1/*-
2 * Copyright (c) 2010 Isilon Systems, Inc.
3 * Copyright (c) 2010 iX Systems, Inc.
4 * Copyright (c) 2010 Panasas, Inc.
5 * Copyright (c) 2013-2018 Mellanox Technologies, Ltd.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice unmodified, this list of conditions, and the following
13 *    disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD$");
32
33#include "opt_stack.h"
34
35#include <sys/param.h>
36#include <sys/systm.h>
37#include <sys/malloc.h>
38#include <sys/kernel.h>
39#include <sys/sysctl.h>
40#include <sys/proc.h>
41#include <sys/sglist.h>
42#include <sys/sleepqueue.h>
43#include <sys/refcount.h>
44#include <sys/lock.h>
45#include <sys/mutex.h>
46#include <sys/bus.h>
47#include <sys/fcntl.h>
48#include <sys/file.h>
49#include <sys/filio.h>
50#include <sys/rwlock.h>
51#include <sys/mman.h>
52#include <sys/stack.h>
53#include <sys/user.h>
54
55#include <vm/vm.h>
56#include <vm/pmap.h>
57#include <vm/vm_object.h>
58#include <vm/vm_page.h>
59#include <vm/vm_pager.h>
60
61#include <machine/stdarg.h>
62
63#if defined(__i386__) || defined(__amd64__)
64#include <machine/md_var.h>
65#endif
66
67#include <linux/kobject.h>
68#include <linux/device.h>
69#include <linux/slab.h>
70#include <linux/module.h>
71#include <linux/moduleparam.h>
72#include <linux/cdev.h>
73#include <linux/file.h>
74#include <linux/sysfs.h>
75#include <linux/mm.h>
76#include <linux/io.h>
77#include <linux/vmalloc.h>
78#include <linux/netdevice.h>
79#include <linux/timer.h>
80#include <linux/interrupt.h>
81#include <linux/uaccess.h>
82#include <linux/list.h>
83#include <linux/kthread.h>
84#include <linux/kernel.h>
85#include <linux/compat.h>
86#include <linux/poll.h>
87#include <linux/smp.h>
88#include <linux/wait_bit.h>
89
90#if defined(__i386__) || defined(__amd64__)
91#include <asm/smp.h>
92#endif
93
94SYSCTL_NODE(_compat, OID_AUTO, linuxkpi, CTLFLAG_RW, 0, "LinuxKPI parameters");
95
96int linuxkpi_debug;
97SYSCTL_INT(_compat_linuxkpi, OID_AUTO, debug, CTLFLAG_RWTUN,
98    &linuxkpi_debug, 0, "Set to enable pr_debug() prints. Clear to disable.");
99
100MALLOC_DEFINE(M_KMALLOC, "linux", "Linux kmalloc compat");
101
102#include <linux/rbtree.h>
103/* Undo Linux compat changes. */
104#undef RB_ROOT
105#undef file
106#undef cdev
107#define	RB_ROOT(head)	(head)->rbh_root
108
109static void linux_cdev_deref(struct linux_cdev *ldev);
110static struct vm_area_struct *linux_cdev_handle_find(void *handle);
111
112struct kobject linux_class_root;
113struct device linux_root_device;
114struct class linux_class_misc;
115struct list_head pci_drivers;
116struct list_head pci_devices;
117spinlock_t pci_lock;
118
119unsigned long linux_timer_hz_mask;
120
121wait_queue_head_t linux_bit_waitq;
122wait_queue_head_t linux_var_waitq;
123
124int
125panic_cmp(struct rb_node *one, struct rb_node *two)
126{
127	panic("no cmp");
128}
129
130RB_GENERATE(linux_root, rb_node, __entry, panic_cmp);
131
132int
133kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list args)
134{
135	va_list tmp_va;
136	int len;
137	char *old;
138	char *name;
139	char dummy;
140
141	old = kobj->name;
142
143	if (old && fmt == NULL)
144		return (0);
145
146	/* compute length of string */
147	va_copy(tmp_va, args);
148	len = vsnprintf(&dummy, 0, fmt, tmp_va);
149	va_end(tmp_va);
150
151	/* account for zero termination */
152	len++;
153
154	/* check for error */
155	if (len < 1)
156		return (-EINVAL);
157
158	/* allocate memory for string */
159	name = kzalloc(len, GFP_KERNEL);
160	if (name == NULL)
161		return (-ENOMEM);
162	vsnprintf(name, len, fmt, args);
163	kobj->name = name;
164
165	/* free old string */
166	kfree(old);
167
168	/* filter new string */
169	for (; *name != '\0'; name++)
170		if (*name == '/')
171			*name = '!';
172	return (0);
173}
174
175int
176kobject_set_name(struct kobject *kobj, const char *fmt, ...)
177{
178	va_list args;
179	int error;
180
181	va_start(args, fmt);
182	error = kobject_set_name_vargs(kobj, fmt, args);
183	va_end(args);
184
185	return (error);
186}
187
188static int
189kobject_add_complete(struct kobject *kobj, struct kobject *parent)
190{
191	const struct kobj_type *t;
192	int error;
193
194	kobj->parent = parent;
195	error = sysfs_create_dir(kobj);
196	if (error == 0 && kobj->ktype && kobj->ktype->default_attrs) {
197		struct attribute **attr;
198		t = kobj->ktype;
199
200		for (attr = t->default_attrs; *attr != NULL; attr++) {
201			error = sysfs_create_file(kobj, *attr);
202			if (error)
203				break;
204		}
205		if (error)
206			sysfs_remove_dir(kobj);
207
208	}
209	return (error);
210}
211
212int
213kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...)
214{
215	va_list args;
216	int error;
217
218	va_start(args, fmt);
219	error = kobject_set_name_vargs(kobj, fmt, args);
220	va_end(args);
221	if (error)
222		return (error);
223
224	return kobject_add_complete(kobj, parent);
225}
226
227void
228linux_kobject_release(struct kref *kref)
229{
230	struct kobject *kobj;
231	char *name;
232
233	kobj = container_of(kref, struct kobject, kref);
234	sysfs_remove_dir(kobj);
235	name = kobj->name;
236	if (kobj->ktype && kobj->ktype->release)
237		kobj->ktype->release(kobj);
238	kfree(name);
239}
240
241static void
242linux_kobject_kfree(struct kobject *kobj)
243{
244	kfree(kobj);
245}
246
247static void
248linux_kobject_kfree_name(struct kobject *kobj)
249{
250	if (kobj) {
251		kfree(kobj->name);
252	}
253}
254
255const struct kobj_type linux_kfree_type = {
256	.release = linux_kobject_kfree
257};
258
259static void
260linux_device_release(struct device *dev)
261{
262	pr_debug("linux_device_release: %s\n", dev_name(dev));
263	kfree(dev);
264}
265
266static ssize_t
267linux_class_show(struct kobject *kobj, struct attribute *attr, char *buf)
268{
269	struct class_attribute *dattr;
270	ssize_t error;
271
272	dattr = container_of(attr, struct class_attribute, attr);
273	error = -EIO;
274	if (dattr->show)
275		error = dattr->show(container_of(kobj, struct class, kobj),
276		    dattr, buf);
277	return (error);
278}
279
280static ssize_t
281linux_class_store(struct kobject *kobj, struct attribute *attr, const char *buf,
282    size_t count)
283{
284	struct class_attribute *dattr;
285	ssize_t error;
286
287	dattr = container_of(attr, struct class_attribute, attr);
288	error = -EIO;
289	if (dattr->store)
290		error = dattr->store(container_of(kobj, struct class, kobj),
291		    dattr, buf, count);
292	return (error);
293}
294
295static void
296linux_class_release(struct kobject *kobj)
297{
298	struct class *class;
299
300	class = container_of(kobj, struct class, kobj);
301	if (class->class_release)
302		class->class_release(class);
303}
304
305static const struct sysfs_ops linux_class_sysfs = {
306	.show  = linux_class_show,
307	.store = linux_class_store,
308};
309
310const struct kobj_type linux_class_ktype = {
311	.release = linux_class_release,
312	.sysfs_ops = &linux_class_sysfs
313};
314
315static void
316linux_dev_release(struct kobject *kobj)
317{
318	struct device *dev;
319
320	dev = container_of(kobj, struct device, kobj);
321	/* This is the precedence defined by linux. */
322	if (dev->release)
323		dev->release(dev);
324	else if (dev->class && dev->class->dev_release)
325		dev->class->dev_release(dev);
326}
327
328static ssize_t
329linux_dev_show(struct kobject *kobj, struct attribute *attr, char *buf)
330{
331	struct device_attribute *dattr;
332	ssize_t error;
333
334	dattr = container_of(attr, struct device_attribute, attr);
335	error = -EIO;
336	if (dattr->show)
337		error = dattr->show(container_of(kobj, struct device, kobj),
338		    dattr, buf);
339	return (error);
340}
341
342static ssize_t
343linux_dev_store(struct kobject *kobj, struct attribute *attr, const char *buf,
344    size_t count)
345{
346	struct device_attribute *dattr;
347	ssize_t error;
348
349	dattr = container_of(attr, struct device_attribute, attr);
350	error = -EIO;
351	if (dattr->store)
352		error = dattr->store(container_of(kobj, struct device, kobj),
353		    dattr, buf, count);
354	return (error);
355}
356
357static const struct sysfs_ops linux_dev_sysfs = {
358	.show  = linux_dev_show,
359	.store = linux_dev_store,
360};
361
362const struct kobj_type linux_dev_ktype = {
363	.release = linux_dev_release,
364	.sysfs_ops = &linux_dev_sysfs
365};
366
367struct device *
368device_create(struct class *class, struct device *parent, dev_t devt,
369    void *drvdata, const char *fmt, ...)
370{
371	struct device *dev;
372	va_list args;
373
374	dev = kzalloc(sizeof(*dev), M_WAITOK);
375	dev->parent = parent;
376	dev->class = class;
377	dev->devt = devt;
378	dev->driver_data = drvdata;
379	dev->release = linux_device_release;
380	va_start(args, fmt);
381	kobject_set_name_vargs(&dev->kobj, fmt, args);
382	va_end(args);
383	device_register(dev);
384
385	return (dev);
386}
387
388int
389kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype,
390    struct kobject *parent, const char *fmt, ...)
391{
392	va_list args;
393	int error;
394
395	kobject_init(kobj, ktype);
396	kobj->ktype = ktype;
397	kobj->parent = parent;
398	kobj->name = NULL;
399
400	va_start(args, fmt);
401	error = kobject_set_name_vargs(kobj, fmt, args);
402	va_end(args);
403	if (error)
404		return (error);
405	return kobject_add_complete(kobj, parent);
406}
407
408static void
409linux_kq_lock(void *arg)
410{
411	spinlock_t *s = arg;
412
413	spin_lock(s);
414}
415static void
416linux_kq_unlock(void *arg)
417{
418	spinlock_t *s = arg;
419
420	spin_unlock(s);
421}
422
423static void
424linux_kq_lock_owned(void *arg)
425{
426#ifdef INVARIANTS
427	spinlock_t *s = arg;
428
429	mtx_assert(&s->m, MA_OWNED);
430#endif
431}
432
433static void
434linux_kq_lock_unowned(void *arg)
435{
436#ifdef INVARIANTS
437	spinlock_t *s = arg;
438
439	mtx_assert(&s->m, MA_NOTOWNED);
440#endif
441}
442
443static void
444linux_file_kqfilter_poll(struct linux_file *, int);
445
446struct linux_file *
447linux_file_alloc(void)
448{
449	struct linux_file *filp;
450
451	filp = kzalloc(sizeof(*filp), GFP_KERNEL);
452
453	/* set initial refcount */
454	filp->f_count = 1;
455
456	/* setup fields needed by kqueue support */
457	spin_lock_init(&filp->f_kqlock);
458	knlist_init(&filp->f_selinfo.si_note, &filp->f_kqlock,
459	    linux_kq_lock, linux_kq_unlock,
460	    linux_kq_lock_owned, linux_kq_lock_unowned);
461
462	return (filp);
463}
464
465void
466linux_file_free(struct linux_file *filp)
467{
468	if (filp->_file == NULL) {
469		if (filp->f_shmem != NULL)
470			vm_object_deallocate(filp->f_shmem);
471		kfree(filp);
472	} else {
473		/*
474		 * The close method of the character device or file
475		 * will free the linux_file structure:
476		 */
477		_fdrop(filp->_file, curthread);
478	}
479}
480
481static int
482linux_cdev_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot,
483    vm_page_t *mres)
484{
485	struct vm_area_struct *vmap;
486
487	vmap = linux_cdev_handle_find(vm_obj->handle);
488
489	MPASS(vmap != NULL);
490	MPASS(vmap->vm_private_data == vm_obj->handle);
491
492	if (likely(vmap->vm_ops != NULL && offset < vmap->vm_len)) {
493		vm_paddr_t paddr = IDX_TO_OFF(vmap->vm_pfn) + offset;
494		vm_page_t page;
495
496		if (((*mres)->flags & PG_FICTITIOUS) != 0) {
497			/*
498			 * If the passed in result page is a fake
499			 * page, update it with the new physical
500			 * address.
501			 */
502			page = *mres;
503			vm_page_updatefake(page, paddr, vm_obj->memattr);
504		} else {
505			/*
506			 * Replace the passed in "mres" page with our
507			 * own fake page and free up the all of the
508			 * original pages.
509			 */
510			VM_OBJECT_WUNLOCK(vm_obj);
511			page = vm_page_getfake(paddr, vm_obj->memattr);
512			VM_OBJECT_WLOCK(vm_obj);
513
514			vm_page_replace_checked(page, vm_obj,
515			    (*mres)->pindex, *mres);
516
517			vm_page_lock(*mres);
518			vm_page_free(*mres);
519			vm_page_unlock(*mres);
520			*mres = page;
521		}
522		page->valid = VM_PAGE_BITS_ALL;
523		return (VM_PAGER_OK);
524	}
525	return (VM_PAGER_FAIL);
526}
527
528static int
529linux_cdev_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type,
530    vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
531{
532	struct vm_area_struct *vmap;
533	int err;
534
535	/* get VM area structure */
536	vmap = linux_cdev_handle_find(vm_obj->handle);
537	MPASS(vmap != NULL);
538	MPASS(vmap->vm_private_data == vm_obj->handle);
539
540	VM_OBJECT_WUNLOCK(vm_obj);
541
542	linux_set_current(curthread);
543
544	down_write(&vmap->vm_mm->mmap_sem);
545	if (unlikely(vmap->vm_ops == NULL)) {
546		err = VM_FAULT_SIGBUS;
547	} else {
548		struct vm_fault vmf;
549
550		/* fill out VM fault structure */
551		vmf.virtual_address = (void *)(uintptr_t)IDX_TO_OFF(pidx);
552		vmf.flags = (fault_type & VM_PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
553		vmf.pgoff = 0;
554		vmf.page = NULL;
555		vmf.vma = vmap;
556
557		vmap->vm_pfn_count = 0;
558		vmap->vm_pfn_pcount = &vmap->vm_pfn_count;
559		vmap->vm_obj = vm_obj;
560
561		err = vmap->vm_ops->fault(vmap, &vmf);
562
563		while (vmap->vm_pfn_count == 0 && err == VM_FAULT_NOPAGE) {
564			kern_yield(PRI_USER);
565			err = vmap->vm_ops->fault(vmap, &vmf);
566		}
567	}
568
569	/* translate return code */
570	switch (err) {
571	case VM_FAULT_OOM:
572		err = VM_PAGER_AGAIN;
573		break;
574	case VM_FAULT_SIGBUS:
575		err = VM_PAGER_BAD;
576		break;
577	case VM_FAULT_NOPAGE:
578		/*
579		 * By contract the fault handler will return having
580		 * busied all the pages itself. If pidx is already
581		 * found in the object, it will simply xbusy the first
582		 * page and return with vm_pfn_count set to 1.
583		 */
584		*first = vmap->vm_pfn_first;
585		*last = *first + vmap->vm_pfn_count - 1;
586		err = VM_PAGER_OK;
587		break;
588	default:
589		err = VM_PAGER_ERROR;
590		break;
591	}
592	up_write(&vmap->vm_mm->mmap_sem);
593	VM_OBJECT_WLOCK(vm_obj);
594	return (err);
595}
596
597static struct rwlock linux_vma_lock;
598static TAILQ_HEAD(, vm_area_struct) linux_vma_head =
599    TAILQ_HEAD_INITIALIZER(linux_vma_head);
600
601static void
602linux_cdev_handle_free(struct vm_area_struct *vmap)
603{
604	/* Drop reference on vm_file */
605	if (vmap->vm_file != NULL)
606		fput(vmap->vm_file);
607
608	/* Drop reference on mm_struct */
609	mmput(vmap->vm_mm);
610
611	kfree(vmap);
612}
613
614static void
615linux_cdev_handle_remove(struct vm_area_struct *vmap)
616{
617	rw_wlock(&linux_vma_lock);
618	TAILQ_REMOVE(&linux_vma_head, vmap, vm_entry);
619	rw_wunlock(&linux_vma_lock);
620}
621
622static struct vm_area_struct *
623linux_cdev_handle_find(void *handle)
624{
625	struct vm_area_struct *vmap;
626
627	rw_rlock(&linux_vma_lock);
628	TAILQ_FOREACH(vmap, &linux_vma_head, vm_entry) {
629		if (vmap->vm_private_data == handle)
630			break;
631	}
632	rw_runlock(&linux_vma_lock);
633	return (vmap);
634}
635
636static int
637linux_cdev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
638		      vm_ooffset_t foff, struct ucred *cred, u_short *color)
639{
640
641	MPASS(linux_cdev_handle_find(handle) != NULL);
642	*color = 0;
643	return (0);
644}
645
646static void
647linux_cdev_pager_dtor(void *handle)
648{
649	const struct vm_operations_struct *vm_ops;
650	struct vm_area_struct *vmap;
651
652	vmap = linux_cdev_handle_find(handle);
653	MPASS(vmap != NULL);
654
655	/*
656	 * Remove handle before calling close operation to prevent
657	 * other threads from reusing the handle pointer.
658	 */
659	linux_cdev_handle_remove(vmap);
660
661	down_write(&vmap->vm_mm->mmap_sem);
662	vm_ops = vmap->vm_ops;
663	if (likely(vm_ops != NULL))
664		vm_ops->close(vmap);
665	up_write(&vmap->vm_mm->mmap_sem);
666
667	linux_cdev_handle_free(vmap);
668}
669
670static struct cdev_pager_ops linux_cdev_pager_ops[2] = {
671  {
672	/* OBJT_MGTDEVICE */
673	.cdev_pg_populate	= linux_cdev_pager_populate,
674	.cdev_pg_ctor	= linux_cdev_pager_ctor,
675	.cdev_pg_dtor	= linux_cdev_pager_dtor
676  },
677  {
678	/* OBJT_DEVICE */
679	.cdev_pg_fault	= linux_cdev_pager_fault,
680	.cdev_pg_ctor	= linux_cdev_pager_ctor,
681	.cdev_pg_dtor	= linux_cdev_pager_dtor
682  },
683};
684
685int
686zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
687    unsigned long size)
688{
689	vm_object_t obj;
690	vm_page_t m;
691
692	obj = vma->vm_obj;
693	if (obj == NULL || (obj->flags & OBJ_UNMANAGED) != 0)
694		return (-ENOTSUP);
695	VM_OBJECT_RLOCK(obj);
696	for (m = vm_page_find_least(obj, OFF_TO_IDX(address));
697	    m != NULL && m->pindex < OFF_TO_IDX(address + size);
698	    m = TAILQ_NEXT(m, listq))
699		pmap_remove_all(m);
700	VM_OBJECT_RUNLOCK(obj);
701	return (0);
702}
703
704static struct file_operations dummy_ldev_ops = {
705	/* XXXKIB */
706};
707
708static struct linux_cdev dummy_ldev = {
709	.ops = &dummy_ldev_ops,
710};
711
712#define	LDEV_SI_DTR	0x0001
713#define	LDEV_SI_REF	0x0002
714
715static void
716linux_get_fop(struct linux_file *filp, const struct file_operations **fop,
717    struct linux_cdev **dev)
718{
719	struct linux_cdev *ldev;
720	u_int siref;
721
722	ldev = filp->f_cdev;
723	*fop = filp->f_op;
724	if (ldev != NULL) {
725		if (ldev->kobj.ktype == &linux_cdev_static_ktype) {
726			refcount_acquire(&ldev->refs);
727		} else {
728			for (siref = ldev->siref;;) {
729				if ((siref & LDEV_SI_DTR) != 0) {
730					ldev = &dummy_ldev;
731					*fop = ldev->ops;
732					siref = ldev->siref;
733					MPASS((ldev->siref & LDEV_SI_DTR) == 0);
734				} else if (atomic_fcmpset_int(&ldev->siref,
735				    &siref, siref + LDEV_SI_REF)) {
736					break;
737				}
738			}
739		}
740	}
741	*dev = ldev;
742}
743
744static void
745linux_drop_fop(struct linux_cdev *ldev)
746{
747
748	if (ldev == NULL)
749		return;
750	if (ldev->kobj.ktype == &linux_cdev_static_ktype) {
751		linux_cdev_deref(ldev);
752	} else {
753		MPASS(ldev->kobj.ktype == &linux_cdev_ktype);
754		MPASS((ldev->siref & ~LDEV_SI_DTR) != 0);
755		atomic_subtract_int(&ldev->siref, LDEV_SI_REF);
756	}
757}
758
759#define	OPW(fp,td,code) ({			\
760	struct file *__fpop;			\
761	__typeof(code) __retval;		\
762						\
763	__fpop = (td)->td_fpop;			\
764	(td)->td_fpop = (fp);			\
765	__retval = (code);			\
766	(td)->td_fpop = __fpop;			\
767	__retval;				\
768})
769
770static int
771linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td,
772    struct file *file)
773{
774	struct linux_cdev *ldev;
775	struct linux_file *filp;
776	const struct file_operations *fop;
777	int error;
778
779	ldev = dev->si_drv1;
780
781	filp = linux_file_alloc();
782	filp->f_dentry = &filp->f_dentry_store;
783	filp->f_op = ldev->ops;
784	filp->f_mode = file->f_flag;
785	filp->f_flags = file->f_flag;
786	filp->f_vnode = file->f_vnode;
787	filp->_file = file;
788	refcount_acquire(&ldev->refs);
789	filp->f_cdev = ldev;
790
791	linux_set_current(td);
792	linux_get_fop(filp, &fop, &ldev);
793
794	if (fop->open != NULL) {
795		error = -fop->open(file->f_vnode, filp);
796		if (error != 0) {
797			linux_drop_fop(ldev);
798			linux_cdev_deref(filp->f_cdev);
799			kfree(filp);
800			return (error);
801		}
802	}
803
804	/* hold on to the vnode - used for fstat() */
805	vhold(filp->f_vnode);
806
807	/* release the file from devfs */
808	finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops);
809	linux_drop_fop(ldev);
810	return (ENXIO);
811}
812
813#define	LINUX_IOCTL_MIN_PTR 0x10000UL
814#define	LINUX_IOCTL_MAX_PTR (LINUX_IOCTL_MIN_PTR + IOCPARM_MAX)
815
816static inline int
817linux_remap_address(void **uaddr, size_t len)
818{
819	uintptr_t uaddr_val = (uintptr_t)(*uaddr);
820
821	if (unlikely(uaddr_val >= LINUX_IOCTL_MIN_PTR &&
822	    uaddr_val < LINUX_IOCTL_MAX_PTR)) {
823		struct task_struct *pts = current;
824		if (pts == NULL) {
825			*uaddr = NULL;
826			return (1);
827		}
828
829		/* compute data offset */
830		uaddr_val -= LINUX_IOCTL_MIN_PTR;
831
832		/* check that length is within bounds */
833		if ((len > IOCPARM_MAX) ||
834		    (uaddr_val + len) > pts->bsd_ioctl_len) {
835			*uaddr = NULL;
836			return (1);
837		}
838
839		/* re-add kernel buffer address */
840		uaddr_val += (uintptr_t)pts->bsd_ioctl_data;
841
842		/* update address location */
843		*uaddr = (void *)uaddr_val;
844		return (1);
845	}
846	return (0);
847}
848
849int
850linux_copyin(const void *uaddr, void *kaddr, size_t len)
851{
852	if (linux_remap_address(__DECONST(void **, &uaddr), len)) {
853		if (uaddr == NULL)
854			return (-EFAULT);
855		memcpy(kaddr, uaddr, len);
856		return (0);
857	}
858	return (-copyin(uaddr, kaddr, len));
859}
860
861int
862linux_copyout(const void *kaddr, void *uaddr, size_t len)
863{
864	if (linux_remap_address(&uaddr, len)) {
865		if (uaddr == NULL)
866			return (-EFAULT);
867		memcpy(uaddr, kaddr, len);
868		return (0);
869	}
870	return (-copyout(kaddr, uaddr, len));
871}
872
873size_t
874linux_clear_user(void *_uaddr, size_t _len)
875{
876	uint8_t *uaddr = _uaddr;
877	size_t len = _len;
878
879	/* make sure uaddr is aligned before going into the fast loop */
880	while (((uintptr_t)uaddr & 7) != 0 && len > 7) {
881		if (subyte(uaddr, 0))
882			return (_len);
883		uaddr++;
884		len--;
885	}
886
887	/* zero 8 bytes at a time */
888	while (len > 7) {
889#ifdef __LP64__
890		if (suword64(uaddr, 0))
891			return (_len);
892#else
893		if (suword32(uaddr, 0))
894			return (_len);
895		if (suword32(uaddr + 4, 0))
896			return (_len);
897#endif
898		uaddr += 8;
899		len -= 8;
900	}
901
902	/* zero fill end, if any */
903	while (len > 0) {
904		if (subyte(uaddr, 0))
905			return (_len);
906		uaddr++;
907		len--;
908	}
909	return (0);
910}
911
912int
913linux_access_ok(const void *uaddr, size_t len)
914{
915	uintptr_t saddr;
916	uintptr_t eaddr;
917
918	/* get start and end address */
919	saddr = (uintptr_t)uaddr;
920	eaddr = (uintptr_t)uaddr + len;
921
922	/* verify addresses are valid for userspace */
923	return ((saddr == eaddr) ||
924	    (eaddr > saddr && eaddr <= VM_MAXUSER_ADDRESS));
925}
926
927/*
928 * This function should return either EINTR or ERESTART depending on
929 * the signal type sent to this thread:
930 */
931static int
932linux_get_error(struct task_struct *task, int error)
933{
934	/* check for signal type interrupt code */
935	if (error == EINTR || error == ERESTARTSYS || error == ERESTART) {
936		error = -linux_schedule_get_interrupt_value(task);
937		if (error == 0)
938			error = EINTR;
939	}
940	return (error);
941}
942
943static int
944linux_file_ioctl_sub(struct file *fp, struct linux_file *filp,
945    const struct file_operations *fop, u_long cmd, caddr_t data,
946    struct thread *td)
947{
948	struct task_struct *task = current;
949	unsigned size;
950	int error;
951
952	size = IOCPARM_LEN(cmd);
953	/* refer to logic in sys_ioctl() */
954	if (size > 0) {
955		/*
956		 * Setup hint for linux_copyin() and linux_copyout().
957		 *
958		 * Background: Linux code expects a user-space address
959		 * while FreeBSD supplies a kernel-space address.
960		 */
961		task->bsd_ioctl_data = data;
962		task->bsd_ioctl_len = size;
963		data = (void *)LINUX_IOCTL_MIN_PTR;
964	} else {
965		/* fetch user-space pointer */
966		data = *(void **)data;
967	}
968#if defined(__amd64__)
969	if (td->td_proc->p_elf_machine == EM_386) {
970		/* try the compat IOCTL handler first */
971		if (fop->compat_ioctl != NULL) {
972			error = -OPW(fp, td, fop->compat_ioctl(filp,
973			    cmd, (u_long)data));
974		} else {
975			error = ENOTTY;
976		}
977
978		/* fallback to the regular IOCTL handler, if any */
979		if (error == ENOTTY && fop->unlocked_ioctl != NULL) {
980			error = -OPW(fp, td, fop->unlocked_ioctl(filp,
981			    cmd, (u_long)data));
982		}
983	} else
984#endif
985	{
986		if (fop->unlocked_ioctl != NULL) {
987			error = -OPW(fp, td, fop->unlocked_ioctl(filp,
988			    cmd, (u_long)data));
989		} else {
990			error = ENOTTY;
991		}
992	}
993	if (size > 0) {
994		task->bsd_ioctl_data = NULL;
995		task->bsd_ioctl_len = 0;
996	}
997
998	if (error == EWOULDBLOCK) {
999		/* update kqfilter status, if any */
1000		linux_file_kqfilter_poll(filp,
1001		    LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);
1002	} else {
1003		error = linux_get_error(task, error);
1004	}
1005	return (error);
1006}
1007
1008#define	LINUX_POLL_TABLE_NORMAL ((poll_table *)1)
1009
1010/*
1011 * This function atomically updates the poll wakeup state and returns
1012 * the previous state at the time of update.
1013 */
1014static uint8_t
1015linux_poll_wakeup_state(atomic_t *v, const uint8_t *pstate)
1016{
1017	int c, old;
1018
1019	c = v->counter;
1020
1021	while ((old = atomic_cmpxchg(v, c, pstate[c])) != c)
1022		c = old;
1023
1024	return (c);
1025}
1026
1027
1028static int
1029linux_poll_wakeup_callback(wait_queue_t *wq, unsigned int wq_state, int flags, void *key)
1030{
1031	static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
1032		[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */
1033		[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */
1034		[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_READY,
1035		[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_READY, /* NOP */
1036	};
1037	struct linux_file *filp = container_of(wq, struct linux_file, f_wait_queue.wq);
1038
1039	switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
1040	case LINUX_FWQ_STATE_QUEUED:
1041		linux_poll_wakeup(filp);
1042		return (1);
1043	default:
1044		return (0);
1045	}
1046}
1047
1048void
1049linux_poll_wait(struct linux_file *filp, wait_queue_head_t *wqh, poll_table *p)
1050{
1051	static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
1052		[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_NOT_READY,
1053		[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */
1054		[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_QUEUED, /* NOP */
1055		[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_QUEUED,
1056	};
1057
1058	/* check if we are called inside the select system call */
1059	if (p == LINUX_POLL_TABLE_NORMAL)
1060		selrecord(curthread, &filp->f_selinfo);
1061
1062	switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
1063	case LINUX_FWQ_STATE_INIT:
1064		/* NOTE: file handles can only belong to one wait-queue */
1065		filp->f_wait_queue.wqh = wqh;
1066		filp->f_wait_queue.wq.func = &linux_poll_wakeup_callback;
1067		add_wait_queue(wqh, &filp->f_wait_queue.wq);
1068		atomic_set(&filp->f_wait_queue.state, LINUX_FWQ_STATE_QUEUED);
1069		break;
1070	default:
1071		break;
1072	}
1073}
1074
1075static void
1076linux_poll_wait_dequeue(struct linux_file *filp)
1077{
1078	static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
1079		[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT,	/* NOP */
1080		[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_INIT,
1081		[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_INIT,
1082		[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_INIT,
1083	};
1084
1085	seldrain(&filp->f_selinfo);
1086
1087	switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
1088	case LINUX_FWQ_STATE_NOT_READY:
1089	case LINUX_FWQ_STATE_QUEUED:
1090	case LINUX_FWQ_STATE_READY:
1091		remove_wait_queue(filp->f_wait_queue.wqh, &filp->f_wait_queue.wq);
1092		break;
1093	default:
1094		break;
1095	}
1096}
1097
1098void
1099linux_poll_wakeup(struct linux_file *filp)
1100{
1101	/* this function should be NULL-safe */
1102	if (filp == NULL)
1103		return;
1104
1105	selwakeup(&filp->f_selinfo);
1106
1107	spin_lock(&filp->f_kqlock);
1108	filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ |
1109	    LINUX_KQ_FLAG_NEED_WRITE;
1110
1111	/* make sure the "knote" gets woken up */
1112	KNOTE_LOCKED(&filp->f_selinfo.si_note, 1);
1113	spin_unlock(&filp->f_kqlock);
1114}
1115
1116static void
1117linux_file_kqfilter_detach(struct knote *kn)
1118{
1119	struct linux_file *filp = kn->kn_hook;
1120
1121	spin_lock(&filp->f_kqlock);
1122	knlist_remove(&filp->f_selinfo.si_note, kn, 1);
1123	spin_unlock(&filp->f_kqlock);
1124}
1125
1126static int
1127linux_file_kqfilter_read_event(struct knote *kn, long hint)
1128{
1129	struct linux_file *filp = kn->kn_hook;
1130
1131	mtx_assert(&filp->f_kqlock.m, MA_OWNED);
1132
1133	return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_READ) ? 1 : 0);
1134}
1135
1136static int
1137linux_file_kqfilter_write_event(struct knote *kn, long hint)
1138{
1139	struct linux_file *filp = kn->kn_hook;
1140
1141	mtx_assert(&filp->f_kqlock.m, MA_OWNED);
1142
1143	return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_WRITE) ? 1 : 0);
1144}
1145
1146static struct filterops linux_dev_kqfiltops_read = {
1147	.f_isfd = 1,
1148	.f_detach = linux_file_kqfilter_detach,
1149	.f_event = linux_file_kqfilter_read_event,
1150};
1151
1152static struct filterops linux_dev_kqfiltops_write = {
1153	.f_isfd = 1,
1154	.f_detach = linux_file_kqfilter_detach,
1155	.f_event = linux_file_kqfilter_write_event,
1156};
1157
1158static void
1159linux_file_kqfilter_poll(struct linux_file *filp, int kqflags)
1160{
1161	struct thread *td;
1162	const struct file_operations *fop;
1163	struct linux_cdev *ldev;
1164	int temp;
1165
1166	if ((filp->f_kqflags & kqflags) == 0)
1167		return;
1168
1169	td = curthread;
1170
1171	linux_get_fop(filp, &fop, &ldev);
1172	/* get the latest polling state */
1173	temp = OPW(filp->_file, td, fop->poll(filp, NULL));
1174	linux_drop_fop(ldev);
1175
1176	spin_lock(&filp->f_kqlock);
1177	/* clear kqflags */
1178	filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ |
1179	    LINUX_KQ_FLAG_NEED_WRITE);
1180	/* update kqflags */
1181	if ((temp & (POLLIN | POLLOUT)) != 0) {
1182		if ((temp & POLLIN) != 0)
1183			filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ;
1184		if ((temp & POLLOUT) != 0)
1185			filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE;
1186
1187		/* make sure the "knote" gets woken up */
1188		KNOTE_LOCKED(&filp->f_selinfo.si_note, 0);
1189	}
1190	spin_unlock(&filp->f_kqlock);
1191}
1192
1193static int
1194linux_file_kqfilter(struct file *file, struct knote *kn)
1195{
1196	struct linux_file *filp;
1197	struct thread *td;
1198	int error;
1199
1200	td = curthread;
1201	filp = (struct linux_file *)file->f_data;
1202	filp->f_flags = file->f_flag;
1203	if (filp->f_op->poll == NULL)
1204		return (EINVAL);
1205
1206	spin_lock(&filp->f_kqlock);
1207	switch (kn->kn_filter) {
1208	case EVFILT_READ:
1209		filp->f_kqflags |= LINUX_KQ_FLAG_HAS_READ;
1210		kn->kn_fop = &linux_dev_kqfiltops_read;
1211		kn->kn_hook = filp;
1212		knlist_add(&filp->f_selinfo.si_note, kn, 1);
1213		error = 0;
1214		break;
1215	case EVFILT_WRITE:
1216		filp->f_kqflags |= LINUX_KQ_FLAG_HAS_WRITE;
1217		kn->kn_fop = &linux_dev_kqfiltops_write;
1218		kn->kn_hook = filp;
1219		knlist_add(&filp->f_selinfo.si_note, kn, 1);
1220		error = 0;
1221		break;
1222	default:
1223		error = EINVAL;
1224		break;
1225	}
1226	spin_unlock(&filp->f_kqlock);
1227
1228	if (error == 0) {
1229		linux_set_current(td);
1230
1231		/* update kqfilter status, if any */
1232		linux_file_kqfilter_poll(filp,
1233		    LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);
1234	}
1235	return (error);
1236}
1237
1238static int
1239linux_file_mmap_single(struct file *fp, const struct file_operations *fop,
1240    vm_ooffset_t *offset, vm_size_t size, struct vm_object **object,
1241    int nprot, struct thread *td)
1242{
1243	struct task_struct *task;
1244	struct vm_area_struct *vmap;
1245	struct mm_struct *mm;
1246	struct linux_file *filp;
1247	vm_memattr_t attr;
1248	int error;
1249
1250	filp = (struct linux_file *)fp->f_data;
1251	filp->f_flags = fp->f_flag;
1252
1253	if (fop->mmap == NULL)
1254		return (EOPNOTSUPP);
1255
1256	linux_set_current(td);
1257
1258	/*
1259	 * The same VM object might be shared by multiple processes
1260	 * and the mm_struct is usually freed when a process exits.
1261	 *
1262	 * The atomic reference below makes sure the mm_struct is
1263	 * available as long as the vmap is in the linux_vma_head.
1264	 */
1265	task = current;
1266	mm = task->mm;
1267	if (atomic_inc_not_zero(&mm->mm_users) == 0)
1268		return (EINVAL);
1269
1270	vmap = kzalloc(sizeof(*vmap), GFP_KERNEL);
1271	vmap->vm_start = 0;
1272	vmap->vm_end = size;
1273	vmap->vm_pgoff = *offset / PAGE_SIZE;
1274	vmap->vm_pfn = 0;
1275	vmap->vm_flags = vmap->vm_page_prot = (nprot & VM_PROT_ALL);
1276	vmap->vm_ops = NULL;
1277	vmap->vm_file = get_file(filp);
1278	vmap->vm_mm = mm;
1279
1280	if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) {
1281		error = linux_get_error(task, EINTR);
1282	} else {
1283		error = -OPW(fp, td, fop->mmap(filp, vmap));
1284		error = linux_get_error(task, error);
1285		up_write(&vmap->vm_mm->mmap_sem);
1286	}
1287
1288	if (error != 0) {
1289		linux_cdev_handle_free(vmap);
1290		return (error);
1291	}
1292
1293	attr = pgprot2cachemode(vmap->vm_page_prot);
1294
1295	if (vmap->vm_ops != NULL) {
1296		struct vm_area_struct *ptr;
1297		void *vm_private_data;
1298		bool vm_no_fault;
1299
1300		if (vmap->vm_ops->open == NULL ||
1301		    vmap->vm_ops->close == NULL ||
1302		    vmap->vm_private_data == NULL) {
1303			/* free allocated VM area struct */
1304			linux_cdev_handle_free(vmap);
1305			return (EINVAL);
1306		}
1307
1308		vm_private_data = vmap->vm_private_data;
1309
1310		rw_wlock(&linux_vma_lock);
1311		TAILQ_FOREACH(ptr, &linux_vma_head, vm_entry) {
1312			if (ptr->vm_private_data == vm_private_data)
1313				break;
1314		}
1315		/* check if there is an existing VM area struct */
1316		if (ptr != NULL) {
1317			/* check if the VM area structure is invalid */
1318			if (ptr->vm_ops == NULL ||
1319			    ptr->vm_ops->open == NULL ||
1320			    ptr->vm_ops->close == NULL) {
1321				error = ESTALE;
1322				vm_no_fault = 1;
1323			} else {
1324				error = EEXIST;
1325				vm_no_fault = (ptr->vm_ops->fault == NULL);
1326			}
1327		} else {
1328			/* insert VM area structure into list */
1329			TAILQ_INSERT_TAIL(&linux_vma_head, vmap, vm_entry);
1330			error = 0;
1331			vm_no_fault = (vmap->vm_ops->fault == NULL);
1332		}
1333		rw_wunlock(&linux_vma_lock);
1334
1335		if (error != 0) {
1336			/* free allocated VM area struct */
1337			linux_cdev_handle_free(vmap);
1338			/* check for stale VM area struct */
1339			if (error != EEXIST)
1340				return (error);
1341		}
1342
1343		/* check if there is no fault handler */
1344		if (vm_no_fault) {
1345			*object = cdev_pager_allocate(vm_private_data, OBJT_DEVICE,
1346			    &linux_cdev_pager_ops[1], size, nprot, *offset,
1347			    td->td_ucred);
1348		} else {
1349			*object = cdev_pager_allocate(vm_private_data, OBJT_MGTDEVICE,
1350			    &linux_cdev_pager_ops[0], size, nprot, *offset,
1351			    td->td_ucred);
1352		}
1353
1354		/* check if allocating the VM object failed */
1355		if (*object == NULL) {
1356			if (error == 0) {
1357				/* remove VM area struct from list */
1358				linux_cdev_handle_remove(vmap);
1359				/* free allocated VM area struct */
1360				linux_cdev_handle_free(vmap);
1361			}
1362			return (EINVAL);
1363		}
1364	} else {
1365		struct sglist *sg;
1366
1367		sg = sglist_alloc(1, M_WAITOK);
1368		sglist_append_phys(sg,
1369		    (vm_paddr_t)vmap->vm_pfn << PAGE_SHIFT, vmap->vm_len);
1370
1371		*object = vm_pager_allocate(OBJT_SG, sg, vmap->vm_len,
1372		    nprot, 0, td->td_ucred);
1373
1374		linux_cdev_handle_free(vmap);
1375
1376		if (*object == NULL) {
1377			sglist_free(sg);
1378			return (EINVAL);
1379		}
1380	}
1381
1382	if (attr != VM_MEMATTR_DEFAULT) {
1383		VM_OBJECT_WLOCK(*object);
1384		vm_object_set_memattr(*object, attr);
1385		VM_OBJECT_WUNLOCK(*object);
1386	}
1387	*offset = 0;
1388	return (0);
1389}
1390
1391struct cdevsw linuxcdevsw = {
1392	.d_version = D_VERSION,
1393	.d_fdopen = linux_dev_fdopen,
1394	.d_name = "lkpidev",
1395};
1396
1397static int
1398linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred,
1399    int flags, struct thread *td)
1400{
1401	struct linux_file *filp;
1402	const struct file_operations *fop;
1403	struct linux_cdev *ldev;
1404	ssize_t bytes;
1405	int error;
1406
1407	error = 0;
1408	filp = (struct linux_file *)file->f_data;
1409	filp->f_flags = file->f_flag;
1410	/* XXX no support for I/O vectors currently */
1411	if (uio->uio_iovcnt != 1)
1412		return (EOPNOTSUPP);
1413	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
1414		return (EINVAL);
1415	linux_set_current(td);
1416	linux_get_fop(filp, &fop, &ldev);
1417	if (fop->read != NULL) {
1418		bytes = OPW(file, td, fop->read(filp,
1419		    uio->uio_iov->iov_base,
1420		    uio->uio_iov->iov_len, &uio->uio_offset));
1421		if (bytes >= 0) {
1422			uio->uio_iov->iov_base =
1423			    ((uint8_t *)uio->uio_iov->iov_base) + bytes;
1424			uio->uio_iov->iov_len -= bytes;
1425			uio->uio_resid -= bytes;
1426		} else {
1427			error = linux_get_error(current, -bytes);
1428		}
1429	} else
1430		error = ENXIO;
1431
1432	/* update kqfilter status, if any */
1433	linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ);
1434	linux_drop_fop(ldev);
1435
1436	return (error);
1437}
1438
1439static int
1440linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred,
1441    int flags, struct thread *td)
1442{
1443	struct linux_file *filp;
1444	const struct file_operations *fop;
1445	struct linux_cdev *ldev;
1446	ssize_t bytes;
1447	int error;
1448
1449	filp = (struct linux_file *)file->f_data;
1450	filp->f_flags = file->f_flag;
1451	/* XXX no support for I/O vectors currently */
1452	if (uio->uio_iovcnt != 1)
1453		return (EOPNOTSUPP);
1454	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
1455		return (EINVAL);
1456	linux_set_current(td);
1457	linux_get_fop(filp, &fop, &ldev);
1458	if (fop->write != NULL) {
1459		bytes = OPW(file, td, fop->write(filp,
1460		    uio->uio_iov->iov_base,
1461		    uio->uio_iov->iov_len, &uio->uio_offset));
1462		if (bytes >= 0) {
1463			uio->uio_iov->iov_base =
1464			    ((uint8_t *)uio->uio_iov->iov_base) + bytes;
1465			uio->uio_iov->iov_len -= bytes;
1466			uio->uio_resid -= bytes;
1467			error = 0;
1468		} else {
1469			error = linux_get_error(current, -bytes);
1470		}
1471	} else
1472		error = ENXIO;
1473
1474	/* update kqfilter status, if any */
1475	linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_WRITE);
1476
1477	linux_drop_fop(ldev);
1478
1479	return (error);
1480}
1481
1482static int
1483linux_file_poll(struct file *file, int events, struct ucred *active_cred,
1484    struct thread *td)
1485{
1486	struct linux_file *filp;
1487	const struct file_operations *fop;
1488	struct linux_cdev *ldev;
1489	int revents;
1490
1491	filp = (struct linux_file *)file->f_data;
1492	filp->f_flags = file->f_flag;
1493	linux_set_current(td);
1494	linux_get_fop(filp, &fop, &ldev);
1495	if (fop->poll != NULL) {
1496		revents = OPW(file, td, fop->poll(filp,
1497		    LINUX_POLL_TABLE_NORMAL)) & events;
1498	} else {
1499		revents = 0;
1500	}
1501	linux_drop_fop(ldev);
1502	return (revents);
1503}
1504
1505static int
1506linux_file_close(struct file *file, struct thread *td)
1507{
1508	struct linux_file *filp;
1509	int (*release)(struct inode *, struct linux_file *);
1510	const struct file_operations *fop;
1511	struct linux_cdev *ldev;
1512	int error;
1513
1514	filp = (struct linux_file *)file->f_data;
1515
1516	KASSERT(file_count(filp) == 0,
1517	    ("File refcount(%d) is not zero", file_count(filp)));
1518
1519	if (td == NULL)
1520		td = curthread;
1521
1522	error = 0;
1523	filp->f_flags = file->f_flag;
1524	linux_set_current(td);
1525	linux_poll_wait_dequeue(filp);
1526	linux_get_fop(filp, &fop, &ldev);
1527	/*
1528	 * Always use the real release function, if any, to avoid
1529	 * leaking device resources:
1530	 */
1531	release = filp->f_op->release;
1532	if (release != NULL)
1533		error = -OPW(file, td, release(filp->f_vnode, filp));
1534	funsetown(&filp->f_sigio);
1535	if (filp->f_vnode != NULL)
1536		vdrop(filp->f_vnode);
1537	linux_drop_fop(ldev);
1538	ldev = filp->f_cdev;
1539	if (ldev != NULL)
1540		linux_cdev_deref(ldev);
1541	kfree(filp);
1542
1543	return (error);
1544}
1545
1546static int
1547linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred,
1548    struct thread *td)
1549{
1550	struct linux_file *filp;
1551	const struct file_operations *fop;
1552	struct linux_cdev *ldev;
1553	struct fiodgname_arg *fgn;
1554	const char *p;
1555	int error, i;
1556
1557	error = 0;
1558	filp = (struct linux_file *)fp->f_data;
1559	filp->f_flags = fp->f_flag;
1560	linux_get_fop(filp, &fop, &ldev);
1561
1562	linux_set_current(td);
1563	switch (cmd) {
1564	case FIONBIO:
1565		break;
1566	case FIOASYNC:
1567		if (fop->fasync == NULL)
1568			break;
1569		error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC));
1570		break;
1571	case FIOSETOWN:
1572		error = fsetown(*(int *)data, &filp->f_sigio);
1573		if (error == 0) {
1574			if (fop->fasync == NULL)
1575				break;
1576			error = -OPW(fp, td, fop->fasync(0, filp,
1577			    fp->f_flag & FASYNC));
1578		}
1579		break;
1580	case FIOGETOWN:
1581		*(int *)data = fgetown(&filp->f_sigio);
1582		break;
1583	case FIODGNAME:
1584		if (filp->f_cdev == NULL || filp->f_cdev->cdev == NULL) {
1585			error = ENXIO;
1586			break;
1587		}
1588		fgn = data;
1589		p = devtoname(filp->f_cdev->cdev);
1590		i = strlen(p) + 1;
1591		if (i > fgn->len) {
1592			error = EINVAL;
1593			break;
1594		}
1595		error = copyout(p, fgn->buf, i);
1596		break;
1597	default:
1598		error = linux_file_ioctl_sub(fp, filp, fop, cmd, data, td);
1599		break;
1600	}
1601	linux_drop_fop(ldev);
1602	return (error);
1603}
1604
1605static int
1606linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot,
1607    vm_prot_t *maxprotp, int *flagsp, struct file *fp,
1608    vm_ooffset_t *foff, const struct file_operations *fop, vm_object_t *objp)
1609{
1610	/*
1611	 * Character devices do not provide private mappings
1612	 * of any kind:
1613	 */
1614	if ((*maxprotp & VM_PROT_WRITE) == 0 &&
1615	    (prot & VM_PROT_WRITE) != 0)
1616		return (EACCES);
1617	if ((*flagsp & (MAP_PRIVATE | MAP_COPY)) != 0)
1618		return (EINVAL);
1619
1620	return (linux_file_mmap_single(fp, fop, foff, objsize, objp,
1621	    (int)prot, td));
1622}
1623
1624static int
1625linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
1626    vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
1627    struct thread *td)
1628{
1629	struct linux_file *filp;
1630	const struct file_operations *fop;
1631	struct linux_cdev *ldev;
1632	struct mount *mp;
1633	struct vnode *vp;
1634	vm_object_t object;
1635	vm_prot_t maxprot;
1636	int error;
1637
1638	filp = (struct linux_file *)fp->f_data;
1639
1640	vp = filp->f_vnode;
1641	if (vp == NULL)
1642		return (EOPNOTSUPP);
1643
1644	/*
1645	 * Ensure that file and memory protections are
1646	 * compatible.
1647	 */
1648	mp = vp->v_mount;
1649	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
1650		maxprot = VM_PROT_NONE;
1651		if ((prot & VM_PROT_EXECUTE) != 0)
1652			return (EACCES);
1653	} else
1654		maxprot = VM_PROT_EXECUTE;
1655	if ((fp->f_flag & FREAD) != 0)
1656		maxprot |= VM_PROT_READ;
1657	else if ((prot & VM_PROT_READ) != 0)
1658		return (EACCES);
1659
1660	/*
1661	 * If we are sharing potential changes via MAP_SHARED and we
1662	 * are trying to get write permission although we opened it
1663	 * without asking for it, bail out.
1664	 *
1665	 * Note that most character devices always share mappings.
1666	 *
1667	 * Rely on linux_file_mmap_sub() to fail invalid MAP_PRIVATE
1668	 * requests rather than doing it here.
1669	 */
1670	if ((flags & MAP_SHARED) != 0) {
1671		if ((fp->f_flag & FWRITE) != 0)
1672			maxprot |= VM_PROT_WRITE;
1673		else if ((prot & VM_PROT_WRITE) != 0)
1674			return (EACCES);
1675	}
1676	maxprot &= cap_maxprot;
1677
1678	linux_get_fop(filp, &fop, &ldev);
1679	error = linux_file_mmap_sub(td, size, prot, &maxprot, &flags, fp,
1680	    &foff, fop, &object);
1681	if (error != 0)
1682		goto out;
1683
1684	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
1685	    foff, FALSE, td);
1686	if (error != 0)
1687		vm_object_deallocate(object);
1688out:
1689	linux_drop_fop(ldev);
1690	return (error);
1691}
1692
1693static int
1694linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
1695    struct thread *td)
1696{
1697	struct linux_file *filp;
1698	struct vnode *vp;
1699	int error;
1700
1701	filp = (struct linux_file *)fp->f_data;
1702	if (filp->f_vnode == NULL)
1703		return (EOPNOTSUPP);
1704
1705	vp = filp->f_vnode;
1706
1707	vn_lock(vp, LK_SHARED | LK_RETRY);
1708	error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
1709	VOP_UNLOCK(vp, 0);
1710
1711	return (error);
1712}
1713
1714static int
1715linux_file_fill_kinfo(struct file *fp, struct kinfo_file *kif,
1716    struct filedesc *fdp)
1717{
1718	struct linux_file *filp;
1719	struct vnode *vp;
1720	int error;
1721
1722	filp = fp->f_data;
1723	vp = filp->f_vnode;
1724	if (vp == NULL) {
1725		error = 0;
1726		kif->kf_type = KF_TYPE_DEV;
1727	} else {
1728		vref(vp);
1729		FILEDESC_SUNLOCK(fdp);
1730		error = vn_fill_kinfo_vnode(vp, kif);
1731		vrele(vp);
1732		kif->kf_type = KF_TYPE_VNODE;
1733		FILEDESC_SLOCK(fdp);
1734	}
1735	return (error);
1736}
1737
1738unsigned int
1739linux_iminor(struct inode *inode)
1740{
1741	struct linux_cdev *ldev;
1742
1743	if (inode == NULL || inode->v_rdev == NULL ||
1744	    inode->v_rdev->si_devsw != &linuxcdevsw)
1745		return (-1U);
1746	ldev = inode->v_rdev->si_drv1;
1747	if (ldev == NULL)
1748		return (-1U);
1749
1750	return (minor(ldev->dev));
1751}
1752
1753struct fileops linuxfileops = {
1754	.fo_read = linux_file_read,
1755	.fo_write = linux_file_write,
1756	.fo_truncate = invfo_truncate,
1757	.fo_kqfilter = linux_file_kqfilter,
1758	.fo_stat = linux_file_stat,
1759	.fo_fill_kinfo = linux_file_fill_kinfo,
1760	.fo_poll = linux_file_poll,
1761	.fo_close = linux_file_close,
1762	.fo_ioctl = linux_file_ioctl,
1763	.fo_mmap = linux_file_mmap,
1764	.fo_chmod = invfo_chmod,
1765	.fo_chown = invfo_chown,
1766	.fo_sendfile = invfo_sendfile,
1767	.fo_flags = DFLAG_PASSABLE,
1768};
1769
1770/*
1771 * Hash of vmmap addresses.  This is infrequently accessed and does not
1772 * need to be particularly large.  This is done because we must store the
1773 * caller's idea of the map size to properly unmap.
1774 */
1775struct vmmap {
1776	LIST_ENTRY(vmmap)	vm_next;
1777	void 			*vm_addr;
1778	unsigned long		vm_size;
1779};
1780
1781struct vmmaphd {
1782	struct vmmap *lh_first;
1783};
1784#define	VMMAP_HASH_SIZE	64
1785#define	VMMAP_HASH_MASK	(VMMAP_HASH_SIZE - 1)
1786#define	VM_HASH(addr)	((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK
1787static struct vmmaphd vmmaphead[VMMAP_HASH_SIZE];
1788static struct mtx vmmaplock;
1789
1790static void
1791vmmap_add(void *addr, unsigned long size)
1792{
1793	struct vmmap *vmmap;
1794
1795	vmmap = kmalloc(sizeof(*vmmap), GFP_KERNEL);
1796	mtx_lock(&vmmaplock);
1797	vmmap->vm_size = size;
1798	vmmap->vm_addr = addr;
1799	LIST_INSERT_HEAD(&vmmaphead[VM_HASH(addr)], vmmap, vm_next);
1800	mtx_unlock(&vmmaplock);
1801}
1802
1803static struct vmmap *
1804vmmap_remove(void *addr)
1805{
1806	struct vmmap *vmmap;
1807
1808	mtx_lock(&vmmaplock);
1809	LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next)
1810		if (vmmap->vm_addr == addr)
1811			break;
1812	if (vmmap)
1813		LIST_REMOVE(vmmap, vm_next);
1814	mtx_unlock(&vmmaplock);
1815
1816	return (vmmap);
1817}
1818
1819#if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__)
1820void *
1821_ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr)
1822{
1823	void *addr;
1824
1825	addr = pmap_mapdev_attr(phys_addr, size, attr);
1826	if (addr == NULL)
1827		return (NULL);
1828	vmmap_add(addr, size);
1829
1830	return (addr);
1831}
1832#endif
1833
1834void
1835iounmap(void *addr)
1836{
1837	struct vmmap *vmmap;
1838
1839	vmmap = vmmap_remove(addr);
1840	if (vmmap == NULL)
1841		return;
1842#if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__)
1843	pmap_unmapdev((vm_offset_t)addr, vmmap->vm_size);
1844#endif
1845	kfree(vmmap);
1846}
1847
1848
1849void *
1850vmap(struct page **pages, unsigned int count, unsigned long flags, int prot)
1851{
1852	vm_offset_t off;
1853	size_t size;
1854
1855	size = count * PAGE_SIZE;
1856	off = kva_alloc(size);
1857	if (off == 0)
1858		return (NULL);
1859	vmmap_add((void *)off, size);
1860	pmap_qenter(off, pages, count);
1861
1862	return ((void *)off);
1863}
1864
1865void
1866vunmap(void *addr)
1867{
1868	struct vmmap *vmmap;
1869
1870	vmmap = vmmap_remove(addr);
1871	if (vmmap == NULL)
1872		return;
1873	pmap_qremove((vm_offset_t)addr, vmmap->vm_size / PAGE_SIZE);
1874	kva_free((vm_offset_t)addr, vmmap->vm_size);
1875	kfree(vmmap);
1876}
1877
1878char *
1879kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
1880{
1881	unsigned int len;
1882	char *p;
1883	va_list aq;
1884
1885	va_copy(aq, ap);
1886	len = vsnprintf(NULL, 0, fmt, aq);
1887	va_end(aq);
1888
1889	p = kmalloc(len + 1, gfp);
1890	if (p != NULL)
1891		vsnprintf(p, len + 1, fmt, ap);
1892
1893	return (p);
1894}
1895
1896char *
1897kasprintf(gfp_t gfp, const char *fmt, ...)
1898{
1899	va_list ap;
1900	char *p;
1901
1902	va_start(ap, fmt);
1903	p = kvasprintf(gfp, fmt, ap);
1904	va_end(ap);
1905
1906	return (p);
1907}
1908
1909static void
1910linux_timer_callback_wrapper(void *context)
1911{
1912	struct timer_list *timer;
1913
1914	timer = context;
1915
1916	if (linux_set_current_flags(curthread, M_NOWAIT)) {
1917		/* try again later */
1918		callout_reset(&timer->callout, 1,
1919		    &linux_timer_callback_wrapper, timer);
1920		return;
1921	}
1922
1923	timer->function(timer->data);
1924}
1925
1926int
1927mod_timer(struct timer_list *timer, int expires)
1928{
1929	int ret;
1930
1931	timer->expires = expires;
1932	ret = callout_reset(&timer->callout,
1933	    linux_timer_jiffies_until(expires),
1934	    &linux_timer_callback_wrapper, timer);
1935
1936	MPASS(ret == 0 || ret == 1);
1937
1938	return (ret == 1);
1939}
1940
1941void
1942add_timer(struct timer_list *timer)
1943{
1944
1945	callout_reset(&timer->callout,
1946	    linux_timer_jiffies_until(timer->expires),
1947	    &linux_timer_callback_wrapper, timer);
1948}
1949
1950void
1951add_timer_on(struct timer_list *timer, int cpu)
1952{
1953
1954	callout_reset_on(&timer->callout,
1955	    linux_timer_jiffies_until(timer->expires),
1956	    &linux_timer_callback_wrapper, timer, cpu);
1957}
1958
1959int
1960del_timer(struct timer_list *timer)
1961{
1962
1963	if (callout_stop(&(timer)->callout) == -1)
1964		return (0);
1965	return (1);
1966}
1967
1968int
1969del_timer_sync(struct timer_list *timer)
1970{
1971
1972	if (callout_drain(&(timer)->callout) == -1)
1973		return (0);
1974	return (1);
1975}
1976
1977/* greatest common divisor, Euclid equation */
1978static uint64_t
1979lkpi_gcd_64(uint64_t a, uint64_t b)
1980{
1981	uint64_t an;
1982	uint64_t bn;
1983
1984	while (b != 0) {
1985		an = b;
1986		bn = a % b;
1987		a = an;
1988		b = bn;
1989	}
1990	return (a);
1991}
1992
1993uint64_t lkpi_nsec2hz_rem;
1994uint64_t lkpi_nsec2hz_div = 1000000000ULL;
1995uint64_t lkpi_nsec2hz_max;
1996
1997uint64_t lkpi_usec2hz_rem;
1998uint64_t lkpi_usec2hz_div = 1000000ULL;
1999uint64_t lkpi_usec2hz_max;
2000
2001uint64_t lkpi_msec2hz_rem;
2002uint64_t lkpi_msec2hz_div = 1000ULL;
2003uint64_t lkpi_msec2hz_max;
2004
2005static void
2006linux_timer_init(void *arg)
2007{
2008	uint64_t gcd;
2009
2010	/*
2011	 * Compute an internal HZ value which can divide 2**32 to
2012	 * avoid timer rounding problems when the tick value wraps
2013	 * around 2**32:
2014	 */
2015	linux_timer_hz_mask = 1;
2016	while (linux_timer_hz_mask < (unsigned long)hz)
2017		linux_timer_hz_mask *= 2;
2018	linux_timer_hz_mask--;
2019
2020	/* compute some internal constants */
2021
2022	lkpi_nsec2hz_rem = hz;
2023	lkpi_usec2hz_rem = hz;
2024	lkpi_msec2hz_rem = hz;
2025
2026	gcd = lkpi_gcd_64(lkpi_nsec2hz_rem, lkpi_nsec2hz_div);
2027	lkpi_nsec2hz_rem /= gcd;
2028	lkpi_nsec2hz_div /= gcd;
2029	lkpi_nsec2hz_max = -1ULL / lkpi_nsec2hz_rem;
2030
2031	gcd = lkpi_gcd_64(lkpi_usec2hz_rem, lkpi_usec2hz_div);
2032	lkpi_usec2hz_rem /= gcd;
2033	lkpi_usec2hz_div /= gcd;
2034	lkpi_usec2hz_max = -1ULL / lkpi_usec2hz_rem;
2035
2036	gcd = lkpi_gcd_64(lkpi_msec2hz_rem, lkpi_msec2hz_div);
2037	lkpi_msec2hz_rem /= gcd;
2038	lkpi_msec2hz_div /= gcd;
2039	lkpi_msec2hz_max = -1ULL / lkpi_msec2hz_rem;
2040}
2041SYSINIT(linux_timer, SI_SUB_DRIVERS, SI_ORDER_FIRST, linux_timer_init, NULL);
2042
2043void
2044linux_complete_common(struct completion *c, int all)
2045{
2046	int wakeup_swapper;
2047
2048	sleepq_lock(c);
2049	if (all) {
2050		c->done = UINT_MAX;
2051		wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0);
2052	} else {
2053		if (c->done != UINT_MAX)
2054			c->done++;
2055		wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0);
2056	}
2057	sleepq_release(c);
2058	if (wakeup_swapper)
2059		kick_proc0();
2060}
2061
2062/*
2063 * Indefinite wait for done != 0 with or without signals.
2064 */
2065int
2066linux_wait_for_common(struct completion *c, int flags)
2067{
2068	struct task_struct *task;
2069	int error;
2070
2071	if (SCHEDULER_STOPPED())
2072		return (0);
2073
2074	task = current;
2075
2076	if (flags != 0)
2077		flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
2078	else
2079		flags = SLEEPQ_SLEEP;
2080	error = 0;
2081	for (;;) {
2082		sleepq_lock(c);
2083		if (c->done)
2084			break;
2085		sleepq_add(c, NULL, "completion", flags, 0);
2086		if (flags & SLEEPQ_INTERRUPTIBLE) {
2087			DROP_GIANT();
2088			error = -sleepq_wait_sig(c, 0);
2089			PICKUP_GIANT();
2090			if (error != 0) {
2091				linux_schedule_save_interrupt_value(task, error);
2092				error = -ERESTARTSYS;
2093				goto intr;
2094			}
2095		} else {
2096			DROP_GIANT();
2097			sleepq_wait(c, 0);
2098			PICKUP_GIANT();
2099		}
2100	}
2101	if (c->done != UINT_MAX)
2102		c->done--;
2103	sleepq_release(c);
2104
2105intr:
2106	return (error);
2107}
2108
2109/*
2110 * Time limited wait for done != 0 with or without signals.
2111 */
2112int
2113linux_wait_for_timeout_common(struct completion *c, int timeout, int flags)
2114{
2115	struct task_struct *task;
2116	int end = jiffies + timeout;
2117	int error;
2118
2119	if (SCHEDULER_STOPPED())
2120		return (0);
2121
2122	task = current;
2123
2124	if (flags != 0)
2125		flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
2126	else
2127		flags = SLEEPQ_SLEEP;
2128
2129	for (;;) {
2130		sleepq_lock(c);
2131		if (c->done)
2132			break;
2133		sleepq_add(c, NULL, "completion", flags, 0);
2134		sleepq_set_timeout(c, linux_timer_jiffies_until(end));
2135
2136		DROP_GIANT();
2137		if (flags & SLEEPQ_INTERRUPTIBLE)
2138			error = -sleepq_timedwait_sig(c, 0);
2139		else
2140			error = -sleepq_timedwait(c, 0);
2141		PICKUP_GIANT();
2142
2143		if (error != 0) {
2144			/* check for timeout */
2145			if (error == -EWOULDBLOCK) {
2146				error = 0;	/* timeout */
2147			} else {
2148				/* signal happened */
2149				linux_schedule_save_interrupt_value(task, error);
2150				error = -ERESTARTSYS;
2151			}
2152			goto done;
2153		}
2154	}
2155	if (c->done != UINT_MAX)
2156		c->done--;
2157	sleepq_release(c);
2158
2159	/* return how many jiffies are left */
2160	error = linux_timer_jiffies_until(end);
2161done:
2162	return (error);
2163}
2164
2165int
2166linux_try_wait_for_completion(struct completion *c)
2167{
2168	int isdone;
2169
2170	sleepq_lock(c);
2171	isdone = (c->done != 0);
2172	if (c->done != 0 && c->done != UINT_MAX)
2173		c->done--;
2174	sleepq_release(c);
2175	return (isdone);
2176}
2177
2178int
2179linux_completion_done(struct completion *c)
2180{
2181	int isdone;
2182
2183	sleepq_lock(c);
2184	isdone = (c->done != 0);
2185	sleepq_release(c);
2186	return (isdone);
2187}
2188
2189static void
2190linux_cdev_deref(struct linux_cdev *ldev)
2191{
2192	if (refcount_release(&ldev->refs) &&
2193	    ldev->kobj.ktype == &linux_cdev_ktype)
2194		kfree(ldev);
2195}
2196
2197static void
2198linux_cdev_release(struct kobject *kobj)
2199{
2200	struct linux_cdev *cdev;
2201	struct kobject *parent;
2202
2203	cdev = container_of(kobj, struct linux_cdev, kobj);
2204	parent = kobj->parent;
2205	linux_destroy_dev(cdev);
2206	linux_cdev_deref(cdev);
2207	kobject_put(parent);
2208}
2209
2210static void
2211linux_cdev_static_release(struct kobject *kobj)
2212{
2213	struct cdev *cdev;
2214	struct linux_cdev *ldev;
2215
2216	ldev = container_of(kobj, struct linux_cdev, kobj);
2217	cdev = ldev->cdev;
2218	if (cdev != NULL) {
2219		destroy_dev(cdev);
2220		ldev->cdev = NULL;
2221	}
2222	kobject_put(kobj->parent);
2223}
2224
2225void
2226linux_destroy_dev(struct linux_cdev *ldev)
2227{
2228
2229	if (ldev->cdev == NULL)
2230		return;
2231
2232	MPASS((ldev->siref & LDEV_SI_DTR) == 0);
2233	MPASS(ldev->kobj.ktype == &linux_cdev_ktype);
2234
2235	atomic_set_int(&ldev->siref, LDEV_SI_DTR);
2236	while ((atomic_load_int(&ldev->siref) & ~LDEV_SI_DTR) != 0)
2237		pause("ldevdtr", hz / 4);
2238
2239	destroy_dev(ldev->cdev);
2240	ldev->cdev = NULL;
2241}
2242
2243const struct kobj_type linux_cdev_ktype = {
2244	.release = linux_cdev_release,
2245};
2246
2247const struct kobj_type linux_cdev_static_ktype = {
2248	.release = linux_cdev_static_release,
2249};
2250
2251static void
2252linux_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate)
2253{
2254	struct notifier_block *nb;
2255
2256	nb = arg;
2257	if (linkstate == LINK_STATE_UP)
2258		nb->notifier_call(nb, NETDEV_UP, ifp);
2259	else
2260		nb->notifier_call(nb, NETDEV_DOWN, ifp);
2261}
2262
2263static void
2264linux_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp)
2265{
2266	struct notifier_block *nb;
2267
2268	nb = arg;
2269	nb->notifier_call(nb, NETDEV_REGISTER, ifp);
2270}
2271
2272static void
2273linux_handle_ifnet_departure_event(void *arg, struct ifnet *ifp)
2274{
2275	struct notifier_block *nb;
2276
2277	nb = arg;
2278	nb->notifier_call(nb, NETDEV_UNREGISTER, ifp);
2279}
2280
2281static void
2282linux_handle_iflladdr_event(void *arg, struct ifnet *ifp)
2283{
2284	struct notifier_block *nb;
2285
2286	nb = arg;
2287	nb->notifier_call(nb, NETDEV_CHANGEADDR, ifp);
2288}
2289
2290static void
2291linux_handle_ifaddr_event(void *arg, struct ifnet *ifp)
2292{
2293	struct notifier_block *nb;
2294
2295	nb = arg;
2296	nb->notifier_call(nb, NETDEV_CHANGEIFADDR, ifp);
2297}
2298
2299int
2300register_netdevice_notifier(struct notifier_block *nb)
2301{
2302
2303	nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER(
2304	    ifnet_link_event, linux_handle_ifnet_link_event, nb, 0);
2305	nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER(
2306	    ifnet_arrival_event, linux_handle_ifnet_arrival_event, nb, 0);
2307	nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER(
2308	    ifnet_departure_event, linux_handle_ifnet_departure_event, nb, 0);
2309	nb->tags[NETDEV_CHANGEADDR] = EVENTHANDLER_REGISTER(
2310	    iflladdr_event, linux_handle_iflladdr_event, nb, 0);
2311
2312	return (0);
2313}
2314
2315int
2316register_inetaddr_notifier(struct notifier_block *nb)
2317{
2318
2319	nb->tags[NETDEV_CHANGEIFADDR] = EVENTHANDLER_REGISTER(
2320	    ifaddr_event, linux_handle_ifaddr_event, nb, 0);
2321	return (0);
2322}
2323
2324int
2325unregister_netdevice_notifier(struct notifier_block *nb)
2326{
2327
2328	EVENTHANDLER_DEREGISTER(ifnet_link_event,
2329	    nb->tags[NETDEV_UP]);
2330	EVENTHANDLER_DEREGISTER(ifnet_arrival_event,
2331	    nb->tags[NETDEV_REGISTER]);
2332	EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2333	    nb->tags[NETDEV_UNREGISTER]);
2334	EVENTHANDLER_DEREGISTER(iflladdr_event,
2335	    nb->tags[NETDEV_CHANGEADDR]);
2336
2337	return (0);
2338}
2339
2340int
2341unregister_inetaddr_notifier(struct notifier_block *nb)
2342{
2343
2344	EVENTHANDLER_DEREGISTER(ifaddr_event,
2345	    nb->tags[NETDEV_CHANGEIFADDR]);
2346
2347	return (0);
2348}
2349
2350struct list_sort_thunk {
2351	int (*cmp)(void *, struct list_head *, struct list_head *);
2352	void *priv;
2353};
2354
2355static inline int
2356linux_le_cmp(void *priv, const void *d1, const void *d2)
2357{
2358	struct list_head *le1, *le2;
2359	struct list_sort_thunk *thunk;
2360
2361	thunk = priv;
2362	le1 = *(__DECONST(struct list_head **, d1));
2363	le2 = *(__DECONST(struct list_head **, d2));
2364	return ((thunk->cmp)(thunk->priv, le1, le2));
2365}
2366
2367void
2368list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv,
2369    struct list_head *a, struct list_head *b))
2370{
2371	struct list_sort_thunk thunk;
2372	struct list_head **ar, *le;
2373	size_t count, i;
2374
2375	count = 0;
2376	list_for_each(le, head)
2377		count++;
2378	ar = malloc(sizeof(struct list_head *) * count, M_KMALLOC, M_WAITOK);
2379	i = 0;
2380	list_for_each(le, head)
2381		ar[i++] = le;
2382	thunk.cmp = cmp;
2383	thunk.priv = priv;
2384	qsort_r(ar, count, sizeof(struct list_head *), &thunk, linux_le_cmp);
2385	INIT_LIST_HEAD(head);
2386	for (i = 0; i < count; i++)
2387		list_add_tail(ar[i], head);
2388	free(ar, M_KMALLOC);
2389}
2390
2391void
2392linux_irq_handler(void *ent)
2393{
2394	struct irq_ent *irqe;
2395
2396	if (linux_set_current_flags(curthread, M_NOWAIT))
2397		return;
2398
2399	irqe = ent;
2400	irqe->handler(irqe->irq, irqe->arg);
2401}
2402
2403#if defined(__i386__) || defined(__amd64__)
2404int
2405linux_wbinvd_on_all_cpus(void)
2406{
2407
2408	pmap_invalidate_cache();
2409	return (0);
2410}
2411#endif
2412
2413int
2414linux_on_each_cpu(void callback(void *), void *data)
2415{
2416
2417	smp_rendezvous(smp_no_rendezvous_barrier, callback,
2418	    smp_no_rendezvous_barrier, data);
2419	return (0);
2420}
2421
2422int
2423linux_in_atomic(void)
2424{
2425
2426	return ((curthread->td_pflags & TDP_NOFAULTING) != 0);
2427}
2428
2429struct linux_cdev *
2430linux_find_cdev(const char *name, unsigned major, unsigned minor)
2431{
2432	dev_t dev = MKDEV(major, minor);
2433	struct cdev *cdev;
2434
2435	dev_lock();
2436	LIST_FOREACH(cdev, &linuxcdevsw.d_devs, si_list) {
2437		struct linux_cdev *ldev = cdev->si_drv1;
2438		if (ldev->dev == dev &&
2439		    strcmp(kobject_name(&ldev->kobj), name) == 0) {
2440			break;
2441		}
2442	}
2443	dev_unlock();
2444
2445	return (cdev != NULL ? cdev->si_drv1 : NULL);
2446}
2447
2448int
2449__register_chrdev(unsigned int major, unsigned int baseminor,
2450    unsigned int count, const char *name,
2451    const struct file_operations *fops)
2452{
2453	struct linux_cdev *cdev;
2454	int ret = 0;
2455	int i;
2456
2457	for (i = baseminor; i < baseminor + count; i++) {
2458		cdev = cdev_alloc();
2459		cdev->ops = fops;
2460		kobject_set_name(&cdev->kobj, name);
2461
2462		ret = cdev_add(cdev, makedev(major, i), 1);
2463		if (ret != 0)
2464			break;
2465	}
2466	return (ret);
2467}
2468
2469int
2470__register_chrdev_p(unsigned int major, unsigned int baseminor,
2471    unsigned int count, const char *name,
2472    const struct file_operations *fops, uid_t uid,
2473    gid_t gid, int mode)
2474{
2475	struct linux_cdev *cdev;
2476	int ret = 0;
2477	int i;
2478
2479	for (i = baseminor; i < baseminor + count; i++) {
2480		cdev = cdev_alloc();
2481		cdev->ops = fops;
2482		kobject_set_name(&cdev->kobj, name);
2483
2484		ret = cdev_add_ext(cdev, makedev(major, i), uid, gid, mode);
2485		if (ret != 0)
2486			break;
2487	}
2488	return (ret);
2489}
2490
2491void
2492__unregister_chrdev(unsigned int major, unsigned int baseminor,
2493    unsigned int count, const char *name)
2494{
2495	struct linux_cdev *cdevp;
2496	int i;
2497
2498	for (i = baseminor; i < baseminor + count; i++) {
2499		cdevp = linux_find_cdev(name, major, i);
2500		if (cdevp != NULL)
2501			cdev_del(cdevp);
2502	}
2503}
2504
2505void
2506linux_dump_stack(void)
2507{
2508#ifdef STACK
2509	struct stack st;
2510
2511	stack_zero(&st);
2512	stack_save(&st);
2513	stack_print(&st);
2514#endif
2515}
2516
2517#if defined(__i386__) || defined(__amd64__)
2518bool linux_cpu_has_clflush;
2519#endif
2520
2521static void
2522linux_compat_init(void *arg)
2523{
2524	struct sysctl_oid *rootoid;
2525	int i;
2526
2527#if defined(__i386__) || defined(__amd64__)
2528	linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH);
2529#endif
2530	rw_init(&linux_vma_lock, "lkpi-vma-lock");
2531
2532	rootoid = SYSCTL_ADD_ROOT_NODE(NULL,
2533	    OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys");
2534	kobject_init(&linux_class_root, &linux_class_ktype);
2535	kobject_set_name(&linux_class_root, "class");
2536	linux_class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid),
2537	    OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class");
2538	kobject_init(&linux_root_device.kobj, &linux_dev_ktype);
2539	kobject_set_name(&linux_root_device.kobj, "device");
2540	linux_root_device.kobj.oidp = SYSCTL_ADD_NODE(NULL,
2541	    SYSCTL_CHILDREN(rootoid), OID_AUTO, "device", CTLFLAG_RD, NULL,
2542	    "device");
2543	linux_root_device.bsddev = root_bus;
2544	linux_class_misc.name = "misc";
2545	class_register(&linux_class_misc);
2546	INIT_LIST_HEAD(&pci_drivers);
2547	INIT_LIST_HEAD(&pci_devices);
2548	spin_lock_init(&pci_lock);
2549	mtx_init(&vmmaplock, "IO Map lock", NULL, MTX_DEF);
2550	for (i = 0; i < VMMAP_HASH_SIZE; i++)
2551		LIST_INIT(&vmmaphead[i]);
2552	init_waitqueue_head(&linux_bit_waitq);
2553	init_waitqueue_head(&linux_var_waitq);
2554}
2555SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL);
2556
2557static void
2558linux_compat_uninit(void *arg)
2559{
2560	linux_kobject_kfree_name(&linux_class_root);
2561	linux_kobject_kfree_name(&linux_root_device.kobj);
2562	linux_kobject_kfree_name(&linux_class_misc.kobj);
2563
2564	mtx_destroy(&vmmaplock);
2565	spin_lock_destroy(&pci_lock);
2566	rw_destroy(&linux_vma_lock);
2567}
2568SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL);
2569
2570/*
2571 * NOTE: Linux frequently uses "unsigned long" for pointer to integer
2572 * conversion and vice versa, where in FreeBSD "uintptr_t" would be
2573 * used. Assert these types have the same size, else some parts of the
2574 * LinuxKPI may not work like expected:
2575 */
2576CTASSERT(sizeof(unsigned long) == sizeof(uintptr_t));
2577