linux_compat.c revision 311803
1/*-
2 * Copyright (c) 2010 Isilon Systems, Inc.
3 * Copyright (c) 2010 iX Systems, Inc.
4 * Copyright (c) 2010 Panasas, Inc.
5 * Copyright (c) 2013-2016 Mellanox Technologies, Ltd.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice unmodified, this list of conditions, and the following
13 *    disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: stable/11/sys/compat/linuxkpi/common/src/linux_compat.c 311803 2017-01-09 17:25:23Z hselasky $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/malloc.h>
36#include <sys/kernel.h>
37#include <sys/sysctl.h>
38#include <sys/proc.h>
39#include <sys/sglist.h>
40#include <sys/sleepqueue.h>
41#include <sys/lock.h>
42#include <sys/mutex.h>
43#include <sys/bus.h>
44#include <sys/fcntl.h>
45#include <sys/file.h>
46#include <sys/filio.h>
47#include <sys/rwlock.h>
48
49#include <vm/vm.h>
50#include <vm/pmap.h>
51
52#include <machine/stdarg.h>
53
54#if defined(__i386__) || defined(__amd64__)
55#include <machine/md_var.h>
56#endif
57
58#include <linux/kobject.h>
59#include <linux/device.h>
60#include <linux/slab.h>
61#include <linux/module.h>
62#include <linux/moduleparam.h>
63#include <linux/cdev.h>
64#include <linux/file.h>
65#include <linux/sysfs.h>
66#include <linux/mm.h>
67#include <linux/io.h>
68#include <linux/vmalloc.h>
69#include <linux/netdevice.h>
70#include <linux/timer.h>
71#include <linux/workqueue.h>
72#include <linux/rcupdate.h>
73#include <linux/interrupt.h>
74#include <linux/uaccess.h>
75#include <linux/kernel.h>
76#include <linux/list.h>
77#include <linux/compat.h>
78
79#include <vm/vm_pager.h>
80
81SYSCTL_NODE(_compat, OID_AUTO, linuxkpi, CTLFLAG_RW, 0, "LinuxKPI parameters");
82
83MALLOC_DEFINE(M_KMALLOC, "linux", "Linux kmalloc compat");
84
85#include <linux/rbtree.h>
86/* Undo Linux compat changes. */
87#undef RB_ROOT
88#undef file
89#undef cdev
90#define	RB_ROOT(head)	(head)->rbh_root
91
92struct kobject linux_class_root;
93struct device linux_root_device;
94struct class linux_class_misc;
95struct list_head pci_drivers;
96struct list_head pci_devices;
97struct net init_net;
98spinlock_t pci_lock;
99struct sx linux_global_rcu_lock;
100
101unsigned long linux_timer_hz_mask;
102
103int
104panic_cmp(struct rb_node *one, struct rb_node *two)
105{
106	panic("no cmp");
107}
108
109RB_GENERATE(linux_root, rb_node, __entry, panic_cmp);
110
111int
112kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list args)
113{
114	va_list tmp_va;
115	int len;
116	char *old;
117	char *name;
118	char dummy;
119
120	old = kobj->name;
121
122	if (old && fmt == NULL)
123		return (0);
124
125	/* compute length of string */
126	va_copy(tmp_va, args);
127	len = vsnprintf(&dummy, 0, fmt, tmp_va);
128	va_end(tmp_va);
129
130	/* account for zero termination */
131	len++;
132
133	/* check for error */
134	if (len < 1)
135		return (-EINVAL);
136
137	/* allocate memory for string */
138	name = kzalloc(len, GFP_KERNEL);
139	if (name == NULL)
140		return (-ENOMEM);
141	vsnprintf(name, len, fmt, args);
142	kobj->name = name;
143
144	/* free old string */
145	kfree(old);
146
147	/* filter new string */
148	for (; *name != '\0'; name++)
149		if (*name == '/')
150			*name = '!';
151	return (0);
152}
153
154int
155kobject_set_name(struct kobject *kobj, const char *fmt, ...)
156{
157	va_list args;
158	int error;
159
160	va_start(args, fmt);
161	error = kobject_set_name_vargs(kobj, fmt, args);
162	va_end(args);
163
164	return (error);
165}
166
167static int
168kobject_add_complete(struct kobject *kobj, struct kobject *parent)
169{
170	const struct kobj_type *t;
171	int error;
172
173	kobj->parent = parent;
174	error = sysfs_create_dir(kobj);
175	if (error == 0 && kobj->ktype && kobj->ktype->default_attrs) {
176		struct attribute **attr;
177		t = kobj->ktype;
178
179		for (attr = t->default_attrs; *attr != NULL; attr++) {
180			error = sysfs_create_file(kobj, *attr);
181			if (error)
182				break;
183		}
184		if (error)
185			sysfs_remove_dir(kobj);
186
187	}
188	return (error);
189}
190
191int
192kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...)
193{
194	va_list args;
195	int error;
196
197	va_start(args, fmt);
198	error = kobject_set_name_vargs(kobj, fmt, args);
199	va_end(args);
200	if (error)
201		return (error);
202
203	return kobject_add_complete(kobj, parent);
204}
205
206void
207linux_kobject_release(struct kref *kref)
208{
209	struct kobject *kobj;
210	char *name;
211
212	kobj = container_of(kref, struct kobject, kref);
213	sysfs_remove_dir(kobj);
214	name = kobj->name;
215	if (kobj->ktype && kobj->ktype->release)
216		kobj->ktype->release(kobj);
217	kfree(name);
218}
219
220static void
221linux_kobject_kfree(struct kobject *kobj)
222{
223	kfree(kobj);
224}
225
226static void
227linux_kobject_kfree_name(struct kobject *kobj)
228{
229	if (kobj) {
230		kfree(kobj->name);
231	}
232}
233
234const struct kobj_type linux_kfree_type = {
235	.release = linux_kobject_kfree
236};
237
238static void
239linux_device_release(struct device *dev)
240{
241	pr_debug("linux_device_release: %s\n", dev_name(dev));
242	kfree(dev);
243}
244
245static ssize_t
246linux_class_show(struct kobject *kobj, struct attribute *attr, char *buf)
247{
248	struct class_attribute *dattr;
249	ssize_t error;
250
251	dattr = container_of(attr, struct class_attribute, attr);
252	error = -EIO;
253	if (dattr->show)
254		error = dattr->show(container_of(kobj, struct class, kobj),
255		    dattr, buf);
256	return (error);
257}
258
259static ssize_t
260linux_class_store(struct kobject *kobj, struct attribute *attr, const char *buf,
261    size_t count)
262{
263	struct class_attribute *dattr;
264	ssize_t error;
265
266	dattr = container_of(attr, struct class_attribute, attr);
267	error = -EIO;
268	if (dattr->store)
269		error = dattr->store(container_of(kobj, struct class, kobj),
270		    dattr, buf, count);
271	return (error);
272}
273
274static void
275linux_class_release(struct kobject *kobj)
276{
277	struct class *class;
278
279	class = container_of(kobj, struct class, kobj);
280	if (class->class_release)
281		class->class_release(class);
282}
283
284static const struct sysfs_ops linux_class_sysfs = {
285	.show  = linux_class_show,
286	.store = linux_class_store,
287};
288
289const struct kobj_type linux_class_ktype = {
290	.release = linux_class_release,
291	.sysfs_ops = &linux_class_sysfs
292};
293
294static void
295linux_dev_release(struct kobject *kobj)
296{
297	struct device *dev;
298
299	dev = container_of(kobj, struct device, kobj);
300	/* This is the precedence defined by linux. */
301	if (dev->release)
302		dev->release(dev);
303	else if (dev->class && dev->class->dev_release)
304		dev->class->dev_release(dev);
305}
306
307static ssize_t
308linux_dev_show(struct kobject *kobj, struct attribute *attr, char *buf)
309{
310	struct device_attribute *dattr;
311	ssize_t error;
312
313	dattr = container_of(attr, struct device_attribute, attr);
314	error = -EIO;
315	if (dattr->show)
316		error = dattr->show(container_of(kobj, struct device, kobj),
317		    dattr, buf);
318	return (error);
319}
320
321static ssize_t
322linux_dev_store(struct kobject *kobj, struct attribute *attr, const char *buf,
323    size_t count)
324{
325	struct device_attribute *dattr;
326	ssize_t error;
327
328	dattr = container_of(attr, struct device_attribute, attr);
329	error = -EIO;
330	if (dattr->store)
331		error = dattr->store(container_of(kobj, struct device, kobj),
332		    dattr, buf, count);
333	return (error);
334}
335
336static const struct sysfs_ops linux_dev_sysfs = {
337	.show  = linux_dev_show,
338	.store = linux_dev_store,
339};
340
341const struct kobj_type linux_dev_ktype = {
342	.release = linux_dev_release,
343	.sysfs_ops = &linux_dev_sysfs
344};
345
346struct device *
347device_create(struct class *class, struct device *parent, dev_t devt,
348    void *drvdata, const char *fmt, ...)
349{
350	struct device *dev;
351	va_list args;
352
353	dev = kzalloc(sizeof(*dev), M_WAITOK);
354	dev->parent = parent;
355	dev->class = class;
356	dev->devt = devt;
357	dev->driver_data = drvdata;
358	dev->release = linux_device_release;
359	va_start(args, fmt);
360	kobject_set_name_vargs(&dev->kobj, fmt, args);
361	va_end(args);
362	device_register(dev);
363
364	return (dev);
365}
366
367int
368kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype,
369    struct kobject *parent, const char *fmt, ...)
370{
371	va_list args;
372	int error;
373
374	kobject_init(kobj, ktype);
375	kobj->ktype = ktype;
376	kobj->parent = parent;
377	kobj->name = NULL;
378
379	va_start(args, fmt);
380	error = kobject_set_name_vargs(kobj, fmt, args);
381	va_end(args);
382	if (error)
383		return (error);
384	return kobject_add_complete(kobj, parent);
385}
386
387void
388linux_set_current(struct thread *td, struct task_struct *t)
389{
390	memset(t, 0, sizeof(*t));
391	task_struct_fill(td, t);
392	task_struct_set(td, t);
393}
394
395void
396linux_clear_current(struct thread *td)
397{
398	task_struct_set(td, NULL);
399}
400
401static void
402linux_file_dtor(void *cdp)
403{
404	struct linux_file *filp;
405	struct task_struct t;
406	struct thread *td;
407
408	td = curthread;
409	filp = cdp;
410	linux_set_current(td, &t);
411	filp->f_op->release(filp->f_vnode, filp);
412	linux_clear_current(td);
413	vdrop(filp->f_vnode);
414	kfree(filp);
415}
416
417static int
418linux_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
419{
420	struct linux_cdev *ldev;
421	struct linux_file *filp;
422	struct task_struct t;
423	struct file *file;
424	int error;
425
426	file = td->td_fpop;
427	ldev = dev->si_drv1;
428	if (ldev == NULL)
429		return (ENODEV);
430	filp = kzalloc(sizeof(*filp), GFP_KERNEL);
431	filp->f_dentry = &filp->f_dentry_store;
432	filp->f_op = ldev->ops;
433	filp->f_flags = file->f_flag;
434	vhold(file->f_vnode);
435	filp->f_vnode = file->f_vnode;
436	linux_set_current(td, &t);
437	if (filp->f_op->open) {
438		error = -filp->f_op->open(file->f_vnode, filp);
439		if (error) {
440			kfree(filp);
441			goto done;
442		}
443	}
444	error = devfs_set_cdevpriv(filp, linux_file_dtor);
445	if (error) {
446		filp->f_op->release(file->f_vnode, filp);
447		kfree(filp);
448	}
449done:
450	linux_clear_current(td);
451	return (error);
452}
453
454static int
455linux_dev_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
456{
457	struct linux_cdev *ldev;
458	struct linux_file *filp;
459	struct file *file;
460	int error;
461
462	file = td->td_fpop;
463	ldev = dev->si_drv1;
464	if (ldev == NULL)
465		return (0);
466	if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
467		return (error);
468	filp->f_flags = file->f_flag;
469        devfs_clear_cdevpriv();
470
471
472	return (0);
473}
474
475#define	LINUX_IOCTL_MIN_PTR 0x10000UL
476#define	LINUX_IOCTL_MAX_PTR (LINUX_IOCTL_MIN_PTR + IOCPARM_MAX)
477
478static inline int
479linux_remap_address(void **uaddr, size_t len)
480{
481	uintptr_t uaddr_val = (uintptr_t)(*uaddr);
482
483	if (unlikely(uaddr_val >= LINUX_IOCTL_MIN_PTR &&
484	    uaddr_val < LINUX_IOCTL_MAX_PTR)) {
485		struct task_struct *pts = current;
486		if (pts == NULL) {
487			*uaddr = NULL;
488			return (1);
489		}
490
491		/* compute data offset */
492		uaddr_val -= LINUX_IOCTL_MIN_PTR;
493
494		/* check that length is within bounds */
495		if ((len > IOCPARM_MAX) ||
496		    (uaddr_val + len) > pts->bsd_ioctl_len) {
497			*uaddr = NULL;
498			return (1);
499		}
500
501		/* re-add kernel buffer address */
502		uaddr_val += (uintptr_t)pts->bsd_ioctl_data;
503
504		/* update address location */
505		*uaddr = (void *)uaddr_val;
506		return (1);
507	}
508	return (0);
509}
510
511int
512linux_copyin(const void *uaddr, void *kaddr, size_t len)
513{
514	if (linux_remap_address(__DECONST(void **, &uaddr), len)) {
515		if (uaddr == NULL)
516			return (-EFAULT);
517		memcpy(kaddr, uaddr, len);
518		return (0);
519	}
520	return (-copyin(uaddr, kaddr, len));
521}
522
523int
524linux_copyout(const void *kaddr, void *uaddr, size_t len)
525{
526	if (linux_remap_address(&uaddr, len)) {
527		if (uaddr == NULL)
528			return (-EFAULT);
529		memcpy(uaddr, kaddr, len);
530		return (0);
531	}
532	return (-copyout(kaddr, uaddr, len));
533}
534
535static int
536linux_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
537    struct thread *td)
538{
539	struct linux_cdev *ldev;
540	struct linux_file *filp;
541	struct task_struct t;
542	struct file *file;
543	unsigned size;
544	int error;
545
546	file = td->td_fpop;
547	ldev = dev->si_drv1;
548	if (ldev == NULL)
549		return (0);
550	if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
551		return (error);
552	filp->f_flags = file->f_flag;
553	linux_set_current(td, &t);
554	size = IOCPARM_LEN(cmd);
555	/* refer to logic in sys_ioctl() */
556	if (size > 0) {
557		/*
558		 * Setup hint for linux_copyin() and linux_copyout().
559		 *
560		 * Background: Linux code expects a user-space address
561		 * while FreeBSD supplies a kernel-space address.
562		 */
563		t.bsd_ioctl_data = data;
564		t.bsd_ioctl_len = size;
565		data = (void *)LINUX_IOCTL_MIN_PTR;
566	} else {
567		/* fetch user-space pointer */
568		data = *(void **)data;
569	}
570	if (filp->f_op->unlocked_ioctl)
571		error = -filp->f_op->unlocked_ioctl(filp, cmd, (u_long)data);
572	else
573		error = ENOTTY;
574	linux_clear_current(td);
575
576	return (error);
577}
578
579static int
580linux_dev_read(struct cdev *dev, struct uio *uio, int ioflag)
581{
582	struct linux_cdev *ldev;
583	struct linux_file *filp;
584	struct task_struct t;
585	struct thread *td;
586	struct file *file;
587	ssize_t bytes;
588	int error;
589
590	td = curthread;
591	file = td->td_fpop;
592	ldev = dev->si_drv1;
593	if (ldev == NULL)
594		return (0);
595	if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
596		return (error);
597	filp->f_flags = file->f_flag;
598	/* XXX no support for I/O vectors currently */
599	if (uio->uio_iovcnt != 1)
600		return (EOPNOTSUPP);
601	linux_set_current(td, &t);
602	if (filp->f_op->read) {
603		bytes = filp->f_op->read(filp, uio->uio_iov->iov_base,
604		    uio->uio_iov->iov_len, &uio->uio_offset);
605		if (bytes >= 0) {
606			uio->uio_iov->iov_base =
607			    ((uint8_t *)uio->uio_iov->iov_base) + bytes;
608			uio->uio_iov->iov_len -= bytes;
609			uio->uio_resid -= bytes;
610		} else
611			error = -bytes;
612	} else
613		error = ENXIO;
614	linux_clear_current(td);
615
616	return (error);
617}
618
619static int
620linux_dev_write(struct cdev *dev, struct uio *uio, int ioflag)
621{
622	struct linux_cdev *ldev;
623	struct linux_file *filp;
624	struct task_struct t;
625	struct thread *td;
626	struct file *file;
627	ssize_t bytes;
628	int error;
629
630	td = curthread;
631	file = td->td_fpop;
632	ldev = dev->si_drv1;
633	if (ldev == NULL)
634		return (0);
635	if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
636		return (error);
637	filp->f_flags = file->f_flag;
638	/* XXX no support for I/O vectors currently */
639	if (uio->uio_iovcnt != 1)
640		return (EOPNOTSUPP);
641	linux_set_current(td, &t);
642	if (filp->f_op->write) {
643		bytes = filp->f_op->write(filp, uio->uio_iov->iov_base,
644		    uio->uio_iov->iov_len, &uio->uio_offset);
645		if (bytes >= 0) {
646			uio->uio_iov->iov_base =
647			    ((uint8_t *)uio->uio_iov->iov_base) + bytes;
648			uio->uio_iov->iov_len -= bytes;
649			uio->uio_resid -= bytes;
650		} else
651			error = -bytes;
652	} else
653		error = ENXIO;
654	linux_clear_current(td);
655
656	return (error);
657}
658
659static int
660linux_dev_poll(struct cdev *dev, int events, struct thread *td)
661{
662	struct linux_cdev *ldev;
663	struct linux_file *filp;
664	struct task_struct t;
665	struct file *file;
666	int revents;
667	int error;
668
669	file = td->td_fpop;
670	ldev = dev->si_drv1;
671	if (ldev == NULL)
672		return (0);
673	if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
674		return (error);
675	filp->f_flags = file->f_flag;
676	linux_set_current(td, &t);
677	if (filp->f_op->poll)
678		revents = filp->f_op->poll(filp, NULL) & events;
679	else
680		revents = 0;
681	linux_clear_current(td);
682
683	return (revents);
684}
685
686static int
687linux_dev_mmap_single(struct cdev *dev, vm_ooffset_t *offset,
688    vm_size_t size, struct vm_object **object, int nprot)
689{
690	struct linux_cdev *ldev;
691	struct linux_file *filp;
692	struct thread *td;
693	struct task_struct t;
694	struct file *file;
695	struct vm_area_struct vma;
696	int error;
697
698	td = curthread;
699	file = td->td_fpop;
700	ldev = dev->si_drv1;
701	if (ldev == NULL)
702		return (ENODEV);
703	if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
704		return (error);
705	filp->f_flags = file->f_flag;
706	linux_set_current(td, &t);
707	vma.vm_start = 0;
708	vma.vm_end = size;
709	vma.vm_pgoff = *offset / PAGE_SIZE;
710	vma.vm_pfn = 0;
711	vma.vm_page_prot = VM_MEMATTR_DEFAULT;
712	if (filp->f_op->mmap) {
713		error = -filp->f_op->mmap(filp, &vma);
714		if (error == 0) {
715			struct sglist *sg;
716
717			sg = sglist_alloc(1, M_WAITOK);
718			sglist_append_phys(sg,
719			    (vm_paddr_t)vma.vm_pfn << PAGE_SHIFT, vma.vm_len);
720			*object = vm_pager_allocate(OBJT_SG, sg, vma.vm_len,
721			    nprot, 0, td->td_ucred);
722		        if (*object == NULL) {
723				sglist_free(sg);
724				error = EINVAL;
725				goto done;
726			}
727			*offset = 0;
728			if (vma.vm_page_prot != VM_MEMATTR_DEFAULT) {
729				VM_OBJECT_WLOCK(*object);
730				vm_object_set_memattr(*object,
731				    vma.vm_page_prot);
732				VM_OBJECT_WUNLOCK(*object);
733			}
734		}
735	} else
736		error = ENODEV;
737done:
738	linux_clear_current(td);
739	return (error);
740}
741
742struct cdevsw linuxcdevsw = {
743	.d_version = D_VERSION,
744	.d_flags = D_TRACKCLOSE,
745	.d_open = linux_dev_open,
746	.d_close = linux_dev_close,
747	.d_read = linux_dev_read,
748	.d_write = linux_dev_write,
749	.d_ioctl = linux_dev_ioctl,
750	.d_mmap_single = linux_dev_mmap_single,
751	.d_poll = linux_dev_poll,
752};
753
754static int
755linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred,
756    int flags, struct thread *td)
757{
758	struct linux_file *filp;
759	struct task_struct t;
760	ssize_t bytes;
761	int error;
762
763	error = 0;
764	filp = (struct linux_file *)file->f_data;
765	filp->f_flags = file->f_flag;
766	/* XXX no support for I/O vectors currently */
767	if (uio->uio_iovcnt != 1)
768		return (EOPNOTSUPP);
769	linux_set_current(td, &t);
770	if (filp->f_op->read) {
771		bytes = filp->f_op->read(filp, uio->uio_iov->iov_base,
772		    uio->uio_iov->iov_len, &uio->uio_offset);
773		if (bytes >= 0) {
774			uio->uio_iov->iov_base =
775			    ((uint8_t *)uio->uio_iov->iov_base) + bytes;
776			uio->uio_iov->iov_len -= bytes;
777			uio->uio_resid -= bytes;
778		} else
779			error = -bytes;
780	} else
781		error = ENXIO;
782	linux_clear_current(td);
783
784	return (error);
785}
786
787static int
788linux_file_poll(struct file *file, int events, struct ucred *active_cred,
789    struct thread *td)
790{
791	struct linux_file *filp;
792	struct task_struct t;
793	int revents;
794
795	filp = (struct linux_file *)file->f_data;
796	filp->f_flags = file->f_flag;
797	linux_set_current(td, &t);
798	if (filp->f_op->poll)
799		revents = filp->f_op->poll(filp, NULL) & events;
800	else
801		revents = 0;
802	linux_clear_current(td);
803
804	return (revents);
805}
806
807static int
808linux_file_close(struct file *file, struct thread *td)
809{
810	struct linux_file *filp;
811	struct task_struct t;
812	int error;
813
814	filp = (struct linux_file *)file->f_data;
815	filp->f_flags = file->f_flag;
816	linux_set_current(td, &t);
817	error = -filp->f_op->release(NULL, filp);
818	linux_clear_current(td);
819	funsetown(&filp->f_sigio);
820	kfree(filp);
821
822	return (error);
823}
824
825static int
826linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred,
827    struct thread *td)
828{
829	struct linux_file *filp;
830	struct task_struct t;
831	int error;
832
833	filp = (struct linux_file *)fp->f_data;
834	filp->f_flags = fp->f_flag;
835	error = 0;
836
837	linux_set_current(td, &t);
838	switch (cmd) {
839	case FIONBIO:
840		break;
841	case FIOASYNC:
842		if (filp->f_op->fasync == NULL)
843			break;
844		error = filp->f_op->fasync(0, filp, fp->f_flag & FASYNC);
845		break;
846	case FIOSETOWN:
847		error = fsetown(*(int *)data, &filp->f_sigio);
848		if (error == 0)
849			error = filp->f_op->fasync(0, filp,
850			    fp->f_flag & FASYNC);
851		break;
852	case FIOGETOWN:
853		*(int *)data = fgetown(&filp->f_sigio);
854		break;
855	default:
856		error = ENOTTY;
857		break;
858	}
859	linux_clear_current(td);
860	return (error);
861}
862
863static int
864linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
865    struct thread *td)
866{
867
868	return (EOPNOTSUPP);
869}
870
871static int
872linux_file_fill_kinfo(struct file *fp, struct kinfo_file *kif,
873    struct filedesc *fdp)
874{
875
876	return (0);
877}
878
879struct fileops linuxfileops = {
880	.fo_read = linux_file_read,
881	.fo_write = invfo_rdwr,
882	.fo_truncate = invfo_truncate,
883	.fo_kqfilter = invfo_kqfilter,
884	.fo_stat = linux_file_stat,
885	.fo_fill_kinfo = linux_file_fill_kinfo,
886	.fo_poll = linux_file_poll,
887	.fo_close = linux_file_close,
888	.fo_ioctl = linux_file_ioctl,
889	.fo_chmod = invfo_chmod,
890	.fo_chown = invfo_chown,
891	.fo_sendfile = invfo_sendfile,
892};
893
894/*
895 * Hash of vmmap addresses.  This is infrequently accessed and does not
896 * need to be particularly large.  This is done because we must store the
897 * caller's idea of the map size to properly unmap.
898 */
899struct vmmap {
900	LIST_ENTRY(vmmap)	vm_next;
901	void 			*vm_addr;
902	unsigned long		vm_size;
903};
904
905struct vmmaphd {
906	struct vmmap *lh_first;
907};
908#define	VMMAP_HASH_SIZE	64
909#define	VMMAP_HASH_MASK	(VMMAP_HASH_SIZE - 1)
910#define	VM_HASH(addr)	((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK
911static struct vmmaphd vmmaphead[VMMAP_HASH_SIZE];
912static struct mtx vmmaplock;
913
914static void
915vmmap_add(void *addr, unsigned long size)
916{
917	struct vmmap *vmmap;
918
919	vmmap = kmalloc(sizeof(*vmmap), GFP_KERNEL);
920	mtx_lock(&vmmaplock);
921	vmmap->vm_size = size;
922	vmmap->vm_addr = addr;
923	LIST_INSERT_HEAD(&vmmaphead[VM_HASH(addr)], vmmap, vm_next);
924	mtx_unlock(&vmmaplock);
925}
926
927static struct vmmap *
928vmmap_remove(void *addr)
929{
930	struct vmmap *vmmap;
931
932	mtx_lock(&vmmaplock);
933	LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next)
934		if (vmmap->vm_addr == addr)
935			break;
936	if (vmmap)
937		LIST_REMOVE(vmmap, vm_next);
938	mtx_unlock(&vmmaplock);
939
940	return (vmmap);
941}
942
943#if defined(__i386__) || defined(__amd64__)
944void *
945_ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr)
946{
947	void *addr;
948
949	addr = pmap_mapdev_attr(phys_addr, size, attr);
950	if (addr == NULL)
951		return (NULL);
952	vmmap_add(addr, size);
953
954	return (addr);
955}
956#endif
957
958void
959iounmap(void *addr)
960{
961	struct vmmap *vmmap;
962
963	vmmap = vmmap_remove(addr);
964	if (vmmap == NULL)
965		return;
966#if defined(__i386__) || defined(__amd64__)
967	pmap_unmapdev((vm_offset_t)addr, vmmap->vm_size);
968#endif
969	kfree(vmmap);
970}
971
972
973void *
974vmap(struct page **pages, unsigned int count, unsigned long flags, int prot)
975{
976	vm_offset_t off;
977	size_t size;
978
979	size = count * PAGE_SIZE;
980	off = kva_alloc(size);
981	if (off == 0)
982		return (NULL);
983	vmmap_add((void *)off, size);
984	pmap_qenter(off, pages, count);
985
986	return ((void *)off);
987}
988
989void
990vunmap(void *addr)
991{
992	struct vmmap *vmmap;
993
994	vmmap = vmmap_remove(addr);
995	if (vmmap == NULL)
996		return;
997	pmap_qremove((vm_offset_t)addr, vmmap->vm_size / PAGE_SIZE);
998	kva_free((vm_offset_t)addr, vmmap->vm_size);
999	kfree(vmmap);
1000}
1001
1002char *
1003kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
1004{
1005	unsigned int len;
1006	char *p;
1007	va_list aq;
1008
1009	va_copy(aq, ap);
1010	len = vsnprintf(NULL, 0, fmt, aq);
1011	va_end(aq);
1012
1013	p = kmalloc(len + 1, gfp);
1014	if (p != NULL)
1015		vsnprintf(p, len + 1, fmt, ap);
1016
1017	return (p);
1018}
1019
1020char *
1021kasprintf(gfp_t gfp, const char *fmt, ...)
1022{
1023	va_list ap;
1024	char *p;
1025
1026	va_start(ap, fmt);
1027	p = kvasprintf(gfp, fmt, ap);
1028	va_end(ap);
1029
1030	return (p);
1031}
1032
1033static void
1034linux_timer_callback_wrapper(void *context)
1035{
1036	struct timer_list *timer;
1037
1038	timer = context;
1039	timer->function(timer->data);
1040}
1041
1042void
1043mod_timer(struct timer_list *timer, unsigned long expires)
1044{
1045
1046	timer->expires = expires;
1047	callout_reset(&timer->timer_callout,
1048	    linux_timer_jiffies_until(expires),
1049	    &linux_timer_callback_wrapper, timer);
1050}
1051
1052void
1053add_timer(struct timer_list *timer)
1054{
1055
1056	callout_reset(&timer->timer_callout,
1057	    linux_timer_jiffies_until(timer->expires),
1058	    &linux_timer_callback_wrapper, timer);
1059}
1060
1061static void
1062linux_timer_init(void *arg)
1063{
1064
1065	/*
1066	 * Compute an internal HZ value which can divide 2**32 to
1067	 * avoid timer rounding problems when the tick value wraps
1068	 * around 2**32:
1069	 */
1070	linux_timer_hz_mask = 1;
1071	while (linux_timer_hz_mask < (unsigned long)hz)
1072		linux_timer_hz_mask *= 2;
1073	linux_timer_hz_mask--;
1074}
1075SYSINIT(linux_timer, SI_SUB_DRIVERS, SI_ORDER_FIRST, linux_timer_init, NULL);
1076
1077void
1078linux_complete_common(struct completion *c, int all)
1079{
1080	int wakeup_swapper;
1081
1082	sleepq_lock(c);
1083	c->done++;
1084	if (all)
1085		wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0);
1086	else
1087		wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0);
1088	sleepq_release(c);
1089	if (wakeup_swapper)
1090		kick_proc0();
1091}
1092
1093/*
1094 * Indefinite wait for done != 0 with or without signals.
1095 */
1096long
1097linux_wait_for_common(struct completion *c, int flags)
1098{
1099	if (SCHEDULER_STOPPED())
1100		return (0);
1101
1102	if (flags != 0)
1103		flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
1104	else
1105		flags = SLEEPQ_SLEEP;
1106	for (;;) {
1107		sleepq_lock(c);
1108		if (c->done)
1109			break;
1110		sleepq_add(c, NULL, "completion", flags, 0);
1111		if (flags & SLEEPQ_INTERRUPTIBLE) {
1112			if (sleepq_wait_sig(c, 0) != 0)
1113				return (-ERESTARTSYS);
1114		} else
1115			sleepq_wait(c, 0);
1116	}
1117	c->done--;
1118	sleepq_release(c);
1119
1120	return (0);
1121}
1122
1123/*
1124 * Time limited wait for done != 0 with or without signals.
1125 */
1126long
1127linux_wait_for_timeout_common(struct completion *c, long timeout, int flags)
1128{
1129	long end = jiffies + timeout;
1130
1131	if (SCHEDULER_STOPPED())
1132		return (0);
1133
1134	if (flags != 0)
1135		flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
1136	else
1137		flags = SLEEPQ_SLEEP;
1138	for (;;) {
1139		int ret;
1140
1141		sleepq_lock(c);
1142		if (c->done)
1143			break;
1144		sleepq_add(c, NULL, "completion", flags, 0);
1145		sleepq_set_timeout(c, linux_timer_jiffies_until(end));
1146		if (flags & SLEEPQ_INTERRUPTIBLE)
1147			ret = sleepq_timedwait_sig(c, 0);
1148		else
1149			ret = sleepq_timedwait(c, 0);
1150		if (ret != 0) {
1151			/* check for timeout or signal */
1152			if (ret == EWOULDBLOCK)
1153				return (0);
1154			else
1155				return (-ERESTARTSYS);
1156		}
1157	}
1158	c->done--;
1159	sleepq_release(c);
1160
1161	/* return how many jiffies are left */
1162	return (linux_timer_jiffies_until(end));
1163}
1164
1165int
1166linux_try_wait_for_completion(struct completion *c)
1167{
1168	int isdone;
1169
1170	isdone = 1;
1171	sleepq_lock(c);
1172	if (c->done)
1173		c->done--;
1174	else
1175		isdone = 0;
1176	sleepq_release(c);
1177	return (isdone);
1178}
1179
1180int
1181linux_completion_done(struct completion *c)
1182{
1183	int isdone;
1184
1185	isdone = 1;
1186	sleepq_lock(c);
1187	if (c->done == 0)
1188		isdone = 0;
1189	sleepq_release(c);
1190	return (isdone);
1191}
1192
1193void
1194linux_delayed_work_fn(void *arg)
1195{
1196	struct delayed_work *work;
1197
1198	work = arg;
1199	taskqueue_enqueue(work->work.taskqueue, &work->work.work_task);
1200}
1201
1202void
1203linux_work_fn(void *context, int pending)
1204{
1205	struct work_struct *work;
1206
1207	work = context;
1208	work->fn(work);
1209}
1210
1211void
1212linux_flush_fn(void *context, int pending)
1213{
1214}
1215
1216struct workqueue_struct *
1217linux_create_workqueue_common(const char *name, int cpus)
1218{
1219	struct workqueue_struct *wq;
1220
1221	wq = kmalloc(sizeof(*wq), M_WAITOK);
1222	wq->taskqueue = taskqueue_create(name, M_WAITOK,
1223	    taskqueue_thread_enqueue,  &wq->taskqueue);
1224	atomic_set(&wq->draining, 0);
1225	taskqueue_start_threads(&wq->taskqueue, cpus, PWAIT, "%s", name);
1226
1227	return (wq);
1228}
1229
1230void
1231destroy_workqueue(struct workqueue_struct *wq)
1232{
1233	taskqueue_free(wq->taskqueue);
1234	kfree(wq);
1235}
1236
1237static void
1238linux_cdev_release(struct kobject *kobj)
1239{
1240	struct linux_cdev *cdev;
1241	struct kobject *parent;
1242
1243	cdev = container_of(kobj, struct linux_cdev, kobj);
1244	parent = kobj->parent;
1245	if (cdev->cdev)
1246		destroy_dev(cdev->cdev);
1247	kfree(cdev);
1248	kobject_put(parent);
1249}
1250
1251static void
1252linux_cdev_static_release(struct kobject *kobj)
1253{
1254	struct linux_cdev *cdev;
1255	struct kobject *parent;
1256
1257	cdev = container_of(kobj, struct linux_cdev, kobj);
1258	parent = kobj->parent;
1259	if (cdev->cdev)
1260		destroy_dev(cdev->cdev);
1261	kobject_put(parent);
1262}
1263
1264const struct kobj_type linux_cdev_ktype = {
1265	.release = linux_cdev_release,
1266};
1267
1268const struct kobj_type linux_cdev_static_ktype = {
1269	.release = linux_cdev_static_release,
1270};
1271
1272static void
1273linux_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate)
1274{
1275	struct notifier_block *nb;
1276
1277	nb = arg;
1278	if (linkstate == LINK_STATE_UP)
1279		nb->notifier_call(nb, NETDEV_UP, ifp);
1280	else
1281		nb->notifier_call(nb, NETDEV_DOWN, ifp);
1282}
1283
1284static void
1285linux_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp)
1286{
1287	struct notifier_block *nb;
1288
1289	nb = arg;
1290	nb->notifier_call(nb, NETDEV_REGISTER, ifp);
1291}
1292
1293static void
1294linux_handle_ifnet_departure_event(void *arg, struct ifnet *ifp)
1295{
1296	struct notifier_block *nb;
1297
1298	nb = arg;
1299	nb->notifier_call(nb, NETDEV_UNREGISTER, ifp);
1300}
1301
1302static void
1303linux_handle_iflladdr_event(void *arg, struct ifnet *ifp)
1304{
1305	struct notifier_block *nb;
1306
1307	nb = arg;
1308	nb->notifier_call(nb, NETDEV_CHANGEADDR, ifp);
1309}
1310
1311static void
1312linux_handle_ifaddr_event(void *arg, struct ifnet *ifp)
1313{
1314	struct notifier_block *nb;
1315
1316	nb = arg;
1317	nb->notifier_call(nb, NETDEV_CHANGEIFADDR, ifp);
1318}
1319
1320int
1321register_netdevice_notifier(struct notifier_block *nb)
1322{
1323
1324	nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER(
1325	    ifnet_link_event, linux_handle_ifnet_link_event, nb, 0);
1326	nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER(
1327	    ifnet_arrival_event, linux_handle_ifnet_arrival_event, nb, 0);
1328	nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER(
1329	    ifnet_departure_event, linux_handle_ifnet_departure_event, nb, 0);
1330	nb->tags[NETDEV_CHANGEADDR] = EVENTHANDLER_REGISTER(
1331	    iflladdr_event, linux_handle_iflladdr_event, nb, 0);
1332
1333	return (0);
1334}
1335
1336int
1337register_inetaddr_notifier(struct notifier_block *nb)
1338{
1339
1340        nb->tags[NETDEV_CHANGEIFADDR] = EVENTHANDLER_REGISTER(
1341            ifaddr_event, linux_handle_ifaddr_event, nb, 0);
1342        return (0);
1343}
1344
1345int
1346unregister_netdevice_notifier(struct notifier_block *nb)
1347{
1348
1349        EVENTHANDLER_DEREGISTER(ifnet_link_event,
1350	    nb->tags[NETDEV_UP]);
1351        EVENTHANDLER_DEREGISTER(ifnet_arrival_event,
1352	    nb->tags[NETDEV_REGISTER]);
1353        EVENTHANDLER_DEREGISTER(ifnet_departure_event,
1354	    nb->tags[NETDEV_UNREGISTER]);
1355        EVENTHANDLER_DEREGISTER(iflladdr_event,
1356	    nb->tags[NETDEV_CHANGEADDR]);
1357
1358	return (0);
1359}
1360
1361int
1362unregister_inetaddr_notifier(struct notifier_block *nb)
1363{
1364
1365        EVENTHANDLER_DEREGISTER(ifaddr_event,
1366            nb->tags[NETDEV_CHANGEIFADDR]);
1367
1368        return (0);
1369}
1370
1371struct list_sort_thunk {
1372	int (*cmp)(void *, struct list_head *, struct list_head *);
1373	void *priv;
1374};
1375
1376static inline int
1377linux_le_cmp(void *priv, const void *d1, const void *d2)
1378{
1379	struct list_head *le1, *le2;
1380	struct list_sort_thunk *thunk;
1381
1382	thunk = priv;
1383	le1 = *(__DECONST(struct list_head **, d1));
1384	le2 = *(__DECONST(struct list_head **, d2));
1385	return ((thunk->cmp)(thunk->priv, le1, le2));
1386}
1387
1388void
1389list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv,
1390    struct list_head *a, struct list_head *b))
1391{
1392	struct list_sort_thunk thunk;
1393	struct list_head **ar, *le;
1394	size_t count, i;
1395
1396	count = 0;
1397	list_for_each(le, head)
1398		count++;
1399	ar = malloc(sizeof(struct list_head *) * count, M_KMALLOC, M_WAITOK);
1400	i = 0;
1401	list_for_each(le, head)
1402		ar[i++] = le;
1403	thunk.cmp = cmp;
1404	thunk.priv = priv;
1405	qsort_r(ar, count, sizeof(struct list_head *), &thunk, linux_le_cmp);
1406	INIT_LIST_HEAD(head);
1407	for (i = 0; i < count; i++)
1408		list_add_tail(ar[i], head);
1409	free(ar, M_KMALLOC);
1410}
1411
1412void
1413linux_irq_handler(void *ent)
1414{
1415	struct irq_ent *irqe;
1416
1417	irqe = ent;
1418	irqe->handler(irqe->irq, irqe->arg);
1419}
1420
1421struct linux_cdev *
1422linux_find_cdev(const char *name, unsigned major, unsigned minor)
1423{
1424	int unit = MKDEV(major, minor);
1425	struct cdev *cdev;
1426
1427	dev_lock();
1428	LIST_FOREACH(cdev, &linuxcdevsw.d_devs, si_list) {
1429		struct linux_cdev *ldev = cdev->si_drv1;
1430		if (dev2unit(cdev) == unit &&
1431		    strcmp(kobject_name(&ldev->kobj), name) == 0) {
1432			break;
1433		}
1434	}
1435	dev_unlock();
1436
1437	return (cdev != NULL ? cdev->si_drv1 : NULL);
1438}
1439
1440int
1441__register_chrdev(unsigned int major, unsigned int baseminor,
1442    unsigned int count, const char *name,
1443    const struct file_operations *fops)
1444{
1445	struct linux_cdev *cdev;
1446	int ret = 0;
1447	int i;
1448
1449	for (i = baseminor; i < baseminor + count; i++) {
1450		cdev = cdev_alloc();
1451		cdev_init(cdev, fops);
1452		kobject_set_name(&cdev->kobj, name);
1453
1454		ret = cdev_add(cdev, makedev(major, i), 1);
1455		if (ret != 0)
1456			break;
1457	}
1458	return (ret);
1459}
1460
1461int
1462__register_chrdev_p(unsigned int major, unsigned int baseminor,
1463    unsigned int count, const char *name,
1464    const struct file_operations *fops, uid_t uid,
1465    gid_t gid, int mode)
1466{
1467	struct linux_cdev *cdev;
1468	int ret = 0;
1469	int i;
1470
1471	for (i = baseminor; i < baseminor + count; i++) {
1472		cdev = cdev_alloc();
1473		cdev_init(cdev, fops);
1474		kobject_set_name(&cdev->kobj, name);
1475
1476		ret = cdev_add_ext(cdev, makedev(major, i), uid, gid, mode);
1477		if (ret != 0)
1478			break;
1479	}
1480	return (ret);
1481}
1482
1483void
1484__unregister_chrdev(unsigned int major, unsigned int baseminor,
1485    unsigned int count, const char *name)
1486{
1487	struct linux_cdev *cdevp;
1488	int i;
1489
1490	for (i = baseminor; i < baseminor + count; i++) {
1491		cdevp = linux_find_cdev(name, major, i);
1492		if (cdevp != NULL)
1493			cdev_del(cdevp);
1494	}
1495}
1496
1497#if defined(__i386__) || defined(__amd64__)
1498bool linux_cpu_has_clflush;
1499#endif
1500
1501static void
1502linux_compat_init(void *arg)
1503{
1504	struct sysctl_oid *rootoid;
1505	int i;
1506
1507#if defined(__i386__) || defined(__amd64__)
1508	linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH);
1509#endif
1510	sx_init(&linux_global_rcu_lock, "LinuxGlobalRCU");
1511
1512	rootoid = SYSCTL_ADD_ROOT_NODE(NULL,
1513	    OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys");
1514	kobject_init(&linux_class_root, &linux_class_ktype);
1515	kobject_set_name(&linux_class_root, "class");
1516	linux_class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid),
1517	    OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class");
1518	kobject_init(&linux_root_device.kobj, &linux_dev_ktype);
1519	kobject_set_name(&linux_root_device.kobj, "device");
1520	linux_root_device.kobj.oidp = SYSCTL_ADD_NODE(NULL,
1521	    SYSCTL_CHILDREN(rootoid), OID_AUTO, "device", CTLFLAG_RD, NULL,
1522	    "device");
1523	linux_root_device.bsddev = root_bus;
1524	linux_class_misc.name = "misc";
1525	class_register(&linux_class_misc);
1526	INIT_LIST_HEAD(&pci_drivers);
1527	INIT_LIST_HEAD(&pci_devices);
1528	spin_lock_init(&pci_lock);
1529	mtx_init(&vmmaplock, "IO Map lock", NULL, MTX_DEF);
1530	for (i = 0; i < VMMAP_HASH_SIZE; i++)
1531		LIST_INIT(&vmmaphead[i]);
1532}
1533SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL);
1534
1535static void
1536linux_compat_uninit(void *arg)
1537{
1538	linux_kobject_kfree_name(&linux_class_root);
1539	linux_kobject_kfree_name(&linux_root_device.kobj);
1540	linux_kobject_kfree_name(&linux_class_misc.kobj);
1541
1542	synchronize_rcu();
1543	sx_destroy(&linux_global_rcu_lock);
1544}
1545SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL);
1546
1547/*
1548 * NOTE: Linux frequently uses "unsigned long" for pointer to integer
1549 * conversion and vice versa, where in FreeBSD "uintptr_t" would be
1550 * used. Assert these types have the same size, else some parts of the
1551 * LinuxKPI may not work like expected:
1552 */
1553CTASSERT(sizeof(unsigned long) == sizeof(uintptr_t));
1554