kernel.c revision 271533
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
24 * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
25 */
26
27#include <assert.h>
28#include <fcntl.h>
29#include <poll.h>
30#include <stdio.h>
31#include <stdlib.h>
32#include <string.h>
33#include <zlib.h>
34#include <libgen.h>
35#include <sys/spa.h>
36#include <sys/stat.h>
37#include <sys/processor.h>
38#include <sys/zfs_context.h>
39#include <sys/rrwlock.h>
40#include <sys/zmod.h>
41#include <sys/utsname.h>
42#include <sys/systeminfo.h>
43
44/*
45 * Emulation of kernel services in userland.
46 */
47
48int aok;
49uint64_t physmem;
50vnode_t *rootdir = (vnode_t *)0xabcd1234;
51char hw_serial[HW_HOSTID_LEN];
52#ifdef illumos
53kmutex_t cpu_lock;
54#endif
55
56/* If set, all blocks read will be copied to the specified directory. */
57char *vn_dumpdir = NULL;
58
59struct utsname utsname = {
60	"userland", "libzpool", "1", "1", "na"
61};
62
63/* this only exists to have its address taken */
64struct proc p0;
65
66/*
67 * =========================================================================
68 * threads
69 * =========================================================================
70 */
71/*ARGSUSED*/
72kthread_t *
73zk_thread_create(void (*func)(), void *arg)
74{
75	thread_t tid;
76
77	VERIFY(thr_create(0, 0, (void *(*)(void *))func, arg, THR_DETACHED,
78	    &tid) == 0);
79
80	return ((void *)(uintptr_t)tid);
81}
82
83/*
84 * =========================================================================
85 * kstats
86 * =========================================================================
87 */
88/*ARGSUSED*/
89kstat_t *
90kstat_create(char *module, int instance, char *name, char *class,
91    uchar_t type, ulong_t ndata, uchar_t ks_flag)
92{
93	return (NULL);
94}
95
96/*ARGSUSED*/
97void
98kstat_install(kstat_t *ksp)
99{}
100
101/*ARGSUSED*/
102void
103kstat_delete(kstat_t *ksp)
104{}
105
106/*
107 * =========================================================================
108 * mutexes
109 * =========================================================================
110 */
111void
112zmutex_init(kmutex_t *mp)
113{
114	mp->m_owner = NULL;
115	mp->initialized = B_TRUE;
116	(void) _mutex_init(&mp->m_lock, USYNC_THREAD, NULL);
117}
118
119void
120zmutex_destroy(kmutex_t *mp)
121{
122	ASSERT(mp->initialized == B_TRUE);
123	ASSERT(mp->m_owner == NULL);
124	(void) _mutex_destroy(&(mp)->m_lock);
125	mp->m_owner = (void *)-1UL;
126	mp->initialized = B_FALSE;
127}
128
129int
130zmutex_owned(kmutex_t *mp)
131{
132	ASSERT(mp->initialized == B_TRUE);
133
134	return (mp->m_owner == curthread);
135}
136
137void
138mutex_enter(kmutex_t *mp)
139{
140	ASSERT(mp->initialized == B_TRUE);
141	ASSERT(mp->m_owner != (void *)-1UL);
142	ASSERT(mp->m_owner != curthread);
143	VERIFY(mutex_lock(&mp->m_lock) == 0);
144	ASSERT(mp->m_owner == NULL);
145	mp->m_owner = curthread;
146}
147
148int
149mutex_tryenter(kmutex_t *mp)
150{
151	ASSERT(mp->initialized == B_TRUE);
152	ASSERT(mp->m_owner != (void *)-1UL);
153	if (0 == mutex_trylock(&mp->m_lock)) {
154		ASSERT(mp->m_owner == NULL);
155		mp->m_owner = curthread;
156		return (1);
157	} else {
158		return (0);
159	}
160}
161
162void
163mutex_exit(kmutex_t *mp)
164{
165	ASSERT(mp->initialized == B_TRUE);
166	ASSERT(mutex_owner(mp) == curthread);
167	mp->m_owner = NULL;
168	VERIFY(mutex_unlock(&mp->m_lock) == 0);
169}
170
171void *
172mutex_owner(kmutex_t *mp)
173{
174	ASSERT(mp->initialized == B_TRUE);
175	return (mp->m_owner);
176}
177
178/*
179 * =========================================================================
180 * rwlocks
181 * =========================================================================
182 */
183/*ARGSUSED*/
184void
185rw_init(krwlock_t *rwlp, char *name, int type, void *arg)
186{
187	rwlock_init(&rwlp->rw_lock, USYNC_THREAD, NULL);
188	rwlp->rw_owner = NULL;
189	rwlp->initialized = B_TRUE;
190	rwlp->rw_count = 0;
191}
192
193void
194rw_destroy(krwlock_t *rwlp)
195{
196	ASSERT(rwlp->rw_count == 0);
197	rwlock_destroy(&rwlp->rw_lock);
198	rwlp->rw_owner = (void *)-1UL;
199	rwlp->initialized = B_FALSE;
200}
201
202void
203rw_enter(krwlock_t *rwlp, krw_t rw)
204{
205	//ASSERT(!RW_LOCK_HELD(rwlp));
206	ASSERT(rwlp->initialized == B_TRUE);
207	ASSERT(rwlp->rw_owner != (void *)-1UL);
208	ASSERT(rwlp->rw_owner != curthread);
209
210	if (rw == RW_READER) {
211		VERIFY(rw_rdlock(&rwlp->rw_lock) == 0);
212		ASSERT(rwlp->rw_count >= 0);
213		atomic_add_int(&rwlp->rw_count, 1);
214	} else {
215		VERIFY(rw_wrlock(&rwlp->rw_lock) == 0);
216		ASSERT(rwlp->rw_count == 0);
217		rwlp->rw_count = -1;
218		rwlp->rw_owner = curthread;
219	}
220}
221
222void
223rw_exit(krwlock_t *rwlp)
224{
225	ASSERT(rwlp->initialized == B_TRUE);
226	ASSERT(rwlp->rw_owner != (void *)-1UL);
227
228	if (rwlp->rw_owner == curthread) {
229		/* Write locked. */
230		ASSERT(rwlp->rw_count == -1);
231		rwlp->rw_count = 0;
232		rwlp->rw_owner = NULL;
233	} else {
234		/* Read locked. */
235		ASSERT(rwlp->rw_count > 0);
236		atomic_add_int(&rwlp->rw_count, -1);
237	}
238	VERIFY(rw_unlock(&rwlp->rw_lock) == 0);
239}
240
241int
242rw_tryenter(krwlock_t *rwlp, krw_t rw)
243{
244	int rv;
245
246	ASSERT(rwlp->initialized == B_TRUE);
247	ASSERT(rwlp->rw_owner != (void *)-1UL);
248	ASSERT(rwlp->rw_owner != curthread);
249
250	if (rw == RW_READER)
251		rv = rw_tryrdlock(&rwlp->rw_lock);
252	else
253		rv = rw_trywrlock(&rwlp->rw_lock);
254
255	if (rv == 0) {
256		ASSERT(rwlp->rw_owner == NULL);
257		if (rw == RW_READER) {
258			ASSERT(rwlp->rw_count >= 0);
259			atomic_add_int(&rwlp->rw_count, 1);
260		} else {
261			ASSERT(rwlp->rw_count == 0);
262			rwlp->rw_count = -1;
263			rwlp->rw_owner = curthread;
264		}
265		return (1);
266	}
267
268	return (0);
269}
270
271/*ARGSUSED*/
272int
273rw_tryupgrade(krwlock_t *rwlp)
274{
275	ASSERT(rwlp->initialized == B_TRUE);
276	ASSERT(rwlp->rw_owner != (void *)-1UL);
277
278	return (0);
279}
280
281int
282rw_lock_held(krwlock_t *rwlp)
283{
284
285	return (rwlp->rw_count != 0);
286}
287
288/*
289 * =========================================================================
290 * condition variables
291 * =========================================================================
292 */
293/*ARGSUSED*/
294void
295cv_init(kcondvar_t *cv, char *name, int type, void *arg)
296{
297	VERIFY(cond_init(cv, name, NULL) == 0);
298}
299
300void
301cv_destroy(kcondvar_t *cv)
302{
303	VERIFY(cond_destroy(cv) == 0);
304}
305
306void
307cv_wait(kcondvar_t *cv, kmutex_t *mp)
308{
309	ASSERT(mutex_owner(mp) == curthread);
310	mp->m_owner = NULL;
311	int ret = cond_wait(cv, &mp->m_lock);
312	VERIFY(ret == 0 || ret == EINTR);
313	mp->m_owner = curthread;
314}
315
316clock_t
317cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
318{
319	int error;
320	struct timespec ts;
321	struct timeval tv;
322	clock_t delta;
323
324	abstime += ddi_get_lbolt();
325top:
326	delta = abstime - ddi_get_lbolt();
327	if (delta <= 0)
328		return (-1);
329
330	if (gettimeofday(&tv, NULL) != 0)
331		assert(!"gettimeofday() failed");
332
333	ts.tv_sec = tv.tv_sec + delta / hz;
334	ts.tv_nsec = tv.tv_usec * 1000 + (delta % hz) * (NANOSEC / hz);
335	ASSERT(ts.tv_nsec >= 0);
336
337	if (ts.tv_nsec >= NANOSEC) {
338		ts.tv_sec++;
339		ts.tv_nsec -= NANOSEC;
340	}
341
342	ASSERT(mutex_owner(mp) == curthread);
343	mp->m_owner = NULL;
344	error = pthread_cond_timedwait(cv, &mp->m_lock, &ts);
345	mp->m_owner = curthread;
346
347	if (error == EINTR)
348		goto top;
349
350	if (error == ETIMEDOUT)
351		return (-1);
352
353	ASSERT(error == 0);
354
355	return (1);
356}
357
358/*ARGSUSED*/
359clock_t
360cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res,
361    int flag)
362{
363	int error;
364	timestruc_t ts;
365	hrtime_t delta;
366
367	ASSERT(flag == 0);
368
369top:
370	delta = tim - gethrtime();
371	if (delta <= 0)
372		return (-1);
373
374	ts.tv_sec = delta / NANOSEC;
375	ts.tv_nsec = delta % NANOSEC;
376
377	ASSERT(mutex_owner(mp) == curthread);
378	mp->m_owner = NULL;
379	error = pthread_cond_timedwait(cv, &mp->m_lock, &ts);
380	mp->m_owner = curthread;
381
382	if (error == ETIMEDOUT)
383		return (-1);
384
385	if (error == EINTR)
386		goto top;
387
388	ASSERT(error == 0);
389
390	return (1);
391}
392
393void
394cv_signal(kcondvar_t *cv)
395{
396	VERIFY(cond_signal(cv) == 0);
397}
398
399void
400cv_broadcast(kcondvar_t *cv)
401{
402	VERIFY(cond_broadcast(cv) == 0);
403}
404
405/*
406 * =========================================================================
407 * vnode operations
408 * =========================================================================
409 */
410/*
411 * Note: for the xxxat() versions of these functions, we assume that the
412 * starting vp is always rootdir (which is true for spa_directory.c, the only
413 * ZFS consumer of these interfaces).  We assert this is true, and then emulate
414 * them by adding '/' in front of the path.
415 */
416
417/*ARGSUSED*/
418int
419vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3)
420{
421	int fd;
422	int dump_fd;
423	vnode_t *vp;
424	int old_umask;
425	char realpath[MAXPATHLEN];
426	struct stat64 st;
427
428	/*
429	 * If we're accessing a real disk from userland, we need to use
430	 * the character interface to avoid caching.  This is particularly
431	 * important if we're trying to look at a real in-kernel storage
432	 * pool from userland, e.g. via zdb, because otherwise we won't
433	 * see the changes occurring under the segmap cache.
434	 * On the other hand, the stupid character device returns zero
435	 * for its size.  So -- gag -- we open the block device to get
436	 * its size, and remember it for subsequent VOP_GETATTR().
437	 */
438	if (strncmp(path, "/dev/", 5) == 0) {
439		char *dsk;
440		fd = open64(path, O_RDONLY);
441		if (fd == -1)
442			return (errno);
443		if (fstat64(fd, &st) == -1) {
444			close(fd);
445			return (errno);
446		}
447		close(fd);
448		(void) sprintf(realpath, "%s", path);
449		dsk = strstr(path, "/dsk/");
450		if (dsk != NULL)
451			(void) sprintf(realpath + (dsk - path) + 1, "r%s",
452			    dsk + 1);
453	} else {
454		(void) sprintf(realpath, "%s", path);
455		if (!(flags & FCREAT) && stat64(realpath, &st) == -1)
456			return (errno);
457	}
458
459	if (flags & FCREAT)
460		old_umask = umask(0);
461
462	/*
463	 * The construct 'flags - FREAD' conveniently maps combinations of
464	 * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR.
465	 */
466	fd = open64(realpath, flags - FREAD, mode);
467
468	if (flags & FCREAT)
469		(void) umask(old_umask);
470
471	if (vn_dumpdir != NULL) {
472		char dumppath[MAXPATHLEN];
473		(void) snprintf(dumppath, sizeof (dumppath),
474		    "%s/%s", vn_dumpdir, basename(realpath));
475		dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666);
476		if (dump_fd == -1)
477			return (errno);
478	} else {
479		dump_fd = -1;
480	}
481
482	if (fd == -1)
483		return (errno);
484
485	if (fstat64(fd, &st) == -1) {
486		close(fd);
487		return (errno);
488	}
489
490	(void) fcntl(fd, F_SETFD, FD_CLOEXEC);
491
492	*vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL);
493
494	vp->v_fd = fd;
495	vp->v_size = st.st_size;
496	vp->v_path = spa_strdup(path);
497	vp->v_dump_fd = dump_fd;
498
499	return (0);
500}
501
502/*ARGSUSED*/
503int
504vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2,
505    int x3, vnode_t *startvp, int fd)
506{
507	char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL);
508	int ret;
509
510	ASSERT(startvp == rootdir);
511	(void) sprintf(realpath, "/%s", path);
512
513	/* fd ignored for now, need if want to simulate nbmand support */
514	ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3);
515
516	umem_free(realpath, strlen(path) + 2);
517
518	return (ret);
519}
520
521/*ARGSUSED*/
522int
523vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset,
524	int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp)
525{
526	ssize_t iolen, split;
527
528	if (uio == UIO_READ) {
529		iolen = pread64(vp->v_fd, addr, len, offset);
530		if (vp->v_dump_fd != -1) {
531			int status =
532			    pwrite64(vp->v_dump_fd, addr, iolen, offset);
533			ASSERT(status != -1);
534		}
535	} else {
536		/*
537		 * To simulate partial disk writes, we split writes into two
538		 * system calls so that the process can be killed in between.
539		 */
540		int sectors = len >> SPA_MINBLOCKSHIFT;
541		split = (sectors > 0 ? rand() % sectors : 0) <<
542		    SPA_MINBLOCKSHIFT;
543		iolen = pwrite64(vp->v_fd, addr, split, offset);
544		iolen += pwrite64(vp->v_fd, (char *)addr + split,
545		    len - split, offset + split);
546	}
547
548	if (iolen == -1)
549		return (errno);
550	if (residp)
551		*residp = len - iolen;
552	else if (iolen != len)
553		return (EIO);
554	return (0);
555}
556
557void
558vn_close(vnode_t *vp, int openflag, cred_t *cr, kthread_t *td)
559{
560	close(vp->v_fd);
561	if (vp->v_dump_fd != -1)
562		close(vp->v_dump_fd);
563	spa_strfree(vp->v_path);
564	umem_free(vp, sizeof (vnode_t));
565}
566
567/*
568 * At a minimum we need to update the size since vdev_reopen()
569 * will no longer call vn_openat().
570 */
571int
572fop_getattr(vnode_t *vp, vattr_t *vap)
573{
574	struct stat64 st;
575
576	if (fstat64(vp->v_fd, &st) == -1) {
577		close(vp->v_fd);
578		return (errno);
579	}
580
581	vap->va_size = st.st_size;
582	return (0);
583}
584
585#ifdef ZFS_DEBUG
586
587/*
588 * =========================================================================
589 * Figure out which debugging statements to print
590 * =========================================================================
591 */
592
593static char *dprintf_string;
594static int dprintf_print_all;
595
596int
597dprintf_find_string(const char *string)
598{
599	char *tmp_str = dprintf_string;
600	int len = strlen(string);
601
602	/*
603	 * Find out if this is a string we want to print.
604	 * String format: file1.c,function_name1,file2.c,file3.c
605	 */
606
607	while (tmp_str != NULL) {
608		if (strncmp(tmp_str, string, len) == 0 &&
609		    (tmp_str[len] == ',' || tmp_str[len] == '\0'))
610			return (1);
611		tmp_str = strchr(tmp_str, ',');
612		if (tmp_str != NULL)
613			tmp_str++; /* Get rid of , */
614	}
615	return (0);
616}
617
618void
619dprintf_setup(int *argc, char **argv)
620{
621	int i, j;
622
623	/*
624	 * Debugging can be specified two ways: by setting the
625	 * environment variable ZFS_DEBUG, or by including a
626	 * "debug=..."  argument on the command line.  The command
627	 * line setting overrides the environment variable.
628	 */
629
630	for (i = 1; i < *argc; i++) {
631		int len = strlen("debug=");
632		/* First look for a command line argument */
633		if (strncmp("debug=", argv[i], len) == 0) {
634			dprintf_string = argv[i] + len;
635			/* Remove from args */
636			for (j = i; j < *argc; j++)
637				argv[j] = argv[j+1];
638			argv[j] = NULL;
639			(*argc)--;
640		}
641	}
642
643	if (dprintf_string == NULL) {
644		/* Look for ZFS_DEBUG environment variable */
645		dprintf_string = getenv("ZFS_DEBUG");
646	}
647
648	/*
649	 * Are we just turning on all debugging?
650	 */
651	if (dprintf_find_string("on"))
652		dprintf_print_all = 1;
653
654	if (dprintf_string != NULL)
655		zfs_flags |= ZFS_DEBUG_DPRINTF;
656}
657
658int
659sysctl_handle_64(SYSCTL_HANDLER_ARGS)
660{
661	return (0);
662}
663
664/*
665 * =========================================================================
666 * debug printfs
667 * =========================================================================
668 */
669void
670__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
671{
672	const char *newfile;
673	va_list adx;
674
675	/*
676	 * Get rid of annoying "../common/" prefix to filename.
677	 */
678	newfile = strrchr(file, '/');
679	if (newfile != NULL) {
680		newfile = newfile + 1; /* Get rid of leading / */
681	} else {
682		newfile = file;
683	}
684
685	if (dprintf_print_all ||
686	    dprintf_find_string(newfile) ||
687	    dprintf_find_string(func)) {
688		/* Print out just the function name if requested */
689		flockfile(stdout);
690		if (dprintf_find_string("pid"))
691			(void) printf("%d ", getpid());
692		if (dprintf_find_string("tid"))
693			(void) printf("%lu ", thr_self());
694#if 0
695		if (dprintf_find_string("cpu"))
696			(void) printf("%u ", getcpuid());
697#endif
698		if (dprintf_find_string("time"))
699			(void) printf("%llu ", gethrtime());
700		if (dprintf_find_string("long"))
701			(void) printf("%s, line %d: ", newfile, line);
702		(void) printf("%s: ", func);
703		va_start(adx, fmt);
704		(void) vprintf(fmt, adx);
705		va_end(adx);
706		funlockfile(stdout);
707	}
708}
709
710#endif /* ZFS_DEBUG */
711
712/*
713 * =========================================================================
714 * cmn_err() and panic()
715 * =========================================================================
716 */
717static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" };
718static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" };
719
720void
721vpanic(const char *fmt, va_list adx)
722{
723	(void) fprintf(stderr, "error: ");
724	(void) vfprintf(stderr, fmt, adx);
725	(void) fprintf(stderr, "\n");
726
727	abort();	/* think of it as a "user-level crash dump" */
728}
729
730void
731panic(const char *fmt, ...)
732{
733	va_list adx;
734
735	va_start(adx, fmt);
736	vpanic(fmt, adx);
737	va_end(adx);
738}
739
740void
741vcmn_err(int ce, const char *fmt, va_list adx)
742{
743	if (ce == CE_PANIC)
744		vpanic(fmt, adx);
745	if (ce != CE_NOTE) {	/* suppress noise in userland stress testing */
746		(void) fprintf(stderr, "%s", ce_prefix[ce]);
747		(void) vfprintf(stderr, fmt, adx);
748		(void) fprintf(stderr, "%s", ce_suffix[ce]);
749	}
750}
751
752/*PRINTFLIKE2*/
753void
754cmn_err(int ce, const char *fmt, ...)
755{
756	va_list adx;
757
758	va_start(adx, fmt);
759	vcmn_err(ce, fmt, adx);
760	va_end(adx);
761}
762
763/*
764 * =========================================================================
765 * kobj interfaces
766 * =========================================================================
767 */
768struct _buf *
769kobj_open_file(char *name)
770{
771	struct _buf *file;
772	vnode_t *vp;
773
774	/* set vp as the _fd field of the file */
775	if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir,
776	    -1) != 0)
777		return ((void *)-1UL);
778
779	file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL);
780	file->_fd = (intptr_t)vp;
781	return (file);
782}
783
784int
785kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off)
786{
787	ssize_t resid;
788
789	vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off,
790	    UIO_SYSSPACE, 0, 0, 0, &resid);
791
792	return (size - resid);
793}
794
795void
796kobj_close_file(struct _buf *file)
797{
798	vn_close((vnode_t *)file->_fd, 0, NULL, NULL);
799	umem_free(file, sizeof (struct _buf));
800}
801
802int
803kobj_get_filesize(struct _buf *file, uint64_t *size)
804{
805	struct stat64 st;
806	vnode_t *vp = (vnode_t *)file->_fd;
807
808	if (fstat64(vp->v_fd, &st) == -1) {
809		vn_close(vp, 0, NULL, NULL);
810		return (errno);
811	}
812	*size = st.st_size;
813	return (0);
814}
815
816/*
817 * =========================================================================
818 * misc routines
819 * =========================================================================
820 */
821
822void
823delay(clock_t ticks)
824{
825	poll(0, 0, ticks * (1000 / hz));
826}
827
828#if 0
829/*
830 * Find highest one bit set.
831 *	Returns bit number + 1 of highest bit that is set, otherwise returns 0.
832 */
833int
834highbit64(uint64_t i)
835{
836	int h = 1;
837
838	if (i == 0)
839		return (0);
840	if (i & 0xffffffff00000000ULL) {
841		h += 32; i >>= 32;
842	}
843	if (i & 0xffff0000) {
844		h += 16; i >>= 16;
845	}
846	if (i & 0xff00) {
847		h += 8; i >>= 8;
848	}
849	if (i & 0xf0) {
850		h += 4; i >>= 4;
851	}
852	if (i & 0xc) {
853		h += 2; i >>= 2;
854	}
855	if (i & 0x2) {
856		h += 1;
857	}
858	return (h);
859}
860#endif
861
862static int random_fd = -1, urandom_fd = -1;
863
864static int
865random_get_bytes_common(uint8_t *ptr, size_t len, int fd)
866{
867	size_t resid = len;
868	ssize_t bytes;
869
870	ASSERT(fd != -1);
871
872	while (resid != 0) {
873		bytes = read(fd, ptr, resid);
874		ASSERT3S(bytes, >=, 0);
875		ptr += bytes;
876		resid -= bytes;
877	}
878
879	return (0);
880}
881
882int
883random_get_bytes(uint8_t *ptr, size_t len)
884{
885	return (random_get_bytes_common(ptr, len, random_fd));
886}
887
888int
889random_get_pseudo_bytes(uint8_t *ptr, size_t len)
890{
891	return (random_get_bytes_common(ptr, len, urandom_fd));
892}
893
894int
895ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result)
896{
897	char *end;
898
899	*result = strtoul(hw_serial, &end, base);
900	if (*result == 0)
901		return (errno);
902	return (0);
903}
904
905int
906ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result)
907{
908	char *end;
909
910	*result = strtoull(str, &end, base);
911	if (*result == 0)
912		return (errno);
913	return (0);
914}
915
916#ifdef illumos
917/* ARGSUSED */
918cyclic_id_t
919cyclic_add(cyc_handler_t *hdlr, cyc_time_t *when)
920{
921	return (1);
922}
923
924/* ARGSUSED */
925void
926cyclic_remove(cyclic_id_t id)
927{
928}
929
930/* ARGSUSED */
931int
932cyclic_reprogram(cyclic_id_t id, hrtime_t expiration)
933{
934	return (1);
935}
936#endif
937
938/*
939 * =========================================================================
940 * kernel emulation setup & teardown
941 * =========================================================================
942 */
943static int
944umem_out_of_memory(void)
945{
946	char errmsg[] = "out of memory -- generating core dump\n";
947
948	write(fileno(stderr), errmsg, sizeof (errmsg));
949	abort();
950	return (0);
951}
952
953void
954kernel_init(int mode)
955{
956	extern uint_t rrw_tsd_key;
957
958	umem_nofail_callback(umem_out_of_memory);
959
960	physmem = sysconf(_SC_PHYS_PAGES);
961
962	dprintf("physmem = %llu pages (%.2f GB)\n", physmem,
963	    (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
964
965	(void) snprintf(hw_serial, sizeof (hw_serial), "%lu",
966	    (mode & FWRITE) ? (unsigned long)gethostid() : 0);
967
968	VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1);
969	VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1);
970
971	system_taskq_init();
972
973#ifdef illumos
974	mutex_init(&cpu_lock, NULL, MUTEX_DEFAULT, NULL);
975#endif
976
977	spa_init(mode);
978
979	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
980}
981
982void
983kernel_fini(void)
984{
985	spa_fini();
986
987	system_taskq_fini();
988
989	close(random_fd);
990	close(urandom_fd);
991
992	random_fd = -1;
993	urandom_fd = -1;
994}
995
996int
997z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen)
998{
999	int ret;
1000	uLongf len = *dstlen;
1001
1002	if ((ret = uncompress(dst, &len, src, srclen)) == Z_OK)
1003		*dstlen = (size_t)len;
1004
1005	return (ret);
1006}
1007
1008int
1009z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen,
1010    int level)
1011{
1012	int ret;
1013	uLongf len = *dstlen;
1014
1015	if ((ret = compress2(dst, &len, src, srclen, level)) == Z_OK)
1016		*dstlen = (size_t)len;
1017
1018	return (ret);
1019}
1020
1021uid_t
1022crgetuid(cred_t *cr)
1023{
1024	return (0);
1025}
1026
1027uid_t
1028crgetruid(cred_t *cr)
1029{
1030	return (0);
1031}
1032
1033gid_t
1034crgetgid(cred_t *cr)
1035{
1036	return (0);
1037}
1038
1039int
1040crgetngroups(cred_t *cr)
1041{
1042	return (0);
1043}
1044
1045gid_t *
1046crgetgroups(cred_t *cr)
1047{
1048	return (NULL);
1049}
1050
1051int
1052zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
1053{
1054	return (0);
1055}
1056
1057int
1058zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
1059{
1060	return (0);
1061}
1062
1063int
1064zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
1065{
1066	return (0);
1067}
1068
1069ksiddomain_t *
1070ksid_lookupdomain(const char *dom)
1071{
1072	ksiddomain_t *kd;
1073
1074	kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL);
1075	kd->kd_name = spa_strdup(dom);
1076	return (kd);
1077}
1078
1079void
1080ksiddomain_rele(ksiddomain_t *ksid)
1081{
1082	spa_strfree(ksid->kd_name);
1083	umem_free(ksid, sizeof (ksiddomain_t));
1084}
1085
1086/*
1087 * Do not change the length of the returned string; it must be freed
1088 * with strfree().
1089 */
1090char *
1091kmem_asprintf(const char *fmt, ...)
1092{
1093	int size;
1094	va_list adx;
1095	char *buf;
1096
1097	va_start(adx, fmt);
1098	size = vsnprintf(NULL, 0, fmt, adx) + 1;
1099	va_end(adx);
1100
1101	buf = kmem_alloc(size, KM_SLEEP);
1102
1103	va_start(adx, fmt);
1104	size = vsnprintf(buf, size, fmt, adx);
1105	va_end(adx);
1106
1107	return (buf);
1108}
1109
1110/* ARGSUSED */
1111int
1112zfs_onexit_fd_hold(int fd, minor_t *minorp)
1113{
1114	*minorp = 0;
1115	return (0);
1116}
1117
1118/* ARGSUSED */
1119void
1120zfs_onexit_fd_rele(int fd)
1121{
1122}
1123
1124/* ARGSUSED */
1125int
1126zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
1127    uint64_t *action_handle)
1128{
1129	return (0);
1130}
1131
1132/* ARGSUSED */
1133int
1134zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire)
1135{
1136	return (0);
1137}
1138
1139/* ARGSUSED */
1140int
1141zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data)
1142{
1143	return (0);
1144}
1145
1146#ifdef __FreeBSD__
1147/* ARGSUSED */
1148int
1149zvol_create_minors(const char *name)
1150{
1151	return (0);
1152}
1153#endif
1154
1155#ifdef illumos
1156void
1157bioinit(buf_t *bp)
1158{
1159	bzero(bp, sizeof (buf_t));
1160}
1161
1162void
1163biodone(buf_t *bp)
1164{
1165	if (bp->b_iodone != NULL) {
1166		(*(bp->b_iodone))(bp);
1167		return;
1168	}
1169	ASSERT((bp->b_flags & B_DONE) == 0);
1170	bp->b_flags |= B_DONE;
1171}
1172
1173void
1174bioerror(buf_t *bp, int error)
1175{
1176	ASSERT(bp != NULL);
1177	ASSERT(error >= 0);
1178
1179	if (error != 0) {
1180		bp->b_flags |= B_ERROR;
1181	} else {
1182		bp->b_flags &= ~B_ERROR;
1183	}
1184	bp->b_error = error;
1185}
1186
1187
1188int
1189geterror(struct buf *bp)
1190{
1191	int error = 0;
1192
1193	if (bp->b_flags & B_ERROR) {
1194		error = bp->b_error;
1195		if (!error)
1196			error = EIO;
1197	}
1198	return (error);
1199}
1200#endif
1201