1/*     $NetBSD: kernel.c,v 1.5 2010/12/28 13:36:09 haad Exp $  */
2
3/*
4 * CDDL HEADER START
5 *
6 * The contents of this file are subject to the terms of the
7 * Common Development and Distribution License (the "License").
8 * You may not use this file except in compliance with the License.
9 *
10 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
11 * or http://www.opensolaris.org/os/licensing.
12 * See the License for the specific language governing permissions
13 * and limitations under the License.
14 *
15 * When distributing Covered Code, include this CDDL HEADER in each
16 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
17 * If applicable, add the following below this CDDL HEADER, with the
18 * fields enclosed by brackets "[]" replaced with your own identifying
19 * information: Portions Copyright [yyyy] [name of copyright owner]
20 *
21 * CDDL HEADER END
22 */
23
24/*
25 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
26 * Use is subject to license terms.
27 */
28
29#pragma ident	"%Z%%M%	%I%	%E% SMI"
30
31#include <sys/cdefs.h>
32__RCSID("$NetBSD: kernel.c,v 1.5 2010/12/28 13:36:09 haad Exp $");
33
34#include <sys/zfs_context.h>
35#include <sys/sysctl.h>
36#include <assert.h>
37#include <fcntl.h>
38#include <poll.h>
39#include <stdio.h>
40#include <stdlib.h>
41#include <errno.h>
42#include <string.h>
43#include <zlib.h>
44#include <sys/spa.h>
45#include <sys/stat.h>
46#include <sys/processor.h>
47#include <sys/zmod.h>
48#include <sys/utsname.h>
49
50/*
51 * Emulation of kernel services in userland.
52 */
53
54#ifdef XXXNETBSD
55int hz = 119;	/* frequency when using gethrtime() >> 23 for lbolt */
56#endif
57int aok;
58uint64_t physmem;
59vnode_t *rootdir = (vnode_t *)0xabcd1234;
60char hw_serial[11];
61size_t pgsize;
62
63struct utsname utsname = {
64	"userland"
65};
66
67/* this only exists to have its address taken */
68struct proc p0;
69
70/*
71 * =========================================================================
72 * threads
73 * =========================================================================
74 */
75/*ARGSUSED*/
76kthread_t *
77zk_thread_create(void (*func)(), void *arg)
78{
79	thread_t tid;
80
81	VERIFY(thr_create(0, 0, (void *(*)(void *))func, arg, THR_DETACHED,
82	    &tid) == 0);
83
84	return ((void *)(uintptr_t)tid);
85}
86
87/*
88 * =========================================================================
89 * kstats
90 * =========================================================================
91 */
92/*ARGSUSED*/
93kstat_t *
94kstat_create(char *module, int instance, char *name, char *class,
95    uchar_t type, ulong_t ndata, uchar_t ks_flag)
96{
97	return (NULL);
98}
99
100/*ARGSUSED*/
101void
102kstat_install(kstat_t *ksp)
103{}
104
105/*ARGSUSED*/
106void
107kstat_delete(kstat_t *ksp)
108{}
109
110/*
111 * =========================================================================
112 * vnode operations
113 * =========================================================================
114 */
115/*
116 * Note: for the xxxat() versions of these functions, we assume that the
117 * starting vp is always rootdir (which is true for spa_directory.c, the only
118 * ZFS consumer of these interfaces).  We assert this is true, and then emulate
119 * them by adding '/' in front of the path.
120 */
121
122/*ARGSUSED*/
123int
124vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3)
125{
126	int fd;
127	vnode_t *vp;
128	int old_umask;
129	char realpath[MAXPATHLEN];
130	struct stat64 st;
131
132	/*
133	 * If we're accessing a real disk from userland, we need to use
134	 * the character interface to avoid caching.  This is particularly
135	 * important if we're trying to look at a real in-kernel storage
136	 * pool from userland, e.g. via zdb, because otherwise we won't
137	 * see the changes occurring under the segmap cache.
138	 * On the other hand, the stupid character device returns zero
139	 * for its size.  So -- gag -- we open the block device to get
140	 * its size, and remember it for subsequent VOP_GETATTR().
141	 */
142	if (strncmp(path, "/dev/", 5) == 0) {
143		char *dsk;
144		fd = open64(path, O_RDONLY);
145		if (fd == -1)
146			return (errno);
147		if (fstat64(fd, &st) == -1) {
148			close(fd);
149			return (errno);
150		}
151		close(fd);
152		(void) sprintf(realpath, "%s", path);
153		dsk = strstr(path, "/dsk/");
154		if (dsk != NULL)
155			(void) sprintf(realpath + (dsk - path) + 1, "r%s",
156			    dsk + 1);
157	} else {
158		(void) sprintf(realpath, "%s", path);
159		if (!(flags & FCREAT) && stat64(realpath, &st) == -1)
160			return (errno);
161	}
162
163	if (flags & FCREAT)
164		old_umask = umask(0);
165
166	/*
167	 * The construct 'flags - FREAD' conveniently maps combinations of
168	 * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR.
169	 */
170	fd = open64(realpath, flags - FREAD, mode);
171
172	if (flags & FCREAT)
173		(void) umask(old_umask);
174
175	if (fd == -1)
176		return (errno);
177
178	if (fstat64(fd, &st) == -1) {
179		close(fd);
180		return (errno);
181	}
182
183	(void) fcntl(fd, F_SETFD, FD_CLOEXEC);
184
185	*vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL);
186
187	vp->v_fd = fd;
188	if (S_ISCHR(st.st_mode)) {
189#ifdef XXXAD
190		ioctl(fd, DIOCGMEDIASIZE, &vp->v_size);
191#endif
192	} else
193		vp->v_size = st.st_size;
194	vp->v_path = spa_strdup(path);
195
196	return (0);
197}
198
199int
200vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2,
201    int x3, vnode_t *startvp, int fd)
202{
203	char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL);
204	int ret;
205
206	ASSERT(startvp == rootdir);
207	(void) sprintf(realpath, "/%s", path);
208
209	ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3);
210
211	umem_free(realpath, strlen(path) + 2);
212
213	return (ret);
214}
215
216int
217vn_getattr(vnode_t *vp, vattr_t *va)
218{
219	int fd;
220	struct stat64 st;
221
222	fd = vp->v_fd;
223
224	if (fstat64(fd, &st) == -1)
225		return (errno);
226
227	vp->v_size = st.st_size;
228	va->va_size = st.st_size;
229
230	return 0;
231}
232
233
234/*ARGSUSED*/
235int
236vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset,
237	int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp)
238{
239	ssize_t iolen, split;
240
241	if (uio == UIO_READ) {
242		iolen = pread64(vp->v_fd, addr, len, offset);
243	} else {
244		/*
245		 * To simulate partial disk writes, we split writes into two
246		 * system calls so that the process can be killed in between.
247		 */
248		split = (len > 0 ? rand() % len : 0);
249		iolen = pwrite64(vp->v_fd, addr, split, offset);
250		iolen += pwrite64(vp->v_fd, (char *)addr + split,
251		    len - split, offset + split);
252	}
253
254	if (iolen == -1)
255		return (errno);
256	if (residp)
257		*residp = len - iolen;
258	else if (iolen != len)
259		return (EIO);
260	return (0);
261}
262
263void
264vn_close(vnode_t *vp)
265{
266	close(vp->v_fd);
267	spa_strfree(vp->v_path);
268	umem_free(vp, sizeof (vnode_t));
269}
270
271#ifdef ZFS_DEBUG
272
273/*
274 * =========================================================================
275 * Figure out which debugging statements to print
276 * =========================================================================
277 */
278
279static char *dprintf_string;
280static int dprintf_print_all;
281
282int
283dprintf_find_string(const char *string)
284{
285	char *tmp_str = dprintf_string;
286	int len = strlen(string);
287
288	/*
289	 * Find out if this is a string we want to print.
290	 * String format: file1.c,function_name1,file2.c,file3.c
291	 */
292
293	while (tmp_str != NULL) {
294		if (strncmp(tmp_str, string, len) == 0 &&
295		    (tmp_str[len] == ',' || tmp_str[len] == '\0'))
296			return (1);
297		tmp_str = strchr(tmp_str, ',');
298		if (tmp_str != NULL)
299			tmp_str++; /* Get rid of , */
300	}
301	return (0);
302}
303
304void
305dprintf_setup(int *argc, char **argv)
306{
307	int i, j;
308
309	/*
310	 * Debugging can be specified two ways: by setting the
311	 * environment variable ZFS_DEBUG, or by including a
312	 * "debug=..."  argument on the command line.  The command
313	 * line setting overrides the environment variable.
314	 */
315
316	for (i = 1; i < *argc; i++) {
317		int len = strlen("debug=");
318		/* First look for a command line argument */
319		if (strncmp("debug=", argv[i], len) == 0) {
320			dprintf_string = argv[i] + len;
321			/* Remove from args */
322			for (j = i; j < *argc; j++)
323				argv[j] = argv[j+1];
324			argv[j] = NULL;
325			(*argc)--;
326		}
327	}
328
329	if (dprintf_string == NULL) {
330		/* Look for ZFS_DEBUG environment variable */
331		dprintf_string = getenv("ZFS_DEBUG");
332	}
333
334	/*
335	 * Are we just turning on all debugging?
336	 */
337	if (dprintf_find_string("on"))
338		dprintf_print_all = 1;
339}
340
341/*
342 * =========================================================================
343 * debug printfs
344 * =========================================================================
345 */
346void
347__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
348{
349	const char *newfile;
350	va_list adx;
351
352	/*
353	 * Get rid of annoying "../common/" prefix to filename.
354	 */
355	newfile = strrchr(file, '/');
356	if (newfile != NULL) {
357		newfile = newfile + 1; /* Get rid of leading / */
358	} else {
359		newfile = file;
360	}
361
362	if (dprintf_print_all ||
363	    dprintf_find_string(newfile) ||
364	    dprintf_find_string(func)) {
365		/* Print out just the function name if requested */
366		flockfile(stdout);
367		if (dprintf_find_string("pid"))
368			(void) printf("%d ", getpid());
369		if (dprintf_find_string("tid"))
370			(void) printf("%u ", thr_self());
371#if 0
372		if (dprintf_find_string("cpu"))
373			(void) printf("%u ", getcpuid());
374#endif
375		if (dprintf_find_string("time"))
376			(void) printf("%llu ", gethrtime());
377		if (dprintf_find_string("long"))
378			(void) printf("%s, line %d: ", newfile, line);
379		(void) printf("%s: ", func);
380		va_start(adx, fmt);
381		(void) vprintf(fmt, adx);
382		va_end(adx);
383		funlockfile(stdout);
384	}
385}
386
387#endif /* ZFS_DEBUG */
388
389/*
390 * =========================================================================
391 * cmn_err() and panic()
392 * =========================================================================
393 */
394static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" };
395static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" };
396
397void
398vpanic(const char *fmt, va_list adx)
399{
400	(void) fprintf(stderr, "error: ");
401	(void) vfprintf(stderr, fmt, adx);
402	(void) fprintf(stderr, "\n");
403
404	abort();	/* think of it as a "user-level crash dump" */
405}
406
407void
408panic(const char *fmt, ...)
409{
410	va_list adx;
411
412	va_start(adx, fmt);
413	vpanic(fmt, adx);
414	va_end(adx);
415}
416
417void
418vcmn_err(int ce, const char *fmt, va_list adx)
419{
420	if (ce == CE_PANIC)
421		vpanic(fmt, adx);
422	if (ce != CE_NOTE) {	/* suppress noise in userland stress testing */
423		(void) fprintf(stderr, "%s", ce_prefix[ce]);
424		(void) vfprintf(stderr, fmt, adx);
425		(void) fprintf(stderr, "%s", ce_suffix[ce]);
426	}
427}
428
429/*PRINTFLIKE2*/
430void
431cmn_err(int ce, const char *fmt, ...)
432{
433	va_list adx;
434
435	va_start(adx, fmt);
436	vcmn_err(ce, fmt, adx);
437	va_end(adx);
438}
439
440/*
441 * =========================================================================
442 * kobj interfaces
443 * =========================================================================
444 */
445struct _buf *
446kobj_open_file(char *name)
447{
448	struct _buf *file;
449	vnode_t *vp;
450
451	/* set vp as the _fd field of the file */
452	if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, 0) != 0)
453		return ((void *)-1UL);
454
455	file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL);
456	file->_fd = (intptr_t)vp;
457	return (file);
458}
459
460int
461kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off)
462{
463	ssize_t resid;
464
465	vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off,
466	    UIO_SYSSPACE, 0, 0, 0, &resid);
467
468	return (size - resid);
469}
470
471void
472kobj_close_file(struct _buf *file)
473{
474	vn_close((vnode_t *)file->_fd);
475	umem_free(file, sizeof (struct _buf));
476}
477
478int
479kobj_get_filesize(struct _buf *file, uint64_t *size)
480{
481	struct stat64 st;
482	vnode_t *vp = (vnode_t *)file->_fd;
483
484	if (fstat64(vp->v_fd, &st) == -1) {
485		vn_close(vp);
486		return (errno);
487	}
488	*size = st.st_size;
489	return (0);
490}
491
492/*
493 * =========================================================================
494 * misc routines
495 * =========================================================================
496 */
497
498void
499delay(clock_t ticks)
500{
501	poll(0, 0, ticks * (1000 / hz));
502}
503
504#if 0
505/*
506 * Find highest one bit set.
507 *	Returns bit number + 1 of highest bit that is set, otherwise returns 0.
508 * High order bit is 31 (or 63 in _LP64 kernel).
509 */
510int
511highbit(ulong_t i)
512{
513	register int h = 1;
514
515	if (i == 0)
516		return (0);
517#ifdef _LP64
518	if (i & 0xffffffff00000000ul) {
519		h += 32; i >>= 32;
520	}
521#endif
522	if (i & 0xffff0000) {
523		h += 16; i >>= 16;
524	}
525	if (i & 0xff00) {
526		h += 8; i >>= 8;
527	}
528	if (i & 0xf0) {
529		h += 4; i >>= 4;
530	}
531	if (i & 0xc) {
532		h += 2; i >>= 2;
533	}
534	if (i & 0x2) {
535		h += 1;
536	}
537	return (h);
538}
539#endif
540
541static int
542random_get_bytes_common(uint8_t *ptr, size_t len, char *devname)
543{
544	int fd = open(devname, O_RDONLY);
545	size_t resid = len;
546	ssize_t bytes;
547
548	ASSERT(fd != -1);
549
550	while (resid != 0) {
551		bytes = read(fd, ptr, resid);
552		ASSERT(bytes >= 0);
553		ptr += bytes;
554		resid -= bytes;
555	}
556
557	close(fd);
558
559	return (0);
560}
561
562int
563random_get_bytes(uint8_t *ptr, size_t len)
564{
565	return (random_get_bytes_common(ptr, len, "/dev/random"));
566}
567
568int
569random_get_pseudo_bytes(uint8_t *ptr, size_t len)
570{
571	return (random_get_bytes_common(ptr, len, "/dev/urandom"));
572}
573
574int
575ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result)
576{
577	char *end;
578
579	*result = strtoul(hw_serial, &end, base);
580	if (*result == 0)
581		return (errno);
582	return (0);
583}
584
585/*
586 * =========================================================================
587 * kernel emulation setup & teardown
588 * =========================================================================
589 */
590static int
591umem_out_of_memory(void)
592{
593	char errmsg[] = "out of memory -- generating core dump\n";
594
595	write(fileno(stderr), errmsg, sizeof (errmsg));
596	abort();
597	return (0);
598}
599
600void
601kernel_init(int mode)
602{
603	umem_nofail_callback(umem_out_of_memory);
604	uint64_t physmem;
605	size_t len = sizeof(physmem);
606	static int mib[2] = { CTL_HW, HW_USERMEM64 };
607
608	if (sysctl(mib, sizeof(mib), &physmem, &len, NULL, 0) != 0) {
609		len = 1048576 * 128;
610	}
611
612	pgsize = sysconf(_SC_PAGE_SIZE);
613	dprintf("physmem = %llu pages (%.2f GB)\n",
614	    physmem / pgsize, (double)physmem / (1ULL << 30));
615
616	snprintf(hw_serial, sizeof (hw_serial), "%ld", gethostid());
617
618	system_taskq_init();
619
620	spa_init(mode);
621}
622
623void
624kernel_fini(void)
625{
626	spa_fini();
627}
628
629int
630z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen)
631{
632	int ret;
633	uLongf len = *dstlen;
634
635	if ((ret = uncompress(dst, &len, src, srclen)) == Z_OK)
636		*dstlen = (size_t)len;
637
638	return (ret);
639}
640
641int
642z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen,
643    int level)
644{
645	int ret;
646	uLongf len = *dstlen;
647
648	if ((ret = compress2(dst, &len, src, srclen, level)) == Z_OK)
649		*dstlen = (size_t)len;
650
651	return (ret);
652}
653
654uid_t
655crgetuid(cred_t *cr)
656{
657	return (0);
658}
659
660gid_t
661crgetgid(cred_t *cr)
662{
663	return (0);
664}
665
666int
667crgetngroups(cred_t *cr)
668{
669	return (0);
670}
671
672gid_t *
673crgetgroups(cred_t *cr)
674{
675	return (NULL);
676}
677
678int
679zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
680{
681	return (0);
682}
683
684int
685zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
686{
687	return (0);
688}
689
690int
691zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
692{
693	return (0);
694}
695
696ksiddomain_t *
697ksid_lookupdomain(const char *dom)
698{
699	ksiddomain_t *kd;
700
701	kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL);
702	kd->kd_name = spa_strdup(dom);
703	return (kd);
704}
705
706void
707ksiddomain_rele(ksiddomain_t *ksid)
708{
709	spa_strfree(ksid->kd_name);
710	umem_free(ksid, sizeof (ksiddomain_t));
711}
712
713size_t
714ptob(size_t npg)
715{
716
717	return npg * pgsize;
718}
719
720void
721print_timestamp(int fmt)
722{
723
724	return;
725}
726
727/*
728 * Do not change the length of the returned string; it must be freed
729 * with strfree().
730 */
731char *
732kmem_asprintf(const char *fmt, ...)
733{
734	int size;
735	va_list adx;
736	char *buf;
737
738	va_start(adx, fmt);
739	size = vsnprintf(NULL, 0, fmt, adx) + 1;
740	va_end(adx);
741
742	buf = kmem_alloc(size, KM_SLEEP);
743
744	va_start(adx, fmt);
745	size = vsnprintf(buf, size, fmt, adx);
746	va_end(adx);
747
748	return (buf);
749}
750