kern_sysctl.c revision 335699
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Mike Karels at Berkeley Software Design, Inc.
7 *
8 * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
9 * project, to make these variables more userfriendly.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 4. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 *	@(#)kern_sysctl.c	8.4 (Berkeley) 4/14/94
36 */
37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD: stable/11/sys/kern/kern_sysctl.c 335699 2018-06-27 07:24:07Z hselasky $");
40
41#include "opt_capsicum.h"
42#include "opt_compat.h"
43#include "opt_ktrace.h"
44
45#include <sys/param.h>
46#include <sys/fail.h>
47#include <sys/systm.h>
48#include <sys/capsicum.h>
49#include <sys/kernel.h>
50#include <sys/sysctl.h>
51#include <sys/malloc.h>
52#include <sys/priv.h>
53#include <sys/proc.h>
54#include <sys/jail.h>
55#include <sys/lock.h>
56#include <sys/mutex.h>
57#include <sys/rmlock.h>
58#include <sys/sbuf.h>
59#include <sys/sx.h>
60#include <sys/sysproto.h>
61#include <sys/uio.h>
62#ifdef KTRACE
63#include <sys/ktrace.h>
64#endif
65
66#include <net/vnet.h>
67
68#include <security/mac/mac_framework.h>
69
70#include <vm/vm.h>
71#include <vm/vm_extern.h>
72
73static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic");
74static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids");
75static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer");
76
77/*
78 * The sysctllock protects the MIB tree.  It also protects sysctl
79 * contexts used with dynamic sysctls.  The sysctl_register_oid() and
80 * sysctl_unregister_oid() routines require the sysctllock to already
81 * be held, so the sysctl_wlock() and sysctl_wunlock() routines are
82 * provided for the few places in the kernel which need to use that
83 * API rather than using the dynamic API.  Use of the dynamic API is
84 * strongly encouraged for most code.
85 *
86 * The sysctlmemlock is used to limit the amount of user memory wired for
87 * sysctl requests.  This is implemented by serializing any userland
88 * sysctl requests larger than a single page via an exclusive lock.
89 */
90static struct rmlock sysctllock;
91static struct sx __exclusive_cache_line sysctlmemlock;
92
93#define	SYSCTL_WLOCK()		rm_wlock(&sysctllock)
94#define	SYSCTL_WUNLOCK()	rm_wunlock(&sysctllock)
95#define	SYSCTL_RLOCK(tracker)	rm_rlock(&sysctllock, (tracker))
96#define	SYSCTL_RUNLOCK(tracker)	rm_runlock(&sysctllock, (tracker))
97#define	SYSCTL_WLOCKED()	rm_wowned(&sysctllock)
98#define	SYSCTL_ASSERT_LOCKED()	rm_assert(&sysctllock, RA_LOCKED)
99#define	SYSCTL_ASSERT_WLOCKED()	rm_assert(&sysctllock, RA_WLOCKED)
100#define	SYSCTL_ASSERT_RLOCKED()	rm_assert(&sysctllock, RA_RLOCKED)
101#define	SYSCTL_INIT()		rm_init_flags(&sysctllock, "sysctl lock", \
102				    RM_SLEEPABLE)
103#define	SYSCTL_SLEEP(ch, wmesg, timo)					\
104				rm_sleep(ch, &sysctllock, 0, wmesg, timo)
105
106static int sysctl_root(SYSCTL_HANDLER_ARGS);
107
108/* Root list */
109struct sysctl_oid_list sysctl__children = SLIST_HEAD_INITIALIZER(&sysctl__children);
110
111static int	sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del,
112		    int recurse);
113static int	sysctl_old_kernel(struct sysctl_req *, const void *, size_t);
114static int	sysctl_new_kernel(struct sysctl_req *, void *, size_t);
115
116static struct sysctl_oid *
117sysctl_find_oidname(const char *name, struct sysctl_oid_list *list)
118{
119	struct sysctl_oid *oidp;
120
121	SYSCTL_ASSERT_LOCKED();
122	SLIST_FOREACH(oidp, list, oid_link) {
123		if (strcmp(oidp->oid_name, name) == 0) {
124			return (oidp);
125		}
126	}
127	return (NULL);
128}
129
130/*
131 * Initialization of the MIB tree.
132 *
133 * Order by number in each list.
134 */
135void
136sysctl_wlock(void)
137{
138
139	SYSCTL_WLOCK();
140}
141
142void
143sysctl_wunlock(void)
144{
145
146	SYSCTL_WUNLOCK();
147}
148
149static int
150sysctl_root_handler_locked(struct sysctl_oid *oid, void *arg1, intmax_t arg2,
151    struct sysctl_req *req, struct rm_priotracker *tracker)
152{
153	int error;
154
155	if (oid->oid_kind & CTLFLAG_DYN)
156		atomic_add_int(&oid->oid_running, 1);
157
158	if (tracker != NULL)
159		SYSCTL_RUNLOCK(tracker);
160	else
161		SYSCTL_WUNLOCK();
162
163	if (!(oid->oid_kind & CTLFLAG_MPSAFE))
164		mtx_lock(&Giant);
165	error = oid->oid_handler(oid, arg1, arg2, req);
166	if (!(oid->oid_kind & CTLFLAG_MPSAFE))
167		mtx_unlock(&Giant);
168
169	KFAIL_POINT_ERROR(_debug_fail_point, sysctl_running, error);
170
171	if (tracker != NULL)
172		SYSCTL_RLOCK(tracker);
173	else
174		SYSCTL_WLOCK();
175
176	if (oid->oid_kind & CTLFLAG_DYN) {
177		if (atomic_fetchadd_int(&oid->oid_running, -1) == 1 &&
178		    (oid->oid_kind & CTLFLAG_DYING) != 0)
179			wakeup(&oid->oid_running);
180	}
181
182	return (error);
183}
184
185static void
186sysctl_load_tunable_by_oid_locked(struct sysctl_oid *oidp)
187{
188	struct sysctl_req req;
189	struct sysctl_oid *curr;
190	char *penv = NULL;
191	char path[96];
192	ssize_t rem = sizeof(path);
193	ssize_t len;
194	uint8_t data[512] __aligned(sizeof(uint64_t));
195	int size;
196	int error;
197
198	path[--rem] = 0;
199
200	for (curr = oidp; curr != NULL; curr = SYSCTL_PARENT(curr)) {
201		len = strlen(curr->oid_name);
202		rem -= len;
203		if (curr != oidp)
204			rem -= 1;
205		if (rem < 0) {
206			printf("OID path exceeds %d bytes\n", (int)sizeof(path));
207			return;
208		}
209		memcpy(path + rem, curr->oid_name, len);
210		if (curr != oidp)
211			path[rem + len] = '.';
212	}
213
214	memset(&req, 0, sizeof(req));
215
216	req.td = curthread;
217	req.oldfunc = sysctl_old_kernel;
218	req.newfunc = sysctl_new_kernel;
219	req.lock = REQ_UNWIRED;
220
221	switch (oidp->oid_kind & CTLTYPE) {
222	case CTLTYPE_INT:
223		if (getenv_array(path + rem, data, sizeof(data), &size,
224		    sizeof(int), GETENV_SIGNED) == 0)
225			return;
226		req.newlen = size;
227		req.newptr = data;
228		break;
229	case CTLTYPE_UINT:
230		if (getenv_array(path + rem, data, sizeof(data), &size,
231		    sizeof(int), GETENV_UNSIGNED) == 0)
232			return;
233		req.newlen = size;
234		req.newptr = data;
235		break;
236	case CTLTYPE_LONG:
237		if (getenv_array(path + rem, data, sizeof(data), &size,
238		    sizeof(long), GETENV_SIGNED) == 0)
239			return;
240		req.newlen = size;
241		req.newptr = data;
242		break;
243	case CTLTYPE_ULONG:
244		if (getenv_array(path + rem, data, sizeof(data), &size,
245		    sizeof(long), GETENV_UNSIGNED) == 0)
246			return;
247		req.newlen = size;
248		req.newptr = data;
249		break;
250	case CTLTYPE_S8:
251		if (getenv_array(path + rem, data, sizeof(data), &size,
252		    sizeof(int8_t), GETENV_SIGNED) == 0)
253			return;
254		req.newlen = size;
255		req.newptr = data;
256		break;
257	case CTLTYPE_S16:
258		if (getenv_array(path + rem, data, sizeof(data), &size,
259		    sizeof(int16_t), GETENV_SIGNED) == 0)
260			return;
261		req.newlen = size;
262		req.newptr = data;
263		break;
264	case CTLTYPE_S32:
265		if (getenv_array(path + rem, data, sizeof(data), &size,
266		    sizeof(int32_t), GETENV_SIGNED) == 0)
267			return;
268		req.newlen = size;
269		req.newptr = data;
270		break;
271	case CTLTYPE_S64:
272		if (getenv_array(path + rem, data, sizeof(data), &size,
273		    sizeof(int64_t), GETENV_SIGNED) == 0)
274			return;
275		req.newlen = size;
276		req.newptr = data;
277		break;
278	case CTLTYPE_U8:
279		if (getenv_array(path + rem, data, sizeof(data), &size,
280		    sizeof(uint8_t), GETENV_UNSIGNED) == 0)
281			return;
282		req.newlen = size;
283		req.newptr = data;
284		break;
285	case CTLTYPE_U16:
286		if (getenv_array(path + rem, data, sizeof(data), &size,
287		    sizeof(uint16_t), GETENV_UNSIGNED) == 0)
288			return;
289		req.newlen = size;
290		req.newptr = data;
291		break;
292	case CTLTYPE_U32:
293		if (getenv_array(path + rem, data, sizeof(data), &size,
294		    sizeof(uint32_t), GETENV_UNSIGNED) == 0)
295			return;
296		req.newlen = size;
297		req.newptr = data;
298		break;
299	case CTLTYPE_U64:
300		if (getenv_array(path + rem, data, sizeof(data), &size,
301		    sizeof(uint64_t), GETENV_UNSIGNED) == 0)
302			return;
303		req.newlen = size;
304		req.newptr = data;
305		break;
306	case CTLTYPE_STRING:
307		penv = kern_getenv(path + rem);
308		if (penv == NULL)
309			return;
310		req.newlen = strlen(penv);
311		req.newptr = penv;
312		break;
313	default:
314		return;
315	}
316	error = sysctl_root_handler_locked(oidp, oidp->oid_arg1,
317	    oidp->oid_arg2, &req, NULL);
318	if (error != 0)
319		printf("Setting sysctl %s failed: %d\n", path + rem, error);
320	if (penv != NULL)
321		freeenv(penv);
322}
323
324void
325sysctl_register_oid(struct sysctl_oid *oidp)
326{
327	struct sysctl_oid_list *parent = oidp->oid_parent;
328	struct sysctl_oid *p;
329	struct sysctl_oid *q;
330	int oid_number;
331	int timeout = 2;
332
333	/*
334	 * First check if another oid with the same name already
335	 * exists in the parent's list.
336	 */
337	SYSCTL_ASSERT_WLOCKED();
338	p = sysctl_find_oidname(oidp->oid_name, parent);
339	if (p != NULL) {
340		if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
341			p->oid_refcnt++;
342			return;
343		} else {
344			printf("can't re-use a leaf (%s)!\n", p->oid_name);
345			return;
346		}
347	}
348	/* get current OID number */
349	oid_number = oidp->oid_number;
350
351#if (OID_AUTO >= 0)
352#error "OID_AUTO is expected to be a negative value"
353#endif
354	/*
355	 * Any negative OID number qualifies as OID_AUTO. Valid OID
356	 * numbers should always be positive.
357	 *
358	 * NOTE: DO NOT change the starting value here, change it in
359	 * <sys/sysctl.h>, and make sure it is at least 256 to
360	 * accommodate e.g. net.inet.raw as a static sysctl node.
361	 */
362	if (oid_number < 0) {
363		static int newoid;
364
365		/*
366		 * By decrementing the next OID number we spend less
367		 * time inserting the OIDs into a sorted list.
368		 */
369		if (--newoid < CTL_AUTO_START)
370			newoid = 0x7fffffff;
371
372		oid_number = newoid;
373	}
374
375	/*
376	 * Insert the OID into the parent's list sorted by OID number.
377	 */
378retry:
379	q = NULL;
380	SLIST_FOREACH(p, parent, oid_link) {
381		/* check if the current OID number is in use */
382		if (oid_number == p->oid_number) {
383			/* get the next valid OID number */
384			if (oid_number < CTL_AUTO_START ||
385			    oid_number == 0x7fffffff) {
386				/* wraparound - restart */
387				oid_number = CTL_AUTO_START;
388				/* don't loop forever */
389				if (!timeout--)
390					panic("sysctl: Out of OID numbers\n");
391				goto retry;
392			} else {
393				oid_number++;
394			}
395		} else if (oid_number < p->oid_number)
396			break;
397		q = p;
398	}
399	/* check for non-auto OID number collision */
400	if (oidp->oid_number >= 0 && oidp->oid_number < CTL_AUTO_START &&
401	    oid_number >= CTL_AUTO_START) {
402		printf("sysctl: OID number(%d) is already in use for '%s'\n",
403		    oidp->oid_number, oidp->oid_name);
404	}
405	/* update the OID number, if any */
406	oidp->oid_number = oid_number;
407	if (q != NULL)
408		SLIST_INSERT_AFTER(q, oidp, oid_link);
409	else
410		SLIST_INSERT_HEAD(parent, oidp, oid_link);
411
412	if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE &&
413#ifdef VIMAGE
414	    (oidp->oid_kind & CTLFLAG_VNET) == 0 &&
415#endif
416	    (oidp->oid_kind & CTLFLAG_TUN) != 0 &&
417	    (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) {
418		/* only fetch value once */
419		oidp->oid_kind |= CTLFLAG_NOFETCH;
420		/* try to fetch value from kernel environment */
421		sysctl_load_tunable_by_oid_locked(oidp);
422	}
423}
424
425void
426sysctl_register_disabled_oid(struct sysctl_oid *oidp)
427{
428
429	/*
430	 * Mark the leaf as dormant if it's not to be immediately enabled.
431	 * We do not disable nodes as they can be shared between modules
432	 * and it is always safe to access a node.
433	 */
434	KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) == 0,
435	    ("internal flag is set in oid_kind"));
436	if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
437		oidp->oid_kind |= CTLFLAG_DORMANT;
438	sysctl_register_oid(oidp);
439}
440
441void
442sysctl_enable_oid(struct sysctl_oid *oidp)
443{
444
445	SYSCTL_ASSERT_WLOCKED();
446	if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
447		KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) == 0,
448		    ("sysctl node is marked as dormant"));
449		return;
450	}
451	KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) != 0,
452	    ("enabling already enabled sysctl oid"));
453	oidp->oid_kind &= ~CTLFLAG_DORMANT;
454}
455
456void
457sysctl_unregister_oid(struct sysctl_oid *oidp)
458{
459	struct sysctl_oid *p;
460	int error;
461
462	SYSCTL_ASSERT_WLOCKED();
463	error = ENOENT;
464	if (oidp->oid_number == OID_AUTO) {
465		error = EINVAL;
466	} else {
467		SLIST_FOREACH(p, oidp->oid_parent, oid_link) {
468			if (p == oidp) {
469				SLIST_REMOVE(oidp->oid_parent, oidp,
470				    sysctl_oid, oid_link);
471				error = 0;
472				break;
473			}
474		}
475	}
476
477	/*
478	 * This can happen when a module fails to register and is
479	 * being unloaded afterwards.  It should not be a panic()
480	 * for normal use.
481	 */
482	if (error)
483		printf("%s: failed to unregister sysctl\n", __func__);
484}
485
486/* Initialize a new context to keep track of dynamically added sysctls. */
487int
488sysctl_ctx_init(struct sysctl_ctx_list *c)
489{
490
491	if (c == NULL) {
492		return (EINVAL);
493	}
494
495	/*
496	 * No locking here, the caller is responsible for not adding
497	 * new nodes to a context until after this function has
498	 * returned.
499	 */
500	TAILQ_INIT(c);
501	return (0);
502}
503
504/* Free the context, and destroy all dynamic oids registered in this context */
505int
506sysctl_ctx_free(struct sysctl_ctx_list *clist)
507{
508	struct sysctl_ctx_entry *e, *e1;
509	int error;
510
511	error = 0;
512	/*
513	 * First perform a "dry run" to check if it's ok to remove oids.
514	 * XXX FIXME
515	 * XXX This algorithm is a hack. But I don't know any
516	 * XXX better solution for now...
517	 */
518	SYSCTL_WLOCK();
519	TAILQ_FOREACH(e, clist, link) {
520		error = sysctl_remove_oid_locked(e->entry, 0, 0);
521		if (error)
522			break;
523	}
524	/*
525	 * Restore deregistered entries, either from the end,
526	 * or from the place where error occurred.
527	 * e contains the entry that was not unregistered
528	 */
529	if (error)
530		e1 = TAILQ_PREV(e, sysctl_ctx_list, link);
531	else
532		e1 = TAILQ_LAST(clist, sysctl_ctx_list);
533	while (e1 != NULL) {
534		sysctl_register_oid(e1->entry);
535		e1 = TAILQ_PREV(e1, sysctl_ctx_list, link);
536	}
537	if (error) {
538		SYSCTL_WUNLOCK();
539		return(EBUSY);
540	}
541	/* Now really delete the entries */
542	e = TAILQ_FIRST(clist);
543	while (e != NULL) {
544		e1 = TAILQ_NEXT(e, link);
545		error = sysctl_remove_oid_locked(e->entry, 1, 0);
546		if (error)
547			panic("sysctl_remove_oid: corrupt tree, entry: %s",
548			    e->entry->oid_name);
549		free(e, M_SYSCTLOID);
550		e = e1;
551	}
552	SYSCTL_WUNLOCK();
553	return (error);
554}
555
556/* Add an entry to the context */
557struct sysctl_ctx_entry *
558sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
559{
560	struct sysctl_ctx_entry *e;
561
562	SYSCTL_ASSERT_WLOCKED();
563	if (clist == NULL || oidp == NULL)
564		return(NULL);
565	e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK);
566	e->entry = oidp;
567	TAILQ_INSERT_HEAD(clist, e, link);
568	return (e);
569}
570
571/* Find an entry in the context */
572struct sysctl_ctx_entry *
573sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
574{
575	struct sysctl_ctx_entry *e;
576
577	SYSCTL_ASSERT_WLOCKED();
578	if (clist == NULL || oidp == NULL)
579		return(NULL);
580	TAILQ_FOREACH(e, clist, link) {
581		if(e->entry == oidp)
582			return(e);
583	}
584	return (e);
585}
586
587/*
588 * Delete an entry from the context.
589 * NOTE: this function doesn't free oidp! You have to remove it
590 * with sysctl_remove_oid().
591 */
592int
593sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
594{
595	struct sysctl_ctx_entry *e;
596
597	if (clist == NULL || oidp == NULL)
598		return (EINVAL);
599	SYSCTL_WLOCK();
600	e = sysctl_ctx_entry_find(clist, oidp);
601	if (e != NULL) {
602		TAILQ_REMOVE(clist, e, link);
603		SYSCTL_WUNLOCK();
604		free(e, M_SYSCTLOID);
605		return (0);
606	} else {
607		SYSCTL_WUNLOCK();
608		return (ENOENT);
609	}
610}
611
612/*
613 * Remove dynamically created sysctl trees.
614 * oidp - top of the tree to be removed
615 * del - if 0 - just deregister, otherwise free up entries as well
616 * recurse - if != 0 traverse the subtree to be deleted
617 */
618int
619sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse)
620{
621	int error;
622
623	SYSCTL_WLOCK();
624	error = sysctl_remove_oid_locked(oidp, del, recurse);
625	SYSCTL_WUNLOCK();
626	return (error);
627}
628
629int
630sysctl_remove_name(struct sysctl_oid *parent, const char *name,
631    int del, int recurse)
632{
633	struct sysctl_oid *p, *tmp;
634	int error;
635
636	error = ENOENT;
637	SYSCTL_WLOCK();
638	SLIST_FOREACH_SAFE(p, SYSCTL_CHILDREN(parent), oid_link, tmp) {
639		if (strcmp(p->oid_name, name) == 0) {
640			error = sysctl_remove_oid_locked(p, del, recurse);
641			break;
642		}
643	}
644	SYSCTL_WUNLOCK();
645
646	return (error);
647}
648
649
650static int
651sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse)
652{
653	struct sysctl_oid *p, *tmp;
654	int error;
655
656	SYSCTL_ASSERT_WLOCKED();
657	if (oidp == NULL)
658		return(EINVAL);
659	if ((oidp->oid_kind & CTLFLAG_DYN) == 0) {
660		printf("Warning: can't remove non-dynamic nodes (%s)!\n",
661		    oidp->oid_name);
662		return (EINVAL);
663	}
664	/*
665	 * WARNING: normal method to do this should be through
666	 * sysctl_ctx_free(). Use recursing as the last resort
667	 * method to purge your sysctl tree of leftovers...
668	 * However, if some other code still references these nodes,
669	 * it will panic.
670	 */
671	if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
672		if (oidp->oid_refcnt == 1) {
673			SLIST_FOREACH_SAFE(p,
674			    SYSCTL_CHILDREN(oidp), oid_link, tmp) {
675				if (!recurse) {
676					printf("Warning: failed attempt to "
677					    "remove oid %s with child %s\n",
678					    oidp->oid_name, p->oid_name);
679					return (ENOTEMPTY);
680				}
681				error = sysctl_remove_oid_locked(p, del,
682				    recurse);
683				if (error)
684					return (error);
685			}
686		}
687	}
688	if (oidp->oid_refcnt > 1 ) {
689		oidp->oid_refcnt--;
690	} else {
691		if (oidp->oid_refcnt == 0) {
692			printf("Warning: bad oid_refcnt=%u (%s)!\n",
693				oidp->oid_refcnt, oidp->oid_name);
694			return (EINVAL);
695		}
696		sysctl_unregister_oid(oidp);
697		if (del) {
698			/*
699			 * Wait for all threads running the handler to drain.
700			 * This preserves the previous behavior when the
701			 * sysctl lock was held across a handler invocation,
702			 * and is necessary for module unload correctness.
703			 */
704			while (oidp->oid_running > 0) {
705				oidp->oid_kind |= CTLFLAG_DYING;
706				SYSCTL_SLEEP(&oidp->oid_running, "oidrm", 0);
707			}
708			if (oidp->oid_descr)
709				free(__DECONST(char *, oidp->oid_descr),
710				    M_SYSCTLOID);
711			free(__DECONST(char *, oidp->oid_name), M_SYSCTLOID);
712			free(oidp, M_SYSCTLOID);
713		}
714	}
715	return (0);
716}
717/*
718 * Create new sysctls at run time.
719 * clist may point to a valid context initialized with sysctl_ctx_init().
720 */
721struct sysctl_oid *
722sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent,
723	int number, const char *name, int kind, void *arg1, intmax_t arg2,
724	int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr)
725{
726	struct sysctl_oid *oidp;
727
728	/* You have to hook up somewhere.. */
729	if (parent == NULL)
730		return(NULL);
731	/* Check if the node already exists, otherwise create it */
732	SYSCTL_WLOCK();
733	oidp = sysctl_find_oidname(name, parent);
734	if (oidp != NULL) {
735		if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
736			oidp->oid_refcnt++;
737			/* Update the context */
738			if (clist != NULL)
739				sysctl_ctx_entry_add(clist, oidp);
740			SYSCTL_WUNLOCK();
741			return (oidp);
742		} else {
743			SYSCTL_WUNLOCK();
744			printf("can't re-use a leaf (%s)!\n", name);
745			return (NULL);
746		}
747	}
748	oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO);
749	oidp->oid_parent = parent;
750	SLIST_INIT(&oidp->oid_children);
751	oidp->oid_number = number;
752	oidp->oid_refcnt = 1;
753	oidp->oid_name = strdup(name, M_SYSCTLOID);
754	oidp->oid_handler = handler;
755	oidp->oid_kind = CTLFLAG_DYN | kind;
756	oidp->oid_arg1 = arg1;
757	oidp->oid_arg2 = arg2;
758	oidp->oid_fmt = fmt;
759	if (descr != NULL)
760		oidp->oid_descr = strdup(descr, M_SYSCTLOID);
761	/* Update the context, if used */
762	if (clist != NULL)
763		sysctl_ctx_entry_add(clist, oidp);
764	/* Register this oid */
765	sysctl_register_oid(oidp);
766	SYSCTL_WUNLOCK();
767	return (oidp);
768}
769
770/*
771 * Rename an existing oid.
772 */
773void
774sysctl_rename_oid(struct sysctl_oid *oidp, const char *name)
775{
776	char *newname;
777	char *oldname;
778
779	newname = strdup(name, M_SYSCTLOID);
780	SYSCTL_WLOCK();
781	oldname = __DECONST(char *, oidp->oid_name);
782	oidp->oid_name = newname;
783	SYSCTL_WUNLOCK();
784	free(oldname, M_SYSCTLOID);
785}
786
787/*
788 * Reparent an existing oid.
789 */
790int
791sysctl_move_oid(struct sysctl_oid *oid, struct sysctl_oid_list *parent)
792{
793	struct sysctl_oid *oidp;
794
795	SYSCTL_WLOCK();
796	if (oid->oid_parent == parent) {
797		SYSCTL_WUNLOCK();
798		return (0);
799	}
800	oidp = sysctl_find_oidname(oid->oid_name, parent);
801	if (oidp != NULL) {
802		SYSCTL_WUNLOCK();
803		return (EEXIST);
804	}
805	sysctl_unregister_oid(oid);
806	oid->oid_parent = parent;
807	oid->oid_number = OID_AUTO;
808	sysctl_register_oid(oid);
809	SYSCTL_WUNLOCK();
810	return (0);
811}
812
813/*
814 * Register the kernel's oids on startup.
815 */
816SET_DECLARE(sysctl_set, struct sysctl_oid);
817
818static void
819sysctl_register_all(void *arg)
820{
821	struct sysctl_oid **oidp;
822
823	sx_init(&sysctlmemlock, "sysctl mem");
824	SYSCTL_INIT();
825	SYSCTL_WLOCK();
826	SET_FOREACH(oidp, sysctl_set)
827		sysctl_register_oid(*oidp);
828	SYSCTL_WUNLOCK();
829}
830SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_FIRST, sysctl_register_all, 0);
831
832/*
833 * "Staff-functions"
834 *
835 * These functions implement a presently undocumented interface
836 * used by the sysctl program to walk the tree, and get the type
837 * so it can print the value.
838 * This interface is under work and consideration, and should probably
839 * be killed with a big axe by the first person who can find the time.
840 * (be aware though, that the proper interface isn't as obvious as it
841 * may seem, there are various conflicting requirements.
842 *
843 * {0,0}	printf the entire MIB-tree.
844 * {0,1,...}	return the name of the "..." OID.
845 * {0,2,...}	return the next OID.
846 * {0,3}	return the OID of the name in "new"
847 * {0,4,...}	return the kind & format info for the "..." OID.
848 * {0,5,...}	return the description the "..." OID.
849 */
850
851#ifdef SYSCTL_DEBUG
852static void
853sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i)
854{
855	int k;
856	struct sysctl_oid *oidp;
857
858	SYSCTL_ASSERT_LOCKED();
859	SLIST_FOREACH(oidp, l, oid_link) {
860
861		for (k=0; k<i; k++)
862			printf(" ");
863
864		printf("%d %s ", oidp->oid_number, oidp->oid_name);
865
866		printf("%c%c",
867			oidp->oid_kind & CTLFLAG_RD ? 'R':' ',
868			oidp->oid_kind & CTLFLAG_WR ? 'W':' ');
869
870		if (oidp->oid_handler)
871			printf(" *Handler");
872
873		switch (oidp->oid_kind & CTLTYPE) {
874			case CTLTYPE_NODE:
875				printf(" Node\n");
876				if (!oidp->oid_handler) {
877					sysctl_sysctl_debug_dump_node(
878					    SYSCTL_CHILDREN(oidp), i + 2);
879				}
880				break;
881			case CTLTYPE_INT:    printf(" Int\n"); break;
882			case CTLTYPE_UINT:   printf(" u_int\n"); break;
883			case CTLTYPE_LONG:   printf(" Long\n"); break;
884			case CTLTYPE_ULONG:  printf(" u_long\n"); break;
885			case CTLTYPE_STRING: printf(" String\n"); break;
886			case CTLTYPE_S8:     printf(" int8_t\n"); break;
887			case CTLTYPE_S16:    printf(" int16_t\n"); break;
888			case CTLTYPE_S32:    printf(" int32_t\n"); break;
889			case CTLTYPE_S64:    printf(" int64_t\n"); break;
890			case CTLTYPE_U8:     printf(" uint8_t\n"); break;
891			case CTLTYPE_U16:    printf(" uint16_t\n"); break;
892			case CTLTYPE_U32:    printf(" uint32_t\n"); break;
893			case CTLTYPE_U64:    printf(" uint64_t\n"); break;
894			case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break;
895			default:	     printf("\n");
896		}
897
898	}
899}
900
901static int
902sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS)
903{
904	struct rm_priotracker tracker;
905	int error;
906
907	error = priv_check(req->td, PRIV_SYSCTL_DEBUG);
908	if (error)
909		return (error);
910	SYSCTL_RLOCK(&tracker);
911	sysctl_sysctl_debug_dump_node(&sysctl__children, 0);
912	SYSCTL_RUNLOCK(&tracker);
913	return (ENOENT);
914}
915
916SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD|CTLFLAG_MPSAFE,
917	0, 0, sysctl_sysctl_debug, "-", "");
918#endif
919
920static int
921sysctl_sysctl_name(SYSCTL_HANDLER_ARGS)
922{
923	int *name = (int *) arg1;
924	u_int namelen = arg2;
925	int error = 0;
926	struct sysctl_oid *oid;
927	struct sysctl_oid_list *lsp = &sysctl__children, *lsp2;
928	struct rm_priotracker tracker;
929	char buf[10];
930
931	SYSCTL_RLOCK(&tracker);
932	while (namelen) {
933		if (!lsp) {
934			snprintf(buf,sizeof(buf),"%d",*name);
935			if (req->oldidx)
936				error = SYSCTL_OUT(req, ".", 1);
937			if (!error)
938				error = SYSCTL_OUT(req, buf, strlen(buf));
939			if (error)
940				goto out;
941			namelen--;
942			name++;
943			continue;
944		}
945		lsp2 = NULL;
946		SLIST_FOREACH(oid, lsp, oid_link) {
947			if (oid->oid_number != *name)
948				continue;
949
950			if (req->oldidx)
951				error = SYSCTL_OUT(req, ".", 1);
952			if (!error)
953				error = SYSCTL_OUT(req, oid->oid_name,
954					strlen(oid->oid_name));
955			if (error)
956				goto out;
957
958			namelen--;
959			name++;
960
961			if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE)
962				break;
963
964			if (oid->oid_handler)
965				break;
966
967			lsp2 = SYSCTL_CHILDREN(oid);
968			break;
969		}
970		lsp = lsp2;
971	}
972	error = SYSCTL_OUT(req, "", 1);
973 out:
974	SYSCTL_RUNLOCK(&tracker);
975	return (error);
976}
977
978/*
979 * XXXRW/JA: Shouldn't return name data for nodes that we don't permit in
980 * capability mode.
981 */
982static SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD,
983    sysctl_sysctl_name, "");
984
985static int
986sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen,
987	int *next, int *len, int level, struct sysctl_oid **oidpp)
988{
989	struct sysctl_oid *oidp;
990
991	SYSCTL_ASSERT_LOCKED();
992	*len = level;
993	SLIST_FOREACH(oidp, lsp, oid_link) {
994		*next = oidp->oid_number;
995		*oidpp = oidp;
996
997		if ((oidp->oid_kind & (CTLFLAG_SKIP | CTLFLAG_DORMANT)) != 0)
998			continue;
999
1000		if (!namelen) {
1001			if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
1002				return (0);
1003			if (oidp->oid_handler)
1004				/* We really should call the handler here...*/
1005				return (0);
1006			lsp = SYSCTL_CHILDREN(oidp);
1007			if (!sysctl_sysctl_next_ls(lsp, 0, 0, next+1,
1008				len, level+1, oidpp))
1009				return (0);
1010			goto emptynode;
1011		}
1012
1013		if (oidp->oid_number < *name)
1014			continue;
1015
1016		if (oidp->oid_number > *name) {
1017			if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
1018				return (0);
1019			if (oidp->oid_handler)
1020				return (0);
1021			lsp = SYSCTL_CHILDREN(oidp);
1022			if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1,
1023				next+1, len, level+1, oidpp))
1024				return (0);
1025			goto next;
1026		}
1027		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
1028			continue;
1029
1030		if (oidp->oid_handler)
1031			continue;
1032
1033		lsp = SYSCTL_CHILDREN(oidp);
1034		if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1,
1035			len, level+1, oidpp))
1036			return (0);
1037	next:
1038		namelen = 1;
1039	emptynode:
1040		*len = level;
1041	}
1042	return (1);
1043}
1044
1045static int
1046sysctl_sysctl_next(SYSCTL_HANDLER_ARGS)
1047{
1048	int *name = (int *) arg1;
1049	u_int namelen = arg2;
1050	int i, j, error;
1051	struct sysctl_oid *oid;
1052	struct sysctl_oid_list *lsp = &sysctl__children;
1053	struct rm_priotracker tracker;
1054	int newoid[CTL_MAXNAME];
1055
1056	SYSCTL_RLOCK(&tracker);
1057	i = sysctl_sysctl_next_ls(lsp, name, namelen, newoid, &j, 1, &oid);
1058	SYSCTL_RUNLOCK(&tracker);
1059	if (i)
1060		return (ENOENT);
1061	error = SYSCTL_OUT(req, newoid, j * sizeof (int));
1062	return (error);
1063}
1064
1065/*
1066 * XXXRW/JA: Shouldn't return next data for nodes that we don't permit in
1067 * capability mode.
1068 */
1069static SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD,
1070    sysctl_sysctl_next, "");
1071
1072static int
1073name2oid(char *name, int *oid, int *len, struct sysctl_oid **oidpp)
1074{
1075	struct sysctl_oid *oidp;
1076	struct sysctl_oid_list *lsp = &sysctl__children;
1077	char *p;
1078
1079	SYSCTL_ASSERT_LOCKED();
1080
1081	for (*len = 0; *len < CTL_MAXNAME;) {
1082		p = strsep(&name, ".");
1083
1084		oidp = SLIST_FIRST(lsp);
1085		for (;; oidp = SLIST_NEXT(oidp, oid_link)) {
1086			if (oidp == NULL)
1087				return (ENOENT);
1088			if (strcmp(p, oidp->oid_name) == 0)
1089				break;
1090		}
1091		*oid++ = oidp->oid_number;
1092		(*len)++;
1093
1094		if (name == NULL || *name == '\0') {
1095			if (oidpp)
1096				*oidpp = oidp;
1097			return (0);
1098		}
1099
1100		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
1101			break;
1102
1103		if (oidp->oid_handler)
1104			break;
1105
1106		lsp = SYSCTL_CHILDREN(oidp);
1107	}
1108	return (ENOENT);
1109}
1110
1111static int
1112sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS)
1113{
1114	char *p;
1115	int error, oid[CTL_MAXNAME], len = 0;
1116	struct sysctl_oid *op = NULL;
1117	struct rm_priotracker tracker;
1118	char buf[32];
1119
1120	if (!req->newlen)
1121		return (ENOENT);
1122	if (req->newlen >= MAXPATHLEN)	/* XXX arbitrary, undocumented */
1123		return (ENAMETOOLONG);
1124
1125	p = buf;
1126	if (req->newlen >= sizeof(buf))
1127		p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK);
1128
1129	error = SYSCTL_IN(req, p, req->newlen);
1130	if (error) {
1131		if (p != buf)
1132			free(p, M_SYSCTL);
1133		return (error);
1134	}
1135
1136	p [req->newlen] = '\0';
1137
1138	SYSCTL_RLOCK(&tracker);
1139	error = name2oid(p, oid, &len, &op);
1140	SYSCTL_RUNLOCK(&tracker);
1141
1142	if (p != buf)
1143		free(p, M_SYSCTL);
1144
1145	if (error)
1146		return (error);
1147
1148	error = SYSCTL_OUT(req, oid, len * sizeof *oid);
1149	return (error);
1150}
1151
1152/*
1153 * XXXRW/JA: Shouldn't return name2oid data for nodes that we don't permit in
1154 * capability mode.
1155 */
1156SYSCTL_PROC(_sysctl, 3, name2oid,
1157    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE
1158    | CTLFLAG_CAPRW, 0, 0, sysctl_sysctl_name2oid, "I", "");
1159
1160static int
1161sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS)
1162{
1163	struct sysctl_oid *oid;
1164	struct rm_priotracker tracker;
1165	int error;
1166
1167	SYSCTL_RLOCK(&tracker);
1168	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
1169	if (error)
1170		goto out;
1171
1172	if (oid->oid_fmt == NULL) {
1173		error = ENOENT;
1174		goto out;
1175	}
1176	error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind));
1177	if (error)
1178		goto out;
1179	error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1);
1180 out:
1181	SYSCTL_RUNLOCK(&tracker);
1182	return (error);
1183}
1184
1185
1186static SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD,
1187    sysctl_sysctl_oidfmt, "");
1188
1189static int
1190sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS)
1191{
1192	struct sysctl_oid *oid;
1193	struct rm_priotracker tracker;
1194	int error;
1195
1196	SYSCTL_RLOCK(&tracker);
1197	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
1198	if (error)
1199		goto out;
1200
1201	if (oid->oid_descr == NULL) {
1202		error = ENOENT;
1203		goto out;
1204	}
1205	error = SYSCTL_OUT(req, oid->oid_descr, strlen(oid->oid_descr) + 1);
1206 out:
1207	SYSCTL_RUNLOCK(&tracker);
1208	return (error);
1209}
1210
1211static SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD,
1212    sysctl_sysctl_oiddescr, "");
1213
1214/*
1215 * Default "handler" functions.
1216 */
1217
1218/*
1219 * Handle a bool.
1220 * Two cases:
1221 *     a variable:  point arg1 at it.
1222 *     a constant:  pass it in arg2.
1223 */
1224
1225int
1226sysctl_handle_bool(SYSCTL_HANDLER_ARGS)
1227{
1228	uint8_t temp;
1229	int error;
1230
1231	/*
1232	 * Attempt to get a coherent snapshot by making a copy of the data.
1233	 */
1234	if (arg1)
1235		temp = *(bool *)arg1 ? 1 : 0;
1236	else
1237		temp = arg2 ? 1 : 0;
1238
1239	error = SYSCTL_OUT(req, &temp, sizeof(temp));
1240	if (error || !req->newptr)
1241		return (error);
1242
1243	if (!arg1)
1244		error = EPERM;
1245	else {
1246		error = SYSCTL_IN(req, &temp, sizeof(temp));
1247		if (!error)
1248			*(bool *)arg1 = temp ? 1 : 0;
1249	}
1250	return (error);
1251}
1252
1253/*
1254 * Handle an int8_t, signed or unsigned.
1255 * Two cases:
1256 *     a variable:  point arg1 at it.
1257 *     a constant:  pass it in arg2.
1258 */
1259
1260int
1261sysctl_handle_8(SYSCTL_HANDLER_ARGS)
1262{
1263	int8_t tmpout;
1264	int error = 0;
1265
1266	/*
1267	 * Attempt to get a coherent snapshot by making a copy of the data.
1268	 */
1269	if (arg1)
1270		tmpout = *(int8_t *)arg1;
1271	else
1272		tmpout = arg2;
1273	error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout));
1274
1275	if (error || !req->newptr)
1276		return (error);
1277
1278	if (!arg1)
1279		error = EPERM;
1280	else
1281		error = SYSCTL_IN(req, arg1, sizeof(tmpout));
1282	return (error);
1283}
1284
1285/*
1286 * Handle an int16_t, signed or unsigned.
1287 * Two cases:
1288 *     a variable:  point arg1 at it.
1289 *     a constant:  pass it in arg2.
1290 */
1291
1292int
1293sysctl_handle_16(SYSCTL_HANDLER_ARGS)
1294{
1295	int16_t tmpout;
1296	int error = 0;
1297
1298	/*
1299	 * Attempt to get a coherent snapshot by making a copy of the data.
1300	 */
1301	if (arg1)
1302		tmpout = *(int16_t *)arg1;
1303	else
1304		tmpout = arg2;
1305	error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout));
1306
1307	if (error || !req->newptr)
1308		return (error);
1309
1310	if (!arg1)
1311		error = EPERM;
1312	else
1313		error = SYSCTL_IN(req, arg1, sizeof(tmpout));
1314	return (error);
1315}
1316
1317/*
1318 * Handle an int32_t, signed or unsigned.
1319 * Two cases:
1320 *     a variable:  point arg1 at it.
1321 *     a constant:  pass it in arg2.
1322 */
1323
1324int
1325sysctl_handle_32(SYSCTL_HANDLER_ARGS)
1326{
1327	int32_t tmpout;
1328	int error = 0;
1329
1330	/*
1331	 * Attempt to get a coherent snapshot by making a copy of the data.
1332	 */
1333	if (arg1)
1334		tmpout = *(int32_t *)arg1;
1335	else
1336		tmpout = arg2;
1337	error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout));
1338
1339	if (error || !req->newptr)
1340		return (error);
1341
1342	if (!arg1)
1343		error = EPERM;
1344	else
1345		error = SYSCTL_IN(req, arg1, sizeof(tmpout));
1346	return (error);
1347}
1348
1349/*
1350 * Handle an int, signed or unsigned.
1351 * Two cases:
1352 *     a variable:  point arg1 at it.
1353 *     a constant:  pass it in arg2.
1354 */
1355
1356int
1357sysctl_handle_int(SYSCTL_HANDLER_ARGS)
1358{
1359	int tmpout, error = 0;
1360
1361	/*
1362	 * Attempt to get a coherent snapshot by making a copy of the data.
1363	 */
1364	if (arg1)
1365		tmpout = *(int *)arg1;
1366	else
1367		tmpout = arg2;
1368	error = SYSCTL_OUT(req, &tmpout, sizeof(int));
1369
1370	if (error || !req->newptr)
1371		return (error);
1372
1373	if (!arg1)
1374		error = EPERM;
1375	else
1376		error = SYSCTL_IN(req, arg1, sizeof(int));
1377	return (error);
1378}
1379
1380/*
1381 * Based on on sysctl_handle_int() convert milliseconds into ticks.
1382 * Note: this is used by TCP.
1383 */
1384
1385int
1386sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS)
1387{
1388	int error, s, tt;
1389
1390	tt = *(int *)arg1;
1391	s = (int)((int64_t)tt * 1000 / hz);
1392
1393	error = sysctl_handle_int(oidp, &s, 0, req);
1394	if (error || !req->newptr)
1395		return (error);
1396
1397	tt = (int)((int64_t)s * hz / 1000);
1398	if (tt < 1)
1399		return (EINVAL);
1400
1401	*(int *)arg1 = tt;
1402	return (0);
1403}
1404
1405
1406/*
1407 * Handle a long, signed or unsigned.
1408 * Two cases:
1409 *     a variable:  point arg1 at it.
1410 *     a constant:  pass it in arg2.
1411 */
1412
1413int
1414sysctl_handle_long(SYSCTL_HANDLER_ARGS)
1415{
1416	int error = 0;
1417	long tmplong;
1418#ifdef SCTL_MASK32
1419	int tmpint;
1420#endif
1421
1422	/*
1423	 * Attempt to get a coherent snapshot by making a copy of the data.
1424	 */
1425	if (arg1)
1426		tmplong = *(long *)arg1;
1427	else
1428		tmplong = arg2;
1429#ifdef SCTL_MASK32
1430	if (req->flags & SCTL_MASK32) {
1431		tmpint = tmplong;
1432		error = SYSCTL_OUT(req, &tmpint, sizeof(int));
1433	} else
1434#endif
1435		error = SYSCTL_OUT(req, &tmplong, sizeof(long));
1436
1437	if (error || !req->newptr)
1438		return (error);
1439
1440	if (!arg1)
1441		error = EPERM;
1442#ifdef SCTL_MASK32
1443	else if (req->flags & SCTL_MASK32) {
1444		error = SYSCTL_IN(req, &tmpint, sizeof(int));
1445		*(long *)arg1 = (long)tmpint;
1446	}
1447#endif
1448	else
1449		error = SYSCTL_IN(req, arg1, sizeof(long));
1450	return (error);
1451}
1452
1453/*
1454 * Handle a 64 bit int, signed or unsigned.
1455 * Two cases:
1456 *     a variable:  point arg1 at it.
1457 *     a constant:  pass it in arg2.
1458 */
1459int
1460sysctl_handle_64(SYSCTL_HANDLER_ARGS)
1461{
1462	int error = 0;
1463	uint64_t tmpout;
1464
1465	/*
1466	 * Attempt to get a coherent snapshot by making a copy of the data.
1467	 */
1468	if (arg1)
1469		tmpout = *(uint64_t *)arg1;
1470	else
1471		tmpout = arg2;
1472	error = SYSCTL_OUT(req, &tmpout, sizeof(uint64_t));
1473
1474	if (error || !req->newptr)
1475		return (error);
1476
1477	if (!arg1)
1478		error = EPERM;
1479	else
1480		error = SYSCTL_IN(req, arg1, sizeof(uint64_t));
1481	return (error);
1482}
1483
1484/*
1485 * Handle our generic '\0' terminated 'C' string.
1486 * Two cases:
1487 * 	a variable string:  point arg1 at it, arg2 is max length.
1488 * 	a constant string:  point arg1 at it, arg2 is zero.
1489 */
1490
1491int
1492sysctl_handle_string(SYSCTL_HANDLER_ARGS)
1493{
1494	size_t outlen;
1495	int error = 0, ro_string = 0;
1496
1497	/*
1498	 * A zero-length buffer indicates a fixed size read-only
1499	 * string:
1500	 */
1501	if (arg2 == 0) {
1502		arg2 = strlen((char *)arg1) + 1;
1503		ro_string = 1;
1504	}
1505
1506	if (req->oldptr != NULL) {
1507		char *tmparg;
1508
1509		if (ro_string) {
1510			tmparg = arg1;
1511		} else {
1512			/* try to make a coherent snapshot of the string */
1513			tmparg = malloc(arg2, M_SYSCTLTMP, M_WAITOK);
1514			memcpy(tmparg, arg1, arg2);
1515		}
1516
1517		outlen = strnlen(tmparg, arg2 - 1) + 1;
1518		error = SYSCTL_OUT(req, tmparg, outlen);
1519
1520		if (!ro_string)
1521			free(tmparg, M_SYSCTLTMP);
1522	} else {
1523		outlen = strnlen((char *)arg1, arg2 - 1) + 1;
1524		error = SYSCTL_OUT(req, NULL, outlen);
1525	}
1526	if (error || !req->newptr)
1527		return (error);
1528
1529	if ((req->newlen - req->newidx) >= arg2) {
1530		error = EINVAL;
1531	} else {
1532		arg2 = (req->newlen - req->newidx);
1533		error = SYSCTL_IN(req, arg1, arg2);
1534		((char *)arg1)[arg2] = '\0';
1535	}
1536	return (error);
1537}
1538
1539/*
1540 * Handle any kind of opaque data.
1541 * arg1 points to it, arg2 is the size.
1542 */
1543
1544int
1545sysctl_handle_opaque(SYSCTL_HANDLER_ARGS)
1546{
1547	int error, tries;
1548	u_int generation;
1549	struct sysctl_req req2;
1550
1551	/*
1552	 * Attempt to get a coherent snapshot, by using the thread
1553	 * pre-emption counter updated from within mi_switch() to
1554	 * determine if we were pre-empted during a bcopy() or
1555	 * copyout(). Make 3 attempts at doing this before giving up.
1556	 * If we encounter an error, stop immediately.
1557	 */
1558	tries = 0;
1559	req2 = *req;
1560retry:
1561	generation = curthread->td_generation;
1562	error = SYSCTL_OUT(req, arg1, arg2);
1563	if (error)
1564		return (error);
1565	tries++;
1566	if (generation != curthread->td_generation && tries < 3) {
1567		*req = req2;
1568		goto retry;
1569	}
1570
1571	error = SYSCTL_IN(req, arg1, arg2);
1572
1573	return (error);
1574}
1575
1576/*
1577 * Transfer functions to/from kernel space.
1578 * XXX: rather untested at this point
1579 */
1580static int
1581sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l)
1582{
1583	size_t i = 0;
1584
1585	if (req->oldptr) {
1586		i = l;
1587		if (req->oldlen <= req->oldidx)
1588			i = 0;
1589		else
1590			if (i > req->oldlen - req->oldidx)
1591				i = req->oldlen - req->oldidx;
1592		if (i > 0)
1593			bcopy(p, (char *)req->oldptr + req->oldidx, i);
1594	}
1595	req->oldidx += l;
1596	if (req->oldptr && i != l)
1597		return (ENOMEM);
1598	return (0);
1599}
1600
1601static int
1602sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l)
1603{
1604	if (!req->newptr)
1605		return (0);
1606	if (req->newlen - req->newidx < l)
1607		return (EINVAL);
1608	bcopy((char *)req->newptr + req->newidx, p, l);
1609	req->newidx += l;
1610	return (0);
1611}
1612
1613int
1614kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old,
1615    size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags)
1616{
1617	int error = 0;
1618	struct sysctl_req req;
1619
1620	bzero(&req, sizeof req);
1621
1622	req.td = td;
1623	req.flags = flags;
1624
1625	if (oldlenp) {
1626		req.oldlen = *oldlenp;
1627	}
1628	req.validlen = req.oldlen;
1629
1630	if (old) {
1631		req.oldptr= old;
1632	}
1633
1634	if (new != NULL) {
1635		req.newlen = newlen;
1636		req.newptr = new;
1637	}
1638
1639	req.oldfunc = sysctl_old_kernel;
1640	req.newfunc = sysctl_new_kernel;
1641	req.lock = REQ_UNWIRED;
1642
1643	error = sysctl_root(0, name, namelen, &req);
1644
1645	if (req.lock == REQ_WIRED && req.validlen > 0)
1646		vsunlock(req.oldptr, req.validlen);
1647
1648	if (error && error != ENOMEM)
1649		return (error);
1650
1651	if (retval) {
1652		if (req.oldptr && req.oldidx > req.validlen)
1653			*retval = req.validlen;
1654		else
1655			*retval = req.oldidx;
1656	}
1657	return (error);
1658}
1659
1660int
1661kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp,
1662    void *new, size_t newlen, size_t *retval, int flags)
1663{
1664        int oid[CTL_MAXNAME];
1665        size_t oidlen, plen;
1666	int error;
1667
1668	oid[0] = 0;		/* sysctl internal magic */
1669	oid[1] = 3;		/* name2oid */
1670	oidlen = sizeof(oid);
1671
1672	error = kernel_sysctl(td, oid, 2, oid, &oidlen,
1673	    (void *)name, strlen(name), &plen, flags);
1674	if (error)
1675		return (error);
1676
1677	error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp,
1678	    new, newlen, retval, flags);
1679	return (error);
1680}
1681
1682/*
1683 * Transfer function to/from user space.
1684 */
1685static int
1686sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
1687{
1688	size_t i, len, origidx;
1689	int error;
1690
1691	origidx = req->oldidx;
1692	req->oldidx += l;
1693	if (req->oldptr == NULL)
1694		return (0);
1695	/*
1696	 * If we have not wired the user supplied buffer and we are currently
1697	 * holding locks, drop a witness warning, as it's possible that
1698	 * write operations to the user page can sleep.
1699	 */
1700	if (req->lock != REQ_WIRED)
1701		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
1702		    "sysctl_old_user()");
1703	i = l;
1704	len = req->validlen;
1705	if (len <= origidx)
1706		i = 0;
1707	else {
1708		if (i > len - origidx)
1709			i = len - origidx;
1710		if (req->lock == REQ_WIRED) {
1711			error = copyout_nofault(p, (char *)req->oldptr +
1712			    origidx, i);
1713		} else
1714			error = copyout(p, (char *)req->oldptr + origidx, i);
1715		if (error != 0)
1716			return (error);
1717	}
1718	if (i < l)
1719		return (ENOMEM);
1720	return (0);
1721}
1722
1723static int
1724sysctl_new_user(struct sysctl_req *req, void *p, size_t l)
1725{
1726	int error;
1727
1728	if (!req->newptr)
1729		return (0);
1730	if (req->newlen - req->newidx < l)
1731		return (EINVAL);
1732	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
1733	    "sysctl_new_user()");
1734	error = copyin((char *)req->newptr + req->newidx, p, l);
1735	req->newidx += l;
1736	return (error);
1737}
1738
1739/*
1740 * Wire the user space destination buffer.  If set to a value greater than
1741 * zero, the len parameter limits the maximum amount of wired memory.
1742 */
1743int
1744sysctl_wire_old_buffer(struct sysctl_req *req, size_t len)
1745{
1746	int ret;
1747	size_t wiredlen;
1748
1749	wiredlen = (len > 0 && len < req->oldlen) ? len : req->oldlen;
1750	ret = 0;
1751	if (req->lock != REQ_WIRED && req->oldptr &&
1752	    req->oldfunc == sysctl_old_user) {
1753		if (wiredlen != 0) {
1754			ret = vslock(req->oldptr, wiredlen);
1755			if (ret != 0) {
1756				if (ret != ENOMEM)
1757					return (ret);
1758				wiredlen = 0;
1759			}
1760		}
1761		req->lock = REQ_WIRED;
1762		req->validlen = wiredlen;
1763	}
1764	return (0);
1765}
1766
1767int
1768sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
1769    int *nindx, struct sysctl_req *req)
1770{
1771	struct sysctl_oid_list *lsp;
1772	struct sysctl_oid *oid;
1773	int indx;
1774
1775	SYSCTL_ASSERT_LOCKED();
1776	lsp = &sysctl__children;
1777	indx = 0;
1778	while (indx < CTL_MAXNAME) {
1779		SLIST_FOREACH(oid, lsp, oid_link) {
1780			if (oid->oid_number == name[indx])
1781				break;
1782		}
1783		if (oid == NULL)
1784			return (ENOENT);
1785
1786		indx++;
1787		if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
1788			if (oid->oid_handler != NULL || indx == namelen) {
1789				*noid = oid;
1790				if (nindx != NULL)
1791					*nindx = indx;
1792				KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0,
1793				    ("%s found DYING node %p", __func__, oid));
1794				return (0);
1795			}
1796			lsp = SYSCTL_CHILDREN(oid);
1797		} else if (indx == namelen) {
1798			if ((oid->oid_kind & CTLFLAG_DORMANT) != 0)
1799				return (ENOENT);
1800			*noid = oid;
1801			if (nindx != NULL)
1802				*nindx = indx;
1803			KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0,
1804			    ("%s found DYING node %p", __func__, oid));
1805			return (0);
1806		} else {
1807			return (ENOTDIR);
1808		}
1809	}
1810	return (ENOENT);
1811}
1812
1813/*
1814 * Traverse our tree, and find the right node, execute whatever it points
1815 * to, and return the resulting error code.
1816 */
1817
1818static int
1819sysctl_root(SYSCTL_HANDLER_ARGS)
1820{
1821	struct sysctl_oid *oid;
1822	struct rm_priotracker tracker;
1823	int error, indx, lvl;
1824
1825	SYSCTL_RLOCK(&tracker);
1826
1827	error = sysctl_find_oid(arg1, arg2, &oid, &indx, req);
1828	if (error)
1829		goto out;
1830
1831	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
1832		/*
1833		 * You can't call a sysctl when it's a node, but has
1834		 * no handler.  Inform the user that it's a node.
1835		 * The indx may or may not be the same as namelen.
1836		 */
1837		if (oid->oid_handler == NULL) {
1838			error = EISDIR;
1839			goto out;
1840		}
1841	}
1842
1843	/* Is this sysctl writable? */
1844	if (req->newptr && !(oid->oid_kind & CTLFLAG_WR)) {
1845		error = EPERM;
1846		goto out;
1847	}
1848
1849	KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL"));
1850
1851#ifdef CAPABILITY_MODE
1852	/*
1853	 * If the process is in capability mode, then don't permit reading or
1854	 * writing unless specifically granted for the node.
1855	 */
1856	if (IN_CAPABILITY_MODE(req->td)) {
1857		if ((req->oldptr && !(oid->oid_kind & CTLFLAG_CAPRD)) ||
1858		    (req->newptr && !(oid->oid_kind & CTLFLAG_CAPWR))) {
1859			error = EPERM;
1860			goto out;
1861		}
1862	}
1863#endif
1864
1865	/* Is this sysctl sensitive to securelevels? */
1866	if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) {
1867		lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE;
1868		error = securelevel_gt(req->td->td_ucred, lvl);
1869		if (error)
1870			goto out;
1871	}
1872
1873	/* Is this sysctl writable by only privileged users? */
1874	if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) {
1875		int priv;
1876
1877		if (oid->oid_kind & CTLFLAG_PRISON)
1878			priv = PRIV_SYSCTL_WRITEJAIL;
1879#ifdef VIMAGE
1880		else if ((oid->oid_kind & CTLFLAG_VNET) &&
1881		     prison_owns_vnet(req->td->td_ucred))
1882			priv = PRIV_SYSCTL_WRITEJAIL;
1883#endif
1884		else
1885			priv = PRIV_SYSCTL_WRITE;
1886		error = priv_check(req->td, priv);
1887		if (error)
1888			goto out;
1889	}
1890
1891	if (!oid->oid_handler) {
1892		error = EINVAL;
1893		goto out;
1894	}
1895
1896	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
1897		arg1 = (int *)arg1 + indx;
1898		arg2 -= indx;
1899	} else {
1900		arg1 = oid->oid_arg1;
1901		arg2 = oid->oid_arg2;
1902	}
1903#ifdef MAC
1904	error = mac_system_check_sysctl(req->td->td_ucred, oid, arg1, arg2,
1905	    req);
1906	if (error != 0)
1907		goto out;
1908#endif
1909#ifdef VIMAGE
1910	if ((oid->oid_kind & CTLFLAG_VNET) && arg1 != NULL)
1911		arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1);
1912#endif
1913	error = sysctl_root_handler_locked(oid, arg1, arg2, req, &tracker);
1914
1915out:
1916	SYSCTL_RUNLOCK(&tracker);
1917	return (error);
1918}
1919
1920#ifndef _SYS_SYSPROTO_H_
1921struct sysctl_args {
1922	int	*name;
1923	u_int	namelen;
1924	void	*old;
1925	size_t	*oldlenp;
1926	void	*new;
1927	size_t	newlen;
1928};
1929#endif
1930int
1931sys___sysctl(struct thread *td, struct sysctl_args *uap)
1932{
1933	int error, i, name[CTL_MAXNAME];
1934	size_t j;
1935
1936	if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
1937		return (EINVAL);
1938
1939 	error = copyin(uap->name, &name, uap->namelen * sizeof(int));
1940 	if (error)
1941		return (error);
1942
1943	error = userland_sysctl(td, name, uap->namelen,
1944		uap->old, uap->oldlenp, 0,
1945		uap->new, uap->newlen, &j, 0);
1946	if (error && error != ENOMEM)
1947		return (error);
1948	if (uap->oldlenp) {
1949		i = copyout(&j, uap->oldlenp, sizeof(j));
1950		if (i)
1951			return (i);
1952	}
1953	return (error);
1954}
1955
1956/*
1957 * This is used from various compatibility syscalls too.  That's why name
1958 * must be in kernel space.
1959 */
1960int
1961userland_sysctl(struct thread *td, int *name, u_int namelen, void *old,
1962    size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval,
1963    int flags)
1964{
1965	int error = 0, memlocked;
1966	struct sysctl_req req;
1967
1968	bzero(&req, sizeof req);
1969
1970	req.td = td;
1971	req.flags = flags;
1972
1973	if (oldlenp) {
1974		if (inkernel) {
1975			req.oldlen = *oldlenp;
1976		} else {
1977			error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp));
1978			if (error)
1979				return (error);
1980		}
1981	}
1982	req.validlen = req.oldlen;
1983	req.oldptr = old;
1984
1985	if (new != NULL) {
1986		req.newlen = newlen;
1987		req.newptr = new;
1988	}
1989
1990	req.oldfunc = sysctl_old_user;
1991	req.newfunc = sysctl_new_user;
1992	req.lock = REQ_UNWIRED;
1993
1994#ifdef KTRACE
1995	if (KTRPOINT(curthread, KTR_SYSCTL))
1996		ktrsysctl(name, namelen);
1997#endif
1998
1999	if (req.oldptr && req.oldlen > PAGE_SIZE) {
2000		memlocked = 1;
2001		sx_xlock(&sysctlmemlock);
2002	} else
2003		memlocked = 0;
2004	CURVNET_SET(TD_TO_VNET(td));
2005
2006	for (;;) {
2007		req.oldidx = 0;
2008		req.newidx = 0;
2009		error = sysctl_root(0, name, namelen, &req);
2010		if (error != EAGAIN)
2011			break;
2012		kern_yield(PRI_USER);
2013	}
2014
2015	CURVNET_RESTORE();
2016
2017	if (req.lock == REQ_WIRED && req.validlen > 0)
2018		vsunlock(req.oldptr, req.validlen);
2019	if (memlocked)
2020		sx_xunlock(&sysctlmemlock);
2021
2022	if (error && error != ENOMEM)
2023		return (error);
2024
2025	if (retval) {
2026		if (req.oldptr && req.oldidx > req.validlen)
2027			*retval = req.validlen;
2028		else
2029			*retval = req.oldidx;
2030	}
2031	return (error);
2032}
2033
2034/*
2035 * Drain into a sysctl struct.  The user buffer should be wired if a page
2036 * fault would cause issue.
2037 */
2038static int
2039sbuf_sysctl_drain(void *arg, const char *data, int len)
2040{
2041	struct sysctl_req *req = arg;
2042	int error;
2043
2044	error = SYSCTL_OUT(req, data, len);
2045	KASSERT(error >= 0, ("Got unexpected negative value %d", error));
2046	return (error == 0 ? len : -error);
2047}
2048
2049struct sbuf *
2050sbuf_new_for_sysctl(struct sbuf *s, char *buf, int length,
2051    struct sysctl_req *req)
2052{
2053
2054	/* Supply a default buffer size if none given. */
2055	if (buf == NULL && length == 0)
2056		length = 64;
2057	s = sbuf_new(s, buf, length, SBUF_FIXEDLEN | SBUF_INCLUDENUL);
2058	sbuf_set_drain(s, sbuf_sysctl_drain, req);
2059	return (s);
2060}
2061