kern_sysctl.c revision 280450
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Mike Karels at Berkeley Software Design, Inc.
7 *
8 * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
9 * project, to make these variables more userfriendly.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 4. Neither the name of the University nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 *	@(#)kern_sysctl.c	8.4 (Berkeley) 4/14/94
36 */
37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD: head/sys/kern/kern_sysctl.c 280450 2015-03-24 17:42:53Z hselasky $");
40
41#include "opt_capsicum.h"
42#include "opt_compat.h"
43#include "opt_ktrace.h"
44
45#include <sys/param.h>
46#include <sys/fail.h>
47#include <sys/systm.h>
48#include <sys/capsicum.h>
49#include <sys/kernel.h>
50#include <sys/sysctl.h>
51#include <sys/malloc.h>
52#include <sys/priv.h>
53#include <sys/proc.h>
54#include <sys/jail.h>
55#include <sys/lock.h>
56#include <sys/mutex.h>
57#include <sys/sbuf.h>
58#include <sys/sx.h>
59#include <sys/sysproto.h>
60#include <sys/uio.h>
61#ifdef KTRACE
62#include <sys/ktrace.h>
63#endif
64
65#include <net/vnet.h>
66
67#include <security/mac/mac_framework.h>
68
69#include <vm/vm.h>
70#include <vm/vm_extern.h>
71
72static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic");
73static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids");
74static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer");
75
76/*
77 * The sysctllock protects the MIB tree.  It also protects sysctl
78 * contexts used with dynamic sysctls.  The sysctl_register_oid() and
79 * sysctl_unregister_oid() routines require the sysctllock to already
80 * be held, so the sysctl_xlock() and sysctl_xunlock() routines are
81 * provided for the few places in the kernel which need to use that
82 * API rather than using the dynamic API.  Use of the dynamic API is
83 * strongly encouraged for most code.
84 *
85 * The sysctlmemlock is used to limit the amount of user memory wired for
86 * sysctl requests.  This is implemented by serializing any userland
87 * sysctl requests larger than a single page via an exclusive lock.
88 */
89static struct sx sysctllock;
90static struct sx sysctlmemlock;
91
92#define	SYSCTL_XLOCK()		sx_xlock(&sysctllock)
93#define	SYSCTL_XUNLOCK()	sx_xunlock(&sysctllock)
94#define	SYSCTL_SLOCK()		sx_slock(&sysctllock)
95#define	SYSCTL_SUNLOCK()	sx_sunlock(&sysctllock)
96#define	SYSCTL_XLOCKED()	sx_xlocked(&sysctllock)
97#define	SYSCTL_ASSERT_LOCKED()	sx_assert(&sysctllock, SA_LOCKED)
98#define	SYSCTL_ASSERT_XLOCKED()	sx_assert(&sysctllock, SA_XLOCKED)
99#define	SYSCTL_ASSERT_SLOCKED()	sx_assert(&sysctllock, SA_SLOCKED)
100#define	SYSCTL_INIT()		sx_init(&sysctllock, "sysctl lock")
101#define	SYSCTL_SLEEP(ch, wmesg, timo)					\
102				sx_sleep(ch, &sysctllock, 0, wmesg, timo)
103
104static int sysctl_root(SYSCTL_HANDLER_ARGS);
105
106/* Root list */
107struct sysctl_oid_list sysctl__children = SLIST_HEAD_INITIALIZER(&sysctl__children);
108
109static int	sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del,
110		    int recurse);
111static int	sysctl_old_kernel(struct sysctl_req *, const void *, size_t);
112static int	sysctl_new_kernel(struct sysctl_req *, void *, size_t);
113
114static void
115sysctl_lock(bool xlock)
116{
117
118	if (xlock)
119		SYSCTL_XLOCK();
120	else
121		SYSCTL_SLOCK();
122}
123
124static bool
125sysctl_unlock(void)
126{
127	bool xlocked;
128
129	xlocked = SYSCTL_XLOCKED();
130	if (xlocked)
131		SYSCTL_XUNLOCK();
132	else
133		SYSCTL_SUNLOCK();
134	return (xlocked);
135}
136
137static struct sysctl_oid *
138sysctl_find_oidname(const char *name, struct sysctl_oid_list *list)
139{
140	struct sysctl_oid *oidp;
141
142	SYSCTL_ASSERT_LOCKED();
143	SLIST_FOREACH(oidp, list, oid_link) {
144		if (strcmp(oidp->oid_name, name) == 0) {
145			return (oidp);
146		}
147	}
148	return (NULL);
149}
150
151/*
152 * Initialization of the MIB tree.
153 *
154 * Order by number in each list.
155 */
156void
157sysctl_xlock(void)
158{
159
160	SYSCTL_XLOCK();
161}
162
163void
164sysctl_xunlock(void)
165{
166
167	SYSCTL_XUNLOCK();
168}
169
170static int
171sysctl_root_handler_locked(struct sysctl_oid *oid, void *arg1, intptr_t arg2,
172    struct sysctl_req *req)
173{
174	int error;
175	bool xlocked;
176
177	if (oid->oid_kind & CTLFLAG_DYN)
178		atomic_add_int(&oid->oid_running, 1);
179	xlocked = sysctl_unlock();
180
181	if (!(oid->oid_kind & CTLFLAG_MPSAFE))
182		mtx_lock(&Giant);
183	error = oid->oid_handler(oid, arg1, arg2, req);
184	if (!(oid->oid_kind & CTLFLAG_MPSAFE))
185		mtx_unlock(&Giant);
186
187	sysctl_lock(xlocked);
188	if (oid->oid_kind & CTLFLAG_DYN) {
189		if (atomic_fetchadd_int(&oid->oid_running, -1) == 1 &&
190		    (oid->oid_kind & CTLFLAG_DYING) != 0)
191			wakeup(&oid->oid_running);
192	}
193
194	return (error);
195}
196
197static void
198sysctl_load_tunable_by_oid_locked(struct sysctl_oid *oidp)
199{
200	struct sysctl_req req;
201	struct sysctl_oid *curr;
202	char *penv = NULL;
203	char path[64];
204	ssize_t rem = sizeof(path);
205	ssize_t len;
206	int val_int;
207	long val_long;
208	int64_t val_64;
209	quad_t val_quad;
210	int error;
211
212	path[--rem] = 0;
213
214	for (curr = oidp; curr != NULL; curr = SYSCTL_PARENT(curr)) {
215		len = strlen(curr->oid_name);
216		rem -= len;
217		if (curr != oidp)
218			rem -= 1;
219		if (rem < 0) {
220			printf("OID path exceeds %d bytes\n", (int)sizeof(path));
221			return;
222		}
223		memcpy(path + rem, curr->oid_name, len);
224		if (curr != oidp)
225			path[rem + len] = '.';
226	}
227
228	memset(&req, 0, sizeof(req));
229
230	req.td = curthread;
231	req.oldfunc = sysctl_old_kernel;
232	req.newfunc = sysctl_new_kernel;
233	req.lock = REQ_UNWIRED;
234
235	switch (oidp->oid_kind & CTLTYPE) {
236	case CTLTYPE_INT:
237		if (getenv_int(path + rem, &val_int) == 0)
238			return;
239		req.newlen = sizeof(val_int);
240		req.newptr = &val_int;
241		break;
242	case CTLTYPE_UINT:
243		if (getenv_uint(path + rem, (unsigned int *)&val_int) == 0)
244			return;
245		req.newlen = sizeof(val_int);
246		req.newptr = &val_int;
247		break;
248	case CTLTYPE_LONG:
249		if (getenv_long(path + rem, &val_long) == 0)
250			return;
251		req.newlen = sizeof(val_long);
252		req.newptr = &val_long;
253		break;
254	case CTLTYPE_ULONG:
255		if (getenv_ulong(path + rem, (unsigned long *)&val_long) == 0)
256			return;
257		req.newlen = sizeof(val_long);
258		req.newptr = &val_long;
259		break;
260	case CTLTYPE_S64:
261		if (getenv_quad(path + rem, &val_quad) == 0)
262			return;
263		val_64 = val_quad;
264		req.newlen = sizeof(val_64);
265		req.newptr = &val_64;
266		break;
267	case CTLTYPE_U64:
268		/* XXX there is no getenv_uquad() */
269		if (getenv_quad(path + rem, &val_quad) == 0)
270			return;
271		val_64 = val_quad;
272		req.newlen = sizeof(val_64);
273		req.newptr = &val_64;
274		break;
275	case CTLTYPE_STRING:
276		penv = kern_getenv(path + rem);
277		if (penv == NULL)
278			return;
279		req.newlen = strlen(penv);
280		req.newptr = penv;
281		break;
282	default:
283		return;
284	}
285	error = sysctl_root_handler_locked(oidp, oidp->oid_arg1,
286	    oidp->oid_arg2, &req);
287	if (error != 0)
288		printf("Setting sysctl %s failed: %d\n", path + rem, error);
289	if (penv != NULL)
290		freeenv(penv);
291}
292
293void
294sysctl_register_oid(struct sysctl_oid *oidp)
295{
296	struct sysctl_oid_list *parent = oidp->oid_parent;
297	struct sysctl_oid *p;
298	struct sysctl_oid *q;
299
300	/*
301	 * First check if another oid with the same name already
302	 * exists in the parent's list.
303	 */
304	SYSCTL_ASSERT_XLOCKED();
305	p = sysctl_find_oidname(oidp->oid_name, parent);
306	if (p != NULL) {
307		if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
308			p->oid_refcnt++;
309			return;
310		} else {
311			printf("can't re-use a leaf (%s)!\n", p->oid_name);
312			return;
313		}
314	}
315	/*
316	 * If this oid has a number OID_AUTO, give it a number which
317	 * is greater than any current oid.
318	 * NOTE: DO NOT change the starting value here, change it in
319	 * <sys/sysctl.h>, and make sure it is at least 256 to
320	 * accomodate e.g. net.inet.raw as a static sysctl node.
321	 */
322	if (oidp->oid_number == OID_AUTO) {
323		static int newoid = CTL_AUTO_START;
324
325		oidp->oid_number = newoid++;
326		if (newoid == 0x7fffffff)
327			panic("out of oids");
328	}
329#if 0
330	else if (oidp->oid_number >= CTL_AUTO_START) {
331		/* do not panic; this happens when unregistering sysctl sets */
332		printf("static sysctl oid too high: %d", oidp->oid_number);
333	}
334#endif
335
336	/*
337	 * Insert the oid into the parent's list in order.
338	 */
339	q = NULL;
340	SLIST_FOREACH(p, parent, oid_link) {
341		if (oidp->oid_number < p->oid_number)
342			break;
343		q = p;
344	}
345	if (q)
346		SLIST_INSERT_AFTER(q, oidp, oid_link);
347	else
348		SLIST_INSERT_HEAD(parent, oidp, oid_link);
349
350	if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE &&
351#ifdef VIMAGE
352	    (oidp->oid_kind & CTLFLAG_VNET) == 0 &&
353#endif
354	    (oidp->oid_kind & CTLFLAG_TUN) != 0 &&
355	    (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) {
356		/* only fetch value once */
357		oidp->oid_kind |= CTLFLAG_NOFETCH;
358		/* try to fetch value from kernel environment */
359		sysctl_load_tunable_by_oid_locked(oidp);
360	}
361}
362
363void
364sysctl_unregister_oid(struct sysctl_oid *oidp)
365{
366	struct sysctl_oid *p;
367	int error;
368
369	SYSCTL_ASSERT_XLOCKED();
370	error = ENOENT;
371	if (oidp->oid_number == OID_AUTO) {
372		error = EINVAL;
373	} else {
374		SLIST_FOREACH(p, oidp->oid_parent, oid_link) {
375			if (p == oidp) {
376				SLIST_REMOVE(oidp->oid_parent, oidp,
377				    sysctl_oid, oid_link);
378				error = 0;
379				break;
380			}
381		}
382	}
383
384	/*
385	 * This can happen when a module fails to register and is
386	 * being unloaded afterwards.  It should not be a panic()
387	 * for normal use.
388	 */
389	if (error)
390		printf("%s: failed to unregister sysctl\n", __func__);
391}
392
393/* Initialize a new context to keep track of dynamically added sysctls. */
394int
395sysctl_ctx_init(struct sysctl_ctx_list *c)
396{
397
398	if (c == NULL) {
399		return (EINVAL);
400	}
401
402	/*
403	 * No locking here, the caller is responsible for not adding
404	 * new nodes to a context until after this function has
405	 * returned.
406	 */
407	TAILQ_INIT(c);
408	return (0);
409}
410
411/* Free the context, and destroy all dynamic oids registered in this context */
412int
413sysctl_ctx_free(struct sysctl_ctx_list *clist)
414{
415	struct sysctl_ctx_entry *e, *e1;
416	int error;
417
418	error = 0;
419	/*
420	 * First perform a "dry run" to check if it's ok to remove oids.
421	 * XXX FIXME
422	 * XXX This algorithm is a hack. But I don't know any
423	 * XXX better solution for now...
424	 */
425	SYSCTL_XLOCK();
426	TAILQ_FOREACH(e, clist, link) {
427		error = sysctl_remove_oid_locked(e->entry, 0, 0);
428		if (error)
429			break;
430	}
431	/*
432	 * Restore deregistered entries, either from the end,
433	 * or from the place where error occured.
434	 * e contains the entry that was not unregistered
435	 */
436	if (error)
437		e1 = TAILQ_PREV(e, sysctl_ctx_list, link);
438	else
439		e1 = TAILQ_LAST(clist, sysctl_ctx_list);
440	while (e1 != NULL) {
441		sysctl_register_oid(e1->entry);
442		e1 = TAILQ_PREV(e1, sysctl_ctx_list, link);
443	}
444	if (error) {
445		SYSCTL_XUNLOCK();
446		return(EBUSY);
447	}
448	/* Now really delete the entries */
449	e = TAILQ_FIRST(clist);
450	while (e != NULL) {
451		e1 = TAILQ_NEXT(e, link);
452		error = sysctl_remove_oid_locked(e->entry, 1, 0);
453		if (error)
454			panic("sysctl_remove_oid: corrupt tree, entry: %s",
455			    e->entry->oid_name);
456		free(e, M_SYSCTLOID);
457		e = e1;
458	}
459	SYSCTL_XUNLOCK();
460	return (error);
461}
462
463/* Add an entry to the context */
464struct sysctl_ctx_entry *
465sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
466{
467	struct sysctl_ctx_entry *e;
468
469	SYSCTL_ASSERT_XLOCKED();
470	if (clist == NULL || oidp == NULL)
471		return(NULL);
472	e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK);
473	e->entry = oidp;
474	TAILQ_INSERT_HEAD(clist, e, link);
475	return (e);
476}
477
478/* Find an entry in the context */
479struct sysctl_ctx_entry *
480sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
481{
482	struct sysctl_ctx_entry *e;
483
484	SYSCTL_ASSERT_XLOCKED();
485	if (clist == NULL || oidp == NULL)
486		return(NULL);
487	TAILQ_FOREACH(e, clist, link) {
488		if(e->entry == oidp)
489			return(e);
490	}
491	return (e);
492}
493
494/*
495 * Delete an entry from the context.
496 * NOTE: this function doesn't free oidp! You have to remove it
497 * with sysctl_remove_oid().
498 */
499int
500sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
501{
502	struct sysctl_ctx_entry *e;
503
504	if (clist == NULL || oidp == NULL)
505		return (EINVAL);
506	SYSCTL_XLOCK();
507	e = sysctl_ctx_entry_find(clist, oidp);
508	if (e != NULL) {
509		TAILQ_REMOVE(clist, e, link);
510		SYSCTL_XUNLOCK();
511		free(e, M_SYSCTLOID);
512		return (0);
513	} else {
514		SYSCTL_XUNLOCK();
515		return (ENOENT);
516	}
517}
518
519/*
520 * Remove dynamically created sysctl trees.
521 * oidp - top of the tree to be removed
522 * del - if 0 - just deregister, otherwise free up entries as well
523 * recurse - if != 0 traverse the subtree to be deleted
524 */
525int
526sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse)
527{
528	int error;
529
530	SYSCTL_XLOCK();
531	error = sysctl_remove_oid_locked(oidp, del, recurse);
532	SYSCTL_XUNLOCK();
533	return (error);
534}
535
536int
537sysctl_remove_name(struct sysctl_oid *parent, const char *name,
538    int del, int recurse)
539{
540	struct sysctl_oid *p, *tmp;
541	int error;
542
543	error = ENOENT;
544	SYSCTL_XLOCK();
545	SLIST_FOREACH_SAFE(p, SYSCTL_CHILDREN(parent), oid_link, tmp) {
546		if (strcmp(p->oid_name, name) == 0) {
547			error = sysctl_remove_oid_locked(p, del, recurse);
548			break;
549		}
550	}
551	SYSCTL_XUNLOCK();
552
553	return (error);
554}
555
556
557static int
558sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse)
559{
560	struct sysctl_oid *p, *tmp;
561	int error;
562
563	SYSCTL_ASSERT_XLOCKED();
564	if (oidp == NULL)
565		return(EINVAL);
566	if ((oidp->oid_kind & CTLFLAG_DYN) == 0) {
567		printf("can't remove non-dynamic nodes!\n");
568		return (EINVAL);
569	}
570	/*
571	 * WARNING: normal method to do this should be through
572	 * sysctl_ctx_free(). Use recursing as the last resort
573	 * method to purge your sysctl tree of leftovers...
574	 * However, if some other code still references these nodes,
575	 * it will panic.
576	 */
577	if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
578		if (oidp->oid_refcnt == 1) {
579			SLIST_FOREACH_SAFE(p,
580			    SYSCTL_CHILDREN(oidp), oid_link, tmp) {
581				if (!recurse) {
582					printf("Warning: failed attempt to "
583					    "remove oid %s with child %s\n",
584					    oidp->oid_name, p->oid_name);
585					return (ENOTEMPTY);
586				}
587				error = sysctl_remove_oid_locked(p, del,
588				    recurse);
589				if (error)
590					return (error);
591			}
592		}
593	}
594	if (oidp->oid_refcnt > 1 ) {
595		oidp->oid_refcnt--;
596	} else {
597		if (oidp->oid_refcnt == 0) {
598			printf("Warning: bad oid_refcnt=%u (%s)!\n",
599				oidp->oid_refcnt, oidp->oid_name);
600			return (EINVAL);
601		}
602		sysctl_unregister_oid(oidp);
603		if (del) {
604			/*
605			 * Wait for all threads running the handler to drain.
606			 * This preserves the previous behavior when the
607			 * sysctl lock was held across a handler invocation,
608			 * and is necessary for module unload correctness.
609			 */
610			while (oidp->oid_running > 0) {
611				oidp->oid_kind |= CTLFLAG_DYING;
612				SYSCTL_SLEEP(&oidp->oid_running, "oidrm", 0);
613			}
614			if (oidp->oid_descr)
615				free(__DECONST(char *, oidp->oid_descr),
616				    M_SYSCTLOID);
617			free(__DECONST(char *, oidp->oid_name), M_SYSCTLOID);
618			free(oidp, M_SYSCTLOID);
619		}
620	}
621	return (0);
622}
623/*
624 * Create new sysctls at run time.
625 * clist may point to a valid context initialized with sysctl_ctx_init().
626 */
627struct sysctl_oid *
628sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent,
629	int number, const char *name, int kind, void *arg1, intptr_t arg2,
630	int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr)
631{
632	struct sysctl_oid *oidp;
633
634	/* You have to hook up somewhere.. */
635	if (parent == NULL)
636		return(NULL);
637	/* Check if the node already exists, otherwise create it */
638	SYSCTL_XLOCK();
639	oidp = sysctl_find_oidname(name, parent);
640	if (oidp != NULL) {
641		if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
642			oidp->oid_refcnt++;
643			/* Update the context */
644			if (clist != NULL)
645				sysctl_ctx_entry_add(clist, oidp);
646			SYSCTL_XUNLOCK();
647			return (oidp);
648		} else {
649			SYSCTL_XUNLOCK();
650			printf("can't re-use a leaf (%s)!\n", name);
651			return (NULL);
652		}
653	}
654	oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO);
655	oidp->oid_parent = parent;
656	SLIST_INIT(&oidp->oid_children);
657	oidp->oid_number = number;
658	oidp->oid_refcnt = 1;
659	oidp->oid_name = strdup(name, M_SYSCTLOID);
660	oidp->oid_handler = handler;
661	oidp->oid_kind = CTLFLAG_DYN | kind;
662	oidp->oid_arg1 = arg1;
663	oidp->oid_arg2 = arg2;
664	oidp->oid_fmt = fmt;
665	if (descr != NULL)
666		oidp->oid_descr = strdup(descr, M_SYSCTLOID);
667	/* Update the context, if used */
668	if (clist != NULL)
669		sysctl_ctx_entry_add(clist, oidp);
670	/* Register this oid */
671	sysctl_register_oid(oidp);
672	SYSCTL_XUNLOCK();
673	return (oidp);
674}
675
676/*
677 * Rename an existing oid.
678 */
679void
680sysctl_rename_oid(struct sysctl_oid *oidp, const char *name)
681{
682	char *newname;
683	char *oldname;
684
685	newname = strdup(name, M_SYSCTLOID);
686	SYSCTL_XLOCK();
687	oldname = __DECONST(char *, oidp->oid_name);
688	oidp->oid_name = newname;
689	SYSCTL_XUNLOCK();
690	free(oldname, M_SYSCTLOID);
691}
692
693/*
694 * Reparent an existing oid.
695 */
696int
697sysctl_move_oid(struct sysctl_oid *oid, struct sysctl_oid_list *parent)
698{
699	struct sysctl_oid *oidp;
700
701	SYSCTL_XLOCK();
702	if (oid->oid_parent == parent) {
703		SYSCTL_XUNLOCK();
704		return (0);
705	}
706	oidp = sysctl_find_oidname(oid->oid_name, parent);
707	if (oidp != NULL) {
708		SYSCTL_XUNLOCK();
709		return (EEXIST);
710	}
711	sysctl_unregister_oid(oid);
712	oid->oid_parent = parent;
713	oid->oid_number = OID_AUTO;
714	sysctl_register_oid(oid);
715	SYSCTL_XUNLOCK();
716	return (0);
717}
718
719/*
720 * Register the kernel's oids on startup.
721 */
722SET_DECLARE(sysctl_set, struct sysctl_oid);
723
724static void
725sysctl_register_all(void *arg)
726{
727	struct sysctl_oid **oidp;
728
729	sx_init(&sysctlmemlock, "sysctl mem");
730	SYSCTL_INIT();
731	SYSCTL_XLOCK();
732	SET_FOREACH(oidp, sysctl_set)
733		sysctl_register_oid(*oidp);
734	SYSCTL_XUNLOCK();
735}
736SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_FIRST, sysctl_register_all, 0);
737
738/*
739 * "Staff-functions"
740 *
741 * These functions implement a presently undocumented interface
742 * used by the sysctl program to walk the tree, and get the type
743 * so it can print the value.
744 * This interface is under work and consideration, and should probably
745 * be killed with a big axe by the first person who can find the time.
746 * (be aware though, that the proper interface isn't as obvious as it
747 * may seem, there are various conflicting requirements.
748 *
749 * {0,0}	printf the entire MIB-tree.
750 * {0,1,...}	return the name of the "..." OID.
751 * {0,2,...}	return the next OID.
752 * {0,3}	return the OID of the name in "new"
753 * {0,4,...}	return the kind & format info for the "..." OID.
754 * {0,5,...}	return the description the "..." OID.
755 */
756
757#ifdef SYSCTL_DEBUG
758static void
759sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i)
760{
761	int k;
762	struct sysctl_oid *oidp;
763
764	SYSCTL_ASSERT_LOCKED();
765	SLIST_FOREACH(oidp, l, oid_link) {
766
767		for (k=0; k<i; k++)
768			printf(" ");
769
770		printf("%d %s ", oidp->oid_number, oidp->oid_name);
771
772		printf("%c%c",
773			oidp->oid_kind & CTLFLAG_RD ? 'R':' ',
774			oidp->oid_kind & CTLFLAG_WR ? 'W':' ');
775
776		if (oidp->oid_handler)
777			printf(" *Handler");
778
779		switch (oidp->oid_kind & CTLTYPE) {
780			case CTLTYPE_NODE:
781				printf(" Node\n");
782				if (!oidp->oid_handler) {
783					sysctl_sysctl_debug_dump_node(
784					    SYSCTL_CHILDREN(oidp), i + 2);
785				}
786				break;
787			case CTLTYPE_INT:    printf(" Int\n"); break;
788			case CTLTYPE_UINT:   printf(" u_int\n"); break;
789			case CTLTYPE_LONG:   printf(" Long\n"); break;
790			case CTLTYPE_ULONG:  printf(" u_long\n"); break;
791			case CTLTYPE_STRING: printf(" String\n"); break;
792			case CTLTYPE_U64:    printf(" uint64_t\n"); break;
793			case CTLTYPE_S64:    printf(" int64_t\n"); break;
794			case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break;
795			default:	     printf("\n");
796		}
797
798	}
799}
800
801static int
802sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS)
803{
804	int error;
805
806	error = priv_check(req->td, PRIV_SYSCTL_DEBUG);
807	if (error)
808		return (error);
809	SYSCTL_SLOCK();
810	sysctl_sysctl_debug_dump_node(&sysctl__children, 0);
811	SYSCTL_SUNLOCK();
812	return (ENOENT);
813}
814
815SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD|CTLFLAG_MPSAFE,
816	0, 0, sysctl_sysctl_debug, "-", "");
817#endif
818
819static int
820sysctl_sysctl_name(SYSCTL_HANDLER_ARGS)
821{
822	int *name = (int *) arg1;
823	u_int namelen = arg2;
824	int error = 0;
825	struct sysctl_oid *oid;
826	struct sysctl_oid_list *lsp = &sysctl__children, *lsp2;
827	char buf[10];
828
829	SYSCTL_SLOCK();
830	while (namelen) {
831		if (!lsp) {
832			snprintf(buf,sizeof(buf),"%d",*name);
833			if (req->oldidx)
834				error = SYSCTL_OUT(req, ".", 1);
835			if (!error)
836				error = SYSCTL_OUT(req, buf, strlen(buf));
837			if (error)
838				goto out;
839			namelen--;
840			name++;
841			continue;
842		}
843		lsp2 = 0;
844		SLIST_FOREACH(oid, lsp, oid_link) {
845			if (oid->oid_number != *name)
846				continue;
847
848			if (req->oldidx)
849				error = SYSCTL_OUT(req, ".", 1);
850			if (!error)
851				error = SYSCTL_OUT(req, oid->oid_name,
852					strlen(oid->oid_name));
853			if (error)
854				goto out;
855
856			namelen--;
857			name++;
858
859			if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE)
860				break;
861
862			if (oid->oid_handler)
863				break;
864
865			lsp2 = SYSCTL_CHILDREN(oid);
866			break;
867		}
868		lsp = lsp2;
869	}
870	error = SYSCTL_OUT(req, "", 1);
871 out:
872	SYSCTL_SUNLOCK();
873	return (error);
874}
875
876/*
877 * XXXRW/JA: Shouldn't return name data for nodes that we don't permit in
878 * capability mode.
879 */
880static SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD,
881    sysctl_sysctl_name, "");
882
883static int
884sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen,
885	int *next, int *len, int level, struct sysctl_oid **oidpp)
886{
887	struct sysctl_oid *oidp;
888
889	SYSCTL_ASSERT_LOCKED();
890	*len = level;
891	SLIST_FOREACH(oidp, lsp, oid_link) {
892		*next = oidp->oid_number;
893		*oidpp = oidp;
894
895		if (oidp->oid_kind & CTLFLAG_SKIP)
896			continue;
897
898		if (!namelen) {
899			if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
900				return (0);
901			if (oidp->oid_handler)
902				/* We really should call the handler here...*/
903				return (0);
904			lsp = SYSCTL_CHILDREN(oidp);
905			if (!sysctl_sysctl_next_ls(lsp, 0, 0, next+1,
906				len, level+1, oidpp))
907				return (0);
908			goto emptynode;
909		}
910
911		if (oidp->oid_number < *name)
912			continue;
913
914		if (oidp->oid_number > *name) {
915			if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
916				return (0);
917			if (oidp->oid_handler)
918				return (0);
919			lsp = SYSCTL_CHILDREN(oidp);
920			if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1,
921				next+1, len, level+1, oidpp))
922				return (0);
923			goto next;
924		}
925		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
926			continue;
927
928		if (oidp->oid_handler)
929			continue;
930
931		lsp = SYSCTL_CHILDREN(oidp);
932		if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1,
933			len, level+1, oidpp))
934			return (0);
935	next:
936		namelen = 1;
937	emptynode:
938		*len = level;
939	}
940	return (1);
941}
942
943static int
944sysctl_sysctl_next(SYSCTL_HANDLER_ARGS)
945{
946	int *name = (int *) arg1;
947	u_int namelen = arg2;
948	int i, j, error;
949	struct sysctl_oid *oid;
950	struct sysctl_oid_list *lsp = &sysctl__children;
951	int newoid[CTL_MAXNAME];
952
953	SYSCTL_SLOCK();
954	i = sysctl_sysctl_next_ls(lsp, name, namelen, newoid, &j, 1, &oid);
955	SYSCTL_SUNLOCK();
956	if (i)
957		return (ENOENT);
958	error = SYSCTL_OUT(req, newoid, j * sizeof (int));
959	return (error);
960}
961
962/*
963 * XXXRW/JA: Shouldn't return next data for nodes that we don't permit in
964 * capability mode.
965 */
966static SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD,
967    sysctl_sysctl_next, "");
968
969static int
970name2oid(char *name, int *oid, int *len, struct sysctl_oid **oidpp)
971{
972	struct sysctl_oid *oidp;
973	struct sysctl_oid_list *lsp = &sysctl__children;
974	char *p;
975
976	SYSCTL_ASSERT_LOCKED();
977
978	for (*len = 0; *len < CTL_MAXNAME;) {
979		p = strsep(&name, ".");
980
981		oidp = SLIST_FIRST(lsp);
982		for (;; oidp = SLIST_NEXT(oidp, oid_link)) {
983			if (oidp == NULL)
984				return (ENOENT);
985			if (strcmp(p, oidp->oid_name) == 0)
986				break;
987		}
988		*oid++ = oidp->oid_number;
989		(*len)++;
990
991		if (name == NULL || *name == '\0') {
992			if (oidpp)
993				*oidpp = oidp;
994			return (0);
995		}
996
997		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
998			break;
999
1000		if (oidp->oid_handler)
1001			break;
1002
1003		lsp = SYSCTL_CHILDREN(oidp);
1004	}
1005	return (ENOENT);
1006}
1007
1008static int
1009sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS)
1010{
1011	char *p;
1012	int error, oid[CTL_MAXNAME], len = 0;
1013	struct sysctl_oid *op = 0;
1014
1015	if (!req->newlen)
1016		return (ENOENT);
1017	if (req->newlen >= MAXPATHLEN)	/* XXX arbitrary, undocumented */
1018		return (ENAMETOOLONG);
1019
1020	p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK);
1021
1022	error = SYSCTL_IN(req, p, req->newlen);
1023	if (error) {
1024		free(p, M_SYSCTL);
1025		return (error);
1026	}
1027
1028	p [req->newlen] = '\0';
1029
1030	SYSCTL_SLOCK();
1031	error = name2oid(p, oid, &len, &op);
1032	SYSCTL_SUNLOCK();
1033
1034	free(p, M_SYSCTL);
1035
1036	if (error)
1037		return (error);
1038
1039	error = SYSCTL_OUT(req, oid, len * sizeof *oid);
1040	return (error);
1041}
1042
1043/*
1044 * XXXRW/JA: Shouldn't return name2oid data for nodes that we don't permit in
1045 * capability mode.
1046 */
1047SYSCTL_PROC(_sysctl, 3, name2oid,
1048    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE
1049    | CTLFLAG_CAPRW, 0, 0, sysctl_sysctl_name2oid, "I", "");
1050
1051static int
1052sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS)
1053{
1054	struct sysctl_oid *oid;
1055	int error;
1056
1057	SYSCTL_SLOCK();
1058	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
1059	if (error)
1060		goto out;
1061
1062	if (oid->oid_fmt == NULL) {
1063		error = ENOENT;
1064		goto out;
1065	}
1066	error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind));
1067	if (error)
1068		goto out;
1069	error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1);
1070 out:
1071	SYSCTL_SUNLOCK();
1072	return (error);
1073}
1074
1075
1076static SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD,
1077    sysctl_sysctl_oidfmt, "");
1078
1079static int
1080sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS)
1081{
1082	struct sysctl_oid *oid;
1083	int error;
1084
1085	SYSCTL_SLOCK();
1086	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
1087	if (error)
1088		goto out;
1089
1090	if (oid->oid_descr == NULL) {
1091		error = ENOENT;
1092		goto out;
1093	}
1094	error = SYSCTL_OUT(req, oid->oid_descr, strlen(oid->oid_descr) + 1);
1095 out:
1096	SYSCTL_SUNLOCK();
1097	return (error);
1098}
1099
1100static SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD,
1101    sysctl_sysctl_oiddescr, "");
1102
1103/*
1104 * Default "handler" functions.
1105 */
1106
1107/*
1108 * Handle an int, signed or unsigned.
1109 * Two cases:
1110 *     a variable:  point arg1 at it.
1111 *     a constant:  pass it in arg2.
1112 */
1113
1114int
1115sysctl_handle_int(SYSCTL_HANDLER_ARGS)
1116{
1117	int tmpout, error = 0;
1118
1119	/*
1120	 * Attempt to get a coherent snapshot by making a copy of the data.
1121	 */
1122	if (arg1)
1123		tmpout = *(int *)arg1;
1124	else
1125		tmpout = arg2;
1126	error = SYSCTL_OUT(req, &tmpout, sizeof(int));
1127
1128	if (error || !req->newptr)
1129		return (error);
1130
1131	if (!arg1)
1132		error = EPERM;
1133	else
1134		error = SYSCTL_IN(req, arg1, sizeof(int));
1135	return (error);
1136}
1137
1138/*
1139 * Based on on sysctl_handle_int() convert milliseconds into ticks.
1140 * Note: this is used by TCP.
1141 */
1142
1143int
1144sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS)
1145{
1146	int error, s, tt;
1147
1148	tt = *(int *)arg1;
1149	s = (int)((int64_t)tt * 1000 / hz);
1150
1151	error = sysctl_handle_int(oidp, &s, 0, req);
1152	if (error || !req->newptr)
1153		return (error);
1154
1155	tt = (int)((int64_t)s * hz / 1000);
1156	if (tt < 1)
1157		return (EINVAL);
1158
1159	*(int *)arg1 = tt;
1160	return (0);
1161}
1162
1163
1164/*
1165 * Handle a long, signed or unsigned.
1166 * Two cases:
1167 *     a variable:  point arg1 at it.
1168 *     a constant:  pass it in arg2.
1169 */
1170
1171int
1172sysctl_handle_long(SYSCTL_HANDLER_ARGS)
1173{
1174	int error = 0;
1175	long tmplong;
1176#ifdef SCTL_MASK32
1177	int tmpint;
1178#endif
1179
1180	/*
1181	 * Attempt to get a coherent snapshot by making a copy of the data.
1182	 */
1183	if (arg1)
1184		tmplong = *(long *)arg1;
1185	else
1186		tmplong = arg2;
1187#ifdef SCTL_MASK32
1188	if (req->flags & SCTL_MASK32) {
1189		tmpint = tmplong;
1190		error = SYSCTL_OUT(req, &tmpint, sizeof(int));
1191	} else
1192#endif
1193		error = SYSCTL_OUT(req, &tmplong, sizeof(long));
1194
1195	if (error || !req->newptr)
1196		return (error);
1197
1198	if (!arg1)
1199		error = EPERM;
1200#ifdef SCTL_MASK32
1201	else if (req->flags & SCTL_MASK32) {
1202		error = SYSCTL_IN(req, &tmpint, sizeof(int));
1203		*(long *)arg1 = (long)tmpint;
1204	}
1205#endif
1206	else
1207		error = SYSCTL_IN(req, arg1, sizeof(long));
1208	return (error);
1209}
1210
1211/*
1212 * Handle a 64 bit int, signed or unsigned.
1213 * Two cases:
1214 *     a variable:  point arg1 at it.
1215 *     a constant:  pass it in arg2.
1216 */
1217int
1218sysctl_handle_64(SYSCTL_HANDLER_ARGS)
1219{
1220	int error = 0;
1221	uint64_t tmpout;
1222
1223	/*
1224	 * Attempt to get a coherent snapshot by making a copy of the data.
1225	 */
1226	if (arg1)
1227		tmpout = *(uint64_t *)arg1;
1228	else
1229		tmpout = arg2;
1230	error = SYSCTL_OUT(req, &tmpout, sizeof(uint64_t));
1231
1232	if (error || !req->newptr)
1233		return (error);
1234
1235	if (!arg1)
1236		error = EPERM;
1237	else
1238		error = SYSCTL_IN(req, arg1, sizeof(uint64_t));
1239	return (error);
1240}
1241
1242/*
1243 * Handle our generic '\0' terminated 'C' string.
1244 * Two cases:
1245 * 	a variable string:  point arg1 at it, arg2 is max length.
1246 * 	a constant string:  point arg1 at it, arg2 is zero.
1247 */
1248
1249int
1250sysctl_handle_string(SYSCTL_HANDLER_ARGS)
1251{
1252	size_t outlen;
1253	int error = 0, ro_string = 0;
1254
1255	/*
1256	 * A zero-length buffer indicates a fixed size read-only
1257	 * string:
1258	 */
1259	if (arg2 == 0) {
1260		arg2 = strlen((char *)arg1) + 1;
1261		ro_string = 1;
1262	}
1263
1264	if (req->oldptr != NULL) {
1265		char *tmparg;
1266
1267		if (ro_string) {
1268			tmparg = arg1;
1269		} else {
1270			/* try to make a coherent snapshot of the string */
1271			tmparg = malloc(arg2, M_SYSCTLTMP, M_WAITOK);
1272			memcpy(tmparg, arg1, arg2);
1273		}
1274
1275		outlen = strnlen(tmparg, arg2 - 1) + 1;
1276		error = SYSCTL_OUT(req, tmparg, outlen);
1277
1278		if (!ro_string)
1279			free(tmparg, M_SYSCTLTMP);
1280	} else {
1281		outlen = strnlen((char *)arg1, arg2 - 1) + 1;
1282		error = SYSCTL_OUT(req, NULL, outlen);
1283	}
1284	if (error || !req->newptr)
1285		return (error);
1286
1287	if ((req->newlen - req->newidx) >= arg2) {
1288		error = EINVAL;
1289	} else {
1290		arg2 = (req->newlen - req->newidx);
1291		error = SYSCTL_IN(req, arg1, arg2);
1292		((char *)arg1)[arg2] = '\0';
1293	}
1294	return (error);
1295}
1296
1297/*
1298 * Handle any kind of opaque data.
1299 * arg1 points to it, arg2 is the size.
1300 */
1301
1302int
1303sysctl_handle_opaque(SYSCTL_HANDLER_ARGS)
1304{
1305	int error, tries;
1306	u_int generation;
1307	struct sysctl_req req2;
1308
1309	/*
1310	 * Attempt to get a coherent snapshot, by using the thread
1311	 * pre-emption counter updated from within mi_switch() to
1312	 * determine if we were pre-empted during a bcopy() or
1313	 * copyout(). Make 3 attempts at doing this before giving up.
1314	 * If we encounter an error, stop immediately.
1315	 */
1316	tries = 0;
1317	req2 = *req;
1318retry:
1319	generation = curthread->td_generation;
1320	error = SYSCTL_OUT(req, arg1, arg2);
1321	if (error)
1322		return (error);
1323	tries++;
1324	if (generation != curthread->td_generation && tries < 3) {
1325		*req = req2;
1326		goto retry;
1327	}
1328
1329	error = SYSCTL_IN(req, arg1, arg2);
1330
1331	return (error);
1332}
1333
1334/*
1335 * Transfer functions to/from kernel space.
1336 * XXX: rather untested at this point
1337 */
1338static int
1339sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l)
1340{
1341	size_t i = 0;
1342
1343	if (req->oldptr) {
1344		i = l;
1345		if (req->oldlen <= req->oldidx)
1346			i = 0;
1347		else
1348			if (i > req->oldlen - req->oldidx)
1349				i = req->oldlen - req->oldidx;
1350		if (i > 0)
1351			bcopy(p, (char *)req->oldptr + req->oldidx, i);
1352	}
1353	req->oldidx += l;
1354	if (req->oldptr && i != l)
1355		return (ENOMEM);
1356	return (0);
1357}
1358
1359static int
1360sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l)
1361{
1362	if (!req->newptr)
1363		return (0);
1364	if (req->newlen - req->newidx < l)
1365		return (EINVAL);
1366	bcopy((char *)req->newptr + req->newidx, p, l);
1367	req->newidx += l;
1368	return (0);
1369}
1370
1371int
1372kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old,
1373    size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags)
1374{
1375	int error = 0;
1376	struct sysctl_req req;
1377
1378	bzero(&req, sizeof req);
1379
1380	req.td = td;
1381	req.flags = flags;
1382
1383	if (oldlenp) {
1384		req.oldlen = *oldlenp;
1385	}
1386	req.validlen = req.oldlen;
1387
1388	if (old) {
1389		req.oldptr= old;
1390	}
1391
1392	if (new != NULL) {
1393		req.newlen = newlen;
1394		req.newptr = new;
1395	}
1396
1397	req.oldfunc = sysctl_old_kernel;
1398	req.newfunc = sysctl_new_kernel;
1399	req.lock = REQ_UNWIRED;
1400
1401	SYSCTL_SLOCK();
1402	error = sysctl_root(0, name, namelen, &req);
1403	SYSCTL_SUNLOCK();
1404
1405	if (req.lock == REQ_WIRED && req.validlen > 0)
1406		vsunlock(req.oldptr, req.validlen);
1407
1408	if (error && error != ENOMEM)
1409		return (error);
1410
1411	if (retval) {
1412		if (req.oldptr && req.oldidx > req.validlen)
1413			*retval = req.validlen;
1414		else
1415			*retval = req.oldidx;
1416	}
1417	return (error);
1418}
1419
1420int
1421kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp,
1422    void *new, size_t newlen, size_t *retval, int flags)
1423{
1424        int oid[CTL_MAXNAME];
1425        size_t oidlen, plen;
1426	int error;
1427
1428	oid[0] = 0;		/* sysctl internal magic */
1429	oid[1] = 3;		/* name2oid */
1430	oidlen = sizeof(oid);
1431
1432	error = kernel_sysctl(td, oid, 2, oid, &oidlen,
1433	    (void *)name, strlen(name), &plen, flags);
1434	if (error)
1435		return (error);
1436
1437	error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp,
1438	    new, newlen, retval, flags);
1439	return (error);
1440}
1441
1442/*
1443 * Transfer function to/from user space.
1444 */
1445static int
1446sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
1447{
1448	size_t i, len, origidx;
1449	int error;
1450
1451	origidx = req->oldidx;
1452	req->oldidx += l;
1453	if (req->oldptr == NULL)
1454		return (0);
1455	/*
1456	 * If we have not wired the user supplied buffer and we are currently
1457	 * holding locks, drop a witness warning, as it's possible that
1458	 * write operations to the user page can sleep.
1459	 */
1460	if (req->lock != REQ_WIRED)
1461		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
1462		    "sysctl_old_user()");
1463	i = l;
1464	len = req->validlen;
1465	if (len <= origidx)
1466		i = 0;
1467	else {
1468		if (i > len - origidx)
1469			i = len - origidx;
1470		if (req->lock == REQ_WIRED) {
1471			error = copyout_nofault(p, (char *)req->oldptr +
1472			    origidx, i);
1473		} else
1474			error = copyout(p, (char *)req->oldptr + origidx, i);
1475		if (error != 0)
1476			return (error);
1477	}
1478	if (i < l)
1479		return (ENOMEM);
1480	return (0);
1481}
1482
1483static int
1484sysctl_new_user(struct sysctl_req *req, void *p, size_t l)
1485{
1486	int error;
1487
1488	if (!req->newptr)
1489		return (0);
1490	if (req->newlen - req->newidx < l)
1491		return (EINVAL);
1492	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
1493	    "sysctl_new_user()");
1494	error = copyin((char *)req->newptr + req->newidx, p, l);
1495	req->newidx += l;
1496	return (error);
1497}
1498
1499/*
1500 * Wire the user space destination buffer.  If set to a value greater than
1501 * zero, the len parameter limits the maximum amount of wired memory.
1502 */
1503int
1504sysctl_wire_old_buffer(struct sysctl_req *req, size_t len)
1505{
1506	int ret;
1507	size_t wiredlen;
1508
1509	wiredlen = (len > 0 && len < req->oldlen) ? len : req->oldlen;
1510	ret = 0;
1511	if (req->lock != REQ_WIRED && req->oldptr &&
1512	    req->oldfunc == sysctl_old_user) {
1513		if (wiredlen != 0) {
1514			ret = vslock(req->oldptr, wiredlen);
1515			if (ret != 0) {
1516				if (ret != ENOMEM)
1517					return (ret);
1518				wiredlen = 0;
1519			}
1520		}
1521		req->lock = REQ_WIRED;
1522		req->validlen = wiredlen;
1523	}
1524	return (0);
1525}
1526
1527int
1528sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
1529    int *nindx, struct sysctl_req *req)
1530{
1531	struct sysctl_oid_list *lsp;
1532	struct sysctl_oid *oid;
1533	int indx;
1534
1535	SYSCTL_ASSERT_LOCKED();
1536	lsp = &sysctl__children;
1537	indx = 0;
1538	while (indx < CTL_MAXNAME) {
1539		SLIST_FOREACH(oid, lsp, oid_link) {
1540			if (oid->oid_number == name[indx])
1541				break;
1542		}
1543		if (oid == NULL)
1544			return (ENOENT);
1545
1546		indx++;
1547		if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
1548			if (oid->oid_handler != NULL || indx == namelen) {
1549				*noid = oid;
1550				if (nindx != NULL)
1551					*nindx = indx;
1552				KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0,
1553				    ("%s found DYING node %p", __func__, oid));
1554				return (0);
1555			}
1556			lsp = SYSCTL_CHILDREN(oid);
1557		} else if (indx == namelen) {
1558			*noid = oid;
1559			if (nindx != NULL)
1560				*nindx = indx;
1561			KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0,
1562			    ("%s found DYING node %p", __func__, oid));
1563			return (0);
1564		} else {
1565			return (ENOTDIR);
1566		}
1567	}
1568	return (ENOENT);
1569}
1570
1571/*
1572 * Traverse our tree, and find the right node, execute whatever it points
1573 * to, and return the resulting error code.
1574 */
1575
1576static int
1577sysctl_root(SYSCTL_HANDLER_ARGS)
1578{
1579	struct sysctl_oid *oid;
1580	int error, indx, lvl;
1581
1582	SYSCTL_ASSERT_SLOCKED();
1583
1584	error = sysctl_find_oid(arg1, arg2, &oid, &indx, req);
1585	if (error)
1586		return (error);
1587
1588	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
1589		/*
1590		 * You can't call a sysctl when it's a node, but has
1591		 * no handler.  Inform the user that it's a node.
1592		 * The indx may or may not be the same as namelen.
1593		 */
1594		if (oid->oid_handler == NULL)
1595			return (EISDIR);
1596	}
1597
1598	/* Is this sysctl writable? */
1599	if (req->newptr && !(oid->oid_kind & CTLFLAG_WR))
1600		return (EPERM);
1601
1602	KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL"));
1603
1604#ifdef CAPABILITY_MODE
1605	/*
1606	 * If the process is in capability mode, then don't permit reading or
1607	 * writing unless specifically granted for the node.
1608	 */
1609	if (IN_CAPABILITY_MODE(req->td)) {
1610		if (req->oldptr && !(oid->oid_kind & CTLFLAG_CAPRD))
1611			return (EPERM);
1612		if (req->newptr && !(oid->oid_kind & CTLFLAG_CAPWR))
1613			return (EPERM);
1614	}
1615#endif
1616
1617	/* Is this sysctl sensitive to securelevels? */
1618	if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) {
1619		lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE;
1620		error = securelevel_gt(req->td->td_ucred, lvl);
1621		if (error)
1622			return (error);
1623	}
1624
1625	/* Is this sysctl writable by only privileged users? */
1626	if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) {
1627		int priv;
1628
1629		if (oid->oid_kind & CTLFLAG_PRISON)
1630			priv = PRIV_SYSCTL_WRITEJAIL;
1631#ifdef VIMAGE
1632		else if ((oid->oid_kind & CTLFLAG_VNET) &&
1633		     prison_owns_vnet(req->td->td_ucred))
1634			priv = PRIV_SYSCTL_WRITEJAIL;
1635#endif
1636		else
1637			priv = PRIV_SYSCTL_WRITE;
1638		error = priv_check(req->td, priv);
1639		if (error)
1640			return (error);
1641	}
1642
1643	if (!oid->oid_handler)
1644		return (EINVAL);
1645
1646	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
1647		arg1 = (int *)arg1 + indx;
1648		arg2 -= indx;
1649	} else {
1650		arg1 = oid->oid_arg1;
1651		arg2 = oid->oid_arg2;
1652	}
1653#ifdef MAC
1654	error = mac_system_check_sysctl(req->td->td_ucred, oid, arg1, arg2,
1655	    req);
1656	if (error != 0)
1657		return (error);
1658#endif
1659#ifdef VIMAGE
1660	if ((oid->oid_kind & CTLFLAG_VNET) && arg1 != NULL)
1661		arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1);
1662#endif
1663	error = sysctl_root_handler_locked(oid, arg1, arg2, req);
1664
1665	KFAIL_POINT_ERROR(_debug_fail_point, sysctl_running, error);
1666
1667	return (error);
1668}
1669
1670#ifndef _SYS_SYSPROTO_H_
1671struct sysctl_args {
1672	int	*name;
1673	u_int	namelen;
1674	void	*old;
1675	size_t	*oldlenp;
1676	void	*new;
1677	size_t	newlen;
1678};
1679#endif
1680int
1681sys___sysctl(struct thread *td, struct sysctl_args *uap)
1682{
1683	int error, i, name[CTL_MAXNAME];
1684	size_t j;
1685
1686	if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
1687		return (EINVAL);
1688
1689 	error = copyin(uap->name, &name, uap->namelen * sizeof(int));
1690 	if (error)
1691		return (error);
1692
1693	error = userland_sysctl(td, name, uap->namelen,
1694		uap->old, uap->oldlenp, 0,
1695		uap->new, uap->newlen, &j, 0);
1696	if (error && error != ENOMEM)
1697		return (error);
1698	if (uap->oldlenp) {
1699		i = copyout(&j, uap->oldlenp, sizeof(j));
1700		if (i)
1701			return (i);
1702	}
1703	return (error);
1704}
1705
1706/*
1707 * This is used from various compatibility syscalls too.  That's why name
1708 * must be in kernel space.
1709 */
1710int
1711userland_sysctl(struct thread *td, int *name, u_int namelen, void *old,
1712    size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval,
1713    int flags)
1714{
1715	int error = 0, memlocked;
1716	struct sysctl_req req;
1717
1718	bzero(&req, sizeof req);
1719
1720	req.td = td;
1721	req.flags = flags;
1722
1723	if (oldlenp) {
1724		if (inkernel) {
1725			req.oldlen = *oldlenp;
1726		} else {
1727			error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp));
1728			if (error)
1729				return (error);
1730		}
1731	}
1732	req.validlen = req.oldlen;
1733
1734	if (old) {
1735		if (!useracc(old, req.oldlen, VM_PROT_WRITE))
1736			return (EFAULT);
1737		req.oldptr= old;
1738	}
1739
1740	if (new != NULL) {
1741		if (!useracc(new, newlen, VM_PROT_READ))
1742			return (EFAULT);
1743		req.newlen = newlen;
1744		req.newptr = new;
1745	}
1746
1747	req.oldfunc = sysctl_old_user;
1748	req.newfunc = sysctl_new_user;
1749	req.lock = REQ_UNWIRED;
1750
1751#ifdef KTRACE
1752	if (KTRPOINT(curthread, KTR_SYSCTL))
1753		ktrsysctl(name, namelen);
1754#endif
1755
1756	if (req.oldlen > PAGE_SIZE) {
1757		memlocked = 1;
1758		sx_xlock(&sysctlmemlock);
1759	} else
1760		memlocked = 0;
1761	CURVNET_SET(TD_TO_VNET(td));
1762
1763	for (;;) {
1764		req.oldidx = 0;
1765		req.newidx = 0;
1766		SYSCTL_SLOCK();
1767		error = sysctl_root(0, name, namelen, &req);
1768		SYSCTL_SUNLOCK();
1769		if (error != EAGAIN)
1770			break;
1771		kern_yield(PRI_USER);
1772	}
1773
1774	CURVNET_RESTORE();
1775
1776	if (req.lock == REQ_WIRED && req.validlen > 0)
1777		vsunlock(req.oldptr, req.validlen);
1778	if (memlocked)
1779		sx_xunlock(&sysctlmemlock);
1780
1781	if (error && error != ENOMEM)
1782		return (error);
1783
1784	if (retval) {
1785		if (req.oldptr && req.oldidx > req.validlen)
1786			*retval = req.validlen;
1787		else
1788			*retval = req.oldidx;
1789	}
1790	return (error);
1791}
1792
1793/*
1794 * Drain into a sysctl struct.  The user buffer should be wired if a page
1795 * fault would cause issue.
1796 */
1797static int
1798sbuf_sysctl_drain(void *arg, const char *data, int len)
1799{
1800	struct sysctl_req *req = arg;
1801	int error;
1802
1803	error = SYSCTL_OUT(req, data, len);
1804	KASSERT(error >= 0, ("Got unexpected negative value %d", error));
1805	return (error == 0 ? len : -error);
1806}
1807
1808struct sbuf *
1809sbuf_new_for_sysctl(struct sbuf *s, char *buf, int length,
1810    struct sysctl_req *req)
1811{
1812
1813	/* Supply a default buffer size if none given. */
1814	if (buf == NULL && length == 0)
1815		length = 64;
1816	s = sbuf_new(s, buf, length, SBUF_FIXEDLEN | SBUF_INCLUDENUL);
1817	sbuf_set_drain(s, sbuf_sysctl_drain, req);
1818	return (s);
1819}
1820