1/*	$NetBSD$	*/
2
3/*-
4 * Copyright (c) 2003, 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Brown.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*-
33 * Copyright (c) 1982, 1986, 1989, 1993
34 *	The Regents of the University of California.  All rights reserved.
35 *
36 * This code is derived from software contributed to Berkeley by
37 * Mike Karels at Berkeley Software Design, Inc.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 *    notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 *    notice, this list of conditions and the following disclaimer in the
46 *    documentation and/or other materials provided with the distribution.
47 * 3. Neither the name of the University nor the names of its contributors
48 *    may be used to endorse or promote products derived from this software
49 *    without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * SUCH DAMAGE.
62 *
63 *	@(#)kern_sysctl.c	8.9 (Berkeley) 5/20/95
64 */
65
66/*
67 * sysctl system call.
68 */
69
70#include <sys/cdefs.h>
71__KERNEL_RCSID(0, "$NetBSD$");
72
73#include "opt_defcorename.h"
74#include "ksyms.h"
75
76#include <sys/param.h>
77#define __COMPAT_SYSCTL
78#include <sys/sysctl.h>
79#include <sys/systm.h>
80#include <sys/buf.h>
81#include <sys/ksyms.h>
82#include <sys/malloc.h>
83#include <sys/mount.h>
84#include <sys/syscallargs.h>
85#include <sys/kauth.h>
86#include <sys/ktrace.h>
87#include <sys/cprng.h>
88
89#define	MAXDESCLEN	1024
90MALLOC_DEFINE(M_SYSCTLNODE, "sysctlnode", "sysctl node structures");
91MALLOC_DEFINE(M_SYSCTLDATA, "sysctldata", "misc sysctl data");
92
93static int sysctl_mmap(SYSCTLFN_PROTO);
94static int sysctl_alloc(struct sysctlnode *, int);
95static int sysctl_realloc(struct sysctlnode *);
96
97static int sysctl_cvt_in(struct lwp *, int *, const void *, size_t,
98			 struct sysctlnode *);
99static int sysctl_cvt_out(struct lwp *, int, const struct sysctlnode *,
100			  void *, size_t, size_t *);
101
102static int sysctl_log_add(struct sysctllog **, const struct sysctlnode *);
103static int sysctl_log_realloc(struct sysctllog *);
104
105typedef void (*sysctl_setup_func)(struct sysctllog **);
106
107struct sysctllog {
108	const struct sysctlnode *log_root;
109	int *log_num;
110	int log_size, log_left;
111};
112
113/*
114 * the "root" of the new sysctl tree
115 */
116struct sysctlnode sysctl_root = {
117	.sysctl_flags = SYSCTL_VERSION|
118	    CTLFLAG_ROOT|CTLFLAG_READWRITE|
119	    CTLTYPE_NODE,
120	.sysctl_num = 0,
121	/*
122	 * XXX once all ports are on gcc3, we can get rid of this
123	 * ugliness and simply make it into
124	 *
125	 *	.sysctl_size = sizeof(struct sysctlnode),
126	 */
127	sysc_init_field(_sysctl_size, sizeof(struct sysctlnode)),
128	.sysctl_name = "(root)",
129};
130
131/*
132 * link set of functions that add nodes at boot time (see also
133 * sysctl_buildtree())
134 */
135__link_set_decl(sysctl_funcs, sysctl_setup_func);
136
137/*
138 * The `sysctl_treelock' is intended to serialize access to the sysctl
139 * tree.  XXX This has serious problems; allocating memory and
140 * copying data out with the lock held is insane.
141 */
142krwlock_t sysctl_treelock;
143
144kmutex_t sysctl_file_marker_lock;
145
146/*
147 * Attributes stored in the kernel.
148 */
149char hostname[MAXHOSTNAMELEN];
150int hostnamelen;
151
152char domainname[MAXHOSTNAMELEN];
153int domainnamelen;
154
155long hostid;
156
157#ifndef DEFCORENAME
158#define	DEFCORENAME	"%n.core"
159#endif
160char defcorename[MAXPATHLEN] = DEFCORENAME;
161
162cprng_strong_t *sysctl_prng;
163
164/*
165 * ********************************************************************
166 * Section 0: Some simple glue
167 * ********************************************************************
168 * By wrapping copyin(), copyout(), and copyinstr() like this, we can
169 * stop caring about who's calling us and simplify some code a bunch.
170 * ********************************************************************
171 */
172int
173sysctl_copyin(struct lwp *l, const void *uaddr, void *kaddr, size_t len)
174{
175	int error;
176
177	if (l != NULL) {
178		error = copyin(uaddr, kaddr, len);
179		ktrmibio(-1, UIO_WRITE, uaddr, len, error);
180	} else {
181		error = kcopy(uaddr, kaddr, len);
182	}
183
184	return error;
185}
186
187int
188sysctl_copyout(struct lwp *l, const void *kaddr, void *uaddr, size_t len)
189{
190	int error;
191
192	if (l != NULL) {
193		error = copyout(kaddr, uaddr, len);
194		ktrmibio(-1, UIO_READ, uaddr, len, error);
195	} else {
196		error = kcopy(kaddr, uaddr, len);
197	}
198
199	return error;
200}
201
202int
203sysctl_copyinstr(struct lwp *l, const void *uaddr, void *kaddr,
204		 size_t len, size_t *done)
205{
206	int error;
207
208	if (l != NULL) {
209		error = copyinstr(uaddr, kaddr, len, done);
210		ktrmibio(-1, UIO_WRITE, uaddr, len, error);
211	} else {
212		error = copystr(uaddr, kaddr, len, done);
213	}
214
215	return error;
216}
217
218/*
219 * ********************************************************************
220 * Initialize sysctl subsystem.
221 * ********************************************************************
222 */
223void
224sysctl_init(void)
225{
226	sysctl_setup_func * const *sysctl_setup, f;
227
228	rw_init(&sysctl_treelock);
229
230	/*
231	 * dynamic mib numbers start here
232	 */
233	sysctl_root.sysctl_num = CREATE_BASE;
234
235        __link_set_foreach(sysctl_setup, sysctl_funcs) {
236		/*
237		 * XXX - why do i have to coerce the pointers like this?
238		 */
239		f = (void*)*sysctl_setup;
240		(*f)(NULL);
241	}
242
243	mutex_init(&sysctl_file_marker_lock, MUTEX_DEFAULT, IPL_NONE);
244}
245
246/*
247 * Setting this means no more permanent nodes can be added,
248 * trees that claim to be readonly at the root now are, and if
249 * the main tree is readonly, *everything* is.
250 *
251 * Also starts up the PRNG used for the "random" sysctl: it's
252 * better to start it later than sooner.
253 *
254 * Call this at the end of kernel init.
255 */
256void
257sysctl_finalize(void)
258{
259        sysctl_prng = cprng_strong_create("sysctl", IPL_NONE,
260					  CPRNG_INIT_ANY|CPRNG_REKEY_ANY);
261	sysctl_root.sysctl_flags |= CTLFLAG_PERMANENT;
262}
263
264/*
265 * ********************************************************************
266 * The main native sysctl system call itself.
267 * ********************************************************************
268 */
269int
270sys___sysctl(struct lwp *l, const struct sys___sysctl_args *uap, register_t *retval)
271{
272	/* {
273		syscallarg(const int *) name;
274		syscallarg(u_int) namelen;
275		syscallarg(void *) old;
276		syscallarg(size_t *) oldlenp;
277		syscallarg(const void *) new;
278		syscallarg(size_t) newlen;
279	} */
280	int error, nerror, name[CTL_MAXNAME];
281	size_t oldlen, savelen, *oldlenp;
282
283	/*
284	 * get oldlen
285	 */
286	oldlen = 0;
287	oldlenp = SCARG(uap, oldlenp);
288	if (oldlenp != NULL) {
289		error = copyin(oldlenp, &oldlen, sizeof(oldlen));
290		if (error)
291			return (error);
292	}
293	savelen = oldlen;
294
295	/*
296	 * top-level sysctl names may or may not be non-terminal, but
297	 * we don't care
298	 */
299	if (SCARG(uap, namelen) > CTL_MAXNAME || SCARG(uap, namelen) < 1)
300		return (EINVAL);
301	error = copyin(SCARG(uap, name), &name,
302		       SCARG(uap, namelen) * sizeof(int));
303	if (error)
304		return (error);
305
306	ktrmib(name, SCARG(uap, namelen));
307
308	sysctl_lock(SCARG(uap, new) != NULL);
309
310	/*
311	 * do sysctl work (NULL means main built-in default tree)
312	 */
313	error = sysctl_dispatch(&name[0], SCARG(uap, namelen),
314				SCARG(uap, old), &oldlen,
315				SCARG(uap, new), SCARG(uap, newlen),
316				&name[0], l, NULL);
317
318	/*
319	 * release the sysctl lock
320	 */
321	sysctl_unlock();
322
323	/*
324	 * set caller's oldlen to new value even in the face of an
325	 * error (if this gets an error and they didn't have one, they
326	 * get this one)
327	 */
328	if (oldlenp) {
329		nerror = copyout(&oldlen, oldlenp, sizeof(oldlen));
330		if (error == 0)
331			error = nerror;
332	}
333
334	/*
335	 * if the only problem is that we weren't given enough space,
336	 * that's an ENOMEM error
337	 */
338	if (error == 0 && SCARG(uap, old) != NULL && savelen < oldlen)
339		error = ENOMEM;
340
341	return (error);
342}
343
344/*
345 * ********************************************************************
346 * Section 1: How the tree is used
347 * ********************************************************************
348 * Implementations of sysctl for emulations should typically need only
349 * these three functions in this order: lock the tree, dispatch
350 * request into it, unlock the tree.
351 * ********************************************************************
352 */
353void
354sysctl_lock(bool write)
355{
356
357	if (write) {
358		rw_enter(&sysctl_treelock, RW_WRITER);
359		curlwp->l_pflag |= LP_SYSCTLWRITE;
360	} else {
361		rw_enter(&sysctl_treelock, RW_READER);
362		curlwp->l_pflag &= ~LP_SYSCTLWRITE;
363	}
364}
365
366void
367sysctl_relock(void)
368{
369
370	if ((curlwp->l_pflag & LP_SYSCTLWRITE) != 0) {
371		rw_enter(&sysctl_treelock, RW_WRITER);
372	} else {
373		rw_enter(&sysctl_treelock, RW_READER);
374	}
375}
376
377/*
378 * ********************************************************************
379 * the main sysctl dispatch routine.  scans the given tree and picks a
380 * function to call based on what it finds.
381 * ********************************************************************
382 */
383int
384sysctl_dispatch(SYSCTLFN_ARGS)
385{
386	int error;
387	sysctlfn fn;
388	int ni;
389
390	KASSERT(rw_lock_held(&sysctl_treelock));
391
392	if (rnode && SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
393		printf("sysctl_dispatch: rnode %p wrong version\n", rnode);
394		error = EINVAL;
395		goto out;
396	}
397
398	fn = NULL;
399	error = sysctl_locate(l, name, namelen, &rnode, &ni);
400
401	if (rnode->sysctl_func != NULL) {
402		/*
403		 * the node we ended up at has a function, so call it.  it can
404		 * hand off to query or create if it wants to.
405		 */
406		fn = rnode->sysctl_func;
407	} else if (error == 0) {
408		/*
409		 * we found the node they were looking for, so do a lookup.
410		 */
411		fn = (sysctlfn)sysctl_lookup; /* XXX may write to rnode */
412	} else if (error == ENOENT && (ni + 1) == namelen && name[ni] < 0) {
413		/*
414		 * prospective parent node found, but the terminal node was
415		 * not.  generic operations associate with the parent.
416		 */
417		switch (name[ni]) {
418		case CTL_QUERY:
419			fn = sysctl_query;
420			break;
421		case CTL_CREATE:
422#if NKSYMS > 0
423		case CTL_CREATESYM:
424#endif /* NKSYMS > 0 */
425			if (newp == NULL) {
426				error = EINVAL;
427				break;
428			}
429			KASSERT(rw_write_held(&sysctl_treelock));
430			fn = (sysctlfn)sysctl_create; /* we own the rnode */
431			break;
432		case CTL_DESTROY:
433			if (newp == NULL) {
434				error = EINVAL;
435				break;
436			}
437			KASSERT(rw_write_held(&sysctl_treelock));
438			fn = (sysctlfn)sysctl_destroy; /* we own the rnode */
439			break;
440		case CTL_MMAP:
441			fn = (sysctlfn)sysctl_mmap; /* we own the rnode */
442			break;
443		case CTL_DESCRIBE:
444			fn = sysctl_describe;
445			break;
446		default:
447			error = EOPNOTSUPP;
448			break;
449		}
450	}
451
452	/*
453	 * after all of that, maybe we found someone who knows how to
454	 * get us what we want?
455	 */
456	if (fn != NULL)
457		error = (*fn)(name + ni, namelen - ni, oldp, oldlenp,
458			      newp, newlen, name, l, rnode);
459	else if (error == 0)
460		error = EOPNOTSUPP;
461
462out:
463	return (error);
464}
465
466/*
467 * ********************************************************************
468 * Releases the tree lock.
469 * ********************************************************************
470 */
471void
472sysctl_unlock(void)
473{
474
475	rw_exit(&sysctl_treelock);
476}
477
478/*
479 * ********************************************************************
480 * Section 2: The main tree interfaces
481 * ********************************************************************
482 * This is how sysctl_dispatch() does its work, and you can too, by
483 * calling these routines from helpers (though typically only
484 * sysctl_lookup() will be used).  The tree MUST BE LOCKED when these
485 * are called.
486 * ********************************************************************
487 */
488
489/*
490 * sysctl_locate -- Finds the node matching the given mib under the
491 * given tree (via rv).  If no tree is given, we fall back to the
492 * native tree.  The current process (via l) is used for access
493 * control on the tree (some nodes may be traversable only by root) and
494 * on return, nip will show how many numbers in the mib were consumed.
495 */
496int
497sysctl_locate(struct lwp *l, const int *name, u_int namelen,
498	      const struct sysctlnode **rnode, int *nip)
499{
500	const struct sysctlnode *node, *pnode;
501	int tn, si, ni, error, alias;
502
503	KASSERT(rw_lock_held(&sysctl_treelock));
504
505	/*
506	 * basic checks and setup
507	 */
508	if (*rnode == NULL)
509		*rnode = &sysctl_root;
510	if (nip)
511		*nip = 0;
512	if (namelen == 0)
513		return (0);
514
515	/*
516	 * search starts from "root"
517	 */
518	pnode = *rnode;
519	if (SYSCTL_VERS(pnode->sysctl_flags) != SYSCTL_VERSION) {
520		printf("sysctl_locate: pnode %p wrong version\n", pnode);
521		return (EINVAL);
522	}
523	node = pnode->sysctl_child;
524	error = 0;
525
526	/*
527	 * scan for node to which new node should be attached
528	 */
529	for (ni = 0; ni < namelen; ni++) {
530		/*
531		 * walked off bottom of tree
532		 */
533		if (node == NULL) {
534			if (SYSCTL_TYPE(pnode->sysctl_flags) == CTLTYPE_NODE)
535				error = ENOENT;
536			else
537				error = ENOTDIR;
538			break;
539		}
540		/*
541		 * can anyone traverse this node or only root?
542		 */
543		if (l != NULL && (pnode->sysctl_flags & CTLFLAG_PRIVATE) &&
544		    (error = kauth_authorize_system(l->l_cred,
545		    KAUTH_SYSTEM_SYSCTL, KAUTH_REQ_SYSTEM_SYSCTL_PRVT,
546		    NULL, NULL, NULL)) != 0)
547			return (error);
548		/*
549		 * find a child node with the right number
550		 */
551		tn = name[ni];
552		alias = 0;
553
554		si = 0;
555		/*
556		 * Note: ANYNUMBER only matches positive integers.
557		 * Since ANYNUMBER is only permitted on single-node
558		 * sub-trees (eg proc), check before the loop and skip
559		 * it if we can.
560		 */
561		if ((node[si].sysctl_flags & CTLFLAG_ANYNUMBER) && (tn >= 0))
562			goto foundit;
563		for (; si < pnode->sysctl_clen; si++) {
564			if (node[si].sysctl_num == tn) {
565				if (node[si].sysctl_flags & CTLFLAG_ALIAS) {
566					if (alias++ == 4)
567						break;
568					else {
569						tn = node[si].sysctl_alias;
570						si = -1;
571					}
572				} else
573					goto foundit;
574			}
575		}
576		/*
577		 * if we ran off the end, it obviously doesn't exist
578		 */
579		error = ENOENT;
580		break;
581
582		/*
583		 * so far so good, move on down the line
584		 */
585	  foundit:
586		pnode = &node[si];
587		if (SYSCTL_TYPE(pnode->sysctl_flags) == CTLTYPE_NODE)
588			node = node[si].sysctl_child;
589		else
590			node = NULL;
591	}
592
593	*rnode = pnode;
594	if (nip)
595		*nip = ni;
596
597	return (error);
598}
599
600/*
601 * sysctl_query -- The auto-discovery engine.  Copies out the structs
602 * describing nodes under the given node and handles overlay trees.
603 */
604int
605sysctl_query(SYSCTLFN_ARGS)
606{
607	int error, ni, elim, v;
608	size_t out, left, t;
609	const struct sysctlnode *enode, *onode;
610	struct sysctlnode qnode;
611
612	KASSERT(rw_lock_held(&sysctl_treelock));
613
614	if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
615		printf("sysctl_query: rnode %p wrong version\n", rnode);
616		return (EINVAL);
617	}
618
619	if (SYSCTL_TYPE(rnode->sysctl_flags) != CTLTYPE_NODE)
620		return (ENOTDIR);
621	if (namelen != 1 || name[0] != CTL_QUERY)
622		return (EINVAL);
623
624	error = 0;
625	out = 0;
626	left = *oldlenp;
627	elim = 0;
628	enode = NULL;
629
630	/*
631	 * translate the given request to a current node
632	 */
633	error = sysctl_cvt_in(l, &v, newp, newlen, &qnode);
634	if (error)
635		return (error);
636
637	/*
638	 * if the request specifies a version, check it
639	 */
640	if (qnode.sysctl_ver != 0) {
641		enode = rnode;
642		if (qnode.sysctl_ver != enode->sysctl_ver &&
643		    qnode.sysctl_ver != sysctl_rootof(enode)->sysctl_ver)
644			return (EINVAL);
645	}
646
647	/*
648	 * process has overlay tree
649	 */
650	if (l && l->l_proc->p_emul->e_sysctlovly) {
651		enode = l->l_proc->p_emul->e_sysctlovly;
652		elim = (name - oname);
653		error = sysctl_locate(l, oname, elim, &enode, NULL);
654		if (error == 0) {
655			/* ah, found parent in overlay */
656			elim = enode->sysctl_clen;
657			enode = enode->sysctl_child;
658		} else {
659			error = 0;
660			elim = 0;
661			enode = NULL;
662		}
663	}
664
665	for (ni = 0; ni < rnode->sysctl_clen; ni++) {
666		onode = &rnode->sysctl_child[ni];
667		if (enode && enode->sysctl_num == onode->sysctl_num) {
668			if (SYSCTL_TYPE(enode->sysctl_flags) != CTLTYPE_NODE)
669				onode = enode;
670			if (--elim > 0)
671				enode++;
672			else
673				enode = NULL;
674		}
675		error = sysctl_cvt_out(l, v, onode, oldp, left, &t);
676		if (error)
677			return (error);
678		if (oldp != NULL)
679			oldp = (char*)oldp + t;
680		out += t;
681		left -= MIN(left, t);
682	}
683
684	/*
685	 * overlay trees *MUST* be entirely consumed
686	 */
687	KASSERT(enode == NULL);
688
689	*oldlenp = out;
690
691	return (error);
692}
693
694/*
695 * sysctl_create -- Adds a node (the description of which is taken
696 * from newp) to the tree, returning a copy of it in the space pointed
697 * to by oldp.  In the event that the requested slot is already taken
698 * (either by name or by number), the offending node is returned
699 * instead.  Yes, this is complex, but we want to make sure everything
700 * is proper.
701 */
702#ifdef SYSCTL_DEBUG_CREATE
703int _sysctl_create(SYSCTLFN_ARGS);
704int
705_sysctl_create(SYSCTLFN_ARGS)
706#else
707int
708sysctl_create(SYSCTLFN_ARGS)
709#endif
710{
711	struct sysctlnode nnode, *node, *pnode;
712	int error, ni, at, nm, type, nsz, sz, flags, anum, v;
713	void *own;
714
715	KASSERT(rw_write_held(&sysctl_treelock));
716
717	error = 0;
718	own = NULL;
719	anum = -1;
720
721	if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
722		printf("sysctl_create: rnode %p wrong version\n", rnode);
723		return (EINVAL);
724	}
725
726	if (namelen != 1 || (name[namelen - 1] != CTL_CREATE
727#if NKSYMS > 0
728			     && name[namelen - 1] != CTL_CREATESYM
729#endif /* NKSYMS > 0 */
730			     ))
731		return (EINVAL);
732
733	/*
734	 * processes can only add nodes at securelevel 0, must be
735	 * root, and can't add nodes to a parent that's not writeable
736	 */
737	if (l != NULL) {
738#ifndef SYSCTL_DISALLOW_CREATE
739		error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL,
740		    KAUTH_REQ_SYSTEM_SYSCTL_ADD, NULL, NULL, NULL);
741		if (error)
742			return (error);
743		if (!(rnode->sysctl_flags & CTLFLAG_READWRITE))
744#endif /* SYSCTL_DISALLOW_CREATE */
745			return (EPERM);
746	}
747
748	/*
749	 * nothing can add a node if:
750	 * we've finished initial set up of this tree and
751	 * (the tree itself is not writeable or
752	 * the entire sysctl system is not writeable)
753	 */
754	if ((sysctl_rootof(rnode)->sysctl_flags & CTLFLAG_PERMANENT) &&
755	    (!(sysctl_rootof(rnode)->sysctl_flags & CTLFLAG_READWRITE) ||
756	     !(sysctl_root.sysctl_flags & CTLFLAG_READWRITE)))
757		return (EPERM);
758
759	/*
760	 * it must be a "node", not a "int" or something
761	 */
762	if (SYSCTL_TYPE(rnode->sysctl_flags) != CTLTYPE_NODE)
763		return (ENOTDIR);
764	if (rnode->sysctl_flags & CTLFLAG_ALIAS) {
765		printf("sysctl_create: attempt to add node to aliased "
766		       "node %p\n", rnode);
767		return (EINVAL);
768	}
769	pnode = __UNCONST(rnode); /* we are adding children to this node */
770
771	if (newp == NULL)
772		return (EINVAL);
773	error = sysctl_cvt_in(l, &v, newp, newlen, &nnode);
774	if (error)
775		return (error);
776
777	/*
778	 * nodes passed in don't *have* parents
779	 */
780	if (nnode.sysctl_parent != NULL)
781		return (EINVAL);
782
783	/*
784	 * if we are indeed adding it, it should be a "good" name and
785	 * number
786	 */
787	nm = nnode.sysctl_num;
788#if NKSYMS > 0
789	if (nm == CTL_CREATESYM)
790		nm = CTL_CREATE;
791#endif /* NKSYMS > 0 */
792	if (nm < 0 && nm != CTL_CREATE)
793		return (EINVAL);
794
795	/*
796	 * the name can't start with a digit
797	 */
798	if (nnode.sysctl_name[0] >= '0' &&
799	    nnode.sysctl_name[0] <= '9')
800		return (EINVAL);
801
802	/*
803	 * the name must be only alphanumerics or - or _, longer than
804	 * 0 bytes and less that SYSCTL_NAMELEN
805	 */
806	nsz = 0;
807	while (nsz < SYSCTL_NAMELEN && nnode.sysctl_name[nsz] != '\0') {
808		if ((nnode.sysctl_name[nsz] >= '0' &&
809		     nnode.sysctl_name[nsz] <= '9') ||
810		    (nnode.sysctl_name[nsz] >= 'A' &&
811		     nnode.sysctl_name[nsz] <= 'Z') ||
812		    (nnode.sysctl_name[nsz] >= 'a' &&
813		     nnode.sysctl_name[nsz] <= 'z') ||
814		    nnode.sysctl_name[nsz] == '-' ||
815		    nnode.sysctl_name[nsz] == '_')
816			nsz++;
817		else
818			return (EINVAL);
819	}
820	if (nsz == 0 || nsz == SYSCTL_NAMELEN)
821		return (EINVAL);
822
823	/*
824	 * various checks revolve around size vs type, etc
825	 */
826	type = SYSCTL_TYPE(nnode.sysctl_flags);
827	flags = SYSCTL_FLAGS(nnode.sysctl_flags);
828	sz = nnode.sysctl_size;
829
830	/*
831	 * find out if there's a collision, and if so, let the caller
832	 * know what they collided with
833	 */
834	node = pnode->sysctl_child;
835	at = 0;
836	if (node) {
837		if ((flags | node->sysctl_flags) & CTLFLAG_ANYNUMBER)
838			/* No siblings for a CTLFLAG_ANYNUMBER node */
839			return EINVAL;
840		for (ni = 0; ni < pnode->sysctl_clen; ni++) {
841			if (nm == node[ni].sysctl_num ||
842			    strcmp(nnode.sysctl_name, node[ni].sysctl_name) == 0) {
843				/*
844				 * ignore error here, since we
845				 * are already fixed on EEXIST
846				 */
847				(void)sysctl_cvt_out(l, v, &node[ni], oldp,
848						     *oldlenp, oldlenp);
849				return (EEXIST);
850			}
851			if (nm > node[ni].sysctl_num)
852				at++;
853		}
854	}
855
856	/*
857	 * use sysctl_ver to add to the tree iff it hasn't changed
858	 */
859	if (nnode.sysctl_ver != 0) {
860		/*
861		 * a specified value must match either the parent
862		 * node's version or the root node's version
863		 */
864		if (nnode.sysctl_ver != sysctl_rootof(rnode)->sysctl_ver &&
865		    nnode.sysctl_ver != rnode->sysctl_ver) {
866			return (EINVAL);
867		}
868	}
869
870	/*
871	 * only the kernel can assign functions to entries
872	 */
873	if (l != NULL && nnode.sysctl_func != NULL)
874		return (EPERM);
875
876	/*
877	 * only the kernel can create permanent entries, and only then
878	 * before the kernel is finished setting itself up
879	 */
880	if (l != NULL && (flags & ~SYSCTL_USERFLAGS))
881		return (EPERM);
882	if ((flags & CTLFLAG_PERMANENT) &
883	    (sysctl_root.sysctl_flags & CTLFLAG_PERMANENT))
884		return (EPERM);
885	if ((flags & (CTLFLAG_OWNDATA | CTLFLAG_IMMEDIATE)) ==
886	    (CTLFLAG_OWNDATA | CTLFLAG_IMMEDIATE))
887		return (EINVAL);
888	if ((flags & CTLFLAG_IMMEDIATE) &&
889	    type != CTLTYPE_INT && type != CTLTYPE_QUAD && type != CTLTYPE_BOOL)
890		return (EINVAL);
891
892	/*
893	 * check size, or set it if unset and we can figure it out.
894	 * kernel created nodes are allowed to have a function instead
895	 * of a size (or a data pointer).
896	 */
897	switch (type) {
898	case CTLTYPE_NODE:
899		/*
900		 * only *i* can assert the size of a node
901		 */
902		if (flags & CTLFLAG_ALIAS) {
903			anum = nnode.sysctl_alias;
904			if (anum < 0)
905				return (EINVAL);
906			nnode.sysctl_alias = 0;
907		}
908		if (sz != 0 || nnode.sysctl_data != NULL)
909			return (EINVAL);
910		if (nnode.sysctl_csize != 0 ||
911		    nnode.sysctl_clen != 0 ||
912		    nnode.sysctl_child != 0)
913			return (EINVAL);
914		if (flags & CTLFLAG_OWNDATA)
915			return (EINVAL);
916		sz = sizeof(struct sysctlnode);
917		break;
918	case CTLTYPE_INT:
919		/*
920		 * since an int is an int, if the size is not given or
921		 * is wrong, we can "int-uit" it.
922		 */
923		if (sz != 0 && sz != sizeof(int))
924			return (EINVAL);
925		sz = sizeof(int);
926		break;
927	case CTLTYPE_STRING:
928		/*
929		 * strings are a little more tricky
930		 */
931		if (sz == 0) {
932			if (l == NULL) {
933				if (nnode.sysctl_func == NULL) {
934					if (nnode.sysctl_data == NULL)
935						return (EINVAL);
936					else
937						sz = strlen(nnode.sysctl_data) +
938						    1;
939				}
940			} else if (nnode.sysctl_data == NULL &&
941				 flags & CTLFLAG_OWNDATA) {
942				return (EINVAL);
943			} else {
944				char *vp, *e;
945				size_t s;
946
947				/*
948				 * we want a rough idea of what the
949				 * size is now
950				 */
951				vp = malloc(PAGE_SIZE, M_SYSCTLDATA,
952					     M_WAITOK|M_CANFAIL);
953				if (vp == NULL)
954					return (ENOMEM);
955				e = nnode.sysctl_data;
956				do {
957					error = copyinstr(e, vp, PAGE_SIZE, &s);
958					if (error) {
959						if (error != ENAMETOOLONG) {
960							free(vp, M_SYSCTLDATA);
961							return (error);
962						}
963						e += PAGE_SIZE;
964						if ((e - 32 * PAGE_SIZE) >
965						    (char*)nnode.sysctl_data) {
966							free(vp, M_SYSCTLDATA);
967							return (ERANGE);
968						}
969					}
970				} while (error != 0);
971				sz = s + (e - (char*)nnode.sysctl_data);
972				free(vp, M_SYSCTLDATA);
973			}
974		}
975		break;
976	case CTLTYPE_QUAD:
977		if (sz != 0 && sz != sizeof(u_quad_t))
978			return (EINVAL);
979		sz = sizeof(u_quad_t);
980		break;
981	case CTLTYPE_BOOL:
982		/*
983		 * since an bool is an bool, if the size is not given or
984		 * is wrong, we can "intuit" it.
985		 */
986		if (sz != 0 && sz != sizeof(bool))
987			return (EINVAL);
988		sz = sizeof(bool);
989		break;
990	case CTLTYPE_STRUCT:
991		if (sz == 0) {
992			if (l != NULL || nnode.sysctl_func == NULL)
993				return (EINVAL);
994			if (flags & CTLFLAG_OWNDATA)
995				return (EINVAL);
996		}
997		break;
998	default:
999		return (EINVAL);
1000	}
1001
1002	/*
1003	 * at this point, if sz is zero, we *must* have a
1004	 * function to go with it and we can't own it.
1005	 */
1006
1007	/*
1008	 *  l  ptr own
1009	 *  0   0   0  -> EINVAL (if no func)
1010	 *  0   0   1  -> own
1011	 *  0   1   0  -> kptr
1012	 *  0   1   1  -> kptr
1013	 *  1   0   0  -> EINVAL
1014	 *  1   0   1  -> own
1015	 *  1   1   0  -> kptr, no own (fault on lookup)
1016	 *  1   1   1  -> uptr, own
1017	 */
1018	if (type != CTLTYPE_NODE) {
1019		if (sz != 0) {
1020			if (flags & CTLFLAG_OWNDATA) {
1021				own = malloc(sz, M_SYSCTLDATA,
1022					     M_WAITOK|M_CANFAIL);
1023				if (own == NULL)
1024					return ENOMEM;
1025				if (nnode.sysctl_data == NULL)
1026					memset(own, 0, sz);
1027				else {
1028					error = sysctl_copyin(l,
1029					    nnode.sysctl_data, own, sz);
1030					if (error != 0) {
1031						free(own, M_SYSCTLDATA);
1032						return (error);
1033					}
1034				}
1035			} else if ((nnode.sysctl_data != NULL) &&
1036				 !(flags & CTLFLAG_IMMEDIATE)) {
1037#if NKSYMS > 0
1038				if (name[namelen - 1] == CTL_CREATESYM) {
1039					char symname[128]; /* XXX enough? */
1040					u_long symaddr;
1041					size_t symlen;
1042
1043					error = sysctl_copyinstr(l,
1044					    nnode.sysctl_data, symname,
1045					    sizeof(symname), &symlen);
1046					if (error)
1047						return (error);
1048					error = ksyms_getval(NULL, symname,
1049					    &symaddr, KSYMS_EXTERN);
1050					if (error)
1051						return (error); /* EINVAL? */
1052					nnode.sysctl_data = (void*)symaddr;
1053				}
1054#endif /* NKSYMS > 0 */
1055				/*
1056				 * Ideally, we'd like to verify here
1057				 * that this address is acceptable,
1058				 * but...
1059				 *
1060				 * - it might be valid now, only to
1061				 *   become invalid later
1062				 *
1063				 * - it might be invalid only for the
1064				 *   moment and valid later
1065				 *
1066				 * - or something else.
1067				 *
1068				 * Since we can't get a good answer,
1069				 * we'll just accept the address as
1070				 * given, and fault on individual
1071				 * lookups.
1072				 */
1073			}
1074		} else if (nnode.sysctl_func == NULL)
1075			return (EINVAL);
1076	}
1077
1078	/*
1079	 * a process can't assign a function to a node, and the kernel
1080	 * can't create a node that has no function or data.
1081	 * (XXX somewhat redundant check)
1082	 */
1083	if (l != NULL || nnode.sysctl_func == NULL) {
1084		if (type != CTLTYPE_NODE &&
1085		    nnode.sysctl_data == NULL &&
1086		    !(flags & CTLFLAG_IMMEDIATE) &&
1087		    own == NULL)
1088			return (EINVAL);
1089	}
1090
1091#ifdef SYSCTL_DISALLOW_KWRITE
1092	/*
1093	 * a process can't create a writable node unless it refers to
1094	 * new data.
1095	 */
1096	if (l != NULL && own == NULL && type != CTLTYPE_NODE &&
1097	    (flags & CTLFLAG_READWRITE) != CTLFLAG_READONLY &&
1098	    !(flags & CTLFLAG_IMMEDIATE))
1099		return (EPERM);
1100#endif /* SYSCTL_DISALLOW_KWRITE */
1101
1102	/*
1103	 * make sure there's somewhere to put the new stuff.
1104	 */
1105	if (pnode->sysctl_child == NULL) {
1106		if (flags & CTLFLAG_ANYNUMBER)
1107			error = sysctl_alloc(pnode, 1);
1108		else
1109			error = sysctl_alloc(pnode, 0);
1110		if (error) {
1111			if (own != NULL)
1112				free(own, M_SYSCTLDATA);
1113			return (error);
1114		}
1115	}
1116	node = pnode->sysctl_child;
1117
1118	/*
1119	 * no collisions, so pick a good dynamic number if we need to.
1120	 */
1121	if (nm == CTL_CREATE) {
1122		nm = ++sysctl_root.sysctl_num;
1123		for (ni = 0; ni < pnode->sysctl_clen; ni++) {
1124			if (nm == node[ni].sysctl_num) {
1125				nm++;
1126				ni = -1;
1127			} else if (nm > node[ni].sysctl_num)
1128				at = ni + 1;
1129		}
1130	}
1131
1132	/*
1133	 * oops...ran out of space
1134	 */
1135	if (pnode->sysctl_clen == pnode->sysctl_csize) {
1136		error = sysctl_realloc(pnode);
1137		if (error) {
1138			if (own != NULL)
1139				free(own, M_SYSCTLDATA);
1140			return (error);
1141		}
1142		node = pnode->sysctl_child;
1143	}
1144
1145	/*
1146	 * insert new node data
1147	 */
1148	if (at < pnode->sysctl_clen) {
1149		int t;
1150
1151		/*
1152		 * move the nodes that should come after the new one
1153		 */
1154		memmove(&node[at + 1], &node[at],
1155			(pnode->sysctl_clen - at) * sizeof(struct sysctlnode));
1156		memset(&node[at], 0, sizeof(struct sysctlnode));
1157		node[at].sysctl_parent = pnode;
1158		/*
1159		 * and...reparent any children of any moved nodes
1160		 */
1161		for (ni = at; ni <= pnode->sysctl_clen; ni++)
1162			if (node[ni].sysctl_child != NULL)
1163				for (t = 0; t < node[ni].sysctl_csize; t++)
1164					node[ni].sysctl_child[t].sysctl_parent =
1165						&node[ni];
1166	}
1167	node = &node[at];
1168	pnode->sysctl_clen++;
1169
1170	strlcpy(node->sysctl_name, nnode.sysctl_name,
1171		sizeof(node->sysctl_name));
1172	node->sysctl_num = nm;
1173	node->sysctl_size = sz;
1174	node->sysctl_flags = SYSCTL_VERSION|type|flags; /* XXX other trees */
1175	node->sysctl_csize = 0;
1176	node->sysctl_clen = 0;
1177	if (own) {
1178		node->sysctl_data = own;
1179		node->sysctl_flags |= CTLFLAG_OWNDATA;
1180	} else if (flags & CTLFLAG_ALIAS) {
1181		node->sysctl_alias = anum;
1182	} else if (flags & CTLFLAG_IMMEDIATE) {
1183		switch (type) {
1184		case CTLTYPE_BOOL:
1185			node->sysctl_bdata = nnode.sysctl_bdata;
1186			break;
1187		case CTLTYPE_INT:
1188			node->sysctl_idata = nnode.sysctl_idata;
1189			break;
1190		case CTLTYPE_QUAD:
1191			node->sysctl_qdata = nnode.sysctl_qdata;
1192			break;
1193		}
1194	} else {
1195		node->sysctl_data = nnode.sysctl_data;
1196		node->sysctl_flags &= ~CTLFLAG_OWNDATA;
1197	}
1198        node->sysctl_func = nnode.sysctl_func;
1199        node->sysctl_child = NULL;
1200	/* node->sysctl_parent should already be done */
1201
1202	/*
1203	 * update "version" on path to "root"
1204	 */
1205	for (; rnode->sysctl_parent != NULL; rnode = rnode->sysctl_parent)
1206		;
1207	pnode = node;
1208	for (nm = rnode->sysctl_ver + 1; pnode != NULL;
1209	     pnode = pnode->sysctl_parent)
1210		pnode->sysctl_ver = nm;
1211
1212	/* If this fails, the node is already added - the user won't know! */
1213	error = sysctl_cvt_out(l, v, node, oldp, *oldlenp, oldlenp);
1214
1215	return (error);
1216}
1217
1218/*
1219 * ********************************************************************
1220 * A wrapper around sysctl_create() that prints the thing we're trying
1221 * to add.
1222 * ********************************************************************
1223 */
1224#ifdef SYSCTL_DEBUG_CREATE
1225int
1226sysctl_create(SYSCTLFN_ARGS)
1227{
1228	const struct sysctlnode *node;
1229	int k, rc, ni, nl = namelen + (name - oname);
1230
1231	node = newp;
1232
1233	printf("namelen %d (", nl);
1234	for (ni = 0; ni < nl - 1; ni++)
1235		printf(" %d", oname[ni]);
1236	printf(" %d )\t[%s]\tflags %08x (%08x %d %zu)\n",
1237	       k = node->sysctl_num,
1238	       node->sysctl_name,
1239	       node->sysctl_flags,
1240	       SYSCTL_FLAGS(node->sysctl_flags),
1241	       SYSCTL_TYPE(node->sysctl_flags),
1242	       node->sysctl_size);
1243
1244	node = rnode;
1245	rc = _sysctl_create(SYSCTLFN_CALL(rnode));
1246
1247	printf("sysctl_create(");
1248	for (ni = 0; ni < nl - 1; ni++)
1249		printf(" %d", oname[ni]);
1250	printf(" %d ) returned %d\n", k, rc);
1251
1252	return (rc);
1253}
1254#endif /* SYSCTL_DEBUG_CREATE */
1255
1256/*
1257 * sysctl_destroy -- Removes a node (as described by newp) from the
1258 * given tree, returning (if successful) a copy of the dead node in
1259 * oldp.  Since we're removing stuff, there's not much to check.
1260 */
1261int
1262sysctl_destroy(SYSCTLFN_ARGS)
1263{
1264	struct sysctlnode *node, *pnode, onode, nnode;
1265	int ni, error, v;
1266
1267	KASSERT(rw_write_held(&sysctl_treelock));
1268
1269	if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
1270		printf("sysctl_destroy: rnode %p wrong version\n", rnode);
1271		return (EINVAL);
1272	}
1273
1274	error = 0;
1275
1276	if (namelen != 1 || name[namelen - 1] != CTL_DESTROY)
1277		return (EINVAL);
1278
1279	/*
1280	 * processes can only destroy nodes at securelevel 0, must be
1281	 * root, and can't remove nodes from a parent that's not
1282	 * writeable
1283	 */
1284	if (l != NULL) {
1285#ifndef SYSCTL_DISALLOW_CREATE
1286		error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL,
1287		    KAUTH_REQ_SYSTEM_SYSCTL_DELETE, NULL, NULL, NULL);
1288		if (error)
1289			return (error);
1290		if (!(rnode->sysctl_flags & CTLFLAG_READWRITE))
1291#endif /* SYSCTL_DISALLOW_CREATE */
1292			return (EPERM);
1293	}
1294
1295	/*
1296	 * nothing can remove a node if:
1297	 * the node is permanent (checked later) or
1298	 * the tree itself is not writeable or
1299	 * the entire sysctl system is not writeable
1300	 *
1301	 * note that we ignore whether setup is complete or not,
1302	 * because these rules always apply.
1303	 */
1304	if (!(sysctl_rootof(rnode)->sysctl_flags & CTLFLAG_READWRITE) ||
1305	    !(sysctl_root.sysctl_flags & CTLFLAG_READWRITE))
1306		return (EPERM);
1307
1308	if (newp == NULL)
1309		return (EINVAL);
1310	error = sysctl_cvt_in(l, &v, newp, newlen, &nnode);
1311	if (error)
1312		return (error);
1313	memset(&onode, 0, sizeof(struct sysctlnode));
1314
1315	node = rnode->sysctl_child;
1316	for (ni = 0; ni < rnode->sysctl_clen; ni++) {
1317		if (nnode.sysctl_num == node[ni].sysctl_num) {
1318			/*
1319			 * if name specified, must match
1320			 */
1321			if (nnode.sysctl_name[0] != '\0' &&
1322			    strcmp(nnode.sysctl_name, node[ni].sysctl_name))
1323				continue;
1324			/*
1325			 * if version specified, must match
1326			 */
1327			if (nnode.sysctl_ver != 0 &&
1328			    nnode.sysctl_ver != node[ni].sysctl_ver)
1329				continue;
1330			/*
1331			 * this must be the one
1332			 */
1333			break;
1334		}
1335	}
1336	if (ni == rnode->sysctl_clen)
1337		return (ENOENT);
1338	node = &node[ni];
1339	pnode = node->sysctl_parent;
1340
1341	/*
1342	 * if the kernel says permanent, it is, so there.  nyah.
1343	 */
1344	if (SYSCTL_FLAGS(node->sysctl_flags) & CTLFLAG_PERMANENT)
1345		return (EPERM);
1346
1347	/*
1348	 * can't delete non-empty nodes
1349	 */
1350	if (SYSCTL_TYPE(node->sysctl_flags) == CTLTYPE_NODE &&
1351	    node->sysctl_clen != 0)
1352		return (ENOTEMPTY);
1353
1354	/*
1355	 * if the node "owns" data, release it now
1356	 */
1357	if (node->sysctl_flags & CTLFLAG_OWNDATA) {
1358		if (node->sysctl_data != NULL)
1359			free(node->sysctl_data, M_SYSCTLDATA);
1360		node->sysctl_data = NULL;
1361	}
1362	if (node->sysctl_flags & CTLFLAG_OWNDESC) {
1363		if (node->sysctl_desc != NULL)
1364			/*XXXUNCONST*/
1365			free(__UNCONST(node->sysctl_desc), M_SYSCTLDATA);
1366		node->sysctl_desc = NULL;
1367	}
1368
1369	/*
1370	 * if the node to be removed is not the last one on the list,
1371	 * move the remaining nodes up, and reparent any grandchildren
1372	 */
1373	onode = *node;
1374	if (ni < pnode->sysctl_clen - 1) {
1375		int t;
1376
1377		memmove(&pnode->sysctl_child[ni], &pnode->sysctl_child[ni + 1],
1378			(pnode->sysctl_clen - ni - 1) *
1379			sizeof(struct sysctlnode));
1380		for (; ni < pnode->sysctl_clen - 1; ni++)
1381			if (SYSCTL_TYPE(pnode->sysctl_child[ni].sysctl_flags) ==
1382			    CTLTYPE_NODE)
1383				for (t = 0;
1384				     t < pnode->sysctl_child[ni].sysctl_clen;
1385				     t++)
1386					pnode->sysctl_child[ni].sysctl_child[t].
1387						sysctl_parent =
1388						&pnode->sysctl_child[ni];
1389		ni = pnode->sysctl_clen - 1;
1390		node = &pnode->sysctl_child[ni];
1391	}
1392
1393	/*
1394	 * reset the space we just vacated
1395	 */
1396	memset(node, 0, sizeof(struct sysctlnode));
1397	node->sysctl_parent = pnode;
1398	pnode->sysctl_clen--;
1399
1400	/*
1401	 * if this parent just lost its last child, nuke the creche
1402	 */
1403	if (pnode->sysctl_clen == 0) {
1404		free(pnode->sysctl_child, M_SYSCTLNODE);
1405		pnode->sysctl_csize = 0;
1406		pnode->sysctl_child = NULL;
1407	}
1408
1409	/*
1410	 * update "version" on path to "root"
1411	 */
1412        for (; rnode->sysctl_parent != NULL; rnode = rnode->sysctl_parent)
1413                ;
1414	for (ni = rnode->sysctl_ver + 1; pnode != NULL;
1415	     pnode = pnode->sysctl_parent)
1416		pnode->sysctl_ver = ni;
1417
1418	error = sysctl_cvt_out(l, v, &onode, oldp, *oldlenp, oldlenp);
1419
1420	return (error);
1421}
1422
1423/*
1424 * sysctl_lookup -- Handles copyin/copyout of new and old values.
1425 * Partial reads are globally allowed.  Only root can write to things
1426 * unless the node says otherwise.
1427 */
1428int
1429sysctl_lookup(SYSCTLFN_ARGS)
1430{
1431	int error, rw;
1432	size_t sz, len;
1433	void *d;
1434
1435	KASSERT(rw_lock_held(&sysctl_treelock));
1436
1437	if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
1438		printf("sysctl_lookup: rnode %p wrong version\n", rnode);
1439		return (EINVAL);
1440	}
1441
1442	error = 0;
1443
1444	/*
1445	 * you can't "look up" a node.  you can "query" it, but you
1446	 * can't "look it up".
1447	 */
1448	if (SYSCTL_TYPE(rnode->sysctl_flags) == CTLTYPE_NODE || namelen != 0)
1449		return (EINVAL);
1450
1451	/*
1452	 * some nodes are private, so only root can look into them.
1453	 */
1454	if (l != NULL && (rnode->sysctl_flags & CTLFLAG_PRIVATE) &&
1455	    (error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL,
1456	    KAUTH_REQ_SYSTEM_SYSCTL_PRVT, NULL, NULL, NULL)) != 0)
1457		return (error);
1458
1459	/*
1460	 * if a node wants to be writable according to different rules
1461	 * other than "only root can write to stuff unless a flag is
1462	 * set", then it needs its own function which should have been
1463	 * called and not us.
1464	 */
1465	if (l != NULL && newp != NULL &&
1466	    !(rnode->sysctl_flags & CTLFLAG_ANYWRITE) &&
1467	    (error = kauth_authorize_system(l->l_cred,
1468	    KAUTH_SYSTEM_SYSCTL, KAUTH_REQ_SYSTEM_SYSCTL_MODIFY, NULL, NULL,
1469	    NULL)) != 0)
1470		return (error);
1471
1472	/*
1473	 * is this node supposedly writable?
1474	 */
1475	rw = (rnode->sysctl_flags & CTLFLAG_READWRITE) ? 1 : 0;
1476
1477	/*
1478	 * it appears not to be writable at this time, so if someone
1479	 * tried to write to it, we must tell them to go away
1480	 */
1481	if (!rw && newp != NULL)
1482		return (EPERM);
1483
1484	/*
1485	 * step one, copy out the stuff we have presently
1486	 */
1487	if (rnode->sysctl_flags & CTLFLAG_IMMEDIATE) {
1488		/*
1489		 * note that we discard const here because we are
1490		 * modifying the contents of the node (which is okay
1491		 * because it's ours)
1492		 */
1493		switch (SYSCTL_TYPE(rnode->sysctl_flags)) {
1494		case CTLTYPE_BOOL:
1495			d = __UNCONST(&rnode->sysctl_bdata);
1496			break;
1497		case CTLTYPE_INT:
1498			d = __UNCONST(&rnode->sysctl_idata);
1499			break;
1500		case CTLTYPE_QUAD:
1501			d = __UNCONST(&rnode->sysctl_qdata);
1502			break;
1503		default:
1504			return (EINVAL);
1505		}
1506	} else
1507		d = rnode->sysctl_data;
1508	if (SYSCTL_TYPE(rnode->sysctl_flags) == CTLTYPE_STRING)
1509		sz = strlen(d) + 1; /* XXX@@@ possible fault here */
1510	else
1511		sz = rnode->sysctl_size;
1512	if (oldp != NULL)
1513		error = sysctl_copyout(l, d, oldp, MIN(sz, *oldlenp));
1514	if (error)
1515		return (error);
1516	*oldlenp = sz;
1517
1518	/*
1519	 * are we done?
1520	 */
1521	if (newp == NULL || newlen == 0)
1522		return (0);
1523
1524	/*
1525	 * hmm...not done.  must now "copy in" new value.  re-adjust
1526	 * sz to maximum value (strings are "weird").
1527	 */
1528	sz = rnode->sysctl_size;
1529	switch (SYSCTL_TYPE(rnode->sysctl_flags)) {
1530	case CTLTYPE_BOOL: {
1531		bool tmp;
1532		/*
1533		 * these data must be *exactly* the same size coming
1534		 * in.  bool may only be true or false.
1535		 */
1536		if (newlen != sz)
1537			return (EINVAL);
1538		error = sysctl_copyin(l, newp, &tmp, sz);
1539		if (tmp != true && tmp != false)
1540			return EINVAL;
1541		if (error)
1542			break;
1543		*(bool *)d = tmp;
1544		break;
1545	}
1546	case CTLTYPE_INT:
1547	case CTLTYPE_QUAD:
1548	case CTLTYPE_STRUCT:
1549		/*
1550		 * these data must be *exactly* the same size coming
1551		 * in.
1552		 */
1553		if (newlen != sz)
1554			return (EINVAL);
1555		error = sysctl_copyin(l, newp, d, sz);
1556		break;
1557	case CTLTYPE_STRING: {
1558		/*
1559		 * strings, on the other hand, can be shorter, and we
1560		 * let userland be sloppy about the trailing nul.
1561		 */
1562		char *newbuf;
1563
1564		/*
1565		 * too much new string?
1566		 */
1567		if (newlen > sz)
1568			return (EINVAL);
1569
1570		/*
1571		 * temporary copy of new inbound string
1572		 */
1573		len = MIN(sz, newlen);
1574		newbuf = malloc(len, M_SYSCTLDATA, M_WAITOK|M_CANFAIL);
1575		if (newbuf == NULL)
1576			return (ENOMEM);
1577		error = sysctl_copyin(l, newp, newbuf, len);
1578		if (error) {
1579			free(newbuf, M_SYSCTLDATA);
1580			return (error);
1581		}
1582
1583		/*
1584		 * did they NUL terminate it, or do we have space
1585		 * left to do it ourselves?
1586		 */
1587		if (newbuf[len - 1] != '\0' && len == sz) {
1588			free(newbuf, M_SYSCTLDATA);
1589			return (EINVAL);
1590		}
1591
1592		/*
1593		 * looks good, so pop it into place and zero the rest.
1594		 */
1595		if (len > 0)
1596			memcpy(d, newbuf, len);
1597		if (sz != len)
1598			memset((char*)d + len, 0, sz - len);
1599		free(newbuf, M_SYSCTLDATA);
1600		break;
1601	}
1602	default:
1603		return (EINVAL);
1604	}
1605
1606	return (error);
1607}
1608
1609/*
1610 * sysctl_mmap -- Dispatches sysctl mmap requests to those nodes that
1611 * purport to handle it.  This interface isn't fully fleshed out yet,
1612 * unfortunately.
1613 */
1614static int
1615sysctl_mmap(SYSCTLFN_ARGS)
1616{
1617	const struct sysctlnode *node;
1618	struct sysctlnode nnode;
1619	int error;
1620
1621	if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
1622		printf("sysctl_mmap: rnode %p wrong version\n", rnode);
1623		return (EINVAL);
1624	}
1625
1626	/*
1627	 * let's just pretend that didn't happen, m'kay?
1628	 */
1629	if (l == NULL)
1630		return (EPERM);
1631
1632	/*
1633	 * is this a sysctlnode description of an mmap request?
1634	 */
1635	if (newp == NULL || newlen != sizeof(struct sysctlnode))
1636		return (EINVAL);
1637	error = sysctl_copyin(l, newp, &nnode, sizeof(nnode));
1638	if (error)
1639		return (error);
1640
1641	/*
1642	 * does the node they asked for exist?
1643	 */
1644	if (namelen != 1)
1645		return (EOPNOTSUPP);
1646	node = rnode;
1647        error = sysctl_locate(l, &nnode.sysctl_num, 1, &node, NULL);
1648	if (error)
1649		return (error);
1650
1651	/*
1652	 * does this node that we have found purport to handle mmap?
1653	 */
1654	if (node->sysctl_func == NULL ||
1655	    !(node->sysctl_flags & CTLFLAG_MMAP))
1656		return (EOPNOTSUPP);
1657
1658	/*
1659	 * well...okay, they asked for it.
1660	 */
1661	return ((*node->sysctl_func)(SYSCTLFN_CALL(node)));
1662}
1663
1664int
1665sysctl_describe(SYSCTLFN_ARGS)
1666{
1667	struct sysctldesc *d;
1668	void *bf;
1669	size_t sz, left, tot;
1670	int i, error, v = -1;
1671	struct sysctlnode *node;
1672	struct sysctlnode dnode;
1673
1674	if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
1675		printf("sysctl_query: rnode %p wrong version\n", rnode);
1676		return (EINVAL);
1677	}
1678
1679	if (SYSCTL_TYPE(rnode->sysctl_flags) != CTLTYPE_NODE)
1680		return (ENOTDIR);
1681	if (namelen != 1 || name[0] != CTL_DESCRIBE)
1682		return (EINVAL);
1683
1684	/*
1685	 * get ready...
1686	 */
1687	error = 0;
1688	d = bf = malloc(MAXDESCLEN, M_TEMP, M_WAITOK|M_CANFAIL);
1689	if (bf == NULL)
1690		return ENOMEM;
1691	tot = 0;
1692	node = rnode->sysctl_child;
1693	left = *oldlenp;
1694
1695	/*
1696	 * no request -> all descriptions at this level
1697	 * request with desc unset -> just this node
1698	 * request with desc set -> set descr for this node
1699	 */
1700	if (newp != NULL) {
1701		error = sysctl_cvt_in(l, &v, newp, newlen, &dnode);
1702		if (error)
1703			goto out;
1704		if (dnode.sysctl_desc != NULL) {
1705			/*
1706			 * processes cannot set descriptions above
1707			 * securelevel 0.  and must be root.  blah
1708			 * blah blah.  a couple more checks are made
1709			 * once we find the node we want.
1710			 */
1711			if (l != NULL) {
1712#ifndef SYSCTL_DISALLOW_CREATE
1713				error = kauth_authorize_system(l->l_cred,
1714				    KAUTH_SYSTEM_SYSCTL,
1715				    KAUTH_REQ_SYSTEM_SYSCTL_DESC, NULL,
1716				    NULL, NULL);
1717				if (error)
1718					goto out;
1719#else /* SYSCTL_DISALLOW_CREATE */
1720				error = EPERM;
1721				goto out;
1722#endif /* SYSCTL_DISALLOW_CREATE */
1723			}
1724
1725			/*
1726			 * find node and try to set the description on it
1727			 */
1728			for (i = 0; i < rnode->sysctl_clen; i++)
1729				if (node[i].sysctl_num == dnode.sysctl_num)
1730					break;
1731			if (i == rnode->sysctl_clen) {
1732				error = ENOENT;
1733				goto out;
1734			}
1735			node = &node[i];
1736
1737			/*
1738			 * did the caller specify a node version?
1739			 */
1740			if (dnode.sysctl_ver != 0 &&
1741			    dnode.sysctl_ver != node->sysctl_ver) {
1742				error = EINVAL;
1743				goto out;
1744			}
1745
1746			/*
1747			 * okay...some rules:
1748			 * (1) if setup is done and the tree is
1749			 *     read-only or the whole system is
1750			 *     read-only
1751			 * (2) no one can set a description on a
1752			 *     permanent node (it must be set when
1753			 *     using createv)
1754			 * (3) processes cannot *change* a description
1755			 * (4) processes *can*, however, set a
1756			 *     description on a read-only node so that
1757			 *     one can be created and then described
1758			 *     in two steps
1759			 * anything else come to mind?
1760			 */
1761			if ((sysctl_root.sysctl_flags & CTLFLAG_PERMANENT) &&
1762			    (!(sysctl_rootof(node)->sysctl_flags &
1763			       CTLFLAG_READWRITE) ||
1764			     !(sysctl_root.sysctl_flags & CTLFLAG_READWRITE))) {
1765				error = EPERM;
1766				goto out;
1767			}
1768			if (node->sysctl_flags & CTLFLAG_PERMANENT) {
1769				error = EPERM;
1770				goto out;
1771			}
1772			if (l != NULL && node->sysctl_desc != NULL) {
1773				error = EPERM;
1774				goto out;
1775			}
1776
1777			/*
1778			 * right, let's go ahead.  the first step is
1779			 * making the description into something the
1780			 * node can "own", if need be.
1781			 */
1782			if (l != NULL ||
1783			    dnode.sysctl_flags & CTLFLAG_OWNDESC) {
1784				char *nd, *k;
1785
1786				k = malloc(MAXDESCLEN, M_TEMP,
1787				    M_WAITOK|M_CANFAIL);
1788				if (k == NULL) {
1789					error = ENOMEM;
1790					goto out;
1791				}
1792				error = sysctl_copyinstr(l, dnode.sysctl_desc,
1793							 k, MAXDESCLEN, &sz);
1794				if (error) {
1795					free(k, M_TEMP);
1796					goto out;
1797				}
1798				nd = malloc(sz, M_SYSCTLDATA,
1799					    M_WAITOK|M_CANFAIL);
1800				if (nd == NULL) {
1801					free(k, M_TEMP);
1802					error = ENOMEM;
1803					goto out;
1804				}
1805				memcpy(nd, k, sz);
1806				dnode.sysctl_flags |= CTLFLAG_OWNDESC;
1807				dnode.sysctl_desc = nd;
1808				free(k, M_TEMP);
1809			}
1810
1811			/*
1812			 * now "release" the old description and
1813			 * attach the new one.  ta-da.
1814			 */
1815			if ((node->sysctl_flags & CTLFLAG_OWNDESC) &&
1816			    node->sysctl_desc != NULL)
1817				/*XXXUNCONST*/
1818				free(__UNCONST(node->sysctl_desc), M_SYSCTLDATA);
1819			node->sysctl_desc = dnode.sysctl_desc;
1820			node->sysctl_flags |=
1821				(dnode.sysctl_flags & CTLFLAG_OWNDESC);
1822
1823			/*
1824			 * now we "fall out" and into the loop which
1825			 * will copy the new description back out for
1826			 * those interested parties
1827			 */
1828		}
1829	}
1830
1831	/*
1832	 * scan for one description or just retrieve all descriptions
1833	 */
1834	for (i = 0; i < rnode->sysctl_clen; i++) {
1835		/*
1836		 * did they ask for the description of only one node?
1837		 */
1838		if (v != -1 && node[i].sysctl_num != dnode.sysctl_num)
1839			continue;
1840
1841		/*
1842		 * don't describe "private" nodes to non-suser users
1843		 */
1844		if ((node[i].sysctl_flags & CTLFLAG_PRIVATE) && (l != NULL) &&
1845		    !(kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL,
1846		    KAUTH_REQ_SYSTEM_SYSCTL_PRVT, NULL, NULL, NULL)))
1847			continue;
1848
1849		/*
1850		 * is this description "valid"?
1851		 */
1852		memset(bf, 0, MAXDESCLEN);
1853		if (node[i].sysctl_desc == NULL)
1854			sz = 1;
1855		else if (copystr(node[i].sysctl_desc, &d->descr_str[0],
1856				 MAXDESCLEN - sizeof(*d), &sz) != 0) {
1857			/*
1858			 * erase possible partial description
1859			 */
1860			memset(bf, 0, MAXDESCLEN);
1861			sz = 1;
1862		}
1863
1864		/*
1865		 * we've got it, stuff it into the caller's buffer
1866		 */
1867		d->descr_num = node[i].sysctl_num;
1868		d->descr_ver = node[i].sysctl_ver;
1869		d->descr_len = sz; /* includes trailing nul */
1870		sz = (char *)NEXT_DESCR(d) - (char *)d;
1871		if (oldp != NULL && left >= sz) {
1872			error = sysctl_copyout(l, d, oldp, sz);
1873			if (error)
1874				goto out;
1875			left -= sz;
1876			oldp = (void *)__sysc_desc_adv(oldp, d->descr_len);
1877		}
1878		tot += sz;
1879
1880		/*
1881		 * if we get this far with v not "unset", they asked
1882		 * for a specific node and we found it
1883		 */
1884		if (v != -1)
1885			break;
1886	}
1887
1888	/*
1889	 * did we find it after all?
1890	 */
1891	if (v != -1 && tot == 0)
1892		error = ENOENT;
1893	else
1894		*oldlenp = tot;
1895
1896out:
1897	free(bf, M_TEMP);
1898	return (error);
1899}
1900
1901/*
1902 * ********************************************************************
1903 * Section 3: Create and destroy from inside the kernel
1904 * ********************************************************************
1905 * sysctl_createv() and sysctl_destroyv() are simpler-to-use
1906 * interfaces for the kernel to fling new entries into the mib and rip
1907 * them out later.  In the case of sysctl_createv(), the returned copy
1908 * of the node (see sysctl_create()) will be translated back into a
1909 * pointer to the actual node.
1910 *
1911 * Note that sysctl_createv() will return 0 if the create request
1912 * matches an existing node (ala mkdir -p), and that sysctl_destroyv()
1913 * will return 0 if the node to be destroyed already does not exist
1914 * (aka rm -f) or if it is a parent of other nodes.
1915 *
1916 * This allows two (or more) different subsystems to assert sub-tree
1917 * existence before populating their own nodes, and to remove their
1918 * own nodes without orphaning the others when they are done.
1919 * ********************************************************************
1920 */
1921int
1922sysctl_createv(struct sysctllog **log, int cflags,
1923	       const struct sysctlnode **rnode, const struct sysctlnode **cnode,
1924	       int flags, int type, const char *namep, const char *descr,
1925	       sysctlfn func, u_quad_t qv, void *newp, size_t newlen,
1926	       ...)
1927{
1928	va_list ap;
1929	int error, ni, namelen, name[CTL_MAXNAME];
1930	const struct sysctlnode *root, *pnode;
1931	struct sysctlnode nnode, onode, *dnode;
1932	size_t sz;
1933
1934	/*
1935	 * where are we putting this?
1936	 */
1937	if (rnode != NULL && *rnode == NULL) {
1938		printf("sysctl_createv: rnode NULL\n");
1939		return (EINVAL);
1940	}
1941	root = rnode ? *rnode : NULL;
1942	if (cnode != NULL)
1943		*cnode = NULL;
1944	if (cflags != 0)
1945		return (EINVAL);
1946
1947	/*
1948	 * what is it?
1949	 */
1950	flags = SYSCTL_VERSION|SYSCTL_TYPE(type)|SYSCTL_FLAGS(flags);
1951	if (log != NULL)
1952		flags &= ~CTLFLAG_PERMANENT;
1953
1954	/*
1955	 * where do we put it?
1956	 */
1957	va_start(ap, newlen);
1958	namelen = 0;
1959	ni = -1;
1960	do {
1961		if (++ni == CTL_MAXNAME)
1962			return (ENAMETOOLONG);
1963		name[ni] = va_arg(ap, int);
1964		/*
1965		 * sorry, this is not supported from here
1966		 */
1967		if (name[ni] == CTL_CREATESYM)
1968			return (EINVAL);
1969	} while (name[ni] != CTL_EOL && name[ni] != CTL_CREATE);
1970	namelen = ni + (name[ni] == CTL_CREATE ? 1 : 0);
1971	va_end(ap);
1972
1973	/*
1974	 * what's it called
1975	 */
1976	if (strlcpy(nnode.sysctl_name, namep, sizeof(nnode.sysctl_name)) >=
1977	    sizeof(nnode.sysctl_name))
1978		return (ENAMETOOLONG);
1979
1980	/*
1981	 * cons up the description of the new node
1982	 */
1983	nnode.sysctl_num = name[namelen - 1];
1984	name[namelen - 1] = CTL_CREATE;
1985	nnode.sysctl_size = newlen;
1986	nnode.sysctl_flags = flags;
1987	if (type == CTLTYPE_NODE) {
1988		nnode.sysctl_csize = 0;
1989		nnode.sysctl_clen = 0;
1990		nnode.sysctl_child = NULL;
1991		if (flags & CTLFLAG_ALIAS)
1992			nnode.sysctl_alias = qv;
1993	} else if (flags & CTLFLAG_IMMEDIATE) {
1994		switch (type) {
1995		case CTLTYPE_BOOL:
1996			nnode.sysctl_bdata = qv;
1997			break;
1998		case CTLTYPE_INT:
1999			nnode.sysctl_idata = qv;
2000			break;
2001		case CTLTYPE_QUAD:
2002			nnode.sysctl_qdata = qv;
2003			break;
2004		default:
2005			return (EINVAL);
2006		}
2007	} else {
2008		nnode.sysctl_data = newp;
2009	}
2010	nnode.sysctl_func = func;
2011	nnode.sysctl_parent = NULL;
2012	nnode.sysctl_ver = 0;
2013
2014	/*
2015	 * initialize lock state -- we need locks if the main tree has
2016	 * been marked as complete, but since we could be called from
2017	 * either there, or from a device driver (say, at device
2018	 * insertion), or from a module (at module load time, say), we
2019	 * don't really want to "wait"...
2020	 */
2021	sysctl_lock(true);
2022
2023	/*
2024	 * locate the prospective parent of the new node, and if we
2025	 * find it, add the new node.
2026	 */
2027	sz = sizeof(onode);
2028	pnode = root;
2029	error = sysctl_locate(NULL, &name[0], namelen - 1, &pnode, &ni);
2030	if (error) {
2031		printf("sysctl_createv: sysctl_locate(%s) returned %d\n",
2032		       nnode.sysctl_name, error);
2033		sysctl_unlock();
2034		return (error);
2035	}
2036	error = sysctl_create(&name[ni], namelen - ni, &onode, &sz,
2037			      &nnode, sizeof(nnode), &name[0], NULL,
2038			      pnode);
2039
2040	/*
2041	 * unfortunately the node we wanted to create is already
2042	 * there.  if the node that's already there is a reasonable
2043	 * facsimile of the node we wanted to create, just pretend
2044	 * (for the caller's benefit) that we managed to create the
2045	 * node they wanted.
2046	 */
2047	if (error == EEXIST) {
2048		/* name is the same as requested... */
2049		if (strcmp(nnode.sysctl_name, onode.sysctl_name) == 0 &&
2050		    /* they want the same function... */
2051		    nnode.sysctl_func == onode.sysctl_func &&
2052		    /* number is the same as requested, or... */
2053		    (nnode.sysctl_num == onode.sysctl_num ||
2054		     /* they didn't pick a number... */
2055		     nnode.sysctl_num == CTL_CREATE)) {
2056			/*
2057			 * collision here from trying to create
2058			 * something that already existed; let's give
2059			 * our customers a hand and tell them they got
2060			 * what they wanted.
2061			 */
2062#ifdef SYSCTL_DEBUG_CREATE
2063			printf("cleared\n");
2064#endif /* SYSCTL_DEBUG_CREATE */
2065			error = 0;
2066		}
2067	}
2068
2069	if (error == 0 &&
2070	    (cnode != NULL || log != NULL || descr != NULL)) {
2071		/*
2072		 * sysctl_create() gave us back a copy of the node,
2073		 * but we need to know where it actually is...
2074		 */
2075		pnode = root;
2076		error = sysctl_locate(NULL, &name[0], namelen - 1, &pnode, &ni);
2077
2078		/*
2079		 * manual scan of last layer so that aliased nodes
2080		 * aren't followed.
2081		 */
2082		if (error == 0) {
2083			for (ni = 0; ni < pnode->sysctl_clen; ni++)
2084				if (pnode->sysctl_child[ni].sysctl_num ==
2085				    onode.sysctl_num)
2086					break;
2087			if (ni < pnode->sysctl_clen)
2088				pnode = &pnode->sysctl_child[ni];
2089			else
2090				error = ENOENT;
2091		}
2092
2093		/*
2094		 * not expecting an error here, but...
2095		 */
2096		if (error == 0) {
2097			if (log != NULL)
2098				sysctl_log_add(log, pnode);
2099			if (cnode != NULL)
2100				*cnode = pnode;
2101			if (descr != NULL) {
2102				/*
2103				 * allow first caller to *set* a
2104				 * description actually to set it
2105				 *
2106				 * discard const here so we can attach
2107				 * the description
2108				 */
2109				dnode = __UNCONST(pnode);
2110				if (pnode->sysctl_desc != NULL)
2111					/* skip it...we've got one */;
2112				else if (flags & CTLFLAG_OWNDESC) {
2113					size_t l = strlen(descr) + 1;
2114					char *d = malloc(l, M_SYSCTLDATA,
2115							 M_WAITOK|M_CANFAIL);
2116					if (d != NULL) {
2117						memcpy(d, descr, l);
2118						dnode->sysctl_desc = d;
2119						dnode->sysctl_flags |=
2120						    CTLFLAG_OWNDESC;
2121					}
2122				} else
2123					dnode->sysctl_desc = descr;
2124			}
2125		} else {
2126			printf("sysctl_create succeeded but node not found?!\n");
2127			/*
2128			 *  confusing, but the create said it
2129			 * succeeded, so...
2130			 */
2131			error = 0;
2132		}
2133	}
2134
2135	/*
2136	 * now it should be safe to release the lock state.  note that
2137	 * the pointer to the newly created node being passed back may
2138	 * not be "good" for very long.
2139	 */
2140	sysctl_unlock();
2141
2142	if (error != 0) {
2143		printf("sysctl_createv: sysctl_create(%s) returned %d\n",
2144		       nnode.sysctl_name, error);
2145#if 0
2146		if (error != ENOENT)
2147			sysctl_dump(&onode);
2148#endif
2149	}
2150
2151	return (error);
2152}
2153
2154int
2155sysctl_destroyv(struct sysctlnode *rnode, ...)
2156{
2157	va_list ap;
2158	int error, name[CTL_MAXNAME], namelen, ni;
2159	const struct sysctlnode *pnode, *node;
2160	struct sysctlnode dnode, *onode;
2161	size_t sz;
2162
2163	va_start(ap, rnode);
2164	namelen = 0;
2165	ni = 0;
2166	do {
2167		if (ni == CTL_MAXNAME)
2168			return (ENAMETOOLONG);
2169		name[ni] = va_arg(ap, int);
2170	} while (name[ni++] != CTL_EOL);
2171	namelen = ni - 1;
2172	va_end(ap);
2173
2174	/*
2175	 * i can't imagine why we'd be destroying a node when the tree
2176	 * wasn't complete, but who knows?
2177	 */
2178	sysctl_lock(true);
2179
2180	/*
2181	 * where is it?
2182	 */
2183	node = rnode;
2184	error = sysctl_locate(NULL, &name[0], namelen - 1, &node, &ni);
2185	if (error) {
2186		/* they want it gone and it's not there, so... */
2187		sysctl_unlock();
2188		return (error == ENOENT ? 0 : error);
2189	}
2190
2191	/*
2192	 * set up the deletion
2193	 */
2194	pnode = node;
2195	node = &dnode;
2196	memset(&dnode, 0, sizeof(dnode));
2197	dnode.sysctl_flags = SYSCTL_VERSION;
2198	dnode.sysctl_num = name[namelen - 1];
2199
2200	/*
2201	 * we found it, now let's nuke it
2202	 */
2203	name[namelen - 1] = CTL_DESTROY;
2204	sz = 0;
2205	error = sysctl_destroy(&name[namelen - 1], 1, NULL, &sz,
2206			       node, sizeof(*node), &name[0], NULL,
2207			       pnode);
2208	if (error == ENOTEMPTY) {
2209		/*
2210		 * think of trying to delete "foo" when "foo.bar"
2211		 * (which someone else put there) is still in
2212		 * existence
2213		 */
2214		error = 0;
2215
2216		/*
2217		 * dunno who put the description there, but if this
2218		 * node can ever be removed, we need to make sure the
2219		 * string doesn't go out of context.  that means we
2220		 * need to find the node that's still there (don't use
2221		 * sysctl_locate() because that follows aliasing).
2222		 */
2223		node = pnode->sysctl_child;
2224		for (ni = 0; ni < pnode->sysctl_clen; ni++)
2225			if (node[ni].sysctl_num == dnode.sysctl_num)
2226				break;
2227		node = (ni < pnode->sysctl_clen) ? &node[ni] : NULL;
2228
2229		/*
2230		 * if we found it, and this node has a description,
2231		 * and this node can be released, and it doesn't
2232		 * already own its own description...sigh.  :)
2233		 */
2234		if (node != NULL && node->sysctl_desc != NULL &&
2235		    !(node->sysctl_flags & CTLFLAG_PERMANENT) &&
2236		    !(node->sysctl_flags & CTLFLAG_OWNDESC)) {
2237			char *d;
2238
2239			sz = strlen(node->sysctl_desc) + 1;
2240			d = malloc(sz, M_SYSCTLDATA, M_WAITOK|M_CANFAIL);
2241			if (d != NULL) {
2242				/*
2243				 * discard const so that we can
2244				 * re-attach the description
2245				 */
2246				memcpy(d, node->sysctl_desc, sz);
2247				onode = __UNCONST(node);
2248				onode->sysctl_desc = d;
2249				onode->sysctl_flags |= CTLFLAG_OWNDESC;
2250			} else {
2251				/*
2252				 * XXX drop the description?  be
2253				 * afraid?  don't care?
2254				 */
2255			}
2256		}
2257	}
2258
2259        sysctl_unlock();
2260
2261	return (error);
2262}
2263
2264/*
2265 * ********************************************************************
2266 * Deletes an entire n-ary tree.  Not recommended unless you know why
2267 * you're doing it.  Personally, I don't know why you'd even think
2268 * about it.
2269 * ********************************************************************
2270 */
2271void
2272sysctl_free(struct sysctlnode *rnode)
2273{
2274	struct sysctlnode *node, *pnode;
2275
2276	rw_enter(&sysctl_treelock, RW_WRITER);
2277
2278	if (rnode == NULL)
2279		rnode = &sysctl_root;
2280
2281	if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
2282		printf("sysctl_free: rnode %p wrong version\n", rnode);
2283		rw_exit(&sysctl_treelock);
2284		return;
2285	}
2286
2287	pnode = rnode;
2288
2289	node = pnode->sysctl_child;
2290	do {
2291		while (node != NULL && pnode->sysctl_csize > 0) {
2292			while (node <
2293			       &pnode->sysctl_child[pnode->sysctl_clen] &&
2294			       (SYSCTL_TYPE(node->sysctl_flags) !=
2295				CTLTYPE_NODE ||
2296				node->sysctl_csize == 0)) {
2297				if (SYSCTL_FLAGS(node->sysctl_flags) &
2298				    CTLFLAG_OWNDATA) {
2299					if (node->sysctl_data != NULL) {
2300						free(node->sysctl_data,
2301						     M_SYSCTLDATA);
2302						node->sysctl_data = NULL;
2303					}
2304				}
2305				if (SYSCTL_FLAGS(node->sysctl_flags) &
2306				    CTLFLAG_OWNDESC) {
2307					if (node->sysctl_desc != NULL) {
2308						/*XXXUNCONST*/
2309						free(__UNCONST(node->sysctl_desc),
2310						     M_SYSCTLDATA);
2311						node->sysctl_desc = NULL;
2312					}
2313				}
2314				node++;
2315			}
2316			if (node < &pnode->sysctl_child[pnode->sysctl_clen]) {
2317				pnode = node;
2318				node = node->sysctl_child;
2319			} else
2320				break;
2321		}
2322		if (pnode->sysctl_child != NULL)
2323			free(pnode->sysctl_child, M_SYSCTLNODE);
2324		pnode->sysctl_clen = 0;
2325		pnode->sysctl_csize = 0;
2326		pnode->sysctl_child = NULL;
2327		node = pnode;
2328		pnode = node->sysctl_parent;
2329	} while (pnode != NULL && node != rnode);
2330
2331	rw_exit(&sysctl_treelock);
2332}
2333
2334void
2335sysctl_log_print(const struct sysctllog *slog)
2336{
2337	int i, len;
2338
2339	printf("root %p left %d size %d content", (const void *)slog->log_root,
2340	    slog->log_left, slog->log_size);
2341
2342	for (len = 0, i = slog->log_left; i < slog->log_size; i++) {
2343		switch (len) {
2344		case 0:
2345			len = -1;
2346			printf(" version %d", slog->log_num[i]);
2347			break;
2348		case -1:
2349			len = -2;
2350			printf(" type %d", slog->log_num[i]);
2351			break;
2352		case -2:
2353			len =  slog->log_num[i];
2354			printf(" len %d:", slog->log_num[i]);
2355			if (len <= 0)
2356				len = -1;
2357			break;
2358		default:
2359			len--;
2360			printf(" %d", slog->log_num[i]);
2361			break;
2362		}
2363	}
2364	printf(" end\n");
2365}
2366
2367int
2368sysctl_log_add(struct sysctllog **logp, const struct sysctlnode *node)
2369{
2370	const int size0 = 16;
2371	int name[CTL_MAXNAME], namelen, i;
2372	const struct sysctlnode *pnode;
2373	struct sysctllog *log;
2374
2375	if (node->sysctl_flags & CTLFLAG_PERMANENT)
2376		return (0);
2377
2378	if (logp == NULL)
2379		return (0);
2380
2381	if (*logp == NULL) {
2382		log = malloc(sizeof(struct sysctllog),
2383		       M_SYSCTLDATA, M_WAITOK|M_CANFAIL);
2384		if (log == NULL) {
2385			/* XXX print error message? */
2386			return (-1);
2387		}
2388		log->log_num = malloc(size0 * sizeof(int),
2389		       M_SYSCTLDATA, M_WAITOK|M_CANFAIL);
2390		if (log->log_num == NULL) {
2391			/* XXX print error message? */
2392			free(log, M_SYSCTLDATA);
2393			return (-1);
2394		}
2395		memset(log->log_num, 0, size0 * sizeof(int));
2396		log->log_root = NULL;
2397		log->log_size = size0;
2398		log->log_left = size0;
2399		*logp = log;
2400	} else
2401		log = *logp;
2402
2403	/*
2404	 * check that the root is proper.  it's okay to record the
2405	 * address of the root of a tree.  it's the only thing that's
2406	 * guaranteed not to shift around as nodes come and go.
2407	 */
2408	if (log->log_root == NULL)
2409		log->log_root = sysctl_rootof(node);
2410	else if (log->log_root != sysctl_rootof(node)) {
2411		printf("sysctl: log %p root mismatch (%p)\n",
2412		       log->log_root, sysctl_rootof(node));
2413		return (-1);
2414	}
2415
2416	/*
2417	 * we will copy out name in reverse order
2418	 */
2419	for (pnode = node, namelen = 0;
2420	     pnode != NULL && !(pnode->sysctl_flags & CTLFLAG_ROOT);
2421	     pnode = pnode->sysctl_parent)
2422		name[namelen++] = pnode->sysctl_num;
2423
2424	/*
2425	 * do we have space?
2426	 */
2427	if (log->log_left < (namelen + 3))
2428		sysctl_log_realloc(log);
2429	if (log->log_left < (namelen + 3))
2430		return (-1);
2431
2432	/*
2433	 * stuff name in, then namelen, then node type, and finally,
2434	 * the version for non-node nodes.
2435	 */
2436	for (i = 0; i < namelen; i++)
2437		log->log_num[--log->log_left] = name[i];
2438	log->log_num[--log->log_left] = namelen;
2439	log->log_num[--log->log_left] = SYSCTL_TYPE(node->sysctl_flags);
2440	if (log->log_num[log->log_left] != CTLTYPE_NODE)
2441		log->log_num[--log->log_left] = node->sysctl_ver;
2442	else
2443		log->log_num[--log->log_left] = 0;
2444
2445	return (0);
2446}
2447
2448void
2449sysctl_teardown(struct sysctllog **logp)
2450{
2451	const struct sysctlnode *rnode;
2452	struct sysctlnode node;
2453	struct sysctllog *log;
2454	uint namelen;
2455	int *name, t, v, error, ni;
2456	size_t sz;
2457
2458	if (logp == NULL || *logp == NULL)
2459		return;
2460	log = *logp;
2461
2462	rw_enter(&sysctl_treelock, RW_WRITER);
2463	memset(&node, 0, sizeof(node));
2464
2465	while (log->log_left < log->log_size) {
2466		KASSERT((log->log_left + 3 < log->log_size) &&
2467			(log->log_left + log->log_num[log->log_left + 2] <=
2468			 log->log_size));
2469		v = log->log_num[log->log_left++];
2470		t = log->log_num[log->log_left++];
2471		namelen = log->log_num[log->log_left++];
2472		name = &log->log_num[log->log_left];
2473
2474		node.sysctl_num = name[namelen - 1];
2475		node.sysctl_flags = SYSCTL_VERSION|t;
2476		node.sysctl_ver = v;
2477
2478		rnode = log->log_root;
2479		error = sysctl_locate(NULL, &name[0], namelen, &rnode, &ni);
2480		if (error == 0) {
2481			name[namelen - 1] = CTL_DESTROY;
2482			rnode = rnode->sysctl_parent;
2483			sz = 0;
2484			(void)sysctl_destroy(&name[namelen - 1], 1, NULL,
2485					     &sz, &node, sizeof(node),
2486					     &name[0], NULL, rnode);
2487		}
2488
2489		log->log_left += namelen;
2490	}
2491
2492	KASSERT(log->log_size == log->log_left);
2493	free(log->log_num, M_SYSCTLDATA);
2494	free(log, M_SYSCTLDATA);
2495	*logp = NULL;
2496
2497	rw_exit(&sysctl_treelock);
2498}
2499
2500/*
2501 * ********************************************************************
2502 * old_sysctl -- A routine to bridge old-style internal calls to the
2503 * new infrastructure.
2504 * ********************************************************************
2505 */
2506int
2507old_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
2508	   void *newp, size_t newlen, struct lwp *l)
2509{
2510	int error;
2511	size_t oldlen = 0;
2512	size_t savelen;
2513
2514	if (oldlenp) {
2515		oldlen = *oldlenp;
2516	}
2517	savelen = oldlen;
2518
2519	sysctl_lock(newp != NULL);
2520	error = sysctl_dispatch(name, namelen, oldp, &oldlen,
2521				newp, newlen, name, l, NULL);
2522	sysctl_unlock();
2523	if (error == 0 && oldp != NULL && savelen < oldlen)
2524		error = ENOMEM;
2525	if (oldlenp) {
2526		*oldlenp = oldlen;
2527	}
2528
2529	return (error);
2530}
2531
2532/*
2533 * ********************************************************************
2534 * Section 4: Generic helper routines
2535 * ********************************************************************
2536 * "helper" routines that can do more finely grained access control,
2537 * construct structures from disparate information, create the
2538 * appearance of more nodes and sub-trees, etc.  for example, if
2539 * CTL_PROC wanted a helper function, it could respond to a CTL_QUERY
2540 * with a dynamically created list of nodes that represented the
2541 * currently running processes at that instant.
2542 * ********************************************************************
2543 */
2544
2545/*
2546 * first, a few generic helpers that provide:
2547 *
2548 * sysctl_needfunc()		a readonly interface that emits a warning
2549 * sysctl_notavail()		returns EOPNOTSUPP (generic error)
2550 * sysctl_null()		an empty return buffer with no error
2551 */
2552int
2553sysctl_needfunc(SYSCTLFN_ARGS)
2554{
2555	int error;
2556
2557	printf("!!SYSCTL_NEEDFUNC!!\n");
2558
2559	if (newp != NULL || namelen != 0)
2560		return (EOPNOTSUPP);
2561
2562	error = 0;
2563	if (oldp != NULL)
2564		error = sysctl_copyout(l, rnode->sysctl_data, oldp,
2565				       MIN(rnode->sysctl_size, *oldlenp));
2566	*oldlenp = rnode->sysctl_size;
2567
2568	return (error);
2569}
2570
2571int
2572sysctl_notavail(SYSCTLFN_ARGS)
2573{
2574
2575	if (namelen == 1 && name[0] == CTL_QUERY)
2576		return (sysctl_query(SYSCTLFN_CALL(rnode)));
2577
2578	return (EOPNOTSUPP);
2579}
2580
2581int
2582sysctl_null(SYSCTLFN_ARGS)
2583{
2584
2585	*oldlenp = 0;
2586
2587	return (0);
2588}
2589
2590u_int
2591sysctl_map_flags(const u_int *map, u_int word)
2592{
2593	u_int rv;
2594
2595	for (rv = 0; *map != 0; map += 2)
2596		if ((word & map[0]) != 0)
2597			rv |= map[1];
2598
2599	return rv;
2600}
2601
2602/*
2603 * ********************************************************************
2604 * Section 5: The machinery that makes it all go
2605 * ********************************************************************
2606 * Memory "manglement" routines.  Not much to this, eh?
2607 * ********************************************************************
2608 */
2609static int
2610sysctl_alloc(struct sysctlnode *p, int x)
2611{
2612	int i;
2613	struct sysctlnode *n;
2614
2615	assert(p->sysctl_child == NULL);
2616
2617	if (x == 1)
2618		n = malloc(sizeof(struct sysctlnode),
2619		       M_SYSCTLNODE, M_WAITOK|M_CANFAIL);
2620	else
2621		n = malloc(SYSCTL_DEFSIZE * sizeof(struct sysctlnode),
2622		       M_SYSCTLNODE, M_WAITOK|M_CANFAIL);
2623	if (n == NULL)
2624		return (ENOMEM);
2625
2626	if (x == 1) {
2627		memset(n, 0, sizeof(struct sysctlnode));
2628		p->sysctl_csize = 1;
2629	} else {
2630		memset(n, 0, SYSCTL_DEFSIZE * sizeof(struct sysctlnode));
2631		p->sysctl_csize = SYSCTL_DEFSIZE;
2632	}
2633	p->sysctl_clen = 0;
2634
2635	for (i = 0; i < p->sysctl_csize; i++)
2636		n[i].sysctl_parent = p;
2637
2638	p->sysctl_child = n;
2639	return (0);
2640}
2641
2642static int
2643sysctl_realloc(struct sysctlnode *p)
2644{
2645	int i, j, olen;
2646	struct sysctlnode *n;
2647
2648	assert(p->sysctl_csize == p->sysctl_clen);
2649
2650	/*
2651	 * how many do we have...how many should we make?
2652	 */
2653	olen = p->sysctl_clen;
2654	n = malloc(2 * olen * sizeof(struct sysctlnode), M_SYSCTLNODE,
2655		   M_WAITOK|M_CANFAIL);
2656	if (n == NULL)
2657		return (ENOMEM);
2658
2659	/*
2660	 * move old children over...initialize new children
2661	 */
2662	memcpy(n, p->sysctl_child, olen * sizeof(struct sysctlnode));
2663	memset(&n[olen], 0, olen * sizeof(struct sysctlnode));
2664	p->sysctl_csize = 2 * olen;
2665
2666	/*
2667	 * reattach moved (and new) children to parent; if a moved
2668	 * child node has children, reattach the parent pointers of
2669	 * grandchildren
2670	 */
2671        for (i = 0; i < p->sysctl_csize; i++) {
2672                n[i].sysctl_parent = p;
2673		if (n[i].sysctl_child != NULL) {
2674			for (j = 0; j < n[i].sysctl_csize; j++)
2675				n[i].sysctl_child[j].sysctl_parent = &n[i];
2676		}
2677	}
2678
2679	/*
2680	 * get out with the old and in with the new
2681	 */
2682	free(p->sysctl_child, M_SYSCTLNODE);
2683	p->sysctl_child = n;
2684
2685	return (0);
2686}
2687
2688static int
2689sysctl_log_realloc(struct sysctllog *log)
2690{
2691	int *n, s, d;
2692
2693	s = log->log_size * 2;
2694	d = log->log_size;
2695
2696	n = malloc(s * sizeof(int), M_SYSCTLDATA, M_WAITOK|M_CANFAIL);
2697	if (n == NULL)
2698		return (-1);
2699
2700	memset(n, 0, s * sizeof(int));
2701	memcpy(&n[d], log->log_num, d * sizeof(int));
2702	free(log->log_num, M_SYSCTLDATA);
2703	log->log_num = n;
2704	if (d)
2705		log->log_left += d;
2706	else
2707		log->log_left = s;
2708	log->log_size = s;
2709
2710	return (0);
2711}
2712
2713/*
2714 * ********************************************************************
2715 * Section 6: Conversion between API versions wrt the sysctlnode
2716 * ********************************************************************
2717 */
2718static int
2719sysctl_cvt_in(struct lwp *l, int *vp, const void *i, size_t sz,
2720	      struct sysctlnode *node)
2721{
2722	int error, flags;
2723
2724	if (i == NULL || sz < sizeof(flags))
2725		return (EINVAL);
2726
2727	error = sysctl_copyin(l, i, &flags, sizeof(flags));
2728	if (error)
2729		return (error);
2730
2731#if (SYSCTL_VERSION != SYSCTL_VERS_1)
2732#error sysctl_cvt_in: no support for SYSCTL_VERSION
2733#endif /*  (SYSCTL_VERSION != SYSCTL_VERS_1) */
2734
2735	if (sz == sizeof(*node) &&
2736	    SYSCTL_VERS(flags) == SYSCTL_VERSION) {
2737		error = sysctl_copyin(l, i, node, sizeof(*node));
2738		if (error)
2739			return (error);
2740		*vp = SYSCTL_VERSION;
2741		return (0);
2742	}
2743
2744	return (EINVAL);
2745}
2746
2747static int
2748sysctl_cvt_out(struct lwp *l, int v, const struct sysctlnode *i,
2749	       void *ovp, size_t left, size_t *szp)
2750{
2751	size_t sz = sizeof(*i);
2752	const void *src = i;
2753	int error;
2754
2755	switch (v) {
2756	case SYSCTL_VERS_0:
2757		return (EINVAL);
2758
2759#if (SYSCTL_VERSION != SYSCTL_VERS_1)
2760#error sysctl_cvt_out: no support for SYSCTL_VERSION
2761#endif /*  (SYSCTL_VERSION != SYSCTL_VERS_1) */
2762
2763	case SYSCTL_VERSION:
2764		/* nothing more to do here */
2765		break;
2766	}
2767
2768	if (ovp != NULL && left >= sz) {
2769		error = sysctl_copyout(l, src, ovp, sz);
2770		if (error)
2771			return (error);
2772	}
2773
2774	if (szp != NULL)
2775		*szp = sz;
2776
2777	return (0);
2778}
2779