1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1980, 1986, 1991, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31/************************************************************************
32 * Note: In this file a 'fib' is a "forwarding information base"	*
33 * Which is the new name for an in kernel routing (next hop) table.	*
34 ***********************************************************************/
35
36#include <sys/cdefs.h>
37#include "opt_route.h"
38
39#include <sys/param.h>
40#include <sys/socket.h>
41#include <sys/systm.h>
42#include <sys/malloc.h>
43#include <sys/jail.h>
44#include <sys/osd.h>
45#include <sys/proc.h>
46#include <sys/sysctl.h>
47#include <sys/syslog.h>
48#include <sys/kernel.h>
49#include <sys/lock.h>
50#include <sys/sx.h>
51#include <sys/domain.h>
52#include <sys/sysproto.h>
53
54#include <net/vnet.h>
55#include <net/route.h>
56#include <net/route/route_ctl.h>
57#include <net/route/route_var.h>
58
59/* Kernel config default option. */
60#ifdef ROUTETABLES
61#if ROUTETABLES <= 0
62#error "ROUTETABLES defined too low"
63#endif
64#if ROUTETABLES > RT_MAXFIBS
65#error "ROUTETABLES defined too big"
66#endif
67#define	RT_NUMFIBS	ROUTETABLES
68#endif /* ROUTETABLES */
69/* Initialize to default if not otherwise set. */
70#ifndef	RT_NUMFIBS
71#define	RT_NUMFIBS	1
72#endif
73
74static void grow_rtables(uint32_t num_fibs);
75
76VNET_DEFINE_STATIC(struct sx, rtables_lock);
77#define	V_rtables_lock		VNET(rtables_lock)
78#define	RTABLES_LOCK()		sx_xlock(&V_rtables_lock)
79#define	RTABLES_UNLOCK()	sx_xunlock(&V_rtables_lock)
80#define	RTABLES_LOCK_INIT()	sx_init(&V_rtables_lock, "rtables lock")
81#define	RTABLES_LOCK_ASSERT()	sx_assert(&V_rtables_lock, SA_LOCKED)
82
83VNET_DEFINE_STATIC(struct rib_head **, rt_tables);
84#define	V_rt_tables	VNET(rt_tables)
85
86VNET_DEFINE(uint32_t, _rt_numfibs) = RT_NUMFIBS;
87
88/*
89 * Handler for net.my_fibnum.
90 * Returns current fib of the process.
91 */
92static int
93sysctl_my_fibnum(SYSCTL_HANDLER_ARGS)
94{
95        int fibnum;
96        int error;
97
98        fibnum = curthread->td_proc->p_fibnum;
99        error = sysctl_handle_int(oidp, &fibnum, 0, req);
100        return (error);
101}
102SYSCTL_PROC(_net, OID_AUTO, my_fibnum,
103    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
104    &sysctl_my_fibnum, "I",
105    "default FIB of caller");
106
107static uint32_t
108normalize_num_rtables(uint32_t num_rtables)
109{
110
111	if (num_rtables > RT_MAXFIBS)
112		num_rtables = RT_MAXFIBS;
113	else if (num_rtables == 0)
114		num_rtables = 1;
115	return (num_rtables);
116}
117
118/*
119 * Sets the number of fibs in the current vnet.
120 * Function does not allow shrinking number of rtables.
121 */
122static int
123sysctl_fibs(SYSCTL_HANDLER_ARGS)
124{
125	uint32_t new_fibs;
126	int error;
127
128	RTABLES_LOCK();
129	new_fibs = V_rt_numfibs;
130	error = sysctl_handle_32(oidp, &new_fibs, 0, req);
131	if (error == 0) {
132		new_fibs = normalize_num_rtables(new_fibs);
133
134		if (new_fibs < V_rt_numfibs)
135			error = ENOTCAPABLE;
136		if (new_fibs > V_rt_numfibs)
137			grow_rtables(new_fibs);
138	}
139	RTABLES_UNLOCK();
140
141	return (error);
142}
143SYSCTL_PROC(_net, OID_AUTO, fibs,
144    CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE,
145    NULL, 0, &sysctl_fibs, "IU",
146    "set number of fibs");
147
148/*
149 * Sets fib of a current process.
150 */
151int
152sys_setfib(struct thread *td, struct setfib_args *uap)
153{
154	int error = 0;
155
156	CURVNET_SET(TD_TO_VNET(td));
157	if (uap->fibnum >= 0 && uap->fibnum < V_rt_numfibs)
158		td->td_proc->p_fibnum = uap->fibnum;
159	else
160		error = EINVAL;
161	CURVNET_RESTORE();
162
163	return (error);
164}
165
166static int
167rtables_check_proc_fib(void *obj, void *data)
168{
169	struct prison *pr = obj;
170	struct thread *td = data;
171	int error = 0;
172
173	if (TD_TO_VNET(td) != pr->pr_vnet) {
174		/* number of fibs may be lower in a new vnet */
175		CURVNET_SET(pr->pr_vnet);
176		if (td->td_proc->p_fibnum >= V_rt_numfibs)
177			error = EINVAL;
178		CURVNET_RESTORE();
179	}
180	return (error);
181}
182
183static void
184rtables_prison_destructor(void *data)
185{
186}
187
188static void
189rtables_init(void)
190{
191	osd_method_t methods[PR_MAXMETHOD] = {
192	    [PR_METHOD_ATTACH] =	rtables_check_proc_fib,
193	};
194	osd_jail_register(rtables_prison_destructor, methods);
195}
196SYSINIT(rtables_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtables_init, NULL);
197
198
199/*
200 * If required, copy interface routes from existing tables to the
201 * newly-created routing table.
202 */
203static void
204populate_kernel_routes(struct rib_head **new_rt_tables, struct rib_head *rh)
205{
206	for (int i = 0; i < V_rt_numfibs; i++) {
207		struct rib_head *rh_src = new_rt_tables[i * (AF_MAX + 1) + rh->rib_family];
208		if ((rh_src != NULL) && (rh_src != rh))
209			rib_copy_kernel_routes(rh_src, rh);
210	}
211}
212
213/*
214 * Grows up the number of routing tables in the current fib.
215 * Function creates new index array for all rtables and allocates
216 *  remaining routing tables.
217 */
218static void
219grow_rtables(uint32_t num_tables)
220{
221	struct domain *dom;
222	struct rib_head **prnh, *rh;
223	struct rib_head **new_rt_tables, **old_rt_tables;
224	int family;
225
226	RTABLES_LOCK_ASSERT();
227
228	KASSERT(num_tables >= V_rt_numfibs, ("num_tables(%u) < rt_numfibs(%u)\n",
229				num_tables, V_rt_numfibs));
230
231	new_rt_tables = mallocarray(num_tables * (AF_MAX + 1), sizeof(void *),
232	    M_RTABLE, M_WAITOK | M_ZERO);
233
234	if ((num_tables > 1) && (V_rt_add_addr_allfibs == 0))
235		printf("WARNING: Adding ifaddrs to all fibs has been turned off "
236			"by default. Consider tuning %s if needed\n",
237			"net.add_addr_allfibs");
238
239#ifdef FIB_ALGO
240	fib_grow_rtables(num_tables);
241#endif
242
243	/*
244	 * Current rt_tables layout:
245	 * fib0[af0, af1, af2, .., AF_MAX]fib1[af0, af1, af2, .., Af_MAX]..
246	 * this allows to copy existing tables data by using memcpy()
247	 */
248	if (V_rt_tables != NULL)
249		memcpy(new_rt_tables, V_rt_tables,
250		    V_rt_numfibs * (AF_MAX + 1) * sizeof(void *));
251
252	/* Populate the remainders */
253	SLIST_FOREACH(dom, &domains, dom_next) {
254		if (dom->dom_rtattach == NULL)
255			continue;
256		family = dom->dom_family;
257		for (int i = 0; i < num_tables; i++) {
258			prnh = &new_rt_tables[i * (AF_MAX + 1) + family];
259			if (*prnh != NULL)
260				continue;
261			rh = dom->dom_rtattach(i);
262			if (rh == NULL)
263				log(LOG_ERR, "unable to create routing table for %d.%d\n",
264				    dom->dom_family, i);
265			else
266				populate_kernel_routes(new_rt_tables, rh);
267			*prnh = rh;
268		}
269	}
270
271	/*
272	 * Update rtables pointer.
273	 * Ensure all writes to new_rt_tables has been completed before
274	 *  switching pointer.
275	 */
276	atomic_thread_fence_rel();
277	old_rt_tables = V_rt_tables;
278	V_rt_tables = new_rt_tables;
279
280	/* Wait till all cpus see new pointers */
281	atomic_thread_fence_rel();
282	NET_EPOCH_WAIT();
283
284	/* Set number of fibs to a new value */
285	V_rt_numfibs = num_tables;
286
287#ifdef FIB_ALGO
288	/* Attach fib algo to the new rtables */
289	SLIST_FOREACH(dom, &domains, dom_next) {
290		if (dom->dom_rtattach != NULL)
291			fib_setup_family(dom->dom_family, num_tables);
292	}
293#endif
294
295	if (old_rt_tables != NULL)
296		free(old_rt_tables, M_RTABLE);
297}
298
299static void
300vnet_rtables_init(const void *unused __unused)
301{
302	int num_rtables_base;
303
304	if (IS_DEFAULT_VNET(curvnet)) {
305		num_rtables_base = RT_NUMFIBS;
306		TUNABLE_INT_FETCH("net.fibs", &num_rtables_base);
307		V_rt_numfibs = normalize_num_rtables(num_rtables_base);
308	} else
309		V_rt_numfibs = 1;
310
311	vnet_rtzone_init();
312#ifdef FIB_ALGO
313	vnet_fib_init();
314#endif
315	RTABLES_LOCK_INIT();
316
317	RTABLES_LOCK();
318	grow_rtables(V_rt_numfibs);
319	RTABLES_UNLOCK();
320}
321VNET_SYSINIT(vnet_rtables_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
322    vnet_rtables_init, 0);
323
324#ifdef VIMAGE
325static void
326rtables_destroy(const void *unused __unused)
327{
328	struct rib_head *rnh;
329	struct domain *dom;
330	int family;
331
332	RTABLES_LOCK();
333	SLIST_FOREACH(dom, &domains, dom_next) {
334		if (dom->dom_rtdetach == NULL)
335			continue;
336		family = dom->dom_family;
337		for (int i = 0; i < V_rt_numfibs; i++) {
338			rnh = rt_tables_get_rnh(i, family);
339			dom->dom_rtdetach(rnh);
340		}
341	}
342	RTABLES_UNLOCK();
343
344	/*
345	 * dom_rtdetach calls rt_table_destroy(), which
346	 *  schedules deletion for all rtentries, nexthops and control
347	 *  structures. Wait for the destruction callbacks to fire.
348	 * Note that this should result in freeing all rtentries, but
349	 *  nexthops deletions will be scheduled for the next epoch run
350	 *  and will be completed after vnet teardown.
351	 */
352	NET_EPOCH_DRAIN_CALLBACKS();
353
354	free(V_rt_tables, M_RTABLE);
355	vnet_rtzone_destroy();
356#ifdef FIB_ALGO
357	vnet_fib_destroy();
358#endif
359}
360VNET_SYSUNINIT(rtables_destroy, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
361    rtables_destroy, 0);
362#endif
363
364static inline struct rib_head *
365rt_tables_get_rnh_ptr(uint32_t table, sa_family_t family)
366{
367	struct rib_head **prnh;
368
369	KASSERT(table < V_rt_numfibs,
370	    ("%s: table out of bounds (%d < %d)", __func__, table,
371	     V_rt_numfibs));
372	KASSERT(family < (AF_MAX + 1),
373	    ("%s: fam out of bounds (%d < %d)", __func__, family, AF_MAX + 1));
374
375	/* rnh is [fib=0][af=0]. */
376	prnh = V_rt_tables;
377	/* Get the offset to the requested table and fam. */
378	prnh += table * (AF_MAX + 1) + family;
379
380	return (*prnh);
381}
382
383struct rib_head *
384rt_tables_get_rnh(uint32_t table, sa_family_t family)
385{
386
387	return (rt_tables_get_rnh_ptr(table, family));
388}
389
390struct rib_head *
391rt_tables_get_rnh_safe(uint32_t table, sa_family_t family)
392{
393	if (__predict_false(table >= V_rt_numfibs))
394		return (NULL);
395	if (__predict_false(family >= (AF_MAX + 1)))
396		return (NULL);
397	return (rt_tables_get_rnh_ptr(table, family));
398}
399
400u_int
401rt_tables_get_gen(uint32_t table, sa_family_t family)
402{
403	struct rib_head *rnh;
404
405	rnh = rt_tables_get_rnh_ptr(table, family);
406	KASSERT(rnh != NULL, ("%s: NULL rib_head pointer table %d family %d",
407	    __func__, table, family));
408	return (rnh->rnh_gen);
409}
410