1/*-
2 * Copyright (c) 2007-2008
3 *	Swinburne University of Technology, Melbourne, Australia.
4 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
5 * Copyright (c) 2010 The FreeBSD Foundation
6 * All rights reserved.
7 *
8 * This software was developed at the Centre for Advanced Internet
9 * Architectures, Swinburne University of Technology, by Lawrence Stewart and
10 * James Healy, made possible in part by a grant from the Cisco University
11 * Research Program Fund at Community Foundation Silicon Valley.
12 *
13 * Portions of this software were developed at the Centre for Advanced
14 * Internet Architectures, Swinburne University of Technology, Melbourne,
15 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 */
38
39/*
40 * This software was first released in 2007 by James Healy and Lawrence Stewart
41 * whilst working on the NewTCP research project at Swinburne University of
42 * Technology's Centre for Advanced Internet Architectures, Melbourne,
43 * Australia, which was made possible in part by a grant from the Cisco
44 * University Research Program Fund at Community Foundation Silicon Valley.
45 * More details are available at:
46 *   http://caia.swin.edu.au/urp/newtcp/
47 */
48
49#include <sys/cdefs.h>
50__FBSDID("$FreeBSD$");
51
52#include <sys/param.h>
53#include <sys/kernel.h>
54#include <sys/libkern.h>
55#include <sys/lock.h>
56#include <sys/malloc.h>
57#include <sys/module.h>
58#include <sys/mutex.h>
59#include <sys/queue.h>
60#include <sys/rwlock.h>
61#include <sys/sbuf.h>
62#include <sys/socket.h>
63#include <sys/socketvar.h>
64#include <sys/sysctl.h>
65
66#include <net/if.h>
67#include <net/if_var.h>
68
69#include <netinet/cc.h>
70#include <netinet/in.h>
71#include <netinet/in_pcb.h>
72#include <netinet/tcp_var.h>
73
74#include <netinet/cc/cc_module.h>
75
76/*
77 * List of available cc algorithms on the current system. First element
78 * is used as the system default CC algorithm.
79 */
80struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
81
82/* Protects the cc_list TAILQ. */
83struct rwlock cc_list_lock;
84
85VNET_DEFINE(struct cc_algo *, default_cc_ptr) = &newreno_cc_algo;
86
87/*
88 * Sysctl handler to show and change the default CC algorithm.
89 */
90static int
91cc_default_algo(SYSCTL_HANDLER_ARGS)
92{
93	char default_cc[TCP_CA_NAME_MAX];
94	struct cc_algo *funcs;
95	int err, found;
96
97	err = found = 0;
98
99	if (req->newptr == NULL) {
100		/* Just print the current default. */
101		CC_LIST_RLOCK();
102		strlcpy(default_cc, CC_DEFAULT()->name, TCP_CA_NAME_MAX);
103		CC_LIST_RUNLOCK();
104		err = sysctl_handle_string(oidp, default_cc, 1, req);
105	} else {
106		/* Find algo with specified name and set it to default. */
107		CC_LIST_RLOCK();
108		STAILQ_FOREACH(funcs, &cc_list, entries) {
109			if (strncmp((char *)req->newptr, funcs->name,
110			    TCP_CA_NAME_MAX) == 0) {
111				found = 1;
112				V_default_cc_ptr = funcs;
113			}
114		}
115		CC_LIST_RUNLOCK();
116
117		if (!found)
118			err = ESRCH;
119	}
120
121	return (err);
122}
123
124/*
125 * Sysctl handler to display the list of available CC algorithms.
126 */
127static int
128cc_list_available(SYSCTL_HANDLER_ARGS)
129{
130	struct cc_algo *algo;
131	struct sbuf *s;
132	int err, first, nalgos;
133
134	err = nalgos = 0;
135	first = 1;
136
137	CC_LIST_RLOCK();
138	STAILQ_FOREACH(algo, &cc_list, entries) {
139		nalgos++;
140	}
141	CC_LIST_RUNLOCK();
142
143	s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN);
144
145	if (s == NULL)
146		return (ENOMEM);
147
148	/*
149	 * It is theoretically possible for the CC list to have grown in size
150	 * since the call to sbuf_new() and therefore for the sbuf to be too
151	 * small. If this were to happen (incredibly unlikely), the sbuf will
152	 * reach an overflow condition, sbuf_printf() will return an error and
153	 * the sysctl will fail gracefully.
154	 */
155	CC_LIST_RLOCK();
156	STAILQ_FOREACH(algo, &cc_list, entries) {
157		err = sbuf_printf(s, first ? "%s" : ", %s", algo->name);
158		if (err) {
159			/* Sbuf overflow condition. */
160			err = EOVERFLOW;
161			break;
162		}
163		first = 0;
164	}
165	CC_LIST_RUNLOCK();
166
167	if (!err) {
168		sbuf_finish(s);
169		err = sysctl_handle_string(oidp, sbuf_data(s), 1, req);
170	}
171
172	sbuf_delete(s);
173	return (err);
174}
175
176/*
177 * Reset the default CC algo to NewReno for any netstack which is using the algo
178 * that is about to go away as its default.
179 */
180static void
181cc_checkreset_default(struct cc_algo *remove_cc)
182{
183	VNET_ITERATOR_DECL(vnet_iter);
184
185	CC_LIST_LOCK_ASSERT();
186
187	VNET_LIST_RLOCK_NOSLEEP();
188	VNET_FOREACH(vnet_iter) {
189		CURVNET_SET(vnet_iter);
190		if (strncmp(CC_DEFAULT()->name, remove_cc->name,
191		    TCP_CA_NAME_MAX) == 0)
192			V_default_cc_ptr = &newreno_cc_algo;
193		CURVNET_RESTORE();
194	}
195	VNET_LIST_RUNLOCK_NOSLEEP();
196}
197
198/*
199 * Initialise CC subsystem on system boot.
200 */
201static void
202cc_init(void)
203{
204	CC_LIST_LOCK_INIT();
205	STAILQ_INIT(&cc_list);
206}
207
208/*
209 * Returns non-zero on success, 0 on failure.
210 */
211int
212cc_deregister_algo(struct cc_algo *remove_cc)
213{
214	struct cc_algo *funcs, *tmpfuncs;
215	int err;
216
217	err = ENOENT;
218
219	/* Never allow newreno to be deregistered. */
220	if (&newreno_cc_algo == remove_cc)
221		return (EPERM);
222
223	/* Remove algo from cc_list so that new connections can't use it. */
224	CC_LIST_WLOCK();
225	STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
226		if (funcs == remove_cc) {
227			cc_checkreset_default(remove_cc);
228			STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
229			err = 0;
230			break;
231		}
232	}
233	CC_LIST_WUNLOCK();
234
235	if (!err)
236		/*
237		 * XXXLAS:
238		 * - We may need to handle non-zero return values in future.
239		 * - If we add CC framework support for protocols other than
240		 *   TCP, we may want a more generic way to handle this step.
241		 */
242		tcp_ccalgounload(remove_cc);
243
244	return (err);
245}
246
247/*
248 * Returns 0 on success, non-zero on failure.
249 */
250int
251cc_register_algo(struct cc_algo *add_cc)
252{
253	struct cc_algo *funcs;
254	int err;
255
256	err = 0;
257
258	/*
259	 * Iterate over list of registered CC algorithms and make sure
260	 * we're not trying to add a duplicate.
261	 */
262	CC_LIST_WLOCK();
263	STAILQ_FOREACH(funcs, &cc_list, entries) {
264		if (funcs == add_cc || strncmp(funcs->name, add_cc->name,
265		    TCP_CA_NAME_MAX) == 0)
266			err = EEXIST;
267	}
268
269	if (!err)
270		STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
271
272	CC_LIST_WUNLOCK();
273
274	return (err);
275}
276
277/*
278 * Handles kld related events. Returns 0 on success, non-zero on failure.
279 */
280int
281cc_modevent(module_t mod, int event_type, void *data)
282{
283	struct cc_algo *algo;
284	int err;
285
286	err = 0;
287	algo = (struct cc_algo *)data;
288
289	switch(event_type) {
290	case MOD_LOAD:
291		if (algo->mod_init != NULL)
292			err = algo->mod_init();
293		if (!err)
294			err = cc_register_algo(algo);
295		break;
296
297	case MOD_QUIESCE:
298	case MOD_SHUTDOWN:
299	case MOD_UNLOAD:
300		err = cc_deregister_algo(algo);
301		if (!err && algo->mod_destroy != NULL)
302			algo->mod_destroy();
303		if (err == ENOENT)
304			err = 0;
305		break;
306
307	default:
308		err = EINVAL;
309		break;
310	}
311
312	return (err);
313}
314
315SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL);
316
317/* Declare sysctl tree and populate it. */
318SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL,
319    "congestion control related settings");
320
321SYSCTL_VNET_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW,
322    NULL, 0, cc_default_algo, "A", "default congestion control algorithm");
323
324SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD,
325    NULL, 0, cc_list_available, "A",
326    "list available congestion control algorithms");
327