cc.c revision 215391
1/*-
2 * Copyright (c) 2007-2008
3 *	Swinburne University of Technology, Melbourne, Australia.
4 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
5 * Copyright (c) 2010 The FreeBSD Foundation
6 * All rights reserved.
7 *
8 * This software was developed at the Centre for Advanced Internet
9 * Architectures, Swinburne University, by Lawrence Stewart and James Healy,
10 * made possible in part by a grant from the Cisco University Research Program
11 * Fund at Community Foundation Silicon Valley.
12 *
13 * Portions of this software were developed at the Centre for Advanced
14 * Internet Architectures, Swinburne University of Technology, Melbourne,
15 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 */
38
39/*
40 * This software was first released in 2007 by James Healy and Lawrence Stewart
41 * whilst working on the NewTCP research project at Swinburne University's
42 * Centre for Advanced Internet Architectures, Melbourne, Australia, which was
43 * made possible in part by a grant from the Cisco University Research Program
44 * Fund at Community Foundation Silicon Valley. More details are available at:
45 *   http://caia.swin.edu.au/urp/newtcp/
46 */
47
48#include <sys/cdefs.h>
49__FBSDID("$FreeBSD: head/sys/netinet/cc/cc.c 215391 2010-11-16 07:57:56Z lstewart $");
50
51#include <sys/param.h>
52#include <sys/kernel.h>
53#include <sys/libkern.h>
54#include <sys/lock.h>
55#include <sys/malloc.h>
56#include <sys/module.h>
57#include <sys/mutex.h>
58#include <sys/queue.h>
59#include <sys/rwlock.h>
60#include <sys/sbuf.h>
61#include <sys/socket.h>
62#include <sys/socketvar.h>
63#include <sys/sysctl.h>
64
65#include <net/if.h>
66#include <net/if_var.h>
67
68#include <netinet/cc.h>
69#include <netinet/in.h>
70#include <netinet/in_pcb.h>
71#include <netinet/tcp_var.h>
72
73#include <netinet/cc/cc_module.h>
74
75/*
76 * List of available cc algorithms on the current system. First element
77 * is used as the system default CC algorithm.
78 */
79struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
80
81/* Protects the cc_list TAILQ. */
82struct rwlock cc_list_lock;
83
84/*
85 * Set the default CC algorithm to new_default. The default is identified
86 * by being the first element in the cc_list TAILQ.
87 */
88static void
89cc_set_default(struct cc_algo *new_default)
90{
91	CC_LIST_WLOCK_ASSERT();
92
93	/*
94	 * Make the requested system default CC algorithm the first element in
95	 * the list if it isn't already.
96	 */
97	if (new_default != CC_DEFAULT()) {
98		STAILQ_REMOVE(&cc_list, new_default, cc_algo, entries);
99		STAILQ_INSERT_HEAD(&cc_list, new_default, entries);
100	}
101}
102
103/*
104 * Sysctl handler to show and change the default CC algorithm.
105 */
106static int
107cc_default_algo(SYSCTL_HANDLER_ARGS)
108{
109	struct cc_algo *funcs;
110	int err, found;
111
112	err = found = 0;
113
114	if (req->newptr == NULL) {
115		char default_cc[TCP_CA_NAME_MAX];
116
117		/* Just print the current default. */
118		CC_LIST_RLOCK();
119		strlcpy(default_cc, CC_DEFAULT()->name, TCP_CA_NAME_MAX);
120		CC_LIST_RUNLOCK();
121		err = sysctl_handle_string(oidp, default_cc, 1, req);
122	} else {
123		/* Find algo with specified name and set it to default. */
124		CC_LIST_WLOCK();
125		STAILQ_FOREACH(funcs, &cc_list, entries) {
126			if (strncmp((char *)req->newptr, funcs->name,
127			    TCP_CA_NAME_MAX) == 0) {
128				found = 1;
129				cc_set_default(funcs);
130			}
131		}
132		CC_LIST_WUNLOCK();
133
134		if (!found)
135			err = ESRCH;
136	}
137
138	return (err);
139}
140
141/*
142 * Sysctl handler to display the list of available CC algorithms.
143 */
144static int
145cc_list_available(SYSCTL_HANDLER_ARGS)
146{
147	struct cc_algo *algo;
148	struct sbuf *s;
149	int err, first;
150
151	err = 0;
152	first = 1;
153	s = sbuf_new(NULL, NULL, TCP_CA_NAME_MAX, SBUF_AUTOEXTEND);
154
155	if (s == NULL)
156		return (ENOMEM);
157
158	CC_LIST_RLOCK();
159	STAILQ_FOREACH(algo, &cc_list, entries) {
160		err = sbuf_printf(s, first ? "%s" : ", %s", algo->name);
161		if (err)
162			break;
163		first = 0;
164	}
165	CC_LIST_RUNLOCK();
166
167	if (!err) {
168		sbuf_finish(s);
169		err = sysctl_handle_string(oidp, sbuf_data(s), 1, req);
170	}
171
172	sbuf_delete(s);
173	return (err);
174}
175
176/*
177 * Initialise CC subsystem on system boot.
178 */
179static void
180cc_init(void)
181{
182	CC_LIST_LOCK_INIT();
183	STAILQ_INIT(&cc_list);
184}
185
186/*
187 * Returns non-zero on success, 0 on failure.
188 */
189int
190cc_deregister_algo(struct cc_algo *remove_cc)
191{
192	struct cc_algo *funcs, *tmpfuncs;
193	struct tcpcb *tp;
194	struct inpcb *inp;
195	int err;
196	VNET_ITERATOR_DECL(vnet_iter);
197
198	err = ENOENT;
199
200	/* Never allow newreno to be deregistered. */
201	if (&newreno_cc_algo == remove_cc)
202		return (EPERM);
203
204	/* Remove algo from cc_list so that new connections can't use it. */
205	CC_LIST_WLOCK();
206	STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
207		if (funcs == remove_cc) {
208			/*
209			 * If we're removing the current system default,
210			 * reset the default to newreno.
211			 */
212			if (strncmp(CC_DEFAULT()->name, remove_cc->name,
213			    TCP_CA_NAME_MAX) == 0)
214				cc_set_default(&newreno_cc_algo);
215
216			STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
217			err = 0;
218			break;
219		}
220	}
221	CC_LIST_WUNLOCK();
222
223	if (!err) {
224		/*
225		 * Check all active control blocks across all network stacks and
226		 * change any that are using this algorithm back to newreno. If
227		 * the algorithm that was in use requires cleanup code to be
228		 * run, call it.
229		 *
230		 * New connections already part way through being initialised
231		 * with the CC algo we're removing will not race with this code
232		 * because the INP_INFO_WLOCK is held during initialisation.
233		 * We therefore don't enter the loop below until the connection
234		 * list has stabilised.
235		 */
236		VNET_LIST_RLOCK();
237		VNET_FOREACH(vnet_iter) {
238			CURVNET_SET(vnet_iter);
239			INP_INFO_RLOCK(&V_tcbinfo);
240			LIST_FOREACH(inp, &V_tcb, inp_list) {
241				INP_WLOCK(inp);
242				/* Important to skip tcptw structs. */
243				if (!(inp->inp_flags & INP_TIMEWAIT) &&
244				    (tp = intotcpcb(inp)) != NULL) {
245					/*
246					 * By holding INP_WLOCK here, we are
247					 * assured that the connection is not
248					 * currently executing inside the CC
249					 * module's functions i.e. it is safe
250					 * to make the switch back to newreno.
251					 */
252					if (CC_ALGO(tp) == remove_cc) {
253						tmpfuncs = CC_ALGO(tp);
254						/*
255						 * Newreno does not
256						 * require any init.
257						 */
258						CC_ALGO(tp) = &newreno_cc_algo;
259						if (tmpfuncs->cb_destroy != NULL)
260							tmpfuncs->cb_destroy(tp->ccv);
261					}
262				}
263				INP_WUNLOCK(inp);
264			}
265			INP_INFO_RUNLOCK(&V_tcbinfo);
266			CURVNET_RESTORE();
267		}
268		VNET_LIST_RUNLOCK();
269	}
270
271	return (err);
272}
273
274/*
275 * Returns 0 on success, non-zero on failure.
276 */
277int
278cc_register_algo(struct cc_algo *add_cc)
279{
280	struct cc_algo *funcs;
281	int err;
282
283	err = 0;
284
285	/*
286	 * Iterate over list of registered CC algorithms and make sure
287	 * we're not trying to add a duplicate.
288	 */
289	CC_LIST_WLOCK();
290	STAILQ_FOREACH(funcs, &cc_list, entries) {
291		if (funcs == add_cc || strncmp(funcs->name, add_cc->name,
292		    TCP_CA_NAME_MAX) == 0)
293			err = EEXIST;
294	}
295
296	if (!err)
297		STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
298
299	CC_LIST_WUNLOCK();
300
301	return (err);
302}
303
304/*
305 * Handles kld related events. Returns 0 on success, non-zero on failure.
306 */
307int
308cc_modevent(module_t mod, int event_type, void *data)
309{
310	struct cc_algo *algo;
311	int err;
312
313	err = 0;
314	algo = (struct cc_algo *)data;
315
316	switch(event_type) {
317	case MOD_LOAD:
318		if (algo->mod_init != NULL)
319			err = algo->mod_init();
320		if (!err)
321			err = cc_register_algo(algo);
322		break;
323
324	case MOD_QUIESCE:
325	case MOD_SHUTDOWN:
326	case MOD_UNLOAD:
327		err = cc_deregister_algo(algo);
328		if (!err && algo->mod_destroy != NULL)
329			algo->mod_destroy();
330		if (err == ENOENT)
331			err = 0;
332		break;
333
334	default:
335		err = EINVAL;
336		break;
337	}
338
339	return (err);
340}
341
342SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL);
343
344/* Declare sysctl tree and populate it. */
345SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL,
346    "congestion control related settings");
347
348SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW,
349    NULL, 0, cc_default_algo, "A", "default congestion control algorithm");
350
351SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD,
352    NULL, 0, cc_list_available, "A",
353    "list available congestion control algorithms");
354