cc_chd.c revision 218155
1/*-
2 * Copyright (c) 2009-2010
3 *	Swinburne University of Technology, Melbourne, Australia
4 * Copyright (c) 2010-2011 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * This software was developed at the Centre for Advanced Internet
8 * Architectures, Swinburne University, by David Hayes and Lawrence Stewart,
9 * made possible in part by a grant from the Cisco University Research Program
10 * Fund at Community Foundation Silicon Valley.
11 *
12 * Portions of this software were developed at the Centre for Advanced Internet
13 * Architectures, Swinburne University of Technology, Melbourne, Australia by
14 * David Hayes under sponsorship from the FreeBSD Foundation.
15 *
16 * Redistribution and use in source and binary forms, with or without
17 * modification, are permitted provided that the following conditions
18 * are met:
19 * 1. Redistributions of source code must retain the above copyright
20 *    notice, this list of conditions and the following disclaimer.
21 * 2. Redistributions in binary form must reproduce the above copyright
22 *    notice, this list of conditions and the following disclaimer in the
23 *    documentation and/or other materials provided with the distribution.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 */
37
38/*
39 * An implementation of the CAIA-Hamilton delay based congestion control
40 * algorithm, based on "Improved coexistence and loss tolerance for delay based
41 * TCP congestion control" by D. A. Hayes and G. Armitage., in 35th Annual IEEE
42 * Conference on Local Computer Networks (LCN 2010), Denver, Colorado, USA,
43 * 11-14 October 2010.
44 *
45 * Originally released as part of the NewTCP research project at Swinburne
46 * University's Centre for Advanced Internet Architectures, Melbourne,
47 * Australia, which was made possible in part by a grant from the Cisco
48 * University Research Program Fund at Community Foundation Silicon Valley. More
49 * details are available at:
50 *   http://caia.swin.edu.au/urp/newtcp/
51 */
52
53#include <sys/cdefs.h>
54__FBSDID("$FreeBSD: head/sys/netinet/cc/cc_chd.c 218155 2011-02-01 07:05:14Z lstewart $");
55
56#include <sys/param.h>
57#include <sys/kernel.h>
58#include <sys/khelp.h>
59#include <sys/limits.h>
60#include <sys/malloc.h>
61#include <sys/module.h>
62#include <sys/queue.h>
63#include <sys/socket.h>
64#include <sys/socketvar.h>
65#include <sys/sysctl.h>
66#include <sys/systm.h>
67
68#include <net/if.h>
69#include <net/vnet.h>
70
71#include <netinet/cc.h>
72#include <netinet/tcp_seq.h>
73#include <netinet/tcp_timer.h>
74#include <netinet/tcp_var.h>
75
76#include <netinet/cc/cc_module.h>
77
78#include <netinet/khelp/h_ertt.h>
79
80#define	CAST_PTR_INT(X)	(*((int*)(X)))
81
82/*
83 * Private signal type for rate based congestion signal.
84 * See <netinet/cc.h> for appropriate bit-range to use for private signals.
85 */
86#define	CC_CHD_DELAY	0x02000000
87
88/* Largest possible number returned by random(). */
89#define	RANDOM_MAX	INT_MAX
90
91static void	chd_ack_received(struct cc_var *ccv, uint16_t ack_type);
92static void	chd_cb_destroy(struct cc_var *ccv);
93static int	chd_cb_init(struct cc_var *ccv);
94static void	chd_cong_signal(struct cc_var *ccv, uint32_t signal_type);
95static void	chd_conn_init(struct cc_var *ccv);
96static int	chd_mod_init(void);
97
98struct chd {
99	/*
100	 * Shadow window - keeps track of what the NewReno congestion window
101	 * would have been if delay-based cwnd backoffs had not been made. This
102	 * functionality aids coexistence with loss-based TCP flows which may be
103	 * sharing links along the path.
104	 */
105	unsigned long shadow_w;
106	/*
107	 * Loss-based TCP compatibility flag - When set, it turns on the shadow
108	 * window functionality.
109	 */
110	int loss_compete;
111	 /* The maximum round trip time seen within a measured rtt period. */
112	int maxrtt_in_rtt;
113	/* The previous qdly that caused cwnd to backoff. */
114	int prev_backoff_qdly;
115};
116
117static int ertt_id;
118
119static VNET_DEFINE(uint32_t, chd_qmin) = 5;
120static VNET_DEFINE(uint32_t, chd_pmax) = 50;
121static VNET_DEFINE(uint32_t, chd_loss_fair) = 1;
122static VNET_DEFINE(uint32_t, chd_use_max) = 1;
123static VNET_DEFINE(uint32_t, chd_qthresh) = 20;
124#define	V_chd_qthresh	VNET(chd_qthresh)
125#define	V_chd_qmin	VNET(chd_qmin)
126#define	V_chd_pmax	VNET(chd_pmax)
127#define	V_chd_loss_fair	VNET(chd_loss_fair)
128#define	V_chd_use_max	VNET(chd_use_max)
129
130MALLOC_DECLARE(M_CHD);
131MALLOC_DEFINE(M_CHD, "chd data",
132    "Per connection data required for the CHD congestion control algorithm");
133
134struct cc_algo chd_cc_algo = {
135	.name = "chd",
136	.ack_received = chd_ack_received,
137	.cb_destroy = chd_cb_destroy,
138	.cb_init = chd_cb_init,
139	.cong_signal = chd_cong_signal,
140	.conn_init = chd_conn_init,
141	.mod_init = chd_mod_init
142};
143
144static __inline void
145chd_window_decrease(struct cc_var *ccv)
146{
147	unsigned long win;
148
149	win = min(CCV(ccv, snd_wnd), CCV(ccv, snd_cwnd)) / CCV(ccv, t_maxseg);
150	win -= max((win / 2), 1);
151	CCV(ccv, snd_ssthresh) = max(win, 2) * CCV(ccv, t_maxseg);
152}
153
154/*
155 * Probabilistic backoff function. Returns 1 if we should backoff or 0
156 * otherwise. The calculation of p is similar to the calculation of p in cc_hd.
157 */
158static __inline int
159should_backoff(int qdly, int maxqdly, struct chd *chd_data)
160{
161	unsigned long p, rand;
162
163	rand = random();
164
165	if (qdly < V_chd_qthresh) {
166		chd_data->loss_compete = 0;
167		p = (((RANDOM_MAX / 100) * V_chd_pmax) /
168		    (V_chd_qthresh - V_chd_qmin)) *
169		    (qdly - V_chd_qmin);
170	} else {
171		if (qdly > V_chd_qthresh) {
172			p = (((RANDOM_MAX / 100) * V_chd_pmax) /
173			    (maxqdly - V_chd_qthresh)) *
174			    (maxqdly - qdly);
175			if (V_chd_loss_fair && rand < p)
176				chd_data->loss_compete = 1;
177		} else {
178			p = (RANDOM_MAX / 100) * V_chd_pmax;
179			chd_data->loss_compete = 0;
180		}
181	}
182
183	return (rand < p);
184}
185
186static __inline void
187chd_window_increase(struct cc_var *ccv, int new_measurement)
188{
189	struct chd *chd_data;
190	int incr;
191
192	chd_data = ccv->cc_data;
193	incr = 0;
194
195	if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) {
196		/* Adapted from NewReno slow start. */
197		if (V_tcp_do_rfc3465) {
198			/* In slow-start with ABC enabled. */
199			if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) {
200				/* Not due to RTO. */
201				incr = min(ccv->bytes_this_ack,
202				    V_tcp_abc_l_var * CCV(ccv, t_maxseg));
203			} else {
204				/* Due to RTO. */
205				incr = min(ccv->bytes_this_ack,
206				    CCV(ccv, t_maxseg));
207			}
208		} else
209			incr = CCV(ccv, t_maxseg);
210
211	} else { /* Congestion avoidance. */
212		if (V_tcp_do_rfc3465) {
213			if (ccv->flags & CCF_ABC_SENTAWND) {
214				ccv->flags &= ~CCF_ABC_SENTAWND;
215				incr = CCV(ccv, t_maxseg);
216			}
217		} else if (new_measurement)
218			incr = CCV(ccv, t_maxseg);
219	}
220
221	if (chd_data->shadow_w > 0) {
222		/* Track NewReno window. */
223		chd_data->shadow_w = min(chd_data->shadow_w + incr,
224		    TCP_MAXWIN << CCV(ccv, snd_scale));
225	}
226
227	CCV(ccv,snd_cwnd) = min(CCV(ccv, snd_cwnd) + incr,
228	    TCP_MAXWIN << CCV(ccv, snd_scale));
229}
230
231/*
232 * All ACK signals are used for timing measurements to determine delay-based
233 * congestion. However, window increases are only performed when
234 * ack_type == CC_ACK.
235 */
236static void
237chd_ack_received(struct cc_var *ccv, uint16_t ack_type)
238{
239	struct chd *chd_data;
240	struct ertt *e_t;
241	int backoff, new_measurement, qdly, rtt;
242
243	e_t = khelp_get_osd(CCV(ccv, osd), ertt_id);
244	chd_data = ccv->cc_data;
245	new_measurement = e_t->flags & ERTT_NEW_MEASUREMENT;
246	backoff = qdly = 0;
247
248	chd_data->maxrtt_in_rtt = imax(e_t->rtt, chd_data->maxrtt_in_rtt);
249
250	if (new_measurement) {
251		/*
252		 * There is a new per RTT measurement, so check to see if there
253		 * is delay based congestion.
254		 */
255		rtt = V_chd_use_max ? chd_data->maxrtt_in_rtt : e_t->rtt;
256		chd_data->maxrtt_in_rtt = 0;
257
258		if (rtt && e_t->minrtt && !IN_RECOVERY(CCV(ccv, t_flags))) {
259			qdly = rtt - e_t->minrtt;
260			if (qdly > V_chd_qmin) {
261				/*
262				 * Probabilistic delay based congestion
263				 * indication.
264				 */
265				backoff = should_backoff(qdly,
266				    e_t->maxrtt - e_t->minrtt, chd_data);
267			} else
268				chd_data->loss_compete = 0;
269		}
270		/* Reset per RTT measurement flag to start a new measurement. */
271		e_t->flags &= ~ERTT_NEW_MEASUREMENT;
272	}
273
274	if (backoff) {
275		/*
276		 * Update shadow_w before delay based backoff.
277		 */
278		if (chd_data->loss_compete ||
279		    qdly > chd_data->prev_backoff_qdly) {
280			/*
281			 * Delay is higher than when we backed off previously,
282			 * so it is possible that this flow is competing with
283			 * loss based flows.
284			 */
285			chd_data->shadow_w = max(CCV(ccv, snd_cwnd),
286			    chd_data->shadow_w);
287		} else {
288			/*
289			 * Reset shadow_w, as it is probable that this flow is
290			 * not competing with loss based flows at the moment.
291			 */
292			chd_data->shadow_w = 0;
293		}
294
295		chd_data->prev_backoff_qdly = qdly;
296		/*
297		 * Send delay-based congestion signal to the congestion signal
298		 * handler.
299		 */
300		chd_cong_signal(ccv, CC_CHD_DELAY);
301
302	} else if (ack_type == CC_ACK)
303		chd_window_increase(ccv, new_measurement);
304}
305
306static void
307chd_cb_destroy(struct cc_var *ccv)
308{
309
310	if (ccv->cc_data != NULL)
311		free(ccv->cc_data, M_CHD);
312}
313
314static int
315chd_cb_init(struct cc_var *ccv)
316{
317	struct chd *chd_data;
318
319	chd_data = malloc(sizeof(struct chd), M_CHD, M_NOWAIT);
320	if (chd_data == NULL)
321		return (ENOMEM);
322
323	chd_data->shadow_w = 0;
324	ccv->cc_data = chd_data;
325
326	return (0);
327}
328
329static void
330chd_cong_signal(struct cc_var *ccv, uint32_t signal_type)
331{
332	struct ertt *e_t;
333	struct chd *chd_data;
334	int qdly;
335
336	e_t = khelp_get_osd(CCV(ccv, osd), ertt_id);
337	chd_data = ccv->cc_data;
338	qdly = imax(e_t->rtt, chd_data->maxrtt_in_rtt) - e_t->minrtt;
339
340	switch(signal_type) {
341	case CC_CHD_DELAY:
342		chd_window_decrease(ccv); /* Set new ssthresh. */
343		CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
344		CCV(ccv, snd_recover) = CCV(ccv, snd_max);
345		ENTER_CONGRECOVERY(CCV(ccv, t_flags));
346		break;
347
348	case CC_NDUPACK: /* Packet loss. */
349		/*
350		 * Only react to loss as a congestion signal if qdly >
351		 * V_chd_qthresh.  If qdly is less than qthresh, presume that
352		 * this is a non congestion related loss. If qdly is greater
353		 * than qthresh, assume that we are competing with loss based
354		 * tcp flows and restore window from any unnecessary backoffs,
355		 * before the decrease.
356		 */
357		if (!IN_RECOVERY(CCV(ccv, t_flags)) && qdly > V_chd_qthresh) {
358			if (chd_data->loss_compete) {
359				CCV(ccv, snd_cwnd) = max(CCV(ccv, snd_cwnd),
360				    chd_data->shadow_w);
361			}
362			chd_window_decrease(ccv);
363		} else {
364			 /*
365			  * This loss isn't congestion related, or already
366			  * recovering from congestion.
367			  */
368			CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd);
369			CCV(ccv, snd_recover) = CCV(ccv, snd_max);
370		}
371
372		if (chd_data->shadow_w > 0) {
373			chd_data->shadow_w = max(chd_data->shadow_w /
374			    CCV(ccv, t_maxseg) / 2, 2) * CCV(ccv, t_maxseg);
375		}
376		ENTER_FASTRECOVERY(CCV(ccv, t_flags));
377		break;
378
379	default:
380		newreno_cc_algo.cong_signal(ccv, signal_type);
381	}
382}
383
384static void
385chd_conn_init(struct cc_var *ccv)
386{
387	struct chd *chd_data;
388
389	chd_data = ccv->cc_data;
390	chd_data->prev_backoff_qdly = 0;
391	chd_data->maxrtt_in_rtt = 0;
392	chd_data->loss_compete = 0;
393	/*
394	 * Initialise the shadow_cwnd to be equal to snd_cwnd in case we are
395	 * competing with loss based flows from the start.
396	 */
397	chd_data->shadow_w = CCV(ccv, snd_cwnd);
398}
399
400static int
401chd_mod_init(void)
402{
403
404	ertt_id = khelp_get_id("ertt");
405	if (ertt_id <= 0) {
406		printf("%s: h_ertt module not found\n", __func__);
407		return (ENOENT);
408	}
409
410	chd_cc_algo.after_idle = newreno_cc_algo.after_idle;
411	chd_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
412
413	return (0);
414}
415
416static int
417chd_loss_fair_handler(SYSCTL_HANDLER_ARGS)
418{
419	int error;
420	uint32_t new;
421
422	new = V_chd_loss_fair;
423	error = sysctl_handle_int(oidp, &new, 0, req);
424	if (error == 0 && req->newptr != NULL) {
425		if (CAST_PTR_INT(req->newptr) > 1)
426			error = EINVAL;
427		else
428			V_chd_loss_fair = new;
429	}
430
431	return (error);
432}
433
434static int
435chd_pmax_handler(SYSCTL_HANDLER_ARGS)
436{
437	int error;
438	uint32_t new;
439
440	new = V_chd_pmax;
441	error = sysctl_handle_int(oidp, &new, 0, req);
442	if (error == 0 && req->newptr != NULL) {
443		if (CAST_PTR_INT(req->newptr) == 0 ||
444		    CAST_PTR_INT(req->newptr) > 100)
445			error = EINVAL;
446		else
447			V_chd_pmax = new;
448	}
449
450	return (error);
451}
452
453static int
454chd_qthresh_handler(SYSCTL_HANDLER_ARGS)
455{
456	int error;
457	uint32_t new;
458
459	new = V_chd_qthresh;
460	error = sysctl_handle_int(oidp, &new, 0, req);
461	if (error == 0 && req->newptr != NULL) {
462		if (CAST_PTR_INT(req->newptr) <= V_chd_qmin)
463			error = EINVAL;
464		else
465			V_chd_qthresh = new;
466	}
467
468	return (error);
469}
470
471SYSCTL_DECL(_net_inet_tcp_cc_chd);
472SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, chd, CTLFLAG_RW, NULL,
473    "CAIA Hamilton delay-based congestion control related settings");
474
475SYSCTL_VNET_PROC(_net_inet_tcp_cc_chd, OID_AUTO, loss_fair,
476    CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(chd_loss_fair), 1, &chd_loss_fair_handler,
477    "IU", "Flag to enable shadow window functionality.");
478
479SYSCTL_VNET_PROC(_net_inet_tcp_cc_chd, OID_AUTO, pmax,
480    CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(chd_pmax), 5, &chd_pmax_handler,
481    "IU", "Per RTT maximum backoff probability as a percentage");
482
483SYSCTL_VNET_PROC(_net_inet_tcp_cc_chd, OID_AUTO, queue_threshold,
484    CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(chd_qthresh), 20, &chd_qthresh_handler,
485    "IU", "Queueing congestion threshold in ticks");
486
487SYSCTL_VNET_UINT(_net_inet_tcp_cc_chd, OID_AUTO, queue_min,
488    CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(chd_qmin), 5,
489    "Minimum queueing delay threshold in ticks");
490
491SYSCTL_VNET_UINT(_net_inet_tcp_cc_chd,  OID_AUTO, use_max,
492    CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(chd_use_max), 1,
493    "Use the maximum RTT seen within the measurement period (RTT) "
494    "as the basic delay measurement for the algorithm.");
495
496DECLARE_CC_MODULE(chd, &chd_cc_algo);
497MODULE_DEPEND(chd, ertt, 1, 1, 1);
498