Deleted Added
full compact
cc_cdg.c (294534) cc_cdg.c (294535)
1/*-
2 * Copyright (c) 2009-2013
3 * Swinburne University of Technology, Melbourne, Australia
4 * All rights reserved.
5 *
6 * This software was developed at the Centre for Advanced Internet
7 * Architectures, Swinburne University of Technology, by David Hayes, made
8 * possible in part by a gift from The Cisco University Research Program Fund,
9 * a corporate advised fund of Silicon Valley Community Foundation. Development
10 * and testing were further assisted by a grant from the FreeBSD Foundation.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 */
33
34/*
35 * CAIA Delay-Gradient (CDG) congestion control algorithm
36 *
37 * An implemention of the delay-gradient congestion control algorithm proposed
38 * in the following paper:
39 *
40 * D. A. Hayes and G. Armitage, "Revisiting TCP Congestion Control using Delay
41 * Gradients", in IFIP Networking, Valencia, Spain, 9-13 May 2011.
42 *
43 * Developed as part of the NewTCP research project at Swinburne University of
44 * Technology's Centre for Advanced Internet Architectures, Melbourne,
45 * Australia. More details are available at:
46 * http://caia.swin.edu.au/urp/newtcp/
47 */
48
49#include <sys/cdefs.h>
1/*-
2 * Copyright (c) 2009-2013
3 * Swinburne University of Technology, Melbourne, Australia
4 * All rights reserved.
5 *
6 * This software was developed at the Centre for Advanced Internet
7 * Architectures, Swinburne University of Technology, by David Hayes, made
8 * possible in part by a gift from The Cisco University Research Program Fund,
9 * a corporate advised fund of Silicon Valley Community Foundation. Development
10 * and testing were further assisted by a grant from the FreeBSD Foundation.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 */
33
34/*
35 * CAIA Delay-Gradient (CDG) congestion control algorithm
36 *
37 * An implemention of the delay-gradient congestion control algorithm proposed
38 * in the following paper:
39 *
40 * D. A. Hayes and G. Armitage, "Revisiting TCP Congestion Control using Delay
41 * Gradients", in IFIP Networking, Valencia, Spain, 9-13 May 2011.
42 *
43 * Developed as part of the NewTCP research project at Swinburne University of
44 * Technology's Centre for Advanced Internet Architectures, Melbourne,
45 * Australia. More details are available at:
46 * http://caia.swin.edu.au/urp/newtcp/
47 */
48
49#include <sys/cdefs.h>
50__FBSDID("$FreeBSD: head/sys/netinet/cc/cc_cdg.c 294534 2016-01-21 22:24:20Z glebius $");
50__FBSDID("$FreeBSD: head/sys/netinet/cc/cc_cdg.c 294535 2016-01-21 22:34:51Z glebius $");
51
52#include <sys/param.h>
53#include <sys/hhook.h>
54#include <sys/kernel.h>
55#include <sys/khelp.h>
56#include <sys/limits.h>
57#include <sys/lock.h>
58#include <sys/malloc.h>
59#include <sys/module.h>
60#include <sys/queue.h>
61#include <sys/socket.h>
62#include <sys/socketvar.h>
63#include <sys/sysctl.h>
64#include <sys/systm.h>
65
66#include <net/vnet.h>
67
51
52#include <sys/param.h>
53#include <sys/hhook.h>
54#include <sys/kernel.h>
55#include <sys/khelp.h>
56#include <sys/limits.h>
57#include <sys/lock.h>
58#include <sys/malloc.h>
59#include <sys/module.h>
60#include <sys/queue.h>
61#include <sys/socket.h>
62#include <sys/socketvar.h>
63#include <sys/sysctl.h>
64#include <sys/systm.h>
65
66#include <net/vnet.h>
67
68#include <netinet/cc.h>
68#include <netinet/tcp.h>
69#include <netinet/tcp_seq.h>
70#include <netinet/tcp_timer.h>
71#include <netinet/tcp_var.h>
69#include <netinet/tcp_seq.h>
70#include <netinet/tcp_timer.h>
71#include <netinet/tcp_var.h>
72
72#include <netinet/tcp_cc.h>
73#include <netinet/cc/cc_module.h>
74
75#include <netinet/khelp/h_ertt.h>
76
77#include <vm/uma.h>
78
79#define CDG_VERSION "0.1"
80
81#define CAST_PTR_INT(X) (*((int*)(X)))
82
83/* Private delay-gradient induced congestion control signal. */
84#define CC_CDG_DELAY 0x01000000
85
86/* NewReno window deflation factor on loss (as a percentage). */
87#define RENO_BETA 50
88
89/* Queue states. */
90#define CDG_Q_EMPTY 1
91#define CDG_Q_RISING 2
92#define CDG_Q_FALLING 3
93#define CDG_Q_FULL 4
94#define CDG_Q_UNKNOWN 9999
95
96/* Number of bit shifts used in probexp lookup table. */
97#define EXP_PREC 15
98
99/* Largest gradient represented in probexp lookup table. */
100#define MAXGRAD 5
101
102/*
103 * Delay Precision Enhance - number of bit shifts used for qtrend related
104 * integer arithmetic precision.
105 */
106#define D_P_E 7
107
108struct qdiff_sample {
109 long qdiff;
110 STAILQ_ENTRY(qdiff_sample) qdiff_lnk;
111};
112
113struct cdg {
114 long max_qtrend;
115 long min_qtrend;
116 STAILQ_HEAD(minrtts_head, qdiff_sample) qdiffmin_q;
117 STAILQ_HEAD(maxrtts_head, qdiff_sample) qdiffmax_q;
118 long window_incr;
119 /* rttcount for window increase when in congestion avoidance */
120 long rtt_count;
121 /* maximum measured rtt within an rtt period */
122 int maxrtt_in_rtt;
123 /* maximum measured rtt within prev rtt period */
124 int maxrtt_in_prevrtt;
125 /* minimum measured rtt within an rtt period */
126 int minrtt_in_rtt;
127 /* minimum measured rtt within prev rtt period */
128 int minrtt_in_prevrtt;
129 /* consecutive congestion episode counter */
130 uint32_t consec_cong_cnt;
131 /* when tracking a new reno type loss window */
132 uint32_t shadow_w;
133 /* maximum number of samples in the moving average queue */
134 int sample_q_size;
135 /* number of samples in the moving average queue */
136 int num_samples;
137 /* estimate of the queue state of the path */
138 int queue_state;
139};
140
141/*
142 * Lookup table for:
143 * (1 - exp(-x)) << EXP_PREC, where x = [0,MAXGRAD] in 2^-7 increments
144 *
145 * Note: probexp[0] is set to 10 (not 0) as a safety for very low increase
146 * gradients.
147 */
148static const int probexp[641] = {
149 10,255,508,759,1008,1255,1501,1744,1985,2225,2463,2698,2932,3165,3395,3624,
150 3850,4075,4299,4520,4740,4958,5175,5389,5602,5814,6024,6232,6438,6643,6846,
151 7048,7248,7447,7644,7839,8033,8226,8417,8606,8794,8981,9166,9350,9532,9713,
152 9892,10070,10247,10422,10596,10769,10940,11110,11278,11445,11611,11776,11939,
153 12101,12262,12422,12580,12737,12893,13048,13201,13354,13505,13655,13803,13951,
154 14097,14243,14387,14530,14672,14813,14952,15091,15229,15365,15500,15635,15768,
155 15900,16032,16162,16291,16419,16547,16673,16798,16922,17046,17168,17289,17410,
156 17529,17648,17766,17882,17998,18113,18227,18340,18453,18564,18675,18784,18893,
157 19001,19108,19215,19320,19425,19529,19632,19734,19835,19936,20036,20135,20233,
158 20331,20427,20523,20619,20713,20807,20900,20993,21084,21175,21265,21355,21444,
159 21532,21619,21706,21792,21878,21962,22046,22130,22213,22295,22376,22457,22537,
160 22617,22696,22774,22852,22929,23006,23082,23157,23232,23306,23380,23453,23525,
161 23597,23669,23739,23810,23879,23949,24017,24085,24153,24220,24286,24352,24418,
162 24483,24547,24611,24675,24738,24800,24862,24924,24985,25045,25106,25165,25224,
163 25283,25341,25399,25456,25513,25570,25626,25681,25737,25791,25846,25899,25953,
164 26006,26059,26111,26163,26214,26265,26316,26366,26416,26465,26514,26563,26611,
165 26659,26707,26754,26801,26847,26893,26939,26984,27029,27074,27118,27162,27206,
166 27249,27292,27335,27377,27419,27460,27502,27543,27583,27624,27664,27703,27743,
167 27782,27821,27859,27897,27935,27973,28010,28047,28084,28121,28157,28193,28228,
168 28263,28299,28333,28368,28402,28436,28470,28503,28536,28569,28602,28634,28667,
169 28699,28730,28762,28793,28824,28854,28885,28915,28945,28975,29004,29034,29063,
170 29092,29120,29149,29177,29205,29232,29260,29287,29314,29341,29368,29394,29421,
171 29447,29472,29498,29524,29549,29574,29599,29623,29648,29672,29696,29720,29744,
172 29767,29791,29814,29837,29860,29882,29905,29927,29949,29971,29993,30014,30036,
173 30057,30078,30099,30120,30141,30161,30181,30201,30221,30241,30261,30280,30300,
174 30319,30338,30357,30376,30394,30413,30431,30449,30467,30485,30503,30521,30538,
175 30555,30573,30590,30607,30624,30640,30657,30673,30690,30706,30722,30738,30753,
176 30769,30785,30800,30815,30831,30846,30861,30876,30890,30905,30919,30934,30948,
177 30962,30976,30990,31004,31018,31031,31045,31058,31072,31085,31098,31111,31124,
178 31137,31149,31162,31174,31187,31199,31211,31223,31235,31247,31259,31271,31283,
179 31294,31306,31317,31328,31339,31351,31362,31373,31383,31394,31405,31416,31426,
180 31436,31447,31457,31467,31477,31487,31497,31507,31517,31527,31537,31546,31556,
181 31565,31574,31584,31593,31602,31611,31620,31629,31638,31647,31655,31664,31673,
182 31681,31690,31698,31706,31715,31723,31731,31739,31747,31755,31763,31771,31778,
183 31786,31794,31801,31809,31816,31824,31831,31838,31846,31853,31860,31867,31874,
184 31881,31888,31895,31902,31908,31915,31922,31928,31935,31941,31948,31954,31960,
185 31967,31973,31979,31985,31991,31997,32003,32009,32015,32021,32027,32033,32038,
186 32044,32050,32055,32061,32066,32072,32077,32083,32088,32093,32098,32104,32109,
187 32114,32119,32124,32129,32134,32139,32144,32149,32154,32158,32163,32168,32173,
188 32177,32182,32186,32191,32195,32200,32204,32209,32213,32217,32222,32226,32230,
189 32234,32238,32242,32247,32251,32255,32259,32263,32267,32270,32274,32278,32282,
190 32286,32290,32293,32297,32301,32304,32308,32311,32315,32318,32322,32325,32329,
191 32332,32336,32339,32342,32346,32349,32352,32356,32359,32362,32365,32368,32371,
192 32374,32377,32381,32384,32387,32389,32392,32395,32398,32401,32404,32407,32410,
193 32412,32415,32418,32421,32423,32426,32429,32431,32434,32437,32439,32442,32444,
194 32447,32449,32452,32454,32457,32459,32461,32464,32466,32469,32471,32473,32476,
195 32478,32480,32482,32485,32487,32489,32491,32493,32495,32497,32500,32502,32504,
196 32506,32508,32510,32512,32514,32516,32518,32520,32522,32524,32526,32527,32529,
197 32531,32533,32535,32537,32538,32540,32542,32544,32545,32547};
198
199static uma_zone_t qdiffsample_zone;
200
201static MALLOC_DEFINE(M_CDG, "cdg data",
202 "Per connection data required for the CDG congestion control algorithm");
203
204static int ertt_id;
205
206static VNET_DEFINE(uint32_t, cdg_alpha_inc);
207static VNET_DEFINE(uint32_t, cdg_beta_delay);
208static VNET_DEFINE(uint32_t, cdg_beta_loss);
209static VNET_DEFINE(uint32_t, cdg_smoothing_factor);
210static VNET_DEFINE(uint32_t, cdg_exp_backoff_scale);
211static VNET_DEFINE(uint32_t, cdg_consec_cong);
212static VNET_DEFINE(uint32_t, cdg_hold_backoff);
213#define V_cdg_alpha_inc VNET(cdg_alpha_inc)
214#define V_cdg_beta_delay VNET(cdg_beta_delay)
215#define V_cdg_beta_loss VNET(cdg_beta_loss)
216#define V_cdg_smoothing_factor VNET(cdg_smoothing_factor)
217#define V_cdg_exp_backoff_scale VNET(cdg_exp_backoff_scale)
218#define V_cdg_consec_cong VNET(cdg_consec_cong)
219#define V_cdg_hold_backoff VNET(cdg_hold_backoff)
220
221/* Function prototypes. */
222static int cdg_mod_init(void);
223static int cdg_mod_destroy(void);
224static void cdg_conn_init(struct cc_var *ccv);
225static int cdg_cb_init(struct cc_var *ccv);
226static void cdg_cb_destroy(struct cc_var *ccv);
227static void cdg_cong_signal(struct cc_var *ccv, uint32_t signal_type);
228static void cdg_ack_received(struct cc_var *ccv, uint16_t ack_type);
229
230struct cc_algo cdg_cc_algo = {
231 .name = "cdg",
232 .mod_init = cdg_mod_init,
233 .ack_received = cdg_ack_received,
234 .cb_destroy = cdg_cb_destroy,
235 .cb_init = cdg_cb_init,
236 .conn_init = cdg_conn_init,
237 .cong_signal = cdg_cong_signal,
238 .mod_destroy = cdg_mod_destroy
239};
240
241/* Vnet created and being initialised. */
242static void
243cdg_init_vnet(const void *unused __unused)
244{
245
246 V_cdg_alpha_inc = 0;
247 V_cdg_beta_delay = 70;
248 V_cdg_beta_loss = 50;
249 V_cdg_smoothing_factor = 8;
250 V_cdg_exp_backoff_scale = 3;
251 V_cdg_consec_cong = 5;
252 V_cdg_hold_backoff = 5;
253}
254
255static int
256cdg_mod_init(void)
257{
258 VNET_ITERATOR_DECL(v);
259
260 ertt_id = khelp_get_id("ertt");
261 if (ertt_id <= 0)
262 return (EINVAL);
263
264 qdiffsample_zone = uma_zcreate("cdg_qdiffsample",
265 sizeof(struct qdiff_sample), NULL, NULL, NULL, NULL, 0, 0);
266
267 VNET_LIST_RLOCK();
268 VNET_FOREACH(v) {
269 CURVNET_SET(v);
270 cdg_init_vnet(NULL);
271 CURVNET_RESTORE();
272 }
273 VNET_LIST_RUNLOCK();
274
275 cdg_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
276 cdg_cc_algo.after_idle = newreno_cc_algo.after_idle;
277
278 return (0);
279}
280
281static int
282cdg_mod_destroy(void)
283{
284
285 uma_zdestroy(qdiffsample_zone);
286 return (0);
287}
288
289static int
290cdg_cb_init(struct cc_var *ccv)
291{
292 struct cdg *cdg_data;
293
294 cdg_data = malloc(sizeof(struct cdg), M_CDG, M_NOWAIT);
295 if (cdg_data == NULL)
296 return (ENOMEM);
297
298 cdg_data->shadow_w = 0;
299 cdg_data->max_qtrend = 0;
300 cdg_data->min_qtrend = 0;
301 cdg_data->queue_state = CDG_Q_UNKNOWN;
302 cdg_data->maxrtt_in_rtt = 0;
303 cdg_data->maxrtt_in_prevrtt = 0;
304 cdg_data->minrtt_in_rtt = INT_MAX;
305 cdg_data->minrtt_in_prevrtt = 0;
306 cdg_data->window_incr = 0;
307 cdg_data->rtt_count = 0;
308 cdg_data->consec_cong_cnt = 0;
309 cdg_data->sample_q_size = V_cdg_smoothing_factor;
310 cdg_data->num_samples = 0;
311 STAILQ_INIT(&cdg_data->qdiffmin_q);
312 STAILQ_INIT(&cdg_data->qdiffmax_q);
313
314 ccv->cc_data = cdg_data;
315
316 return (0);
317}
318
319static void
320cdg_conn_init(struct cc_var *ccv)
321{
322 struct cdg *cdg_data = ccv->cc_data;
323
324 /*
325 * Initialise the shadow_cwnd in case we are competing with loss based
326 * flows from the start
327 */
328 cdg_data->shadow_w = CCV(ccv, snd_cwnd);
329}
330
331static void
332cdg_cb_destroy(struct cc_var *ccv)
333{
334 struct cdg *cdg_data;
335 struct qdiff_sample *qds, *qds_n;
336
337 cdg_data = ccv->cc_data;
338
339 qds = STAILQ_FIRST(&cdg_data->qdiffmin_q);
340 while (qds != NULL) {
341 qds_n = STAILQ_NEXT(qds, qdiff_lnk);
342 uma_zfree(qdiffsample_zone,qds);
343 qds = qds_n;
344 }
345
346 qds = STAILQ_FIRST(&cdg_data->qdiffmax_q);
347 while (qds != NULL) {
348 qds_n = STAILQ_NEXT(qds, qdiff_lnk);
349 uma_zfree(qdiffsample_zone,qds);
350 qds = qds_n;
351 }
352
353 free(ccv->cc_data, M_CDG);
354}
355
356static int
357cdg_beta_handler(SYSCTL_HANDLER_ARGS)
358{
359
360 if (req->newptr != NULL &&
361 (CAST_PTR_INT(req->newptr) == 0 || CAST_PTR_INT(req->newptr) > 100))
362 return (EINVAL);
363
364 return (sysctl_handle_int(oidp, arg1, arg2, req));
365}
366
367static int
368cdg_exp_backoff_scale_handler(SYSCTL_HANDLER_ARGS)
369{
370
371 if (req->newptr != NULL && CAST_PTR_INT(req->newptr) < 1)
372 return (EINVAL);
373
374 return (sysctl_handle_int(oidp, arg1, arg2, req));
375}
376
377static inline unsigned long
378cdg_window_decrease(struct cc_var *ccv, unsigned long owin, unsigned int beta)
379{
380
381 return ((ulmin(CCV(ccv, snd_wnd), owin) * beta) / 100);
382}
383
384/*
385 * Window increase function
386 * This window increase function is independent of the initial window size
387 * to ensure small window flows are not discriminated against (i.e. fairness).
388 * It increases at 1pkt/rtt like Reno for alpha_inc rtts, and then 2pkts/rtt for
389 * the next alpha_inc rtts, etc.
390 */
391static void
392cdg_window_increase(struct cc_var *ccv, int new_measurement)
393{
394 struct cdg *cdg_data;
395 int incr, s_w_incr;
396
397 cdg_data = ccv->cc_data;
398 incr = s_w_incr = 0;
399
400 if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) {
401 /* Slow start. */
402 incr = CCV(ccv, t_maxseg);
403 s_w_incr = incr;
404 cdg_data->window_incr = cdg_data->rtt_count = 0;
405 } else {
406 /* Congestion avoidance. */
407 if (new_measurement) {
408 s_w_incr = CCV(ccv, t_maxseg);
409 if (V_cdg_alpha_inc == 0) {
410 incr = CCV(ccv, t_maxseg);
411 } else {
412 if (++cdg_data->rtt_count >= V_cdg_alpha_inc) {
413 cdg_data->window_incr++;
414 cdg_data->rtt_count = 0;
415 }
416 incr = CCV(ccv, t_maxseg) *
417 cdg_data->window_incr;
418 }
419 }
420 }
421
422 if (cdg_data->shadow_w > 0)
423 cdg_data->shadow_w = ulmin(cdg_data->shadow_w + s_w_incr,
424 TCP_MAXWIN << CCV(ccv, snd_scale));
425
426 CCV(ccv, snd_cwnd) = ulmin(CCV(ccv, snd_cwnd) + incr,
427 TCP_MAXWIN << CCV(ccv, snd_scale));
428}
429
430static void
431cdg_cong_signal(struct cc_var *ccv, uint32_t signal_type)
432{
433 struct cdg *cdg_data = ccv->cc_data;
434
435 switch(signal_type) {
436 case CC_CDG_DELAY:
437 CCV(ccv, snd_ssthresh) = cdg_window_decrease(ccv,
438 CCV(ccv, snd_cwnd), V_cdg_beta_delay);
439 CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
440 CCV(ccv, snd_recover) = CCV(ccv, snd_max);
441 cdg_data->window_incr = cdg_data->rtt_count = 0;
442 ENTER_CONGRECOVERY(CCV(ccv, t_flags));
443 break;
444 case CC_NDUPACK:
445 /*
446 * If already responding to congestion OR we have guessed no
447 * queue in the path is full.
448 */
449 if (IN_CONGRECOVERY(CCV(ccv, t_flags)) ||
450 cdg_data->queue_state < CDG_Q_FULL) {
451 CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd);
452 CCV(ccv, snd_recover) = CCV(ccv, snd_max);
453 } else {
454 /*
455 * Loss is likely to be congestion related. We have
456 * inferred a queue full state, so have shadow window
457 * react to loss as NewReno would.
458 */
459 if (cdg_data->shadow_w > 0)
460 cdg_data->shadow_w = cdg_window_decrease(ccv,
461 cdg_data->shadow_w, RENO_BETA);
462
463 CCV(ccv, snd_ssthresh) = ulmax(cdg_data->shadow_w,
464 cdg_window_decrease(ccv, CCV(ccv, snd_cwnd),
465 V_cdg_beta_loss));
466
467 cdg_data->window_incr = cdg_data->rtt_count = 0;
468 }
469 ENTER_RECOVERY(CCV(ccv, t_flags));
470 break;
471 default:
472 newreno_cc_algo.cong_signal(ccv, signal_type);
473 break;
474 }
475}
476
477/*
478 * Using a negative exponential probabilistic backoff so that sources with
479 * varying RTTs which share the same link will, on average, have the same
480 * probability of backoff over time.
481 *
482 * Prob_backoff = 1 - exp(-qtrend / V_cdg_exp_backoff_scale), where
483 * V_cdg_exp_backoff_scale is the average qtrend for the exponential backoff.
484 */
485static inline int
486prob_backoff(long qtrend)
487{
488 int backoff, idx, p;
489
490 backoff = (qtrend > ((MAXGRAD * V_cdg_exp_backoff_scale) << D_P_E));
491
492 if (!backoff) {
493 if (V_cdg_exp_backoff_scale > 1)
494 idx = (qtrend + V_cdg_exp_backoff_scale / 2) /
495 V_cdg_exp_backoff_scale;
496 else
497 idx = qtrend;
498
499 /* Backoff probability proportional to rate of queue growth. */
500 p = (INT_MAX / (1 << EXP_PREC)) * probexp[idx];
501 backoff = (random() < p);
502 }
503
504 return (backoff);
505}
506
507static inline void
508calc_moving_average(struct cdg *cdg_data, long qdiff_max, long qdiff_min)
509{
510 struct qdiff_sample *qds;
511
512 ++cdg_data->num_samples;
513 if (cdg_data->num_samples > cdg_data->sample_q_size) {
514 /* Minimum RTT. */
515 qds = STAILQ_FIRST(&cdg_data->qdiffmin_q);
516 cdg_data->min_qtrend = cdg_data->min_qtrend +
517 (qdiff_min - qds->qdiff) / cdg_data->sample_q_size;
518 STAILQ_REMOVE_HEAD(&cdg_data->qdiffmin_q, qdiff_lnk);
519 qds->qdiff = qdiff_min;
520 STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds, qdiff_lnk);
521
522 /* Maximum RTT. */
523 qds = STAILQ_FIRST(&cdg_data->qdiffmax_q);
524 cdg_data->max_qtrend = cdg_data->max_qtrend +
525 (qdiff_max - qds->qdiff) / cdg_data->sample_q_size;
526 STAILQ_REMOVE_HEAD(&cdg_data->qdiffmax_q, qdiff_lnk);
527 qds->qdiff = qdiff_max;
528 STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds, qdiff_lnk);
529 --cdg_data->num_samples;
530 } else {
531 qds = uma_zalloc(qdiffsample_zone, M_NOWAIT);
532 if (qds != NULL) {
533 cdg_data->min_qtrend = cdg_data->min_qtrend +
534 qdiff_min / cdg_data->sample_q_size;
535 qds->qdiff = qdiff_min;
536 STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds,
537 qdiff_lnk);
538 }
539
540 qds = uma_zalloc(qdiffsample_zone, M_NOWAIT);
541 if (qds) {
542 cdg_data->max_qtrend = cdg_data->max_qtrend +
543 qdiff_max / cdg_data->sample_q_size;
544 qds->qdiff = qdiff_max;
545 STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds,
546 qdiff_lnk);
547 }
548 }
549}
550
551static void
552cdg_ack_received(struct cc_var *ccv, uint16_t ack_type)
553{
554 struct cdg *cdg_data;
555 struct ertt *e_t;
556 long qdiff_max, qdiff_min;
557 int congestion, new_measurement, slowstart;
558
559 cdg_data = ccv->cc_data;
560 e_t = (struct ertt *)khelp_get_osd(CCV(ccv, osd), ertt_id);
561 new_measurement = e_t->flags & ERTT_NEW_MEASUREMENT;
562 congestion = 0;
563 cdg_data->maxrtt_in_rtt = imax(e_t->rtt, cdg_data->maxrtt_in_rtt);
564 cdg_data->minrtt_in_rtt = imin(e_t->rtt, cdg_data->minrtt_in_rtt);
565
566 if (new_measurement) {
567 slowstart = (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh));
568 /*
569 * Update smoothed gradient measurements. Since we are only
570 * using one measurement per RTT, use max or min rtt_in_rtt.
571 * This is also less noisy than a sample RTT measurement. Max
572 * RTT measurements can have trouble due to OS issues.
573 */
574 if (cdg_data->maxrtt_in_prevrtt) {
575 qdiff_max = ((long)(cdg_data->maxrtt_in_rtt -
576 cdg_data->maxrtt_in_prevrtt) << D_P_E );
577 qdiff_min = ((long)(cdg_data->minrtt_in_rtt -
578 cdg_data->minrtt_in_prevrtt) << D_P_E );
579
580 calc_moving_average(cdg_data, qdiff_max, qdiff_min);
581
582 /* Probabilistic backoff with respect to gradient. */
583 if (slowstart && qdiff_min > 0)
584 congestion = prob_backoff(qdiff_min);
585 else if (cdg_data->min_qtrend > 0)
586 congestion = prob_backoff(cdg_data->min_qtrend);
587 else if (slowstart && qdiff_max > 0)
588 congestion = prob_backoff(qdiff_max);
589 else if (cdg_data->max_qtrend > 0)
590 congestion = prob_backoff(cdg_data->max_qtrend);
591
592 /* Update estimate of queue state. */
593 if (cdg_data->min_qtrend > 0 &&
594 cdg_data->max_qtrend <= 0) {
595 cdg_data->queue_state = CDG_Q_FULL;
596 } else if (cdg_data->min_qtrend >= 0 &&
597 cdg_data->max_qtrend < 0) {
598 cdg_data->queue_state = CDG_Q_EMPTY;
599 cdg_data->shadow_w = 0;
600 } else if (cdg_data->min_qtrend > 0 &&
601 cdg_data->max_qtrend > 0) {
602 cdg_data->queue_state = CDG_Q_RISING;
603 } else if (cdg_data->min_qtrend < 0 &&
604 cdg_data->max_qtrend < 0) {
605 cdg_data->queue_state = CDG_Q_FALLING;
606 }
607
608 if (cdg_data->min_qtrend < 0 ||
609 cdg_data->max_qtrend < 0)
610 cdg_data->consec_cong_cnt = 0;
611 }
612
613 cdg_data->minrtt_in_prevrtt = cdg_data->minrtt_in_rtt;
614 cdg_data->minrtt_in_rtt = INT_MAX;
615 cdg_data->maxrtt_in_prevrtt = cdg_data->maxrtt_in_rtt;
616 cdg_data->maxrtt_in_rtt = 0;
617 e_t->flags &= ~ERTT_NEW_MEASUREMENT;
618 }
619
620 if (congestion) {
621 cdg_data->consec_cong_cnt++;
622 if (!IN_RECOVERY(CCV(ccv, t_flags))) {
623 if (cdg_data->consec_cong_cnt <= V_cdg_consec_cong)
624 cdg_cong_signal(ccv, CC_CDG_DELAY);
625 else
626 /*
627 * We have been backing off but the queue is not
628 * falling. Assume we are competing with
629 * loss-based flows and don't back off for the
630 * next V_cdg_hold_backoff RTT periods.
631 */
632 if (cdg_data->consec_cong_cnt >=
633 V_cdg_consec_cong + V_cdg_hold_backoff)
634 cdg_data->consec_cong_cnt = 0;
635
636 /* Won't see effect until 2nd RTT. */
637 cdg_data->maxrtt_in_prevrtt = 0;
638 /*
639 * Resync shadow window in case we are competing with a
640 * loss based flow
641 */
642 cdg_data->shadow_w = ulmax(CCV(ccv, snd_cwnd),
643 cdg_data->shadow_w);
644 }
645 } else if (ack_type == CC_ACK)
646 cdg_window_increase(ccv, new_measurement);
647}
648
649/* When a vnet is created and being initialised, init the per-stack CDG vars. */
650VNET_SYSINIT(cdg_init_vnet, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
651 cdg_init_vnet, NULL);
652
653SYSCTL_DECL(_net_inet_tcp_cc_cdg);
654SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, cdg, CTLFLAG_RW, NULL,
655 "CAIA delay-gradient congestion control related settings");
656
657SYSCTL_STRING(_net_inet_tcp_cc_cdg, OID_AUTO, version,
658 CTLFLAG_RD, CDG_VERSION, sizeof(CDG_VERSION) - 1,
659 "Current algorithm/implementation version number");
660
661SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, alpha_inc,
662 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_alpha_inc), 0,
663 "Increment the window increase factor alpha by 1 MSS segment every "
664 "alpha_inc RTTs during congestion avoidance mode.");
665
666SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_delay,
667 CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(cdg_beta_delay), 70,
668 &cdg_beta_handler, "IU",
669 "Delay-based window decrease factor as a percentage "
670 "(on delay-based backoff, w = w * beta_delay / 100)");
671
672SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_loss,
673 CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(cdg_beta_loss), 50,
674 &cdg_beta_handler, "IU",
675 "Loss-based window decrease factor as a percentage "
676 "(on loss-based backoff, w = w * beta_loss / 100)");
677
678SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, exp_backoff_scale,
679 CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
680 &VNET_NAME(cdg_exp_backoff_scale), 2, &cdg_exp_backoff_scale_handler, "IU",
681 "Scaling parameter for the probabilistic exponential backoff");
682
683SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, smoothing_factor,
684 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_smoothing_factor), 8,
685 "Number of samples used for moving average smoothing (0 = no smoothing)");
686
687SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_consec_cong,
688 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_consec_cong), 5,
689 "Number of consecutive delay-gradient based congestion episodes which will "
690 "trigger loss based CC compatibility");
691
692SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_hold_backoff,
693 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_hold_backoff), 5,
694 "Number of consecutive delay-gradient based congestion episodes to hold "
695 "the window backoff for loss based CC compatibility");
696
697DECLARE_CC_MODULE(cdg, &cdg_cc_algo);
698
699MODULE_DEPEND(cdg, ertt, 1, 1, 1);
73#include <netinet/cc/cc_module.h>
74
75#include <netinet/khelp/h_ertt.h>
76
77#include <vm/uma.h>
78
79#define CDG_VERSION "0.1"
80
81#define CAST_PTR_INT(X) (*((int*)(X)))
82
83/* Private delay-gradient induced congestion control signal. */
84#define CC_CDG_DELAY 0x01000000
85
86/* NewReno window deflation factor on loss (as a percentage). */
87#define RENO_BETA 50
88
89/* Queue states. */
90#define CDG_Q_EMPTY 1
91#define CDG_Q_RISING 2
92#define CDG_Q_FALLING 3
93#define CDG_Q_FULL 4
94#define CDG_Q_UNKNOWN 9999
95
96/* Number of bit shifts used in probexp lookup table. */
97#define EXP_PREC 15
98
99/* Largest gradient represented in probexp lookup table. */
100#define MAXGRAD 5
101
102/*
103 * Delay Precision Enhance - number of bit shifts used for qtrend related
104 * integer arithmetic precision.
105 */
106#define D_P_E 7
107
108struct qdiff_sample {
109 long qdiff;
110 STAILQ_ENTRY(qdiff_sample) qdiff_lnk;
111};
112
113struct cdg {
114 long max_qtrend;
115 long min_qtrend;
116 STAILQ_HEAD(minrtts_head, qdiff_sample) qdiffmin_q;
117 STAILQ_HEAD(maxrtts_head, qdiff_sample) qdiffmax_q;
118 long window_incr;
119 /* rttcount for window increase when in congestion avoidance */
120 long rtt_count;
121 /* maximum measured rtt within an rtt period */
122 int maxrtt_in_rtt;
123 /* maximum measured rtt within prev rtt period */
124 int maxrtt_in_prevrtt;
125 /* minimum measured rtt within an rtt period */
126 int minrtt_in_rtt;
127 /* minimum measured rtt within prev rtt period */
128 int minrtt_in_prevrtt;
129 /* consecutive congestion episode counter */
130 uint32_t consec_cong_cnt;
131 /* when tracking a new reno type loss window */
132 uint32_t shadow_w;
133 /* maximum number of samples in the moving average queue */
134 int sample_q_size;
135 /* number of samples in the moving average queue */
136 int num_samples;
137 /* estimate of the queue state of the path */
138 int queue_state;
139};
140
141/*
142 * Lookup table for:
143 * (1 - exp(-x)) << EXP_PREC, where x = [0,MAXGRAD] in 2^-7 increments
144 *
145 * Note: probexp[0] is set to 10 (not 0) as a safety for very low increase
146 * gradients.
147 */
148static const int probexp[641] = {
149 10,255,508,759,1008,1255,1501,1744,1985,2225,2463,2698,2932,3165,3395,3624,
150 3850,4075,4299,4520,4740,4958,5175,5389,5602,5814,6024,6232,6438,6643,6846,
151 7048,7248,7447,7644,7839,8033,8226,8417,8606,8794,8981,9166,9350,9532,9713,
152 9892,10070,10247,10422,10596,10769,10940,11110,11278,11445,11611,11776,11939,
153 12101,12262,12422,12580,12737,12893,13048,13201,13354,13505,13655,13803,13951,
154 14097,14243,14387,14530,14672,14813,14952,15091,15229,15365,15500,15635,15768,
155 15900,16032,16162,16291,16419,16547,16673,16798,16922,17046,17168,17289,17410,
156 17529,17648,17766,17882,17998,18113,18227,18340,18453,18564,18675,18784,18893,
157 19001,19108,19215,19320,19425,19529,19632,19734,19835,19936,20036,20135,20233,
158 20331,20427,20523,20619,20713,20807,20900,20993,21084,21175,21265,21355,21444,
159 21532,21619,21706,21792,21878,21962,22046,22130,22213,22295,22376,22457,22537,
160 22617,22696,22774,22852,22929,23006,23082,23157,23232,23306,23380,23453,23525,
161 23597,23669,23739,23810,23879,23949,24017,24085,24153,24220,24286,24352,24418,
162 24483,24547,24611,24675,24738,24800,24862,24924,24985,25045,25106,25165,25224,
163 25283,25341,25399,25456,25513,25570,25626,25681,25737,25791,25846,25899,25953,
164 26006,26059,26111,26163,26214,26265,26316,26366,26416,26465,26514,26563,26611,
165 26659,26707,26754,26801,26847,26893,26939,26984,27029,27074,27118,27162,27206,
166 27249,27292,27335,27377,27419,27460,27502,27543,27583,27624,27664,27703,27743,
167 27782,27821,27859,27897,27935,27973,28010,28047,28084,28121,28157,28193,28228,
168 28263,28299,28333,28368,28402,28436,28470,28503,28536,28569,28602,28634,28667,
169 28699,28730,28762,28793,28824,28854,28885,28915,28945,28975,29004,29034,29063,
170 29092,29120,29149,29177,29205,29232,29260,29287,29314,29341,29368,29394,29421,
171 29447,29472,29498,29524,29549,29574,29599,29623,29648,29672,29696,29720,29744,
172 29767,29791,29814,29837,29860,29882,29905,29927,29949,29971,29993,30014,30036,
173 30057,30078,30099,30120,30141,30161,30181,30201,30221,30241,30261,30280,30300,
174 30319,30338,30357,30376,30394,30413,30431,30449,30467,30485,30503,30521,30538,
175 30555,30573,30590,30607,30624,30640,30657,30673,30690,30706,30722,30738,30753,
176 30769,30785,30800,30815,30831,30846,30861,30876,30890,30905,30919,30934,30948,
177 30962,30976,30990,31004,31018,31031,31045,31058,31072,31085,31098,31111,31124,
178 31137,31149,31162,31174,31187,31199,31211,31223,31235,31247,31259,31271,31283,
179 31294,31306,31317,31328,31339,31351,31362,31373,31383,31394,31405,31416,31426,
180 31436,31447,31457,31467,31477,31487,31497,31507,31517,31527,31537,31546,31556,
181 31565,31574,31584,31593,31602,31611,31620,31629,31638,31647,31655,31664,31673,
182 31681,31690,31698,31706,31715,31723,31731,31739,31747,31755,31763,31771,31778,
183 31786,31794,31801,31809,31816,31824,31831,31838,31846,31853,31860,31867,31874,
184 31881,31888,31895,31902,31908,31915,31922,31928,31935,31941,31948,31954,31960,
185 31967,31973,31979,31985,31991,31997,32003,32009,32015,32021,32027,32033,32038,
186 32044,32050,32055,32061,32066,32072,32077,32083,32088,32093,32098,32104,32109,
187 32114,32119,32124,32129,32134,32139,32144,32149,32154,32158,32163,32168,32173,
188 32177,32182,32186,32191,32195,32200,32204,32209,32213,32217,32222,32226,32230,
189 32234,32238,32242,32247,32251,32255,32259,32263,32267,32270,32274,32278,32282,
190 32286,32290,32293,32297,32301,32304,32308,32311,32315,32318,32322,32325,32329,
191 32332,32336,32339,32342,32346,32349,32352,32356,32359,32362,32365,32368,32371,
192 32374,32377,32381,32384,32387,32389,32392,32395,32398,32401,32404,32407,32410,
193 32412,32415,32418,32421,32423,32426,32429,32431,32434,32437,32439,32442,32444,
194 32447,32449,32452,32454,32457,32459,32461,32464,32466,32469,32471,32473,32476,
195 32478,32480,32482,32485,32487,32489,32491,32493,32495,32497,32500,32502,32504,
196 32506,32508,32510,32512,32514,32516,32518,32520,32522,32524,32526,32527,32529,
197 32531,32533,32535,32537,32538,32540,32542,32544,32545,32547};
198
199static uma_zone_t qdiffsample_zone;
200
201static MALLOC_DEFINE(M_CDG, "cdg data",
202 "Per connection data required for the CDG congestion control algorithm");
203
204static int ertt_id;
205
206static VNET_DEFINE(uint32_t, cdg_alpha_inc);
207static VNET_DEFINE(uint32_t, cdg_beta_delay);
208static VNET_DEFINE(uint32_t, cdg_beta_loss);
209static VNET_DEFINE(uint32_t, cdg_smoothing_factor);
210static VNET_DEFINE(uint32_t, cdg_exp_backoff_scale);
211static VNET_DEFINE(uint32_t, cdg_consec_cong);
212static VNET_DEFINE(uint32_t, cdg_hold_backoff);
213#define V_cdg_alpha_inc VNET(cdg_alpha_inc)
214#define V_cdg_beta_delay VNET(cdg_beta_delay)
215#define V_cdg_beta_loss VNET(cdg_beta_loss)
216#define V_cdg_smoothing_factor VNET(cdg_smoothing_factor)
217#define V_cdg_exp_backoff_scale VNET(cdg_exp_backoff_scale)
218#define V_cdg_consec_cong VNET(cdg_consec_cong)
219#define V_cdg_hold_backoff VNET(cdg_hold_backoff)
220
221/* Function prototypes. */
222static int cdg_mod_init(void);
223static int cdg_mod_destroy(void);
224static void cdg_conn_init(struct cc_var *ccv);
225static int cdg_cb_init(struct cc_var *ccv);
226static void cdg_cb_destroy(struct cc_var *ccv);
227static void cdg_cong_signal(struct cc_var *ccv, uint32_t signal_type);
228static void cdg_ack_received(struct cc_var *ccv, uint16_t ack_type);
229
230struct cc_algo cdg_cc_algo = {
231 .name = "cdg",
232 .mod_init = cdg_mod_init,
233 .ack_received = cdg_ack_received,
234 .cb_destroy = cdg_cb_destroy,
235 .cb_init = cdg_cb_init,
236 .conn_init = cdg_conn_init,
237 .cong_signal = cdg_cong_signal,
238 .mod_destroy = cdg_mod_destroy
239};
240
241/* Vnet created and being initialised. */
242static void
243cdg_init_vnet(const void *unused __unused)
244{
245
246 V_cdg_alpha_inc = 0;
247 V_cdg_beta_delay = 70;
248 V_cdg_beta_loss = 50;
249 V_cdg_smoothing_factor = 8;
250 V_cdg_exp_backoff_scale = 3;
251 V_cdg_consec_cong = 5;
252 V_cdg_hold_backoff = 5;
253}
254
255static int
256cdg_mod_init(void)
257{
258 VNET_ITERATOR_DECL(v);
259
260 ertt_id = khelp_get_id("ertt");
261 if (ertt_id <= 0)
262 return (EINVAL);
263
264 qdiffsample_zone = uma_zcreate("cdg_qdiffsample",
265 sizeof(struct qdiff_sample), NULL, NULL, NULL, NULL, 0, 0);
266
267 VNET_LIST_RLOCK();
268 VNET_FOREACH(v) {
269 CURVNET_SET(v);
270 cdg_init_vnet(NULL);
271 CURVNET_RESTORE();
272 }
273 VNET_LIST_RUNLOCK();
274
275 cdg_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
276 cdg_cc_algo.after_idle = newreno_cc_algo.after_idle;
277
278 return (0);
279}
280
281static int
282cdg_mod_destroy(void)
283{
284
285 uma_zdestroy(qdiffsample_zone);
286 return (0);
287}
288
289static int
290cdg_cb_init(struct cc_var *ccv)
291{
292 struct cdg *cdg_data;
293
294 cdg_data = malloc(sizeof(struct cdg), M_CDG, M_NOWAIT);
295 if (cdg_data == NULL)
296 return (ENOMEM);
297
298 cdg_data->shadow_w = 0;
299 cdg_data->max_qtrend = 0;
300 cdg_data->min_qtrend = 0;
301 cdg_data->queue_state = CDG_Q_UNKNOWN;
302 cdg_data->maxrtt_in_rtt = 0;
303 cdg_data->maxrtt_in_prevrtt = 0;
304 cdg_data->minrtt_in_rtt = INT_MAX;
305 cdg_data->minrtt_in_prevrtt = 0;
306 cdg_data->window_incr = 0;
307 cdg_data->rtt_count = 0;
308 cdg_data->consec_cong_cnt = 0;
309 cdg_data->sample_q_size = V_cdg_smoothing_factor;
310 cdg_data->num_samples = 0;
311 STAILQ_INIT(&cdg_data->qdiffmin_q);
312 STAILQ_INIT(&cdg_data->qdiffmax_q);
313
314 ccv->cc_data = cdg_data;
315
316 return (0);
317}
318
319static void
320cdg_conn_init(struct cc_var *ccv)
321{
322 struct cdg *cdg_data = ccv->cc_data;
323
324 /*
325 * Initialise the shadow_cwnd in case we are competing with loss based
326 * flows from the start
327 */
328 cdg_data->shadow_w = CCV(ccv, snd_cwnd);
329}
330
331static void
332cdg_cb_destroy(struct cc_var *ccv)
333{
334 struct cdg *cdg_data;
335 struct qdiff_sample *qds, *qds_n;
336
337 cdg_data = ccv->cc_data;
338
339 qds = STAILQ_FIRST(&cdg_data->qdiffmin_q);
340 while (qds != NULL) {
341 qds_n = STAILQ_NEXT(qds, qdiff_lnk);
342 uma_zfree(qdiffsample_zone,qds);
343 qds = qds_n;
344 }
345
346 qds = STAILQ_FIRST(&cdg_data->qdiffmax_q);
347 while (qds != NULL) {
348 qds_n = STAILQ_NEXT(qds, qdiff_lnk);
349 uma_zfree(qdiffsample_zone,qds);
350 qds = qds_n;
351 }
352
353 free(ccv->cc_data, M_CDG);
354}
355
356static int
357cdg_beta_handler(SYSCTL_HANDLER_ARGS)
358{
359
360 if (req->newptr != NULL &&
361 (CAST_PTR_INT(req->newptr) == 0 || CAST_PTR_INT(req->newptr) > 100))
362 return (EINVAL);
363
364 return (sysctl_handle_int(oidp, arg1, arg2, req));
365}
366
367static int
368cdg_exp_backoff_scale_handler(SYSCTL_HANDLER_ARGS)
369{
370
371 if (req->newptr != NULL && CAST_PTR_INT(req->newptr) < 1)
372 return (EINVAL);
373
374 return (sysctl_handle_int(oidp, arg1, arg2, req));
375}
376
377static inline unsigned long
378cdg_window_decrease(struct cc_var *ccv, unsigned long owin, unsigned int beta)
379{
380
381 return ((ulmin(CCV(ccv, snd_wnd), owin) * beta) / 100);
382}
383
384/*
385 * Window increase function
386 * This window increase function is independent of the initial window size
387 * to ensure small window flows are not discriminated against (i.e. fairness).
388 * It increases at 1pkt/rtt like Reno for alpha_inc rtts, and then 2pkts/rtt for
389 * the next alpha_inc rtts, etc.
390 */
391static void
392cdg_window_increase(struct cc_var *ccv, int new_measurement)
393{
394 struct cdg *cdg_data;
395 int incr, s_w_incr;
396
397 cdg_data = ccv->cc_data;
398 incr = s_w_incr = 0;
399
400 if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) {
401 /* Slow start. */
402 incr = CCV(ccv, t_maxseg);
403 s_w_incr = incr;
404 cdg_data->window_incr = cdg_data->rtt_count = 0;
405 } else {
406 /* Congestion avoidance. */
407 if (new_measurement) {
408 s_w_incr = CCV(ccv, t_maxseg);
409 if (V_cdg_alpha_inc == 0) {
410 incr = CCV(ccv, t_maxseg);
411 } else {
412 if (++cdg_data->rtt_count >= V_cdg_alpha_inc) {
413 cdg_data->window_incr++;
414 cdg_data->rtt_count = 0;
415 }
416 incr = CCV(ccv, t_maxseg) *
417 cdg_data->window_incr;
418 }
419 }
420 }
421
422 if (cdg_data->shadow_w > 0)
423 cdg_data->shadow_w = ulmin(cdg_data->shadow_w + s_w_incr,
424 TCP_MAXWIN << CCV(ccv, snd_scale));
425
426 CCV(ccv, snd_cwnd) = ulmin(CCV(ccv, snd_cwnd) + incr,
427 TCP_MAXWIN << CCV(ccv, snd_scale));
428}
429
430static void
431cdg_cong_signal(struct cc_var *ccv, uint32_t signal_type)
432{
433 struct cdg *cdg_data = ccv->cc_data;
434
435 switch(signal_type) {
436 case CC_CDG_DELAY:
437 CCV(ccv, snd_ssthresh) = cdg_window_decrease(ccv,
438 CCV(ccv, snd_cwnd), V_cdg_beta_delay);
439 CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
440 CCV(ccv, snd_recover) = CCV(ccv, snd_max);
441 cdg_data->window_incr = cdg_data->rtt_count = 0;
442 ENTER_CONGRECOVERY(CCV(ccv, t_flags));
443 break;
444 case CC_NDUPACK:
445 /*
446 * If already responding to congestion OR we have guessed no
447 * queue in the path is full.
448 */
449 if (IN_CONGRECOVERY(CCV(ccv, t_flags)) ||
450 cdg_data->queue_state < CDG_Q_FULL) {
451 CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd);
452 CCV(ccv, snd_recover) = CCV(ccv, snd_max);
453 } else {
454 /*
455 * Loss is likely to be congestion related. We have
456 * inferred a queue full state, so have shadow window
457 * react to loss as NewReno would.
458 */
459 if (cdg_data->shadow_w > 0)
460 cdg_data->shadow_w = cdg_window_decrease(ccv,
461 cdg_data->shadow_w, RENO_BETA);
462
463 CCV(ccv, snd_ssthresh) = ulmax(cdg_data->shadow_w,
464 cdg_window_decrease(ccv, CCV(ccv, snd_cwnd),
465 V_cdg_beta_loss));
466
467 cdg_data->window_incr = cdg_data->rtt_count = 0;
468 }
469 ENTER_RECOVERY(CCV(ccv, t_flags));
470 break;
471 default:
472 newreno_cc_algo.cong_signal(ccv, signal_type);
473 break;
474 }
475}
476
477/*
478 * Using a negative exponential probabilistic backoff so that sources with
479 * varying RTTs which share the same link will, on average, have the same
480 * probability of backoff over time.
481 *
482 * Prob_backoff = 1 - exp(-qtrend / V_cdg_exp_backoff_scale), where
483 * V_cdg_exp_backoff_scale is the average qtrend for the exponential backoff.
484 */
485static inline int
486prob_backoff(long qtrend)
487{
488 int backoff, idx, p;
489
490 backoff = (qtrend > ((MAXGRAD * V_cdg_exp_backoff_scale) << D_P_E));
491
492 if (!backoff) {
493 if (V_cdg_exp_backoff_scale > 1)
494 idx = (qtrend + V_cdg_exp_backoff_scale / 2) /
495 V_cdg_exp_backoff_scale;
496 else
497 idx = qtrend;
498
499 /* Backoff probability proportional to rate of queue growth. */
500 p = (INT_MAX / (1 << EXP_PREC)) * probexp[idx];
501 backoff = (random() < p);
502 }
503
504 return (backoff);
505}
506
507static inline void
508calc_moving_average(struct cdg *cdg_data, long qdiff_max, long qdiff_min)
509{
510 struct qdiff_sample *qds;
511
512 ++cdg_data->num_samples;
513 if (cdg_data->num_samples > cdg_data->sample_q_size) {
514 /* Minimum RTT. */
515 qds = STAILQ_FIRST(&cdg_data->qdiffmin_q);
516 cdg_data->min_qtrend = cdg_data->min_qtrend +
517 (qdiff_min - qds->qdiff) / cdg_data->sample_q_size;
518 STAILQ_REMOVE_HEAD(&cdg_data->qdiffmin_q, qdiff_lnk);
519 qds->qdiff = qdiff_min;
520 STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds, qdiff_lnk);
521
522 /* Maximum RTT. */
523 qds = STAILQ_FIRST(&cdg_data->qdiffmax_q);
524 cdg_data->max_qtrend = cdg_data->max_qtrend +
525 (qdiff_max - qds->qdiff) / cdg_data->sample_q_size;
526 STAILQ_REMOVE_HEAD(&cdg_data->qdiffmax_q, qdiff_lnk);
527 qds->qdiff = qdiff_max;
528 STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds, qdiff_lnk);
529 --cdg_data->num_samples;
530 } else {
531 qds = uma_zalloc(qdiffsample_zone, M_NOWAIT);
532 if (qds != NULL) {
533 cdg_data->min_qtrend = cdg_data->min_qtrend +
534 qdiff_min / cdg_data->sample_q_size;
535 qds->qdiff = qdiff_min;
536 STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds,
537 qdiff_lnk);
538 }
539
540 qds = uma_zalloc(qdiffsample_zone, M_NOWAIT);
541 if (qds) {
542 cdg_data->max_qtrend = cdg_data->max_qtrend +
543 qdiff_max / cdg_data->sample_q_size;
544 qds->qdiff = qdiff_max;
545 STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds,
546 qdiff_lnk);
547 }
548 }
549}
550
551static void
552cdg_ack_received(struct cc_var *ccv, uint16_t ack_type)
553{
554 struct cdg *cdg_data;
555 struct ertt *e_t;
556 long qdiff_max, qdiff_min;
557 int congestion, new_measurement, slowstart;
558
559 cdg_data = ccv->cc_data;
560 e_t = (struct ertt *)khelp_get_osd(CCV(ccv, osd), ertt_id);
561 new_measurement = e_t->flags & ERTT_NEW_MEASUREMENT;
562 congestion = 0;
563 cdg_data->maxrtt_in_rtt = imax(e_t->rtt, cdg_data->maxrtt_in_rtt);
564 cdg_data->minrtt_in_rtt = imin(e_t->rtt, cdg_data->minrtt_in_rtt);
565
566 if (new_measurement) {
567 slowstart = (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh));
568 /*
569 * Update smoothed gradient measurements. Since we are only
570 * using one measurement per RTT, use max or min rtt_in_rtt.
571 * This is also less noisy than a sample RTT measurement. Max
572 * RTT measurements can have trouble due to OS issues.
573 */
574 if (cdg_data->maxrtt_in_prevrtt) {
575 qdiff_max = ((long)(cdg_data->maxrtt_in_rtt -
576 cdg_data->maxrtt_in_prevrtt) << D_P_E );
577 qdiff_min = ((long)(cdg_data->minrtt_in_rtt -
578 cdg_data->minrtt_in_prevrtt) << D_P_E );
579
580 calc_moving_average(cdg_data, qdiff_max, qdiff_min);
581
582 /* Probabilistic backoff with respect to gradient. */
583 if (slowstart && qdiff_min > 0)
584 congestion = prob_backoff(qdiff_min);
585 else if (cdg_data->min_qtrend > 0)
586 congestion = prob_backoff(cdg_data->min_qtrend);
587 else if (slowstart && qdiff_max > 0)
588 congestion = prob_backoff(qdiff_max);
589 else if (cdg_data->max_qtrend > 0)
590 congestion = prob_backoff(cdg_data->max_qtrend);
591
592 /* Update estimate of queue state. */
593 if (cdg_data->min_qtrend > 0 &&
594 cdg_data->max_qtrend <= 0) {
595 cdg_data->queue_state = CDG_Q_FULL;
596 } else if (cdg_data->min_qtrend >= 0 &&
597 cdg_data->max_qtrend < 0) {
598 cdg_data->queue_state = CDG_Q_EMPTY;
599 cdg_data->shadow_w = 0;
600 } else if (cdg_data->min_qtrend > 0 &&
601 cdg_data->max_qtrend > 0) {
602 cdg_data->queue_state = CDG_Q_RISING;
603 } else if (cdg_data->min_qtrend < 0 &&
604 cdg_data->max_qtrend < 0) {
605 cdg_data->queue_state = CDG_Q_FALLING;
606 }
607
608 if (cdg_data->min_qtrend < 0 ||
609 cdg_data->max_qtrend < 0)
610 cdg_data->consec_cong_cnt = 0;
611 }
612
613 cdg_data->minrtt_in_prevrtt = cdg_data->minrtt_in_rtt;
614 cdg_data->minrtt_in_rtt = INT_MAX;
615 cdg_data->maxrtt_in_prevrtt = cdg_data->maxrtt_in_rtt;
616 cdg_data->maxrtt_in_rtt = 0;
617 e_t->flags &= ~ERTT_NEW_MEASUREMENT;
618 }
619
620 if (congestion) {
621 cdg_data->consec_cong_cnt++;
622 if (!IN_RECOVERY(CCV(ccv, t_flags))) {
623 if (cdg_data->consec_cong_cnt <= V_cdg_consec_cong)
624 cdg_cong_signal(ccv, CC_CDG_DELAY);
625 else
626 /*
627 * We have been backing off but the queue is not
628 * falling. Assume we are competing with
629 * loss-based flows and don't back off for the
630 * next V_cdg_hold_backoff RTT periods.
631 */
632 if (cdg_data->consec_cong_cnt >=
633 V_cdg_consec_cong + V_cdg_hold_backoff)
634 cdg_data->consec_cong_cnt = 0;
635
636 /* Won't see effect until 2nd RTT. */
637 cdg_data->maxrtt_in_prevrtt = 0;
638 /*
639 * Resync shadow window in case we are competing with a
640 * loss based flow
641 */
642 cdg_data->shadow_w = ulmax(CCV(ccv, snd_cwnd),
643 cdg_data->shadow_w);
644 }
645 } else if (ack_type == CC_ACK)
646 cdg_window_increase(ccv, new_measurement);
647}
648
649/* When a vnet is created and being initialised, init the per-stack CDG vars. */
650VNET_SYSINIT(cdg_init_vnet, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
651 cdg_init_vnet, NULL);
652
653SYSCTL_DECL(_net_inet_tcp_cc_cdg);
654SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, cdg, CTLFLAG_RW, NULL,
655 "CAIA delay-gradient congestion control related settings");
656
657SYSCTL_STRING(_net_inet_tcp_cc_cdg, OID_AUTO, version,
658 CTLFLAG_RD, CDG_VERSION, sizeof(CDG_VERSION) - 1,
659 "Current algorithm/implementation version number");
660
661SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, alpha_inc,
662 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_alpha_inc), 0,
663 "Increment the window increase factor alpha by 1 MSS segment every "
664 "alpha_inc RTTs during congestion avoidance mode.");
665
666SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_delay,
667 CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(cdg_beta_delay), 70,
668 &cdg_beta_handler, "IU",
669 "Delay-based window decrease factor as a percentage "
670 "(on delay-based backoff, w = w * beta_delay / 100)");
671
672SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_loss,
673 CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(cdg_beta_loss), 50,
674 &cdg_beta_handler, "IU",
675 "Loss-based window decrease factor as a percentage "
676 "(on loss-based backoff, w = w * beta_loss / 100)");
677
678SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, exp_backoff_scale,
679 CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
680 &VNET_NAME(cdg_exp_backoff_scale), 2, &cdg_exp_backoff_scale_handler, "IU",
681 "Scaling parameter for the probabilistic exponential backoff");
682
683SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, smoothing_factor,
684 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_smoothing_factor), 8,
685 "Number of samples used for moving average smoothing (0 = no smoothing)");
686
687SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_consec_cong,
688 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_consec_cong), 5,
689 "Number of consecutive delay-gradient based congestion episodes which will "
690 "trigger loss based CC compatibility");
691
692SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_hold_backoff,
693 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_hold_backoff), 5,
694 "Number of consecutive delay-gradient based congestion episodes to hold "
695 "the window backoff for loss based CC compatibility");
696
697DECLARE_CC_MODULE(cdg, &cdg_cc_algo);
698
699MODULE_DEPEND(cdg, ertt, 1, 1, 1);