1252504Slstewart/*-
2252504Slstewart * Copyright (c) 2009-2013
3252504Slstewart * 	Swinburne University of Technology, Melbourne, Australia
4252504Slstewart * All rights reserved.
5252504Slstewart *
6252504Slstewart * This software was developed at the Centre for Advanced Internet
7252504Slstewart * Architectures, Swinburne University of Technology, by David Hayes, made
8252504Slstewart * possible in part by a gift from The Cisco University Research Program Fund,
9252504Slstewart * a corporate advised fund of Silicon Valley Community Foundation. Development
10252504Slstewart * and testing were further assisted by a grant from the FreeBSD Foundation.
11252504Slstewart *
12252504Slstewart * Redistribution and use in source and binary forms, with or without
13252504Slstewart * modification, are permitted provided that the following conditions
14252504Slstewart * are met:
15252504Slstewart * 1. Redistributions of source code must retain the above copyright
16252504Slstewart *    notice, this list of conditions and the following disclaimer.
17252504Slstewart * 2. Redistributions in binary form must reproduce the above copyright
18252504Slstewart *    notice, this list of conditions and the following disclaimer in the
19252504Slstewart *    documentation and/or other materials provided with the distribution.
20252504Slstewart *
21252504Slstewart * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
22252504Slstewart * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23252504Slstewart * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24252504Slstewart * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
25252504Slstewart * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26252504Slstewart * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27252504Slstewart * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28252504Slstewart * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29252504Slstewart * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30252504Slstewart * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31252504Slstewart * SUCH DAMAGE.
32252504Slstewart */
33252504Slstewart
34252504Slstewart/*
35252504Slstewart * CAIA Delay-Gradient (CDG) congestion control algorithm
36252504Slstewart *
37252504Slstewart * An implemention of the delay-gradient congestion control algorithm proposed
38252504Slstewart * in the following paper:
39252504Slstewart *
40252504Slstewart * D. A. Hayes and G. Armitage, "Revisiting TCP Congestion Control using Delay
41252504Slstewart * Gradients", in IFIP Networking, Valencia, Spain, 9-13 May 2011.
42252504Slstewart *
43252504Slstewart * Developed as part of the NewTCP research project at Swinburne University of
44252504Slstewart * Technology's Centre for Advanced Internet Architectures, Melbourne,
45252504Slstewart * Australia. More details are available at:
46252504Slstewart *   http://caia.swin.edu.au/urp/newtcp/
47252504Slstewart */
48252504Slstewart
49252504Slstewart#include <sys/cdefs.h>
50252504Slstewart__FBSDID("$FreeBSD$");
51252504Slstewart
52252504Slstewart#include <sys/param.h>
53252504Slstewart#include <sys/hhook.h>
54252504Slstewart#include <sys/kernel.h>
55252504Slstewart#include <sys/khelp.h>
56252504Slstewart#include <sys/limits.h>
57252504Slstewart#include <sys/lock.h>
58252504Slstewart#include <sys/malloc.h>
59252504Slstewart#include <sys/module.h>
60252504Slstewart#include <sys/queue.h>
61252504Slstewart#include <sys/socket.h>
62252504Slstewart#include <sys/socketvar.h>
63252504Slstewart#include <sys/sysctl.h>
64252504Slstewart#include <sys/systm.h>
65252504Slstewart
66252504Slstewart#include <net/if.h>
67252504Slstewart#include <net/vnet.h>
68252504Slstewart
69252504Slstewart#include <netinet/cc.h>
70252504Slstewart#include <netinet/tcp_seq.h>
71252504Slstewart#include <netinet/tcp_timer.h>
72252504Slstewart#include <netinet/tcp_var.h>
73252504Slstewart
74252504Slstewart#include <netinet/cc/cc_module.h>
75252504Slstewart
76252504Slstewart#include <netinet/khelp/h_ertt.h>
77252504Slstewart
78252504Slstewart#include <vm/uma.h>
79252504Slstewart
80252504Slstewart#define	CDG_VERSION "0.1"
81252504Slstewart
82252504Slstewart#define	CAST_PTR_INT(X) (*((int*)(X)))
83252504Slstewart
84252504Slstewart#ifndef	VIMAGE
85252504Slstewart#define	vnet_sysctl_handle_uint(oidp, arg1, arg2, req) \
86252504Slstewart    sysctl_handle_int(oidp, arg1, arg2, req)
87252504Slstewart#endif
88252504Slstewart
89252504Slstewart/* Private delay-gradient induced congestion control signal. */
90252504Slstewart#define	CC_CDG_DELAY 0x01000000
91252504Slstewart
92252504Slstewart/* NewReno window deflation factor on loss (as a percentage). */
93252504Slstewart#define	RENO_BETA 50
94252504Slstewart
95252504Slstewart/* Queue states. */
96252504Slstewart#define	CDG_Q_EMPTY	1
97252504Slstewart#define	CDG_Q_RISING	2
98252504Slstewart#define	CDG_Q_FALLING	3
99252504Slstewart#define	CDG_Q_FULL	4
100252504Slstewart#define	CDG_Q_UNKNOWN	9999
101252504Slstewart
102252504Slstewart/* Number of bit shifts used in probexp lookup table. */
103252504Slstewart#define	EXP_PREC 15
104252504Slstewart
105252504Slstewart/* Largest gradient represented in probexp lookup table. */
106252504Slstewart#define	MAXGRAD 5
107252504Slstewart
108252504Slstewart/*
109252504Slstewart * Delay Precision Enhance - number of bit shifts used for qtrend related
110252504Slstewart * integer arithmetic precision.
111252504Slstewart */
112252504Slstewart#define	D_P_E 7
113252504Slstewart
114252504Slstewartstruct qdiff_sample {
115252504Slstewart	long qdiff;
116252504Slstewart	STAILQ_ENTRY(qdiff_sample) qdiff_lnk;
117252504Slstewart};
118252504Slstewart
119252504Slstewartstruct cdg {
120252504Slstewart	long max_qtrend;
121252504Slstewart	long min_qtrend;
122252504Slstewart	STAILQ_HEAD(minrtts_head, qdiff_sample) qdiffmin_q;
123252504Slstewart	STAILQ_HEAD(maxrtts_head, qdiff_sample) qdiffmax_q;
124252504Slstewart	long window_incr;
125252504Slstewart	/* rttcount for window increase when in congestion avoidance */
126252504Slstewart	long rtt_count;
127252504Slstewart	/* maximum measured rtt within an rtt period */
128252504Slstewart	int maxrtt_in_rtt;
129252504Slstewart	/* maximum measured rtt within prev rtt period */
130252504Slstewart	int maxrtt_in_prevrtt;
131252504Slstewart	/* minimum measured rtt within an rtt period */
132252504Slstewart	int minrtt_in_rtt;
133252504Slstewart	/* minimum measured rtt within prev rtt period */
134252504Slstewart	int minrtt_in_prevrtt;
135252504Slstewart	/* consecutive congestion episode counter */
136252504Slstewart	uint32_t consec_cong_cnt;
137252504Slstewart	/* when tracking a new reno type loss window */
138252504Slstewart	uint32_t shadow_w;
139252504Slstewart	/* maximum number of samples in the moving average queue */
140252504Slstewart	int sample_q_size;
141252504Slstewart	/* number of samples in the moving average queue */
142252504Slstewart	int num_samples;
143252504Slstewart	/* estimate of the queue state of the path */
144252504Slstewart	int queue_state;
145252504Slstewart};
146252504Slstewart
147252504Slstewart/*
148252504Slstewart * Lookup table for:
149252504Slstewart *   (1 - exp(-x)) << EXP_PREC, where x = [0,MAXGRAD] in 2^-7 increments
150252504Slstewart *
151252504Slstewart * Note: probexp[0] is set to 10 (not 0) as a safety for very low increase
152252504Slstewart * gradients.
153252504Slstewart */
154252504Slstewartstatic const int probexp[641] = {
155252504Slstewart   10,255,508,759,1008,1255,1501,1744,1985,2225,2463,2698,2932,3165,3395,3624,
156252504Slstewart   3850,4075,4299,4520,4740,4958,5175,5389,5602,5814,6024,6232,6438,6643,6846,
157252504Slstewart   7048,7248,7447,7644,7839,8033,8226,8417,8606,8794,8981,9166,9350,9532,9713,
158252504Slstewart   9892,10070,10247,10422,10596,10769,10940,11110,11278,11445,11611,11776,11939,
159252504Slstewart   12101,12262,12422,12580,12737,12893,13048,13201,13354,13505,13655,13803,13951,
160252504Slstewart   14097,14243,14387,14530,14672,14813,14952,15091,15229,15365,15500,15635,15768,
161252504Slstewart   15900,16032,16162,16291,16419,16547,16673,16798,16922,17046,17168,17289,17410,
162252504Slstewart   17529,17648,17766,17882,17998,18113,18227,18340,18453,18564,18675,18784,18893,
163252504Slstewart   19001,19108,19215,19320,19425,19529,19632,19734,19835,19936,20036,20135,20233,
164252504Slstewart   20331,20427,20523,20619,20713,20807,20900,20993,21084,21175,21265,21355,21444,
165252504Slstewart   21532,21619,21706,21792,21878,21962,22046,22130,22213,22295,22376,22457,22537,
166252504Slstewart   22617,22696,22774,22852,22929,23006,23082,23157,23232,23306,23380,23453,23525,
167252504Slstewart   23597,23669,23739,23810,23879,23949,24017,24085,24153,24220,24286,24352,24418,
168252504Slstewart   24483,24547,24611,24675,24738,24800,24862,24924,24985,25045,25106,25165,25224,
169252504Slstewart   25283,25341,25399,25456,25513,25570,25626,25681,25737,25791,25846,25899,25953,
170252504Slstewart   26006,26059,26111,26163,26214,26265,26316,26366,26416,26465,26514,26563,26611,
171252504Slstewart   26659,26707,26754,26801,26847,26893,26939,26984,27029,27074,27118,27162,27206,
172252504Slstewart   27249,27292,27335,27377,27419,27460,27502,27543,27583,27624,27664,27703,27743,
173252504Slstewart   27782,27821,27859,27897,27935,27973,28010,28047,28084,28121,28157,28193,28228,
174252504Slstewart   28263,28299,28333,28368,28402,28436,28470,28503,28536,28569,28602,28634,28667,
175252504Slstewart   28699,28730,28762,28793,28824,28854,28885,28915,28945,28975,29004,29034,29063,
176252504Slstewart   29092,29120,29149,29177,29205,29232,29260,29287,29314,29341,29368,29394,29421,
177252504Slstewart   29447,29472,29498,29524,29549,29574,29599,29623,29648,29672,29696,29720,29744,
178252504Slstewart   29767,29791,29814,29837,29860,29882,29905,29927,29949,29971,29993,30014,30036,
179252504Slstewart   30057,30078,30099,30120,30141,30161,30181,30201,30221,30241,30261,30280,30300,
180252504Slstewart   30319,30338,30357,30376,30394,30413,30431,30449,30467,30485,30503,30521,30538,
181252504Slstewart   30555,30573,30590,30607,30624,30640,30657,30673,30690,30706,30722,30738,30753,
182252504Slstewart   30769,30785,30800,30815,30831,30846,30861,30876,30890,30905,30919,30934,30948,
183252504Slstewart   30962,30976,30990,31004,31018,31031,31045,31058,31072,31085,31098,31111,31124,
184252504Slstewart   31137,31149,31162,31174,31187,31199,31211,31223,31235,31247,31259,31271,31283,
185252504Slstewart   31294,31306,31317,31328,31339,31351,31362,31373,31383,31394,31405,31416,31426,
186252504Slstewart   31436,31447,31457,31467,31477,31487,31497,31507,31517,31527,31537,31546,31556,
187252504Slstewart   31565,31574,31584,31593,31602,31611,31620,31629,31638,31647,31655,31664,31673,
188252504Slstewart   31681,31690,31698,31706,31715,31723,31731,31739,31747,31755,31763,31771,31778,
189252504Slstewart   31786,31794,31801,31809,31816,31824,31831,31838,31846,31853,31860,31867,31874,
190252504Slstewart   31881,31888,31895,31902,31908,31915,31922,31928,31935,31941,31948,31954,31960,
191252504Slstewart   31967,31973,31979,31985,31991,31997,32003,32009,32015,32021,32027,32033,32038,
192252504Slstewart   32044,32050,32055,32061,32066,32072,32077,32083,32088,32093,32098,32104,32109,
193252504Slstewart   32114,32119,32124,32129,32134,32139,32144,32149,32154,32158,32163,32168,32173,
194252504Slstewart   32177,32182,32186,32191,32195,32200,32204,32209,32213,32217,32222,32226,32230,
195252504Slstewart   32234,32238,32242,32247,32251,32255,32259,32263,32267,32270,32274,32278,32282,
196252504Slstewart   32286,32290,32293,32297,32301,32304,32308,32311,32315,32318,32322,32325,32329,
197252504Slstewart   32332,32336,32339,32342,32346,32349,32352,32356,32359,32362,32365,32368,32371,
198252504Slstewart   32374,32377,32381,32384,32387,32389,32392,32395,32398,32401,32404,32407,32410,
199252504Slstewart   32412,32415,32418,32421,32423,32426,32429,32431,32434,32437,32439,32442,32444,
200252504Slstewart   32447,32449,32452,32454,32457,32459,32461,32464,32466,32469,32471,32473,32476,
201252504Slstewart   32478,32480,32482,32485,32487,32489,32491,32493,32495,32497,32500,32502,32504,
202252504Slstewart   32506,32508,32510,32512,32514,32516,32518,32520,32522,32524,32526,32527,32529,
203252504Slstewart   32531,32533,32535,32537,32538,32540,32542,32544,32545,32547};
204252504Slstewart
205252504Slstewartstatic uma_zone_t qdiffsample_zone;
206252504Slstewart
207252504Slstewartstatic MALLOC_DEFINE(M_CDG, "cdg data",
208252504Slstewart  "Per connection data required for the CDG congestion control algorithm");
209252504Slstewart
210252504Slstewartstatic int ertt_id;
211252504Slstewart
212252504Slstewartstatic VNET_DEFINE(uint32_t, cdg_alpha_inc);
213252504Slstewartstatic VNET_DEFINE(uint32_t, cdg_beta_delay);
214252504Slstewartstatic VNET_DEFINE(uint32_t, cdg_beta_loss);
215252504Slstewartstatic VNET_DEFINE(uint32_t, cdg_smoothing_factor);
216252504Slstewartstatic VNET_DEFINE(uint32_t, cdg_exp_backoff_scale);
217252504Slstewartstatic VNET_DEFINE(uint32_t, cdg_consec_cong);
218252504Slstewartstatic VNET_DEFINE(uint32_t, cdg_hold_backoff);
219252504Slstewart#define	V_cdg_alpha_inc		VNET(cdg_alpha_inc)
220252504Slstewart#define	V_cdg_beta_delay	VNET(cdg_beta_delay)
221252504Slstewart#define	V_cdg_beta_loss		VNET(cdg_beta_loss)
222252504Slstewart#define	V_cdg_smoothing_factor	VNET(cdg_smoothing_factor)
223252504Slstewart#define	V_cdg_exp_backoff_scale	VNET(cdg_exp_backoff_scale)
224252504Slstewart#define	V_cdg_consec_cong	VNET(cdg_consec_cong)
225252504Slstewart#define	V_cdg_hold_backoff	VNET(cdg_hold_backoff)
226252504Slstewart
227252504Slstewart/* Function prototypes. */
228252504Slstewartstatic int cdg_mod_init(void);
229252504Slstewartstatic void cdg_conn_init(struct cc_var *ccv);
230252504Slstewartstatic int cdg_cb_init(struct cc_var *ccv);
231252504Slstewartstatic void cdg_cb_destroy(struct cc_var *ccv);
232252504Slstewartstatic void cdg_cong_signal(struct cc_var *ccv, uint32_t signal_type);
233252504Slstewartstatic void cdg_ack_received(struct cc_var *ccv, uint16_t ack_type);
234252504Slstewart
235252504Slstewartstruct cc_algo cdg_cc_algo = {
236252504Slstewart	.name = "cdg",
237252504Slstewart	.mod_init = cdg_mod_init,
238252504Slstewart	.ack_received = cdg_ack_received,
239252504Slstewart	.cb_destroy = cdg_cb_destroy,
240252504Slstewart	.cb_init = cdg_cb_init,
241252504Slstewart	.conn_init = cdg_conn_init,
242252504Slstewart	.cong_signal = cdg_cong_signal
243252504Slstewart};
244252504Slstewart
245252504Slstewart/* Vnet created and being initialised. */
246252504Slstewartstatic void
247252504Slstewartcdg_init_vnet(const void *unused __unused)
248252504Slstewart{
249252504Slstewart
250252504Slstewart	V_cdg_alpha_inc = 0;
251252504Slstewart	V_cdg_beta_delay = 70;
252252504Slstewart	V_cdg_beta_loss = 50;
253252504Slstewart	V_cdg_smoothing_factor = 8;
254252504Slstewart	V_cdg_exp_backoff_scale = 3;
255252504Slstewart	V_cdg_consec_cong = 5;
256252504Slstewart	V_cdg_hold_backoff = 5;
257252504Slstewart}
258252504Slstewart
259252504Slstewartstatic int
260252504Slstewartcdg_mod_init(void)
261252504Slstewart{
262252504Slstewart	VNET_ITERATOR_DECL(v);
263252504Slstewart
264252504Slstewart	ertt_id = khelp_get_id("ertt");
265252504Slstewart	if (ertt_id <= 0)
266252504Slstewart		return (EINVAL);
267252504Slstewart
268252504Slstewart	qdiffsample_zone = uma_zcreate("cdg_qdiffsample",
269252504Slstewart	    sizeof(struct qdiff_sample), NULL, NULL, NULL, NULL, 0, 0);
270252504Slstewart
271252504Slstewart	VNET_LIST_RLOCK();
272252504Slstewart	VNET_FOREACH(v) {
273252504Slstewart		CURVNET_SET(v);
274252504Slstewart		cdg_init_vnet(NULL);
275252504Slstewart		CURVNET_RESTORE();
276252504Slstewart	}
277252504Slstewart	VNET_LIST_RUNLOCK();
278252504Slstewart
279252504Slstewart	cdg_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
280252504Slstewart	cdg_cc_algo.after_idle = newreno_cc_algo.after_idle;
281252504Slstewart
282252504Slstewart	return (0);
283252504Slstewart}
284252504Slstewart
285252504Slstewartstatic int
286252504Slstewartcdg_cb_init(struct cc_var *ccv)
287252504Slstewart{
288252504Slstewart	struct cdg *cdg_data;
289252504Slstewart
290252504Slstewart	cdg_data = malloc(sizeof(struct cdg), M_CDG, M_NOWAIT);
291252504Slstewart	if (cdg_data == NULL)
292252504Slstewart		return (ENOMEM);
293252504Slstewart
294252504Slstewart	cdg_data->shadow_w = 0;
295252504Slstewart	cdg_data->max_qtrend = 0;
296252504Slstewart	cdg_data->min_qtrend = 0;
297252504Slstewart	cdg_data->queue_state = CDG_Q_UNKNOWN;
298252504Slstewart	cdg_data->maxrtt_in_rtt = 0;
299252504Slstewart	cdg_data->maxrtt_in_prevrtt = 0;
300252504Slstewart	cdg_data->minrtt_in_rtt = INT_MAX;
301252504Slstewart	cdg_data->minrtt_in_prevrtt = 0;
302252504Slstewart	cdg_data->window_incr = 0;
303252504Slstewart	cdg_data->rtt_count = 0;
304252504Slstewart	cdg_data->consec_cong_cnt = 0;
305252504Slstewart	cdg_data->sample_q_size = V_cdg_smoothing_factor;
306252504Slstewart	cdg_data->num_samples = 0;
307252504Slstewart	STAILQ_INIT(&cdg_data->qdiffmin_q);
308252504Slstewart	STAILQ_INIT(&cdg_data->qdiffmax_q);
309252504Slstewart
310252504Slstewart	ccv->cc_data = cdg_data;
311252504Slstewart
312252504Slstewart	return (0);
313252504Slstewart}
314252504Slstewart
315252504Slstewartstatic void
316252504Slstewartcdg_conn_init(struct cc_var *ccv)
317252504Slstewart{
318252504Slstewart	struct cdg *cdg_data = ccv->cc_data;
319252504Slstewart
320252504Slstewart	/*
321252504Slstewart	 * Initialise the shadow_cwnd in case we are competing with loss based
322252504Slstewart	 * flows from the start
323252504Slstewart	 */
324252504Slstewart	cdg_data->shadow_w = CCV(ccv, snd_cwnd);
325252504Slstewart}
326252504Slstewart
327252504Slstewartstatic void
328252504Slstewartcdg_cb_destroy(struct cc_var *ccv)
329252504Slstewart{
330252504Slstewart	struct cdg *cdg_data;
331252504Slstewart	struct qdiff_sample *qds, *qds_n;
332252504Slstewart
333252504Slstewart	cdg_data = ccv->cc_data;
334252504Slstewart
335252504Slstewart	qds = STAILQ_FIRST(&cdg_data->qdiffmin_q);
336252504Slstewart	while (qds != NULL) {
337252504Slstewart		qds_n = STAILQ_NEXT(qds, qdiff_lnk);
338252504Slstewart		uma_zfree(qdiffsample_zone,qds);
339252504Slstewart		qds = qds_n;
340252504Slstewart	}
341252504Slstewart
342252504Slstewart	qds = STAILQ_FIRST(&cdg_data->qdiffmax_q);
343252504Slstewart	while (qds != NULL) {
344252504Slstewart		qds_n = STAILQ_NEXT(qds, qdiff_lnk);
345252504Slstewart		uma_zfree(qdiffsample_zone,qds);
346252504Slstewart		qds = qds_n;
347252504Slstewart	}
348252504Slstewart
349252504Slstewart	free(ccv->cc_data, M_CDG);
350252504Slstewart}
351252504Slstewart
352252504Slstewartstatic int
353252504Slstewartcdg_beta_handler(SYSCTL_HANDLER_ARGS)
354252504Slstewart{
355252504Slstewart
356252504Slstewart	if (req->newptr != NULL &&
357252504Slstewart	    (CAST_PTR_INT(req->newptr) == 0 || CAST_PTR_INT(req->newptr) > 100))
358252504Slstewart		return (EINVAL);
359252504Slstewart
360252504Slstewart	return (vnet_sysctl_handle_uint(oidp, arg1, arg2, req));
361252504Slstewart}
362252504Slstewart
363252504Slstewartstatic int
364252504Slstewartcdg_exp_backoff_scale_handler(SYSCTL_HANDLER_ARGS)
365252504Slstewart{
366252504Slstewart
367252504Slstewart	if (req->newptr != NULL && CAST_PTR_INT(req->newptr) < 1)
368252504Slstewart		return (EINVAL);
369252504Slstewart
370252504Slstewart	return (vnet_sysctl_handle_uint(oidp, arg1, arg2, req));
371252504Slstewart}
372252504Slstewart
373252504Slstewartstatic inline unsigned long
374252504Slstewartcdg_window_decrease(struct cc_var *ccv, unsigned long owin, unsigned int beta)
375252504Slstewart{
376252504Slstewart
377252504Slstewart	return ((ulmin(CCV(ccv, snd_wnd), owin) * beta) / 100);
378252504Slstewart}
379252504Slstewart
380252504Slstewart/*
381252504Slstewart * Window increase function
382252504Slstewart * This window increase function is independent of the initial window size
383252504Slstewart * to ensure small window flows are not discriminated against (i.e. fairness).
384252504Slstewart * It increases at 1pkt/rtt like Reno for alpha_inc rtts, and then 2pkts/rtt for
385252504Slstewart * the next alpha_inc rtts, etc.
386252504Slstewart */
387252504Slstewartstatic void
388252504Slstewartcdg_window_increase(struct cc_var *ccv, int new_measurement)
389252504Slstewart{
390252504Slstewart	struct cdg *cdg_data;
391252504Slstewart	int incr, s_w_incr;
392252504Slstewart
393252504Slstewart	cdg_data = ccv->cc_data;
394252504Slstewart	incr = s_w_incr = 0;
395252504Slstewart
396252504Slstewart	if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) {
397252504Slstewart		/* Slow start. */
398252504Slstewart		incr = CCV(ccv, t_maxseg);
399252504Slstewart		s_w_incr = incr;
400252504Slstewart		cdg_data->window_incr = cdg_data->rtt_count = 0;
401252504Slstewart	} else {
402252504Slstewart		/* Congestion avoidance. */
403252504Slstewart		if (new_measurement) {
404252504Slstewart			s_w_incr = CCV(ccv, t_maxseg);
405252504Slstewart			if (V_cdg_alpha_inc == 0) {
406252504Slstewart				incr = CCV(ccv, t_maxseg);
407252504Slstewart			} else {
408252504Slstewart				if (++cdg_data->rtt_count >= V_cdg_alpha_inc) {
409252504Slstewart					cdg_data->window_incr++;
410252504Slstewart					cdg_data->rtt_count = 0;
411252504Slstewart				}
412252504Slstewart				incr = CCV(ccv, t_maxseg) *
413252504Slstewart				    cdg_data->window_incr;
414252504Slstewart			}
415252504Slstewart		}
416252504Slstewart	}
417252504Slstewart
418252504Slstewart	if (cdg_data->shadow_w > 0)
419252504Slstewart		cdg_data->shadow_w = ulmin(cdg_data->shadow_w + s_w_incr,
420252504Slstewart		    TCP_MAXWIN << CCV(ccv, snd_scale));
421252504Slstewart
422252504Slstewart	CCV(ccv, snd_cwnd) = ulmin(CCV(ccv, snd_cwnd) + incr,
423252504Slstewart	    TCP_MAXWIN << CCV(ccv, snd_scale));
424252504Slstewart}
425252504Slstewart
426252504Slstewartstatic void
427252504Slstewartcdg_cong_signal(struct cc_var *ccv, uint32_t signal_type)
428252504Slstewart{
429252504Slstewart	struct cdg *cdg_data = ccv->cc_data;
430252504Slstewart
431252504Slstewart	switch(signal_type) {
432252504Slstewart	case CC_CDG_DELAY:
433252504Slstewart		CCV(ccv, snd_ssthresh) = cdg_window_decrease(ccv,
434252504Slstewart		    CCV(ccv, snd_cwnd), V_cdg_beta_delay);
435252504Slstewart		CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
436252504Slstewart		CCV(ccv, snd_recover) = CCV(ccv, snd_max);
437252504Slstewart		cdg_data->window_incr = cdg_data->rtt_count = 0;
438252504Slstewart		ENTER_CONGRECOVERY(CCV(ccv, t_flags));
439252504Slstewart		break;
440252504Slstewart	case CC_NDUPACK:
441252504Slstewart		/*
442252504Slstewart		 * If already responding to congestion OR we have guessed no
443252504Slstewart		 * queue in the path is full.
444252504Slstewart		 */
445252504Slstewart		if (IN_CONGRECOVERY(CCV(ccv, t_flags)) ||
446252504Slstewart		    cdg_data->queue_state < CDG_Q_FULL) {
447252504Slstewart			CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd);
448252504Slstewart			CCV(ccv, snd_recover) = CCV(ccv, snd_max);
449252504Slstewart		} else {
450252504Slstewart			/*
451252504Slstewart			 * Loss is likely to be congestion related. We have
452252504Slstewart			 * inferred a queue full state, so have shadow window
453252504Slstewart			 * react to loss as NewReno would.
454252504Slstewart			 */
455252504Slstewart			if (cdg_data->shadow_w > 0)
456252504Slstewart				cdg_data->shadow_w = cdg_window_decrease(ccv,
457252504Slstewart				    cdg_data->shadow_w, RENO_BETA);
458252504Slstewart
459252504Slstewart			CCV(ccv, snd_ssthresh) = ulmax(cdg_data->shadow_w,
460252504Slstewart			    cdg_window_decrease(ccv, CCV(ccv, snd_cwnd),
461252504Slstewart			    V_cdg_beta_loss));
462252504Slstewart
463252504Slstewart			cdg_data->window_incr = cdg_data->rtt_count = 0;
464252504Slstewart		}
465252504Slstewart		ENTER_RECOVERY(CCV(ccv, t_flags));
466252504Slstewart		break;
467252504Slstewart	default:
468252504Slstewart		newreno_cc_algo.cong_signal(ccv, signal_type);
469252504Slstewart		break;
470252504Slstewart	}
471252504Slstewart}
472252504Slstewart
473252504Slstewart/*
474252504Slstewart * Using a negative exponential probabilistic backoff so that sources with
475252504Slstewart * varying RTTs which share the same link will, on average, have the same
476252504Slstewart * probability of backoff over time.
477252504Slstewart *
478252504Slstewart * Prob_backoff = 1 - exp(-qtrend / V_cdg_exp_backoff_scale), where
479252504Slstewart * V_cdg_exp_backoff_scale is the average qtrend for the exponential backoff.
480252504Slstewart */
481252504Slstewartstatic inline int
482252504Slstewartprob_backoff(long qtrend)
483252504Slstewart{
484252504Slstewart	int backoff, idx, p;
485252504Slstewart
486252504Slstewart	backoff = (qtrend > ((MAXGRAD * V_cdg_exp_backoff_scale) << D_P_E));
487252504Slstewart
488252504Slstewart	if (!backoff) {
489252504Slstewart		if (V_cdg_exp_backoff_scale > 1)
490252504Slstewart			idx = (qtrend + V_cdg_exp_backoff_scale / 2) /
491252504Slstewart			    V_cdg_exp_backoff_scale;
492252504Slstewart		else
493252504Slstewart			idx = qtrend;
494252504Slstewart
495252504Slstewart		/* Backoff probability proportional to rate of queue growth. */
496252504Slstewart		p = (INT_MAX / (1 << EXP_PREC)) * probexp[idx];
497252504Slstewart		backoff = (random() < p);
498252504Slstewart	}
499252504Slstewart
500252504Slstewart	return (backoff);
501252504Slstewart}
502252504Slstewart
503252504Slstewartstatic inline void
504252504Slstewartcalc_moving_average(struct cdg *cdg_data, long qdiff_max, long qdiff_min)
505252504Slstewart{
506252504Slstewart	struct qdiff_sample *qds;
507252504Slstewart
508252504Slstewart	++cdg_data->num_samples;
509252504Slstewart	if (cdg_data->num_samples > cdg_data->sample_q_size) {
510252504Slstewart		/* Minimum RTT. */
511252504Slstewart		qds = STAILQ_FIRST(&cdg_data->qdiffmin_q);
512252504Slstewart		cdg_data->min_qtrend =  cdg_data->min_qtrend +
513252504Slstewart		    (qdiff_min - qds->qdiff) / cdg_data->sample_q_size;
514252504Slstewart		STAILQ_REMOVE_HEAD(&cdg_data->qdiffmin_q, qdiff_lnk);
515252504Slstewart		qds->qdiff = qdiff_min;
516252504Slstewart		STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds, qdiff_lnk);
517252504Slstewart
518252504Slstewart		/* Maximum RTT. */
519252504Slstewart		qds = STAILQ_FIRST(&cdg_data->qdiffmax_q);
520252504Slstewart		cdg_data->max_qtrend =  cdg_data->max_qtrend +
521252504Slstewart		    (qdiff_max - qds->qdiff) / cdg_data->sample_q_size;
522252504Slstewart		STAILQ_REMOVE_HEAD(&cdg_data->qdiffmax_q, qdiff_lnk);
523252504Slstewart		qds->qdiff = qdiff_max;
524252504Slstewart		STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds, qdiff_lnk);
525252504Slstewart		--cdg_data->num_samples;
526252504Slstewart	} else {
527252504Slstewart		qds = uma_zalloc(qdiffsample_zone, M_NOWAIT);
528252504Slstewart		if (qds != NULL) {
529252504Slstewart			cdg_data->min_qtrend = cdg_data->min_qtrend +
530252504Slstewart			    qdiff_min / cdg_data->sample_q_size;
531252504Slstewart			qds->qdiff = qdiff_min;
532252504Slstewart			STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds,
533252504Slstewart			    qdiff_lnk);
534252504Slstewart		}
535252504Slstewart
536252504Slstewart		qds = uma_zalloc(qdiffsample_zone, M_NOWAIT);
537252504Slstewart		if (qds) {
538252504Slstewart			cdg_data->max_qtrend = cdg_data->max_qtrend +
539252504Slstewart			    qdiff_max / cdg_data->sample_q_size;
540252504Slstewart			qds->qdiff = qdiff_max;
541252504Slstewart			STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds,
542252504Slstewart			    qdiff_lnk);
543252504Slstewart		}
544252504Slstewart	}
545252504Slstewart}
546252504Slstewart
547252504Slstewartstatic void
548252504Slstewartcdg_ack_received(struct cc_var *ccv, uint16_t ack_type)
549252504Slstewart{
550252504Slstewart	struct cdg *cdg_data;
551252504Slstewart	struct ertt *e_t;
552252504Slstewart	long qdiff_max, qdiff_min;
553252504Slstewart	int congestion, new_measurement, slowstart;
554252504Slstewart
555252504Slstewart	cdg_data = ccv->cc_data;
556252504Slstewart	e_t = (struct ertt *)khelp_get_osd(CCV(ccv, osd), ertt_id);
557252504Slstewart	new_measurement = e_t->flags & ERTT_NEW_MEASUREMENT;
558252504Slstewart	congestion = 0;
559252504Slstewart	cdg_data->maxrtt_in_rtt = imax(e_t->rtt, cdg_data->maxrtt_in_rtt);
560252504Slstewart	cdg_data->minrtt_in_rtt = imin(e_t->rtt, cdg_data->minrtt_in_rtt);
561252504Slstewart
562252504Slstewart	if (new_measurement) {
563252504Slstewart		slowstart = (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh));
564252504Slstewart		/*
565252504Slstewart		 * Update smoothed gradient measurements. Since we are only
566252504Slstewart		 * using one measurement per RTT, use max or min rtt_in_rtt.
567252504Slstewart		 * This is also less noisy than a sample RTT measurement. Max
568252504Slstewart		 * RTT measurements can have trouble due to OS issues.
569252504Slstewart		 */
570252504Slstewart		if (cdg_data->maxrtt_in_prevrtt) {
571252504Slstewart			qdiff_max = ((long)(cdg_data->maxrtt_in_rtt -
572252504Slstewart			    cdg_data->maxrtt_in_prevrtt) << D_P_E );
573252504Slstewart			qdiff_min = ((long)(cdg_data->minrtt_in_rtt -
574252504Slstewart			    cdg_data->minrtt_in_prevrtt) << D_P_E );
575252504Slstewart
576252504Slstewart			calc_moving_average(cdg_data, qdiff_max, qdiff_min);
577252504Slstewart
578252504Slstewart			/* Probabilistic backoff with respect to gradient. */
579252504Slstewart			if (slowstart && qdiff_min > 0)
580252504Slstewart				congestion = prob_backoff(qdiff_min);
581252504Slstewart			else if (cdg_data->min_qtrend > 0)
582252504Slstewart				congestion = prob_backoff(cdg_data->min_qtrend);
583252504Slstewart			else if (slowstart && qdiff_max > 0)
584252504Slstewart				congestion = prob_backoff(qdiff_max);
585252504Slstewart			else if (cdg_data->max_qtrend > 0)
586252504Slstewart				congestion = prob_backoff(cdg_data->max_qtrend);
587252504Slstewart
588252504Slstewart			/* Update estimate of queue state. */
589252504Slstewart			if (cdg_data->min_qtrend > 0 &&
590252504Slstewart			    cdg_data->max_qtrend <= 0) {
591252504Slstewart				cdg_data->queue_state = CDG_Q_FULL;
592252504Slstewart			} else if (cdg_data->min_qtrend >= 0 &&
593252504Slstewart			    cdg_data->max_qtrend < 0) {
594252504Slstewart				cdg_data->queue_state = CDG_Q_EMPTY;
595252504Slstewart				cdg_data->shadow_w = 0;
596252504Slstewart			} else if (cdg_data->min_qtrend > 0 &&
597252504Slstewart			    cdg_data->max_qtrend > 0) {
598252504Slstewart				cdg_data->queue_state = CDG_Q_RISING;
599252504Slstewart			} else if (cdg_data->min_qtrend < 0 &&
600252504Slstewart			    cdg_data->max_qtrend < 0) {
601252504Slstewart				cdg_data->queue_state = CDG_Q_FALLING;
602252504Slstewart			}
603252504Slstewart
604252504Slstewart			if (cdg_data->min_qtrend < 0 ||
605252504Slstewart			    cdg_data->max_qtrend < 0)
606252504Slstewart				cdg_data->consec_cong_cnt = 0;
607252504Slstewart		}
608252504Slstewart
609252504Slstewart		cdg_data->minrtt_in_prevrtt = cdg_data->minrtt_in_rtt;
610252504Slstewart		cdg_data->minrtt_in_rtt = INT_MAX;
611252504Slstewart		cdg_data->maxrtt_in_prevrtt = cdg_data->maxrtt_in_rtt;
612252504Slstewart		cdg_data->maxrtt_in_rtt = 0;
613252504Slstewart		e_t->flags &= ~ERTT_NEW_MEASUREMENT;
614252504Slstewart	}
615252504Slstewart
616252504Slstewart	if (congestion) {
617252504Slstewart		cdg_data->consec_cong_cnt++;
618252504Slstewart		if (!IN_RECOVERY(CCV(ccv, t_flags))) {
619252504Slstewart			if (cdg_data->consec_cong_cnt <= V_cdg_consec_cong)
620252504Slstewart				cdg_cong_signal(ccv, CC_CDG_DELAY);
621252504Slstewart			else
622252504Slstewart				/*
623252504Slstewart				 * We have been backing off but the queue is not
624252504Slstewart				 * falling. Assume we are competing with
625252504Slstewart				 * loss-based flows and don't back off for the
626252504Slstewart				 * next V_cdg_hold_backoff RTT periods.
627252504Slstewart				 */
628252504Slstewart				if (cdg_data->consec_cong_cnt >=
629252504Slstewart				    V_cdg_consec_cong + V_cdg_hold_backoff)
630252504Slstewart					cdg_data->consec_cong_cnt = 0;
631252504Slstewart
632252504Slstewart			/* Won't see effect until 2nd RTT. */
633252504Slstewart			cdg_data->maxrtt_in_prevrtt = 0;
634252504Slstewart			/*
635252504Slstewart			 * Resync shadow window in case we are competing with a
636252504Slstewart			 * loss based flow
637252504Slstewart			 */
638252504Slstewart			cdg_data->shadow_w = ulmax(CCV(ccv, snd_cwnd),
639252504Slstewart			    cdg_data->shadow_w);
640252504Slstewart		}
641252504Slstewart	} else if (ack_type == CC_ACK)
642252504Slstewart		cdg_window_increase(ccv, new_measurement);
643252504Slstewart}
644252504Slstewart
645252504Slstewart/* When a vnet is created and being initialised, init the per-stack CDG vars. */
646252504SlstewartVNET_SYSINIT(cdg_init_vnet, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
647252504Slstewart    cdg_init_vnet, NULL);
648252504Slstewart
649252504SlstewartSYSCTL_DECL(_net_inet_tcp_cc_cdg);
650252504SlstewartSYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, cdg, CTLFLAG_RW, NULL,
651252504Slstewart    "CAIA delay-gradient congestion control related settings");
652252504Slstewart
653252504SlstewartSYSCTL_STRING(_net_inet_tcp_cc_cdg, OID_AUTO, version,
654252504Slstewart    CTLFLAG_RD, CDG_VERSION, sizeof(CDG_VERSION) - 1,
655252504Slstewart    "Current algorithm/implementation version number");
656252504Slstewart
657252504SlstewartSYSCTL_VNET_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, alpha_inc,
658252504Slstewart    CTLFLAG_RW, &VNET_NAME(cdg_alpha_inc), 0,
659252504Slstewart    "Increment the window increase factor alpha by 1 MSS segment every "
660252504Slstewart    "alpha_inc RTTs during congestion avoidance mode.");
661252504Slstewart
662252504SlstewartSYSCTL_VNET_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_delay,
663252504Slstewart    CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(cdg_beta_delay), 70,
664252504Slstewart    &cdg_beta_handler, "IU",
665252504Slstewart    "Delay-based window decrease factor as a percentage "
666252504Slstewart    "(on delay-based backoff, w = w * beta_delay / 100)");
667252504Slstewart
668252504SlstewartSYSCTL_VNET_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_loss,
669252504Slstewart    CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(cdg_beta_loss), 50,
670252504Slstewart    &cdg_beta_handler, "IU",
671252504Slstewart    "Loss-based window decrease factor as a percentage "
672252504Slstewart    "(on loss-based backoff, w = w * beta_loss / 100)");
673252504Slstewart
674252504SlstewartSYSCTL_VNET_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, exp_backoff_scale,
675252504Slstewart    CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(cdg_exp_backoff_scale), 2,
676252504Slstewart    &cdg_exp_backoff_scale_handler, "IU",
677252504Slstewart    "Scaling parameter for the probabilistic exponential backoff");
678252504Slstewart
679252504SlstewartSYSCTL_VNET_UINT(_net_inet_tcp_cc_cdg,  OID_AUTO, smoothing_factor,
680252504Slstewart    CTLFLAG_RW, &VNET_NAME(cdg_smoothing_factor), 8,
681252504Slstewart    "Number of samples used for moving average smoothing (0 = no smoothing)");
682252504Slstewart
683252504SlstewartSYSCTL_VNET_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_consec_cong,
684252504Slstewart    CTLFLAG_RW, &VNET_NAME(cdg_consec_cong), 5,
685252504Slstewart    "Number of consecutive delay-gradient based congestion episodes which will "
686252504Slstewart    "trigger loss based CC compatibility");
687252504Slstewart
688252504SlstewartSYSCTL_VNET_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_hold_backoff,
689252504Slstewart    CTLFLAG_RW, &VNET_NAME(cdg_hold_backoff), 5,
690252504Slstewart    "Number of consecutive delay-gradient based congestion episodes to hold "
691252504Slstewart    "the window backoff for loss based CC compatibility");
692252504Slstewart
693252504SlstewartDECLARE_CC_MODULE(cdg, &cdg_cc_algo);
694252504Slstewart
695252504SlstewartMODULE_DEPEND(cdg, ertt, 1, 1, 1);
696