1/*-
2 * Copyright (c) 2009-2010
3 * 	Swinburne University of Technology, Melbourne, Australia
4 * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
5 * Copyright (c) 2010-2011 The FreeBSD Foundation
6 * All rights reserved.
7 *
8 * This software was developed at the Centre for Advanced Internet
9 * Architectures, Swinburne University of Technology, by David Hayes, made
10 * possible in part by a grant from the Cisco University Research Program Fund
11 * at Community Foundation Silicon Valley.
12 *
13 * Portions of this software were developed at the Centre for Advanced
14 * Internet Architectures, Swinburne University of Technology, Melbourne,
15 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 *    notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 *    notice, this list of conditions and the following disclaimer in the
24 *    documentation and/or other materials provided with the distribution.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 */
38
39#include <sys/cdefs.h>
40__FBSDID("$FreeBSD$");
41
42#include <sys/param.h>
43#include <sys/kernel.h>
44#include <sys/mbuf.h>
45#include <sys/module.h>
46#include <sys/hhook.h>
47#include <sys/khelp.h>
48#include <sys/module_khelp.h>
49#include <sys/socket.h>
50#include <sys/sockopt.h>
51
52#include <net/vnet.h>
53
54#include <netinet/in.h>
55#include <netinet/in_pcb.h>
56#include <netinet/tcp_seq.h>
57#include <netinet/tcp_var.h>
58
59#include <netinet/khelp/h_ertt.h>
60
61#include <vm/uma.h>
62
63uma_zone_t txseginfo_zone;
64
65/* Smoothing factor for delayed ack guess. */
66#define	DLYACK_SMOOTH	5
67
68/* Max number of time stamp errors allowed in a session. */
69#define	MAX_TS_ERR	10
70
71static int ertt_packet_measurement_hook(int hhook_type, int hhook_id,
72    void *udata, void *ctx_data, void *hdata, struct osd *hosd);
73static int ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id,
74    void *udata, void *ctx_data, void *hdata, struct osd *hosd);
75static int ertt_mod_init(void);
76static int ertt_mod_destroy(void);
77static int ertt_uma_ctor(void *mem, int size, void *arg, int flags);
78static void ertt_uma_dtor(void *mem, int size, void *arg);
79
80/*
81 * Contains information about the sent segment for comparison with the
82 * corresponding ack.
83 */
84struct txseginfo {
85	/* Segment length. */
86	long		len;
87	/* Segment sequence number. */
88	tcp_seq		seq;
89	/* Time stamp indicating when the packet was sent. */
90	uint32_t	tx_ts;
91	/* Last received receiver ts (if the TCP option is used). */
92	uint32_t	rx_ts;
93	uint32_t	flags;
94	TAILQ_ENTRY (txseginfo) txsegi_lnk;
95};
96
97/* Flags for struct txseginfo. */
98#define	TXSI_TSO		0x01 /* TSO was used for this entry. */
99#define	TXSI_RTT_MEASURE_START	0x02 /* Start a per RTT measurement. */
100#define	TXSI_RX_MEASURE_END	0x04 /* Measure the rx rate until this txsi. */
101
102struct helper ertt_helper = {
103	.mod_init = ertt_mod_init,
104	.mod_destroy = ertt_mod_destroy,
105	.h_flags = HELPER_NEEDS_OSD,
106	.h_classes = HELPER_CLASS_TCP
107};
108
109/* Define the helper hook info required by ERTT. */
110struct hookinfo ertt_hooks[] = {
111	{
112		.hook_type = HHOOK_TYPE_TCP,
113		.hook_id = HHOOK_TCP_EST_IN,
114		.hook_udata = NULL,
115		.hook_func = &ertt_packet_measurement_hook
116	},
117	{
118		.hook_type = HHOOK_TYPE_TCP,
119		.hook_id = HHOOK_TCP_EST_OUT,
120		.hook_udata = NULL,
121		.hook_func = &ertt_add_tx_segment_info_hook
122	}
123};
124
125/* Flags to indicate how marked_packet_rtt should handle this txsi. */
126#define	MULTI_ACK		0x01 /* More than this txsi is acked. */
127#define	OLD_TXSI		0x02 /* TXSI is old according to timestamps. */
128#define	CORRECT_ACK		0X04 /* Acks this TXSI. */
129#define	FORCED_MEASUREMENT	0X08 /* Force an RTT measurement. */
130
131/*
132 * This fuction measures the RTT of a particular segment/ack pair, or the next
133 * closest if this will yield an inaccurate result due to delayed acking or
134 * other issues.
135 */
136static void inline
137marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp,
138    uint32_t *pmeasurenext, int *pmeasurenext_len, int *prtt_bytes_adjust,
139    int mflag)
140{
141
142	/*
143	 * If we can't measure this one properly due to delayed acking adjust
144	 * byte counters and flag to measure next txsi. Note that since the
145	 * marked packet's transmitted bytes are measured we need to subtract the
146	 * transmitted bytes. Then pretend the next txsi was marked.
147	 */
148	if (mflag & (MULTI_ACK|OLD_TXSI)) {
149		*pmeasurenext = txsi->tx_ts;
150		*pmeasurenext_len = txsi->len;
151		*prtt_bytes_adjust += *pmeasurenext_len;
152	} else {
153		if (mflag & FORCED_MEASUREMENT) {
154			e_t->markedpkt_rtt = tcp_ts_getticks() -
155			    *pmeasurenext + 1;
156			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt +
157			    *pmeasurenext_len - *prtt_bytes_adjust;
158		} else {
159			e_t->markedpkt_rtt = tcp_ts_getticks() -
160			    txsi->tx_ts + 1;
161			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt -
162			    *prtt_bytes_adjust;
163		}
164		e_t->marked_snd_cwnd = tp->snd_cwnd;
165
166		/*
167		 * Reset the ERTT_MEASUREMENT_IN_PROGRESS flag to indicate to
168		 * add_tx_segment_info that a new measurement should be started.
169		 */
170		e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS;
171		/*
172		 * Set ERTT_NEW_MEASUREMENT to tell the congestion control
173		 * algorithm that a new marked RTT measurement has has been made
174		 * and is available for use.
175		 */
176		e_t->flags |= ERTT_NEW_MEASUREMENT;
177
178		if (tp->t_flags & TF_TSO) {
179			/* Temporarily disable TSO to aid a new measurment. */
180			tp->t_flags &= ~TF_TSO;
181			/* Keep track that we've disabled it. */
182			e_t->flags |= ERTT_TSO_DISABLED;
183		}
184	}
185}
186
187/*
188 * Ertt_packet_measurements uses a small amount of state kept on each packet
189 * sent to match incoming acknowledgements. This enables more accurate and
190 * secure round trip time measurements. The resulting measurement is used for
191 * congestion control algorithms which require a more accurate time.
192 * Ertt_packet_measurements is called via the helper hook in tcp_input.c
193 */
194static int
195ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata,
196    void *ctx_data, void *hdata, struct osd *hosd)
197{
198	struct ertt *e_t;
199	struct tcpcb *tp;
200	struct tcphdr *th;
201	struct tcpopt *to;
202	struct tcp_hhook_data *thdp;
203	struct txseginfo *txsi;
204	int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust;
205	uint32_t measurenext, rts;
206	tcp_seq ack;
207
208	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
209	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
210
211	e_t = (struct ertt *)hdata;
212	thdp = ctx_data;
213	tp = thdp->tp;
214	th = thdp->th;
215	to = thdp->to;
216	new_sacked_bytes = (tp->sackhint.last_sack_ack != 0);
217	measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0;
218	acked = th->th_ack - tp->snd_una;
219
220	INP_WLOCK_ASSERT(tp->t_inpcb);
221
222	/* Packet has provided new acknowledgements. */
223	if (acked > 0 || new_sacked_bytes) {
224		if (acked == 0 && new_sacked_bytes) {
225			/* Use last sacked data. */
226			ack = tp->sackhint.last_sack_ack;
227		} else
228			ack = th->th_ack;
229
230		txsi = TAILQ_FIRST(&e_t->txsegi_q);
231		while (txsi != NULL) {
232			rts = 0;
233
234			/* Acknowledgement is acking more than this txsi. */
235			if (SEQ_GT(ack, txsi->seq + txsi->len)) {
236				if (txsi->flags & TXSI_RTT_MEASURE_START ||
237				    measurenext) {
238					marked_packet_rtt(txsi, e_t, tp,
239					    &measurenext, &measurenext_len,
240					    &rtt_bytes_adjust, MULTI_ACK);
241				}
242				TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
243				uma_zfree(txseginfo_zone, txsi);
244				txsi = TAILQ_FIRST(&e_t->txsegi_q);
245				continue;
246			}
247
248			/*
249			 * Guess if delayed acks are being used by the receiver.
250			 *
251			 * XXXDH: A simple heuristic that could be improved
252			 */
253			if (!new_sacked_bytes) {
254				if (acked > tp->t_maxseg) {
255					e_t->dlyack_rx +=
256					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
257					    1 : 0;
258					multiack = 1;
259				} else if (acked > txsi->len) {
260					multiack = 1;
261					e_t->dlyack_rx +=
262					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
263					    1 : 0;
264				} else if (acked == tp->t_maxseg ||
265					   acked == txsi->len) {
266					e_t->dlyack_rx -=
267					    (e_t->dlyack_rx > 0) ? 1 : 0;
268				}
269				/* Otherwise leave dlyack_rx the way it was. */
270			}
271
272			/*
273			 * Time stamps are only to help match the txsi with the
274			 * received acknowledgements.
275			 */
276			if (e_t->timestamp_errors < MAX_TS_ERR &&
277			    (to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
278				/*
279				 * Note: All packets sent with the offload will
280				 * have the same time stamp. If we are sending
281				 * on a fast interface and the t_maxseg is much
282				 * smaller than one tick, this will be fine. The
283				 * time stamp would be the same whether we were
284				 * using tso or not. However, if the interface
285				 * is slow, this will cause problems with the
286				 * calculations. If the interface is slow, there
287				 * is not reason to be using tso, and it should
288				 * be turned off.
289				 */
290				/*
291				 * If there are too many time stamp errors, time
292				 * stamps won't be trusted
293				 */
294				rts = to->to_tsecr;
295				/* Before this packet. */
296				if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts))
297					/* When delayed acking is used, the
298					 * reflected time stamp is of the first
299					 * packet and thus may be before
300					 * txsi->tx_ts.
301					 */
302					break;
303				if (TSTMP_GT(rts, txsi->tx_ts)) {
304					/*
305					 * If reflected time stamp is later than
306					 * tx_tsi, then this txsi is old.
307					 */
308					if (txsi->flags & TXSI_RTT_MEASURE_START
309					    || measurenext) {
310						marked_packet_rtt(txsi, e_t, tp,
311						    &measurenext, &measurenext_len,
312						    &rtt_bytes_adjust, OLD_TXSI);
313					}
314					TAILQ_REMOVE(&e_t->txsegi_q, txsi,
315					    txsegi_lnk);
316					uma_zfree(txseginfo_zone, txsi);
317					txsi = TAILQ_FIRST(&e_t->txsegi_q);
318					continue;
319				}
320				if (rts == txsi->tx_ts &&
321				    TSTMP_LT(to->to_tsval, txsi->rx_ts)) {
322					/*
323					 * Segment received before sent!
324					 * Something is wrong with the received
325					 * timestamps so increment errors. If
326					 * this keeps up we will ignore
327					 * timestamps.
328					 */
329					e_t->timestamp_errors++;
330				}
331			}
332			/*
333			 * Acknowledging a sequence number before this txsi.
334			 * If it is an old txsi that may have had the same seq
335			 * numbers, it should have been removed if time stamps
336			 * are being used.
337			 */
338			if (SEQ_LEQ(ack, txsi->seq))
339				break; /* Before first packet in txsi. */
340
341			/*
342			 * Only ack > txsi->seq and ack <= txsi->seq+txsi->len
343			 * past this point.
344			 *
345			 * If delayed acks are being used, an acknowledgement
346			 * for a single segment will have been delayed by the
347			 * receiver and will yield an inaccurate measurement. In
348			 * this case, we only make the measurement if more than
349			 * one segment is being acknowledged or sack is
350			 * currently being used.
351			 */
352			if (!e_t->dlyack_rx || multiack || new_sacked_bytes) {
353				/* Make an accurate new measurement. */
354				e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1;
355
356				if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0)
357					e_t->minrtt = e_t->rtt;
358
359				if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0)
360					e_t->maxrtt = e_t->rtt;
361			}
362
363			if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext)
364				marked_packet_rtt(txsi, e_t, tp,
365				    &measurenext, &measurenext_len,
366				    &rtt_bytes_adjust, CORRECT_ACK);
367
368			if (txsi->flags & TXSI_TSO) {
369				txsi->len -= acked;
370				if (txsi->len > 0) {
371					/*
372					 * This presumes ack for first bytes in
373					 * txsi, this may not be true but it
374					 * shouldn't cause problems for the
375					 * timing.
376					 *
377					 * We remeasure RTT even though we only
378					 * have a single txsi. The rationale
379					 * behind this is that it is better to
380					 * have a slightly inaccurate
381					 * measurement than no additional
382					 * measurement for the rest of the bulk
383					 * transfer. Since TSO is only used on
384					 * high speed interface cards, so the
385					 * packets should be transmitted at line
386					 * rate back to back with little
387					 * difference in transmission times (in
388					 * ticks).
389					 */
390					txsi->seq += acked;
391					/*
392					 * Reset txsi measure flag so we don't
393					 * use it for another RTT measurement.
394					 */
395					txsi->flags &= ~TXSI_RTT_MEASURE_START;
396					/*
397					 * There is still more data to be acked
398					 * from tso bulk transmission, so we
399					 * won't remove it from the TAILQ yet.
400					 */
401					break;
402				}
403			}
404
405			TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
406			uma_zfree(txseginfo_zone, txsi);
407			break;
408		}
409
410		if (measurenext) {
411			/*
412			 * We need to do a RTT measurement. It won't be the best
413			 * if we do it here.
414			 */
415			marked_packet_rtt(txsi, e_t, tp,
416			    &measurenext, &measurenext_len,
417			    &rtt_bytes_adjust, FORCED_MEASUREMENT);
418		}
419	}
420
421	return (0);
422}
423
424/*
425 * Add information about a transmitted segment to a list.
426 * This is called via the helper hook in tcp_output.c
427 */
428static int
429ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata,
430    void *ctx_data, void *hdata, struct osd *hosd)
431{
432	struct ertt *e_t;
433	struct tcpcb *tp;
434	struct tcphdr *th;
435	struct tcpopt *to;
436	struct tcp_hhook_data *thdp;
437	struct txseginfo *txsi;
438	long len;
439	int tso;
440
441	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
442	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
443
444	e_t = (struct ertt *)hdata;
445	thdp = ctx_data;
446	tp = thdp->tp;
447	th = thdp->th;
448	to = thdp->to;
449	len = thdp->len;
450	tso = thdp->tso;
451
452	INP_WLOCK_ASSERT(tp->t_inpcb);
453
454	if (len > 0) {
455		txsi = uma_zalloc(txseginfo_zone, M_NOWAIT);
456		if (txsi != NULL) {
457			/* Construct txsi setting the necessary flags. */
458			txsi->flags = 0; /* Needs to be initialised. */
459			txsi->seq = ntohl(th->th_seq);
460			txsi->len = len;
461			if (tso)
462				txsi->flags |= TXSI_TSO;
463			else if (e_t->flags & ERTT_TSO_DISABLED) {
464				tp->t_flags |= TF_TSO;
465				e_t->flags &= ~ERTT_TSO_DISABLED;
466			}
467
468			if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) {
469				e_t->bytes_tx_in_rtt += len;
470			} else {
471				txsi->flags |= TXSI_RTT_MEASURE_START;
472				e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS;
473				e_t->bytes_tx_in_rtt = len;
474			}
475
476			if (((tp->t_flags & TF_NOOPT) == 0) &&
477			    (to->to_flags & TOF_TS)) {
478				txsi->tx_ts = ntohl(to->to_tsval) -
479				    tp->ts_offset;
480				txsi->rx_ts = ntohl(to->to_tsecr);
481			} else {
482				txsi->tx_ts = tcp_ts_getticks();
483				txsi->rx_ts = 0; /* No received time stamp. */
484			}
485			TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk);
486		}
487	}
488
489	return (0);
490}
491
492static int
493ertt_mod_init(void)
494{
495
496	txseginfo_zone = uma_zcreate("ertt_txseginfo", sizeof(struct txseginfo),
497	    NULL, NULL, NULL, NULL, 0, 0);
498
499	return (0);
500}
501
502static int
503ertt_mod_destroy(void)
504{
505
506	uma_zdestroy(txseginfo_zone);
507
508	return (0);
509}
510
511static int
512ertt_uma_ctor(void *mem, int size, void *arg, int flags)
513{
514	struct ertt *e_t;
515
516	e_t = mem;
517
518	TAILQ_INIT(&e_t->txsegi_q);
519	e_t->timestamp_errors = 0;
520	e_t->minrtt = 0;
521	e_t->maxrtt = 0;
522	e_t->rtt = 0;
523	e_t->flags = 0;
524	e_t->dlyack_rx = 0;
525	e_t->bytes_tx_in_rtt = 0;
526	e_t->markedpkt_rtt = 0;
527
528	return (0);
529}
530
531static void
532ertt_uma_dtor(void *mem, int size, void *arg)
533{
534	struct ertt *e_t;
535	struct txseginfo *n_txsi, *txsi;
536
537	e_t = mem;
538	txsi = TAILQ_FIRST(&e_t->txsegi_q);
539	while (txsi != NULL) {
540		n_txsi = TAILQ_NEXT(txsi, txsegi_lnk);
541		uma_zfree(txseginfo_zone, txsi);
542		txsi = n_txsi;
543	}
544}
545
546KHELP_DECLARE_MOD_UMA(ertt, &ertt_helper, ertt_hooks, 1, sizeof(struct ertt),
547    ertt_uma_ctor, ertt_uma_dtor);
548