1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2009-2010
5 * 	Swinburne University of Technology, Melbourne, Australia
6 * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
7 * Copyright (c) 2010-2011 The FreeBSD Foundation
8 * All rights reserved.
9 *
10 * This software was developed at the Centre for Advanced Internet
11 * Architectures, Swinburne University of Technology, by David Hayes, made
12 * possible in part by a grant from the Cisco University Research Program Fund
13 * at Community Foundation Silicon Valley.
14 *
15 * Portions of this software were developed at the Centre for Advanced
16 * Internet Architectures, Swinburne University of Technology, Melbourne,
17 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
18 *
19 * Redistribution and use in source and binary forms, with or without
20 * modification, are permitted provided that the following conditions
21 * are met:
22 * 1. Redistributions of source code must retain the above copyright
23 *    notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 *    notice, this list of conditions and the following disclaimer in the
26 *    documentation and/or other materials provided with the distribution.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 */
40
41#include <sys/param.h>
42#include <sys/kernel.h>
43#include <sys/mbuf.h>
44#include <sys/module.h>
45#include <sys/hhook.h>
46#include <sys/khelp.h>
47#include <sys/module_khelp.h>
48#include <sys/socket.h>
49#include <sys/sockopt.h>
50
51#include <net/vnet.h>
52
53#include <netinet/in.h>
54#include <netinet/in_pcb.h>
55#include <netinet/tcp_seq.h>
56#include <netinet/tcp_var.h>
57
58#include <netinet/khelp/h_ertt.h>
59
60#include <vm/uma.h>
61
62uma_zone_t txseginfo_zone;
63
64/* Smoothing factor for delayed ack guess. */
65#define	DLYACK_SMOOTH	5
66
67/* Max number of time stamp errors allowed in a session. */
68#define	MAX_TS_ERR	10
69
70static int ertt_packet_measurement_hook(int hhook_type, int hhook_id,
71    void *udata, void *ctx_data, void *hdata, struct osd *hosd);
72static int ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id,
73    void *udata, void *ctx_data, void *hdata, struct osd *hosd);
74static int ertt_mod_init(void);
75static int ertt_mod_destroy(void);
76static int ertt_uma_ctor(void *mem, int size, void *arg, int flags);
77static void ertt_uma_dtor(void *mem, int size, void *arg);
78
79/*
80 * Contains information about the sent segment for comparison with the
81 * corresponding ack.
82 */
83struct txseginfo {
84	/* Segment length. */
85	uint32_t	len;
86	/* Segment sequence number. */
87	tcp_seq		seq;
88	/* Time stamp indicating when the packet was sent. */
89	uint32_t	tx_ts;
90	/* Last received receiver ts (if the TCP option is used). */
91	uint32_t	rx_ts;
92	uint32_t	flags;
93	TAILQ_ENTRY (txseginfo) txsegi_lnk;
94};
95
96/* Flags for struct txseginfo. */
97#define	TXSI_TSO		0x01 /* TSO was used for this entry. */
98#define	TXSI_RTT_MEASURE_START	0x02 /* Start a per RTT measurement. */
99#define	TXSI_RX_MEASURE_END	0x04 /* Measure the rx rate until this txsi. */
100
101struct helper ertt_helper = {
102	.mod_init = ertt_mod_init,
103	.mod_destroy = ertt_mod_destroy,
104	.h_flags = HELPER_NEEDS_OSD,
105	.h_classes = HELPER_CLASS_TCP
106};
107
108/* Define the helper hook info required by ERTT. */
109struct hookinfo ertt_hooks[] = {
110	{
111		.hook_type = HHOOK_TYPE_TCP,
112		.hook_id = HHOOK_TCP_EST_IN,
113		.hook_udata = NULL,
114		.hook_func = &ertt_packet_measurement_hook
115	},
116	{
117		.hook_type = HHOOK_TYPE_TCP,
118		.hook_id = HHOOK_TCP_EST_OUT,
119		.hook_udata = NULL,
120		.hook_func = &ertt_add_tx_segment_info_hook
121	}
122};
123
124/* Flags to indicate how marked_packet_rtt should handle this txsi. */
125#define	MULTI_ACK		0x01 /* More than this txsi is acked. */
126#define	OLD_TXSI		0x02 /* TXSI is old according to timestamps. */
127#define	CORRECT_ACK		0X04 /* Acks this TXSI. */
128#define	FORCED_MEASUREMENT	0X08 /* Force an RTT measurement. */
129
130/*
131 * This fuction measures the RTT of a particular segment/ack pair, or the next
132 * closest if this will yield an inaccurate result due to delayed acking or
133 * other issues.
134 */
135static void inline
136marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp,
137    uint32_t *pmeasurenext, int *pmeasurenext_len, int *prtt_bytes_adjust,
138    int mflag)
139{
140
141	/*
142	 * If we can't measure this one properly due to delayed acking adjust
143	 * byte counters and flag to measure next txsi. Note that since the
144	 * marked packet's transmitted bytes are measured we need to subtract the
145	 * transmitted bytes. Then pretend the next txsi was marked.
146	 */
147	if (mflag & (MULTI_ACK|OLD_TXSI)) {
148		*pmeasurenext = txsi->tx_ts;
149		*pmeasurenext_len = txsi->len;
150		*prtt_bytes_adjust += *pmeasurenext_len;
151	} else {
152		if (mflag & FORCED_MEASUREMENT) {
153			e_t->markedpkt_rtt = tcp_ts_getticks() -
154			    *pmeasurenext + 1;
155			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt +
156			    *pmeasurenext_len - *prtt_bytes_adjust;
157		} else {
158			e_t->markedpkt_rtt = tcp_ts_getticks() -
159			    txsi->tx_ts + 1;
160			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt -
161			    *prtt_bytes_adjust;
162		}
163		e_t->marked_snd_cwnd = tp->snd_cwnd;
164
165		/*
166		 * Reset the ERTT_MEASUREMENT_IN_PROGRESS flag to indicate to
167		 * add_tx_segment_info that a new measurement should be started.
168		 */
169		e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS;
170		/*
171		 * Set ERTT_NEW_MEASUREMENT to tell the congestion control
172		 * algorithm that a new marked RTT measurement has has been made
173		 * and is available for use.
174		 */
175		e_t->flags |= ERTT_NEW_MEASUREMENT;
176
177		if (tp->t_flags & TF_TSO) {
178			/* Temporarily disable TSO to aid a new measurement. */
179			tp->t_flags &= ~TF_TSO;
180			/* Keep track that we've disabled it. */
181			e_t->flags |= ERTT_TSO_DISABLED;
182		}
183	}
184}
185
186/*
187 * Ertt_packet_measurements uses a small amount of state kept on each packet
188 * sent to match incoming acknowledgements. This enables more accurate and
189 * secure round trip time measurements. The resulting measurement is used for
190 * congestion control algorithms which require a more accurate time.
191 * Ertt_packet_measurements is called via the helper hook in tcp_input.c
192 */
193static int
194ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata,
195    void *ctx_data, void *hdata, struct osd *hosd)
196{
197	struct ertt *e_t;
198	struct tcpcb *tp;
199	struct tcphdr *th;
200	struct tcpopt *to;
201	struct tcp_hhook_data *thdp;
202	struct txseginfo *txsi;
203	int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust;
204	uint32_t measurenext, rts;
205	tcp_seq ack;
206
207	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
208	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
209
210	e_t = (struct ertt *)hdata;
211	thdp = ctx_data;
212	tp = thdp->tp;
213	th = thdp->th;
214	to = thdp->to;
215	new_sacked_bytes = (tp->sackhint.last_sack_ack != 0);
216	measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0;
217	acked = th->th_ack - tp->snd_una;
218
219	INP_WLOCK_ASSERT(tptoinpcb(tp));
220
221	/* Packet has provided new acknowledgements. */
222	if (acked > 0 || new_sacked_bytes) {
223		if (acked == 0 && new_sacked_bytes) {
224			/* Use last sacked data. */
225			ack = tp->sackhint.last_sack_ack;
226		} else
227			ack = th->th_ack;
228
229		txsi = TAILQ_FIRST(&e_t->txsegi_q);
230		while (txsi != NULL) {
231			rts = 0;
232
233			/* Acknowledgement is acking more than this txsi. */
234			if (SEQ_GT(ack, txsi->seq + txsi->len)) {
235				if (txsi->flags & TXSI_RTT_MEASURE_START ||
236				    measurenext) {
237					marked_packet_rtt(txsi, e_t, tp,
238					    &measurenext, &measurenext_len,
239					    &rtt_bytes_adjust, MULTI_ACK);
240				}
241				TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
242				uma_zfree(txseginfo_zone, txsi);
243				txsi = TAILQ_FIRST(&e_t->txsegi_q);
244				continue;
245			}
246
247			/*
248			 * Guess if delayed acks are being used by the receiver.
249			 *
250			 * XXXDH: A simple heuristic that could be improved
251			 */
252			if (!new_sacked_bytes) {
253				if (acked > tp->t_maxseg) {
254					e_t->dlyack_rx +=
255					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
256					    1 : 0;
257					multiack = 1;
258				} else if (acked > txsi->len) {
259					multiack = 1;
260					e_t->dlyack_rx +=
261					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
262					    1 : 0;
263				} else if (acked == tp->t_maxseg ||
264					   acked == txsi->len) {
265					e_t->dlyack_rx -=
266					    (e_t->dlyack_rx > 0) ? 1 : 0;
267				}
268				/* Otherwise leave dlyack_rx the way it was. */
269			}
270
271			/*
272			 * Time stamps are only to help match the txsi with the
273			 * received acknowledgements.
274			 */
275			if (e_t->timestamp_errors < MAX_TS_ERR &&
276			    (to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
277				/*
278				 * Note: All packets sent with the offload will
279				 * have the same time stamp. If we are sending
280				 * on a fast interface and the t_maxseg is much
281				 * smaller than one tick, this will be fine. The
282				 * time stamp would be the same whether we were
283				 * using tso or not. However, if the interface
284				 * is slow, this will cause problems with the
285				 * calculations. If the interface is slow, there
286				 * is not reason to be using tso, and it should
287				 * be turned off.
288				 */
289				/*
290				 * If there are too many time stamp errors, time
291				 * stamps won't be trusted
292				 */
293				rts = to->to_tsecr;
294				/* Before this packet. */
295				if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts))
296					/* When delayed acking is used, the
297					 * reflected time stamp is of the first
298					 * packet and thus may be before
299					 * txsi->tx_ts.
300					 */
301					break;
302				if (TSTMP_GT(rts, txsi->tx_ts)) {
303					/*
304					 * If reflected time stamp is later than
305					 * tx_tsi, then this txsi is old.
306					 */
307					if (txsi->flags & TXSI_RTT_MEASURE_START
308					    || measurenext) {
309						marked_packet_rtt(txsi, e_t, tp,
310						    &measurenext, &measurenext_len,
311						    &rtt_bytes_adjust, OLD_TXSI);
312					}
313					TAILQ_REMOVE(&e_t->txsegi_q, txsi,
314					    txsegi_lnk);
315					uma_zfree(txseginfo_zone, txsi);
316					txsi = TAILQ_FIRST(&e_t->txsegi_q);
317					continue;
318				}
319				if (rts == txsi->tx_ts &&
320				    TSTMP_LT(to->to_tsval, txsi->rx_ts)) {
321					/*
322					 * Segment received before sent!
323					 * Something is wrong with the received
324					 * timestamps so increment errors. If
325					 * this keeps up we will ignore
326					 * timestamps.
327					 */
328					e_t->timestamp_errors++;
329				}
330			}
331			/*
332			 * Acknowledging a sequence number before this txsi.
333			 * If it is an old txsi that may have had the same seq
334			 * numbers, it should have been removed if time stamps
335			 * are being used.
336			 */
337			if (SEQ_LEQ(ack, txsi->seq))
338				break; /* Before first packet in txsi. */
339
340			/*
341			 * Only ack > txsi->seq and ack <= txsi->seq+txsi->len
342			 * past this point.
343			 *
344			 * If delayed acks are being used, an acknowledgement
345			 * for a single segment will have been delayed by the
346			 * receiver and will yield an inaccurate measurement. In
347			 * this case, we only make the measurement if more than
348			 * one segment is being acknowledged or sack is
349			 * currently being used.
350			 */
351			if (!e_t->dlyack_rx || multiack || new_sacked_bytes) {
352				/* Make an accurate new measurement. */
353				e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1;
354
355				if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0)
356					e_t->minrtt = e_t->rtt;
357
358				if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0)
359					e_t->maxrtt = e_t->rtt;
360			}
361
362			if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext)
363				marked_packet_rtt(txsi, e_t, tp,
364				    &measurenext, &measurenext_len,
365				    &rtt_bytes_adjust, CORRECT_ACK);
366
367			if (txsi->flags & TXSI_TSO) {
368				if (txsi->len > acked) {
369					txsi->len -= acked;
370					/*
371					 * This presumes ack for first bytes in
372					 * txsi, this may not be true but it
373					 * shouldn't cause problems for the
374					 * timing.
375					 *
376					 * We remeasure RTT even though we only
377					 * have a single txsi. The rationale
378					 * behind this is that it is better to
379					 * have a slightly inaccurate
380					 * measurement than no additional
381					 * measurement for the rest of the bulk
382					 * transfer. Since TSO is only used on
383					 * high speed interface cards, so the
384					 * packets should be transmitted at line
385					 * rate back to back with little
386					 * difference in transmission times (in
387					 * ticks).
388					 */
389					txsi->seq += acked;
390					/*
391					 * Reset txsi measure flag so we don't
392					 * use it for another RTT measurement.
393					 */
394					txsi->flags &= ~TXSI_RTT_MEASURE_START;
395					/*
396					 * There is still more data to be acked
397					 * from tso bulk transmission, so we
398					 * won't remove it from the TAILQ yet.
399					 */
400					break;
401				}
402				txsi->len = 0;
403			}
404
405			TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
406			uma_zfree(txseginfo_zone, txsi);
407			break;
408		}
409
410		if (measurenext) {
411			/*
412			 * We need to do a RTT measurement. It won't be the best
413			 * if we do it here.
414			 */
415			marked_packet_rtt(txsi, e_t, tp,
416			    &measurenext, &measurenext_len,
417			    &rtt_bytes_adjust, FORCED_MEASUREMENT);
418		}
419	}
420
421	return (0);
422}
423
424/*
425 * Add information about a transmitted segment to a list.
426 * This is called via the helper hook in tcp_output.c
427 */
428static int
429ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata,
430    void *ctx_data, void *hdata, struct osd *hosd)
431{
432	struct ertt *e_t;
433	struct tcpcb *tp;
434	struct tcphdr *th;
435	struct tcpopt *to;
436	struct tcp_hhook_data *thdp;
437	struct txseginfo *txsi;
438	uint32_t len;
439	int tso;
440
441	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
442	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
443
444	e_t = (struct ertt *)hdata;
445	thdp = ctx_data;
446	tp = thdp->tp;
447	th = thdp->th;
448	to = thdp->to;
449	len = thdp->len;
450	tso = thdp->tso;
451
452	INP_WLOCK_ASSERT(tptoinpcb(tp));
453
454	if (len > 0) {
455		txsi = uma_zalloc(txseginfo_zone, M_NOWAIT);
456		if (txsi != NULL) {
457			/* Construct txsi setting the necessary flags. */
458			txsi->flags = 0; /* Needs to be initialised. */
459			txsi->seq = ntohl(th->th_seq);
460			txsi->len = len;
461			if (tso)
462				txsi->flags |= TXSI_TSO;
463			else if (e_t->flags & ERTT_TSO_DISABLED) {
464				tp->t_flags |= TF_TSO;
465				e_t->flags &= ~ERTT_TSO_DISABLED;
466			}
467
468			if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) {
469				e_t->bytes_tx_in_rtt += len;
470			} else {
471				txsi->flags |= TXSI_RTT_MEASURE_START;
472				e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS;
473				e_t->bytes_tx_in_rtt = len;
474			}
475
476			if (((tp->t_flags & TF_NOOPT) == 0) &&
477			    (to->to_flags & TOF_TS)) {
478				txsi->tx_ts = ntohl(to->to_tsval) -
479				    tp->ts_offset;
480				txsi->rx_ts = ntohl(to->to_tsecr);
481			} else {
482				txsi->tx_ts = tcp_ts_getticks();
483				txsi->rx_ts = 0; /* No received time stamp. */
484			}
485			TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk);
486		}
487	}
488
489	return (0);
490}
491
492static int
493ertt_mod_init(void)
494{
495
496	txseginfo_zone = uma_zcreate("ertt_txseginfo", sizeof(struct txseginfo),
497	    NULL, NULL, NULL, NULL, 0, 0);
498
499	return (0);
500}
501
502static int
503ertt_mod_destroy(void)
504{
505
506	uma_zdestroy(txseginfo_zone);
507
508	return (0);
509}
510
511static int
512ertt_uma_ctor(void *mem, int size, void *arg, int flags)
513{
514	struct ertt *e_t;
515
516	e_t = mem;
517
518	TAILQ_INIT(&e_t->txsegi_q);
519	e_t->timestamp_errors = 0;
520	e_t->minrtt = 0;
521	e_t->maxrtt = 0;
522	e_t->rtt = 0;
523	e_t->flags = 0;
524	e_t->dlyack_rx = 0;
525	e_t->bytes_tx_in_rtt = 0;
526	e_t->markedpkt_rtt = 0;
527
528	return (0);
529}
530
531static void
532ertt_uma_dtor(void *mem, int size, void *arg)
533{
534	struct ertt *e_t;
535	struct txseginfo *n_txsi, *txsi;
536
537	e_t = mem;
538	txsi = TAILQ_FIRST(&e_t->txsegi_q);
539	while (txsi != NULL) {
540		n_txsi = TAILQ_NEXT(txsi, txsegi_lnk);
541		uma_zfree(txseginfo_zone, txsi);
542		txsi = n_txsi;
543	}
544}
545
546KHELP_DECLARE_MOD_UMA(ertt, &ertt_helper, ertt_hooks, 1, sizeof(struct ertt),
547    ertt_uma_ctor, ertt_uma_dtor);
548