1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2009-2010
5 * 	Swinburne University of Technology, Melbourne, Australia
6 * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
7 * Copyright (c) 2010-2011 The FreeBSD Foundation
8 * All rights reserved.
9 *
10 * This software was developed at the Centre for Advanced Internet
11 * Architectures, Swinburne University of Technology, by David Hayes, made
12 * possible in part by a grant from the Cisco University Research Program Fund
13 * at Community Foundation Silicon Valley.
14 *
15 * Portions of this software were developed at the Centre for Advanced
16 * Internet Architectures, Swinburne University of Technology, Melbourne,
17 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
18 *
19 * Redistribution and use in source and binary forms, with or without
20 * modification, are permitted provided that the following conditions
21 * are met:
22 * 1. Redistributions of source code must retain the above copyright
23 *    notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 *    notice, this list of conditions and the following disclaimer in the
26 *    documentation and/or other materials provided with the distribution.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD$");
43
44#include <sys/param.h>
45#include <sys/kernel.h>
46#include <sys/mbuf.h>
47#include <sys/module.h>
48#include <sys/hhook.h>
49#include <sys/khelp.h>
50#include <sys/module_khelp.h>
51#include <sys/socket.h>
52#include <sys/sockopt.h>
53
54#include <net/vnet.h>
55
56#include <netinet/in.h>
57#include <netinet/in_pcb.h>
58#include <netinet/tcp_seq.h>
59#include <netinet/tcp_var.h>
60
61#include <netinet/khelp/h_ertt.h>
62
63#include <vm/uma.h>
64
65uma_zone_t txseginfo_zone;
66
67/* Smoothing factor for delayed ack guess. */
68#define	DLYACK_SMOOTH	5
69
70/* Max number of time stamp errors allowed in a session. */
71#define	MAX_TS_ERR	10
72
73static int ertt_packet_measurement_hook(int hhook_type, int hhook_id,
74    void *udata, void *ctx_data, void *hdata, struct osd *hosd);
75static int ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id,
76    void *udata, void *ctx_data, void *hdata, struct osd *hosd);
77static int ertt_mod_init(void);
78static int ertt_mod_destroy(void);
79static int ertt_uma_ctor(void *mem, int size, void *arg, int flags);
80static void ertt_uma_dtor(void *mem, int size, void *arg);
81
82/*
83 * Contains information about the sent segment for comparison with the
84 * corresponding ack.
85 */
86struct txseginfo {
87	/* Segment length. */
88	uint32_t	len;
89	/* Segment sequence number. */
90	tcp_seq		seq;
91	/* Time stamp indicating when the packet was sent. */
92	uint32_t	tx_ts;
93	/* Last received receiver ts (if the TCP option is used). */
94	uint32_t	rx_ts;
95	uint32_t	flags;
96	TAILQ_ENTRY (txseginfo) txsegi_lnk;
97};
98
99/* Flags for struct txseginfo. */
100#define	TXSI_TSO		0x01 /* TSO was used for this entry. */
101#define	TXSI_RTT_MEASURE_START	0x02 /* Start a per RTT measurement. */
102#define	TXSI_RX_MEASURE_END	0x04 /* Measure the rx rate until this txsi. */
103
104struct helper ertt_helper = {
105	.mod_init = ertt_mod_init,
106	.mod_destroy = ertt_mod_destroy,
107	.h_flags = HELPER_NEEDS_OSD,
108	.h_classes = HELPER_CLASS_TCP
109};
110
111/* Define the helper hook info required by ERTT. */
112struct hookinfo ertt_hooks[] = {
113	{
114		.hook_type = HHOOK_TYPE_TCP,
115		.hook_id = HHOOK_TCP_EST_IN,
116		.hook_udata = NULL,
117		.hook_func = &ertt_packet_measurement_hook
118	},
119	{
120		.hook_type = HHOOK_TYPE_TCP,
121		.hook_id = HHOOK_TCP_EST_OUT,
122		.hook_udata = NULL,
123		.hook_func = &ertt_add_tx_segment_info_hook
124	}
125};
126
127/* Flags to indicate how marked_packet_rtt should handle this txsi. */
128#define	MULTI_ACK		0x01 /* More than this txsi is acked. */
129#define	OLD_TXSI		0x02 /* TXSI is old according to timestamps. */
130#define	CORRECT_ACK		0X04 /* Acks this TXSI. */
131#define	FORCED_MEASUREMENT	0X08 /* Force an RTT measurement. */
132
133/*
134 * This fuction measures the RTT of a particular segment/ack pair, or the next
135 * closest if this will yield an inaccurate result due to delayed acking or
136 * other issues.
137 */
138static void inline
139marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp,
140    uint32_t *pmeasurenext, int *pmeasurenext_len, int *prtt_bytes_adjust,
141    int mflag)
142{
143
144	/*
145	 * If we can't measure this one properly due to delayed acking adjust
146	 * byte counters and flag to measure next txsi. Note that since the
147	 * marked packet's transmitted bytes are measured we need to subtract the
148	 * transmitted bytes. Then pretend the next txsi was marked.
149	 */
150	if (mflag & (MULTI_ACK|OLD_TXSI)) {
151		*pmeasurenext = txsi->tx_ts;
152		*pmeasurenext_len = txsi->len;
153		*prtt_bytes_adjust += *pmeasurenext_len;
154	} else {
155		if (mflag & FORCED_MEASUREMENT) {
156			e_t->markedpkt_rtt = tcp_ts_getticks() -
157			    *pmeasurenext + 1;
158			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt +
159			    *pmeasurenext_len - *prtt_bytes_adjust;
160		} else {
161			e_t->markedpkt_rtt = tcp_ts_getticks() -
162			    txsi->tx_ts + 1;
163			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt -
164			    *prtt_bytes_adjust;
165		}
166		e_t->marked_snd_cwnd = tp->snd_cwnd;
167
168		/*
169		 * Reset the ERTT_MEASUREMENT_IN_PROGRESS flag to indicate to
170		 * add_tx_segment_info that a new measurement should be started.
171		 */
172		e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS;
173		/*
174		 * Set ERTT_NEW_MEASUREMENT to tell the congestion control
175		 * algorithm that a new marked RTT measurement has has been made
176		 * and is available for use.
177		 */
178		e_t->flags |= ERTT_NEW_MEASUREMENT;
179
180		if (tp->t_flags & TF_TSO) {
181			/* Temporarily disable TSO to aid a new measurment. */
182			tp->t_flags &= ~TF_TSO;
183			/* Keep track that we've disabled it. */
184			e_t->flags |= ERTT_TSO_DISABLED;
185		}
186	}
187}
188
189/*
190 * Ertt_packet_measurements uses a small amount of state kept on each packet
191 * sent to match incoming acknowledgements. This enables more accurate and
192 * secure round trip time measurements. The resulting measurement is used for
193 * congestion control algorithms which require a more accurate time.
194 * Ertt_packet_measurements is called via the helper hook in tcp_input.c
195 */
196static int
197ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata,
198    void *ctx_data, void *hdata, struct osd *hosd)
199{
200	struct ertt *e_t;
201	struct tcpcb *tp;
202	struct tcphdr *th;
203	struct tcpopt *to;
204	struct tcp_hhook_data *thdp;
205	struct txseginfo *txsi;
206	int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust;
207	uint32_t measurenext, rts;
208	tcp_seq ack;
209
210	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
211	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
212
213	e_t = (struct ertt *)hdata;
214	thdp = ctx_data;
215	tp = thdp->tp;
216	th = thdp->th;
217	to = thdp->to;
218	new_sacked_bytes = (tp->sackhint.last_sack_ack != 0);
219	measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0;
220	acked = th->th_ack - tp->snd_una;
221
222	INP_WLOCK_ASSERT(tp->t_inpcb);
223
224	/* Packet has provided new acknowledgements. */
225	if (acked > 0 || new_sacked_bytes) {
226		if (acked == 0 && new_sacked_bytes) {
227			/* Use last sacked data. */
228			ack = tp->sackhint.last_sack_ack;
229		} else
230			ack = th->th_ack;
231
232		txsi = TAILQ_FIRST(&e_t->txsegi_q);
233		while (txsi != NULL) {
234			rts = 0;
235
236			/* Acknowledgement is acking more than this txsi. */
237			if (SEQ_GT(ack, txsi->seq + txsi->len)) {
238				if (txsi->flags & TXSI_RTT_MEASURE_START ||
239				    measurenext) {
240					marked_packet_rtt(txsi, e_t, tp,
241					    &measurenext, &measurenext_len,
242					    &rtt_bytes_adjust, MULTI_ACK);
243				}
244				TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
245				uma_zfree(txseginfo_zone, txsi);
246				txsi = TAILQ_FIRST(&e_t->txsegi_q);
247				continue;
248			}
249
250			/*
251			 * Guess if delayed acks are being used by the receiver.
252			 *
253			 * XXXDH: A simple heuristic that could be improved
254			 */
255			if (!new_sacked_bytes) {
256				if (acked > tp->t_maxseg) {
257					e_t->dlyack_rx +=
258					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
259					    1 : 0;
260					multiack = 1;
261				} else if (acked > txsi->len) {
262					multiack = 1;
263					e_t->dlyack_rx +=
264					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
265					    1 : 0;
266				} else if (acked == tp->t_maxseg ||
267					   acked == txsi->len) {
268					e_t->dlyack_rx -=
269					    (e_t->dlyack_rx > 0) ? 1 : 0;
270				}
271				/* Otherwise leave dlyack_rx the way it was. */
272			}
273
274			/*
275			 * Time stamps are only to help match the txsi with the
276			 * received acknowledgements.
277			 */
278			if (e_t->timestamp_errors < MAX_TS_ERR &&
279			    (to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
280				/*
281				 * Note: All packets sent with the offload will
282				 * have the same time stamp. If we are sending
283				 * on a fast interface and the t_maxseg is much
284				 * smaller than one tick, this will be fine. The
285				 * time stamp would be the same whether we were
286				 * using tso or not. However, if the interface
287				 * is slow, this will cause problems with the
288				 * calculations. If the interface is slow, there
289				 * is not reason to be using tso, and it should
290				 * be turned off.
291				 */
292				/*
293				 * If there are too many time stamp errors, time
294				 * stamps won't be trusted
295				 */
296				rts = to->to_tsecr;
297				/* Before this packet. */
298				if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts))
299					/* When delayed acking is used, the
300					 * reflected time stamp is of the first
301					 * packet and thus may be before
302					 * txsi->tx_ts.
303					 */
304					break;
305				if (TSTMP_GT(rts, txsi->tx_ts)) {
306					/*
307					 * If reflected time stamp is later than
308					 * tx_tsi, then this txsi is old.
309					 */
310					if (txsi->flags & TXSI_RTT_MEASURE_START
311					    || measurenext) {
312						marked_packet_rtt(txsi, e_t, tp,
313						    &measurenext, &measurenext_len,
314						    &rtt_bytes_adjust, OLD_TXSI);
315					}
316					TAILQ_REMOVE(&e_t->txsegi_q, txsi,
317					    txsegi_lnk);
318					uma_zfree(txseginfo_zone, txsi);
319					txsi = TAILQ_FIRST(&e_t->txsegi_q);
320					continue;
321				}
322				if (rts == txsi->tx_ts &&
323				    TSTMP_LT(to->to_tsval, txsi->rx_ts)) {
324					/*
325					 * Segment received before sent!
326					 * Something is wrong with the received
327					 * timestamps so increment errors. If
328					 * this keeps up we will ignore
329					 * timestamps.
330					 */
331					e_t->timestamp_errors++;
332				}
333			}
334			/*
335			 * Acknowledging a sequence number before this txsi.
336			 * If it is an old txsi that may have had the same seq
337			 * numbers, it should have been removed if time stamps
338			 * are being used.
339			 */
340			if (SEQ_LEQ(ack, txsi->seq))
341				break; /* Before first packet in txsi. */
342
343			/*
344			 * Only ack > txsi->seq and ack <= txsi->seq+txsi->len
345			 * past this point.
346			 *
347			 * If delayed acks are being used, an acknowledgement
348			 * for a single segment will have been delayed by the
349			 * receiver and will yield an inaccurate measurement. In
350			 * this case, we only make the measurement if more than
351			 * one segment is being acknowledged or sack is
352			 * currently being used.
353			 */
354			if (!e_t->dlyack_rx || multiack || new_sacked_bytes) {
355				/* Make an accurate new measurement. */
356				e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1;
357
358				if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0)
359					e_t->minrtt = e_t->rtt;
360
361				if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0)
362					e_t->maxrtt = e_t->rtt;
363			}
364
365			if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext)
366				marked_packet_rtt(txsi, e_t, tp,
367				    &measurenext, &measurenext_len,
368				    &rtt_bytes_adjust, CORRECT_ACK);
369
370			if (txsi->flags & TXSI_TSO) {
371				if (txsi->len > acked) {
372					txsi->len -= acked;
373					/*
374					 * This presumes ack for first bytes in
375					 * txsi, this may not be true but it
376					 * shouldn't cause problems for the
377					 * timing.
378					 *
379					 * We remeasure RTT even though we only
380					 * have a single txsi. The rationale
381					 * behind this is that it is better to
382					 * have a slightly inaccurate
383					 * measurement than no additional
384					 * measurement for the rest of the bulk
385					 * transfer. Since TSO is only used on
386					 * high speed interface cards, so the
387					 * packets should be transmitted at line
388					 * rate back to back with little
389					 * difference in transmission times (in
390					 * ticks).
391					 */
392					txsi->seq += acked;
393					/*
394					 * Reset txsi measure flag so we don't
395					 * use it for another RTT measurement.
396					 */
397					txsi->flags &= ~TXSI_RTT_MEASURE_START;
398					/*
399					 * There is still more data to be acked
400					 * from tso bulk transmission, so we
401					 * won't remove it from the TAILQ yet.
402					 */
403					break;
404				}
405				txsi->len = 0;
406			}
407
408			TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
409			uma_zfree(txseginfo_zone, txsi);
410			break;
411		}
412
413		if (measurenext) {
414			/*
415			 * We need to do a RTT measurement. It won't be the best
416			 * if we do it here.
417			 */
418			marked_packet_rtt(txsi, e_t, tp,
419			    &measurenext, &measurenext_len,
420			    &rtt_bytes_adjust, FORCED_MEASUREMENT);
421		}
422	}
423
424	return (0);
425}
426
427/*
428 * Add information about a transmitted segment to a list.
429 * This is called via the helper hook in tcp_output.c
430 */
431static int
432ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata,
433    void *ctx_data, void *hdata, struct osd *hosd)
434{
435	struct ertt *e_t;
436	struct tcpcb *tp;
437	struct tcphdr *th;
438	struct tcpopt *to;
439	struct tcp_hhook_data *thdp;
440	struct txseginfo *txsi;
441	uint32_t len;
442	int tso;
443
444	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
445	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
446
447	e_t = (struct ertt *)hdata;
448	thdp = ctx_data;
449	tp = thdp->tp;
450	th = thdp->th;
451	to = thdp->to;
452	len = thdp->len;
453	tso = thdp->tso;
454
455	INP_WLOCK_ASSERT(tp->t_inpcb);
456
457	if (len > 0) {
458		txsi = uma_zalloc(txseginfo_zone, M_NOWAIT);
459		if (txsi != NULL) {
460			/* Construct txsi setting the necessary flags. */
461			txsi->flags = 0; /* Needs to be initialised. */
462			txsi->seq = ntohl(th->th_seq);
463			txsi->len = len;
464			if (tso)
465				txsi->flags |= TXSI_TSO;
466			else if (e_t->flags & ERTT_TSO_DISABLED) {
467				tp->t_flags |= TF_TSO;
468				e_t->flags &= ~ERTT_TSO_DISABLED;
469			}
470
471			if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) {
472				e_t->bytes_tx_in_rtt += len;
473			} else {
474				txsi->flags |= TXSI_RTT_MEASURE_START;
475				e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS;
476				e_t->bytes_tx_in_rtt = len;
477			}
478
479			if (((tp->t_flags & TF_NOOPT) == 0) &&
480			    (to->to_flags & TOF_TS)) {
481				txsi->tx_ts = ntohl(to->to_tsval) -
482				    tp->ts_offset;
483				txsi->rx_ts = ntohl(to->to_tsecr);
484			} else {
485				txsi->tx_ts = tcp_ts_getticks();
486				txsi->rx_ts = 0; /* No received time stamp. */
487			}
488			TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk);
489		}
490	}
491
492	return (0);
493}
494
495static int
496ertt_mod_init(void)
497{
498
499	txseginfo_zone = uma_zcreate("ertt_txseginfo", sizeof(struct txseginfo),
500	    NULL, NULL, NULL, NULL, 0, 0);
501
502	return (0);
503}
504
505static int
506ertt_mod_destroy(void)
507{
508
509	uma_zdestroy(txseginfo_zone);
510
511	return (0);
512}
513
514static int
515ertt_uma_ctor(void *mem, int size, void *arg, int flags)
516{
517	struct ertt *e_t;
518
519	e_t = mem;
520
521	TAILQ_INIT(&e_t->txsegi_q);
522	e_t->timestamp_errors = 0;
523	e_t->minrtt = 0;
524	e_t->maxrtt = 0;
525	e_t->rtt = 0;
526	e_t->flags = 0;
527	e_t->dlyack_rx = 0;
528	e_t->bytes_tx_in_rtt = 0;
529	e_t->markedpkt_rtt = 0;
530
531	return (0);
532}
533
534static void
535ertt_uma_dtor(void *mem, int size, void *arg)
536{
537	struct ertt *e_t;
538	struct txseginfo *n_txsi, *txsi;
539
540	e_t = mem;
541	txsi = TAILQ_FIRST(&e_t->txsegi_q);
542	while (txsi != NULL) {
543		n_txsi = TAILQ_NEXT(txsi, txsegi_lnk);
544		uma_zfree(txseginfo_zone, txsi);
545		txsi = n_txsi;
546	}
547}
548
549KHELP_DECLARE_MOD_UMA(ertt, &ertt_helper, ertt_hooks, 1, sizeof(struct ertt),
550    ertt_uma_ctor, ertt_uma_dtor);
551