1217806Slstewart/*-
2217806Slstewart * Copyright (c) 2009-2010
3217806Slstewart * 	Swinburne University of Technology, Melbourne, Australia
4217806Slstewart * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
5217806Slstewart * Copyright (c) 2010-2011 The FreeBSD Foundation
6217806Slstewart * All rights reserved.
7217806Slstewart *
8217806Slstewart * This software was developed at the Centre for Advanced Internet
9220560Slstewart * Architectures, Swinburne University of Technology, by David Hayes, made
10220560Slstewart * possible in part by a grant from the Cisco University Research Program Fund
11220560Slstewart * at Community Foundation Silicon Valley.
12217806Slstewart *
13217806Slstewart * Portions of this software were developed at the Centre for Advanced
14217806Slstewart * Internet Architectures, Swinburne University of Technology, Melbourne,
15217806Slstewart * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
16217806Slstewart *
17217806Slstewart * Redistribution and use in source and binary forms, with or without
18217806Slstewart * modification, are permitted provided that the following conditions
19217806Slstewart * are met:
20217806Slstewart * 1. Redistributions of source code must retain the above copyright
21217806Slstewart *    notice, this list of conditions and the following disclaimer.
22217806Slstewart * 2. Redistributions in binary form must reproduce the above copyright
23217806Slstewart *    notice, this list of conditions and the following disclaimer in the
24217806Slstewart *    documentation and/or other materials provided with the distribution.
25217806Slstewart *
26217806Slstewart * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
27217806Slstewart * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28217806Slstewart * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29217806Slstewart * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
30217806Slstewart * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31217806Slstewart * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32217806Slstewart * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33217806Slstewart * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34217806Slstewart * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35217806Slstewart * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36217806Slstewart * SUCH DAMAGE.
37217806Slstewart */
38217806Slstewart
39217806Slstewart#include <sys/cdefs.h>
40217806Slstewart__FBSDID("$FreeBSD$");
41217806Slstewart
42217806Slstewart#include <sys/param.h>
43217806Slstewart#include <sys/kernel.h>
44217806Slstewart#include <sys/mbuf.h>
45217806Slstewart#include <sys/module.h>
46217806Slstewart#include <sys/hhook.h>
47217806Slstewart#include <sys/khelp.h>
48217806Slstewart#include <sys/module_khelp.h>
49217806Slstewart#include <sys/socket.h>
50217806Slstewart#include <sys/sockopt.h>
51217806Slstewart
52217806Slstewart#include <net/vnet.h>
53217806Slstewart
54217806Slstewart#include <netinet/in.h>
55217806Slstewart#include <netinet/in_pcb.h>
56217806Slstewart#include <netinet/tcp_seq.h>
57217806Slstewart#include <netinet/tcp_var.h>
58217806Slstewart
59217806Slstewart#include <netinet/khelp/h_ertt.h>
60217806Slstewart
61217806Slstewart#include <vm/uma.h>
62217806Slstewart
63217806Slstewartuma_zone_t txseginfo_zone;
64217806Slstewart
65217806Slstewart/* Smoothing factor for delayed ack guess. */
66217806Slstewart#define	DLYACK_SMOOTH	5
67217806Slstewart
68217806Slstewart/* Max number of time stamp errors allowed in a session. */
69217806Slstewart#define	MAX_TS_ERR	10
70217806Slstewart
71217806Slstewartstatic int ertt_packet_measurement_hook(int hhook_type, int hhook_id,
72217806Slstewart    void *udata, void *ctx_data, void *hdata, struct osd *hosd);
73217806Slstewartstatic int ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id,
74217806Slstewart    void *udata, void *ctx_data, void *hdata, struct osd *hosd);
75217806Slstewartstatic int ertt_mod_init(void);
76217806Slstewartstatic int ertt_mod_destroy(void);
77217806Slstewartstatic int ertt_uma_ctor(void *mem, int size, void *arg, int flags);
78217806Slstewartstatic void ertt_uma_dtor(void *mem, int size, void *arg);
79217806Slstewart
80217806Slstewart/*
81217806Slstewart * Contains information about the sent segment for comparison with the
82217806Slstewart * corresponding ack.
83217806Slstewart */
84217806Slstewartstruct txseginfo {
85217806Slstewart	/* Segment length. */
86217806Slstewart	long		len;
87217806Slstewart	/* Segment sequence number. */
88217806Slstewart	tcp_seq		seq;
89217806Slstewart	/* Time stamp indicating when the packet was sent. */
90217806Slstewart	uint32_t	tx_ts;
91217806Slstewart	/* Last received receiver ts (if the TCP option is used). */
92217806Slstewart	uint32_t	rx_ts;
93217806Slstewart	uint32_t	flags;
94217806Slstewart	TAILQ_ENTRY (txseginfo) txsegi_lnk;
95217806Slstewart};
96217806Slstewart
97217806Slstewart/* Flags for struct txseginfo. */
98217806Slstewart#define	TXSI_TSO		0x01 /* TSO was used for this entry. */
99217806Slstewart#define	TXSI_RTT_MEASURE_START	0x02 /* Start a per RTT measurement. */
100217806Slstewart#define	TXSI_RX_MEASURE_END	0x04 /* Measure the rx rate until this txsi. */
101217806Slstewart
102217806Slstewartstruct helper ertt_helper = {
103217806Slstewart	.mod_init = ertt_mod_init,
104217806Slstewart	.mod_destroy = ertt_mod_destroy,
105217806Slstewart	.h_flags = HELPER_NEEDS_OSD,
106217806Slstewart	.h_classes = HELPER_CLASS_TCP
107217806Slstewart};
108217806Slstewart
109217806Slstewart/* Define the helper hook info required by ERTT. */
110217806Slstewartstruct hookinfo ertt_hooks[] = {
111217806Slstewart	{
112217806Slstewart		.hook_type = HHOOK_TYPE_TCP,
113217806Slstewart		.hook_id = HHOOK_TCP_EST_IN,
114217806Slstewart		.hook_udata = NULL,
115217806Slstewart		.hook_func = &ertt_packet_measurement_hook
116217806Slstewart	},
117217806Slstewart	{
118217806Slstewart		.hook_type = HHOOK_TYPE_TCP,
119217806Slstewart		.hook_id = HHOOK_TCP_EST_OUT,
120217806Slstewart		.hook_udata = NULL,
121217806Slstewart		.hook_func = &ertt_add_tx_segment_info_hook
122217806Slstewart	}
123217806Slstewart};
124217806Slstewart
125217806Slstewart/* Flags to indicate how marked_packet_rtt should handle this txsi. */
126217806Slstewart#define	MULTI_ACK		0x01 /* More than this txsi is acked. */
127217806Slstewart#define	OLD_TXSI		0x02 /* TXSI is old according to timestamps. */
128217806Slstewart#define	CORRECT_ACK		0X04 /* Acks this TXSI. */
129217806Slstewart#define	FORCED_MEASUREMENT	0X08 /* Force an RTT measurement. */
130217806Slstewart
131217806Slstewart/*
132217806Slstewart * This fuction measures the RTT of a particular segment/ack pair, or the next
133217806Slstewart * closest if this will yield an inaccurate result due to delayed acking or
134217806Slstewart * other issues.
135217806Slstewart */
136217806Slstewartstatic void inline
137217806Slstewartmarked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp,
138217806Slstewart    uint32_t *pmeasurenext, int *pmeasurenext_len, int *prtt_bytes_adjust,
139217806Slstewart    int mflag)
140217806Slstewart{
141217806Slstewart
142217806Slstewart	/*
143217806Slstewart	 * If we can't measure this one properly due to delayed acking adjust
144217806Slstewart	 * byte counters and flag to measure next txsi. Note that since the
145217806Slstewart	 * marked packet's transmitted bytes are measured we need to subtract the
146217806Slstewart	 * transmitted bytes. Then pretend the next txsi was marked.
147217806Slstewart	 */
148217806Slstewart	if (mflag & (MULTI_ACK|OLD_TXSI)) {
149217806Slstewart		*pmeasurenext = txsi->tx_ts;
150217806Slstewart		*pmeasurenext_len = txsi->len;
151217806Slstewart		*prtt_bytes_adjust += *pmeasurenext_len;
152217806Slstewart	} else {
153217806Slstewart		if (mflag & FORCED_MEASUREMENT) {
154239474Slstewart			e_t->markedpkt_rtt = tcp_ts_getticks() -
155239474Slstewart			    *pmeasurenext + 1;
156217806Slstewart			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt +
157217806Slstewart			    *pmeasurenext_len - *prtt_bytes_adjust;
158217806Slstewart		} else {
159239474Slstewart			e_t->markedpkt_rtt = tcp_ts_getticks() -
160239474Slstewart			    txsi->tx_ts + 1;
161217806Slstewart			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt -
162217806Slstewart			    *prtt_bytes_adjust;
163217806Slstewart		}
164217806Slstewart		e_t->marked_snd_cwnd = tp->snd_cwnd;
165217806Slstewart
166217806Slstewart		/*
167217806Slstewart		 * Reset the ERTT_MEASUREMENT_IN_PROGRESS flag to indicate to
168217806Slstewart		 * add_tx_segment_info that a new measurement should be started.
169217806Slstewart		 */
170217806Slstewart		e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS;
171217806Slstewart		/*
172217806Slstewart		 * Set ERTT_NEW_MEASUREMENT to tell the congestion control
173217806Slstewart		 * algorithm that a new marked RTT measurement has has been made
174217806Slstewart		 * and is available for use.
175217806Slstewart		 */
176217806Slstewart		e_t->flags |= ERTT_NEW_MEASUREMENT;
177217806Slstewart
178217806Slstewart		if (tp->t_flags & TF_TSO) {
179217806Slstewart			/* Temporarily disable TSO to aid a new measurment. */
180217806Slstewart			tp->t_flags &= ~TF_TSO;
181217806Slstewart			/* Keep track that we've disabled it. */
182217806Slstewart			e_t->flags |= ERTT_TSO_DISABLED;
183217806Slstewart		}
184217806Slstewart	}
185217806Slstewart}
186217806Slstewart
187217806Slstewart/*
188217806Slstewart * Ertt_packet_measurements uses a small amount of state kept on each packet
189217806Slstewart * sent to match incoming acknowledgements. This enables more accurate and
190217806Slstewart * secure round trip time measurements. The resulting measurement is used for
191217806Slstewart * congestion control algorithms which require a more accurate time.
192217806Slstewart * Ertt_packet_measurements is called via the helper hook in tcp_input.c
193217806Slstewart */
194217806Slstewartstatic int
195217806Slstewartertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata,
196217806Slstewart    void *ctx_data, void *hdata, struct osd *hosd)
197217806Slstewart{
198217806Slstewart	struct ertt *e_t;
199217806Slstewart	struct tcpcb *tp;
200217806Slstewart	struct tcphdr *th;
201217806Slstewart	struct tcpopt *to;
202217806Slstewart	struct tcp_hhook_data *thdp;
203217806Slstewart	struct txseginfo *txsi;
204217806Slstewart	int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust;
205217806Slstewart	uint32_t measurenext, rts;
206217806Slstewart	tcp_seq ack;
207217806Slstewart
208217806Slstewart	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
209217806Slstewart	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
210217806Slstewart
211217806Slstewart	e_t = (struct ertt *)hdata;
212217806Slstewart	thdp = ctx_data;
213217806Slstewart	tp = thdp->tp;
214217806Slstewart	th = thdp->th;
215217806Slstewart	to = thdp->to;
216217806Slstewart	new_sacked_bytes = (tp->sackhint.last_sack_ack != 0);
217217806Slstewart	measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0;
218217806Slstewart	acked = th->th_ack - tp->snd_una;
219217806Slstewart
220217806Slstewart	INP_WLOCK_ASSERT(tp->t_inpcb);
221217806Slstewart
222217806Slstewart	/* Packet has provided new acknowledgements. */
223217806Slstewart	if (acked > 0 || new_sacked_bytes) {
224217806Slstewart		if (acked == 0 && new_sacked_bytes) {
225217806Slstewart			/* Use last sacked data. */
226217806Slstewart			ack = tp->sackhint.last_sack_ack;
227217806Slstewart		} else
228217806Slstewart			ack = th->th_ack;
229217806Slstewart
230217806Slstewart		txsi = TAILQ_FIRST(&e_t->txsegi_q);
231217806Slstewart		while (txsi != NULL) {
232217806Slstewart			rts = 0;
233217806Slstewart
234217806Slstewart			/* Acknowledgement is acking more than this txsi. */
235217806Slstewart			if (SEQ_GT(ack, txsi->seq + txsi->len)) {
236217806Slstewart				if (txsi->flags & TXSI_RTT_MEASURE_START ||
237217806Slstewart				    measurenext) {
238217806Slstewart					marked_packet_rtt(txsi, e_t, tp,
239217806Slstewart					    &measurenext, &measurenext_len,
240217806Slstewart					    &rtt_bytes_adjust, MULTI_ACK);
241217806Slstewart				}
242217806Slstewart				TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
243217806Slstewart				uma_zfree(txseginfo_zone, txsi);
244217806Slstewart				txsi = TAILQ_FIRST(&e_t->txsegi_q);
245217806Slstewart				continue;
246217806Slstewart			}
247217806Slstewart
248217806Slstewart			/*
249217806Slstewart			 * Guess if delayed acks are being used by the receiver.
250217806Slstewart			 *
251217806Slstewart			 * XXXDH: A simple heuristic that could be improved
252217806Slstewart			 */
253217806Slstewart			if (!new_sacked_bytes) {
254217806Slstewart				if (acked > tp->t_maxseg) {
255217806Slstewart					e_t->dlyack_rx +=
256217806Slstewart					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
257217806Slstewart					    1 : 0;
258217806Slstewart					multiack = 1;
259217806Slstewart				} else if (acked > txsi->len) {
260217806Slstewart					multiack = 1;
261217806Slstewart					e_t->dlyack_rx +=
262217806Slstewart					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
263217806Slstewart					    1 : 0;
264217806Slstewart				} else if (acked == tp->t_maxseg ||
265217806Slstewart					   acked == txsi->len) {
266217806Slstewart					e_t->dlyack_rx -=
267217806Slstewart					    (e_t->dlyack_rx > 0) ? 1 : 0;
268217806Slstewart				}
269217806Slstewart				/* Otherwise leave dlyack_rx the way it was. */
270217806Slstewart			}
271217806Slstewart
272217806Slstewart			/*
273217806Slstewart			 * Time stamps are only to help match the txsi with the
274217806Slstewart			 * received acknowledgements.
275217806Slstewart			 */
276217806Slstewart			if (e_t->timestamp_errors < MAX_TS_ERR &&
277217806Slstewart			    (to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
278217806Slstewart				/*
279217806Slstewart				 * Note: All packets sent with the offload will
280217806Slstewart				 * have the same time stamp. If we are sending
281217806Slstewart				 * on a fast interface and the t_maxseg is much
282217806Slstewart				 * smaller than one tick, this will be fine. The
283217806Slstewart				 * time stamp would be the same whether we were
284217806Slstewart				 * using tso or not. However, if the interface
285217806Slstewart				 * is slow, this will cause problems with the
286217806Slstewart				 * calculations. If the interface is slow, there
287217806Slstewart				 * is not reason to be using tso, and it should
288217806Slstewart				 * be turned off.
289217806Slstewart				 */
290217806Slstewart				/*
291217806Slstewart				 * If there are too many time stamp errors, time
292217806Slstewart				 * stamps won't be trusted
293217806Slstewart				 */
294217806Slstewart				rts = to->to_tsecr;
295217806Slstewart				/* Before this packet. */
296217806Slstewart				if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts))
297217806Slstewart					/* When delayed acking is used, the
298217806Slstewart					 * reflected time stamp is of the first
299217806Slstewart					 * packet and thus may be before
300217806Slstewart					 * txsi->tx_ts.
301217806Slstewart					 */
302217806Slstewart					break;
303217806Slstewart				if (TSTMP_GT(rts, txsi->tx_ts)) {
304217806Slstewart					/*
305217806Slstewart					 * If reflected time stamp is later than
306217806Slstewart					 * tx_tsi, then this txsi is old.
307217806Slstewart					 */
308217806Slstewart					if (txsi->flags & TXSI_RTT_MEASURE_START
309217806Slstewart					    || measurenext) {
310217806Slstewart						marked_packet_rtt(txsi, e_t, tp,
311217806Slstewart						    &measurenext, &measurenext_len,
312217806Slstewart						    &rtt_bytes_adjust, OLD_TXSI);
313217806Slstewart					}
314217806Slstewart					TAILQ_REMOVE(&e_t->txsegi_q, txsi,
315217806Slstewart					    txsegi_lnk);
316217806Slstewart					uma_zfree(txseginfo_zone, txsi);
317217806Slstewart					txsi = TAILQ_FIRST(&e_t->txsegi_q);
318217806Slstewart					continue;
319217806Slstewart				}
320217806Slstewart				if (rts == txsi->tx_ts &&
321217806Slstewart				    TSTMP_LT(to->to_tsval, txsi->rx_ts)) {
322217806Slstewart					/*
323217806Slstewart					 * Segment received before sent!
324217806Slstewart					 * Something is wrong with the received
325217806Slstewart					 * timestamps so increment errors. If
326217806Slstewart					 * this keeps up we will ignore
327217806Slstewart					 * timestamps.
328217806Slstewart					 */
329217806Slstewart					e_t->timestamp_errors++;
330217806Slstewart				}
331217806Slstewart			}
332217806Slstewart			/*
333217806Slstewart			 * Acknowledging a sequence number before this txsi.
334217806Slstewart			 * If it is an old txsi that may have had the same seq
335217806Slstewart			 * numbers, it should have been removed if time stamps
336217806Slstewart			 * are being used.
337217806Slstewart			 */
338217806Slstewart			if (SEQ_LEQ(ack, txsi->seq))
339217806Slstewart				break; /* Before first packet in txsi. */
340217806Slstewart
341217806Slstewart			/*
342217806Slstewart			 * Only ack > txsi->seq and ack <= txsi->seq+txsi->len
343217806Slstewart			 * past this point.
344217806Slstewart			 *
345217806Slstewart			 * If delayed acks are being used, an acknowledgement
346217806Slstewart			 * for a single segment will have been delayed by the
347217806Slstewart			 * receiver and will yield an inaccurate measurement. In
348217806Slstewart			 * this case, we only make the measurement if more than
349217806Slstewart			 * one segment is being acknowledged or sack is
350217806Slstewart			 * currently being used.
351217806Slstewart			 */
352217806Slstewart			if (!e_t->dlyack_rx || multiack || new_sacked_bytes) {
353217806Slstewart				/* Make an accurate new measurement. */
354239474Slstewart				e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1;
355217806Slstewart
356217806Slstewart				if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0)
357217806Slstewart					e_t->minrtt = e_t->rtt;
358217806Slstewart
359217806Slstewart				if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0)
360217806Slstewart					e_t->maxrtt = e_t->rtt;
361217806Slstewart			}
362217806Slstewart
363217806Slstewart			if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext)
364217806Slstewart				marked_packet_rtt(txsi, e_t, tp,
365217806Slstewart				    &measurenext, &measurenext_len,
366217806Slstewart				    &rtt_bytes_adjust, CORRECT_ACK);
367217806Slstewart
368217806Slstewart			if (txsi->flags & TXSI_TSO) {
369217806Slstewart				txsi->len -= acked;
370217806Slstewart				if (txsi->len > 0) {
371217806Slstewart					/*
372217806Slstewart					 * This presumes ack for first bytes in
373217806Slstewart					 * txsi, this may not be true but it
374217806Slstewart					 * shouldn't cause problems for the
375217806Slstewart					 * timing.
376217806Slstewart					 *
377217806Slstewart					 * We remeasure RTT even though we only
378217806Slstewart					 * have a single txsi. The rationale
379217806Slstewart					 * behind this is that it is better to
380217806Slstewart					 * have a slightly inaccurate
381217806Slstewart					 * measurement than no additional
382217806Slstewart					 * measurement for the rest of the bulk
383217806Slstewart					 * transfer. Since TSO is only used on
384217806Slstewart					 * high speed interface cards, so the
385217806Slstewart					 * packets should be transmitted at line
386217806Slstewart					 * rate back to back with little
387217806Slstewart					 * difference in transmission times (in
388217806Slstewart					 * ticks).
389217806Slstewart					 */
390217806Slstewart					txsi->seq += acked;
391217806Slstewart					/*
392217806Slstewart					 * Reset txsi measure flag so we don't
393217806Slstewart					 * use it for another RTT measurement.
394217806Slstewart					 */
395217806Slstewart					txsi->flags &= ~TXSI_RTT_MEASURE_START;
396217806Slstewart					/*
397217806Slstewart					 * There is still more data to be acked
398217806Slstewart					 * from tso bulk transmission, so we
399217806Slstewart					 * won't remove it from the TAILQ yet.
400217806Slstewart					 */
401217806Slstewart					break;
402217806Slstewart				}
403217806Slstewart			}
404217806Slstewart
405217806Slstewart			TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
406217806Slstewart			uma_zfree(txseginfo_zone, txsi);
407217806Slstewart			break;
408217806Slstewart		}
409217806Slstewart
410217806Slstewart		if (measurenext) {
411217806Slstewart			/*
412217806Slstewart			 * We need to do a RTT measurement. It won't be the best
413217806Slstewart			 * if we do it here.
414217806Slstewart			 */
415217806Slstewart			marked_packet_rtt(txsi, e_t, tp,
416217806Slstewart			    &measurenext, &measurenext_len,
417217806Slstewart			    &rtt_bytes_adjust, FORCED_MEASUREMENT);
418217806Slstewart		}
419217806Slstewart	}
420217806Slstewart
421217806Slstewart	return (0);
422217806Slstewart}
423217806Slstewart
424217806Slstewart/*
425217806Slstewart * Add information about a transmitted segment to a list.
426217806Slstewart * This is called via the helper hook in tcp_output.c
427217806Slstewart */
428217806Slstewartstatic int
429217806Slstewartertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata,
430217806Slstewart    void *ctx_data, void *hdata, struct osd *hosd)
431217806Slstewart{
432217806Slstewart	struct ertt *e_t;
433217806Slstewart	struct tcpcb *tp;
434217806Slstewart	struct tcphdr *th;
435217806Slstewart	struct tcpopt *to;
436217806Slstewart	struct tcp_hhook_data *thdp;
437217806Slstewart	struct txseginfo *txsi;
438217806Slstewart	long len;
439217806Slstewart	int tso;
440217806Slstewart
441217806Slstewart	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
442217806Slstewart	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
443217806Slstewart
444217806Slstewart	e_t = (struct ertt *)hdata;
445217806Slstewart	thdp = ctx_data;
446217806Slstewart	tp = thdp->tp;
447217806Slstewart	th = thdp->th;
448217806Slstewart	to = thdp->to;
449217806Slstewart	len = thdp->len;
450217806Slstewart	tso = thdp->tso;
451217806Slstewart
452217806Slstewart	INP_WLOCK_ASSERT(tp->t_inpcb);
453217806Slstewart
454217806Slstewart	if (len > 0) {
455217806Slstewart		txsi = uma_zalloc(txseginfo_zone, M_NOWAIT);
456217806Slstewart		if (txsi != NULL) {
457217806Slstewart			/* Construct txsi setting the necessary flags. */
458217806Slstewart			txsi->flags = 0; /* Needs to be initialised. */
459217806Slstewart			txsi->seq = ntohl(th->th_seq);
460217806Slstewart			txsi->len = len;
461217806Slstewart			if (tso)
462217806Slstewart				txsi->flags |= TXSI_TSO;
463217806Slstewart			else if (e_t->flags & ERTT_TSO_DISABLED) {
464217806Slstewart				tp->t_flags |= TF_TSO;
465217806Slstewart				e_t->flags &= ~ERTT_TSO_DISABLED;
466217806Slstewart			}
467217806Slstewart
468217806Slstewart			if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) {
469217806Slstewart				e_t->bytes_tx_in_rtt += len;
470217806Slstewart			} else {
471217806Slstewart				txsi->flags |= TXSI_RTT_MEASURE_START;
472217806Slstewart				e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS;
473217806Slstewart				e_t->bytes_tx_in_rtt = len;
474217806Slstewart			}
475217806Slstewart
476217806Slstewart			if (((tp->t_flags & TF_NOOPT) == 0) &&
477217806Slstewart			    (to->to_flags & TOF_TS)) {
478217806Slstewart				txsi->tx_ts = ntohl(to->to_tsval) -
479217806Slstewart				    tp->ts_offset;
480217806Slstewart				txsi->rx_ts = ntohl(to->to_tsecr);
481217806Slstewart			} else {
482239474Slstewart				txsi->tx_ts = tcp_ts_getticks();
483217806Slstewart				txsi->rx_ts = 0; /* No received time stamp. */
484217806Slstewart			}
485217806Slstewart			TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk);
486217806Slstewart		}
487217806Slstewart	}
488217806Slstewart
489217806Slstewart	return (0);
490217806Slstewart}
491217806Slstewart
492217806Slstewartstatic int
493217806Slstewartertt_mod_init(void)
494217806Slstewart{
495217806Slstewart
496217806Slstewart	txseginfo_zone = uma_zcreate("ertt_txseginfo", sizeof(struct txseginfo),
497217806Slstewart	    NULL, NULL, NULL, NULL, 0, 0);
498217806Slstewart
499217806Slstewart	return (0);
500217806Slstewart}
501217806Slstewart
502217806Slstewartstatic int
503217806Slstewartertt_mod_destroy(void)
504217806Slstewart{
505217806Slstewart
506217806Slstewart	uma_zdestroy(txseginfo_zone);
507217806Slstewart
508217806Slstewart	return (0);
509217806Slstewart}
510217806Slstewart
511217806Slstewartstatic int
512217806Slstewartertt_uma_ctor(void *mem, int size, void *arg, int flags)
513217806Slstewart{
514217806Slstewart	struct ertt *e_t;
515217806Slstewart
516217806Slstewart	e_t = mem;
517217806Slstewart
518217806Slstewart	TAILQ_INIT(&e_t->txsegi_q);
519217806Slstewart	e_t->timestamp_errors = 0;
520217806Slstewart	e_t->minrtt = 0;
521217806Slstewart	e_t->maxrtt = 0;
522217806Slstewart	e_t->rtt = 0;
523217806Slstewart	e_t->flags = 0;
524217806Slstewart	e_t->dlyack_rx = 0;
525217806Slstewart	e_t->bytes_tx_in_rtt = 0;
526217806Slstewart	e_t->markedpkt_rtt = 0;
527217806Slstewart
528217806Slstewart	return (0);
529217806Slstewart}
530217806Slstewart
531217806Slstewartstatic void
532217806Slstewartertt_uma_dtor(void *mem, int size, void *arg)
533217806Slstewart{
534217806Slstewart	struct ertt *e_t;
535217806Slstewart	struct txseginfo *n_txsi, *txsi;
536217806Slstewart
537217806Slstewart	e_t = mem;
538217806Slstewart	txsi = TAILQ_FIRST(&e_t->txsegi_q);
539217806Slstewart	while (txsi != NULL) {
540217806Slstewart		n_txsi = TAILQ_NEXT(txsi, txsegi_lnk);
541217806Slstewart		uma_zfree(txseginfo_zone, txsi);
542217806Slstewart		txsi = n_txsi;
543217806Slstewart	}
544217806Slstewart}
545217806Slstewart
546217806SlstewartKHELP_DECLARE_MOD_UMA(ertt, &ertt_helper, ertt_hooks, 1, sizeof(struct ertt),
547217806Slstewart    ertt_uma_ctor, ertt_uma_dtor);
548