1/*-
2 * Copyright (c) 2017 Netflix, Inc.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 */
26#include <sys/cdefs.h>
27__FBSDID("$FreeBSD$");
28#include <sys/types.h>
29#include <sys/queue.h>
30#include <sys/socket.h>
31#include <sys/mbuf.h>
32#include <sys/sockopt.h>
33#include <netinet/tcp.h>
34#include <netinet/tcp_var.h>
35#include <netinet/tcp_seq.h>
36#ifndef _KERNEL
37#include <stdio.h>
38#include <unistd.h>
39#include <string.h>
40#include <strings.h>
41#include <stdlib.h>
42#include <limits.h>
43#include <getopt.h>
44#endif
45#include "sack_filter.h"
46
47/*
48 * Sack filter is used to filter out sacks
49 * that have already been processed. The idea
50 * is pretty simple really, consider two sacks
51 *
52 * SACK 1
53 *   cum-ack A
54 *     sack B - C
55 * SACK 2
56 *   cum-ack A
57 *     sack D - E
58 *     sack B - C
59 *
60 * The previous sack information (B-C) is repeated
61 * in SACK 2. If the receiver gets SACK 1 and then
62 * SACK 2 then any work associated with B-C as already
63 * been completed. This only effects where we may have
64 * (as in bbr or rack) cases where we walk a linked list.
65 *
66 * Now the utility trys to keep everything in a single
67 * cache line. This means that its not perfect and
68 * it could be that so big of sack's come that a
69 * "remembered" processed sack falls off the list and
70 * so gets re-processed. Thats ok, it just means we
71 * did some extra work. We could of course take more
72 * cache line hits by expanding the size of this
73 * structure, but then that would cost more.
74 */
75
76#ifndef _KERNEL
77int detailed_dump = 0;
78uint64_t cnt_skipped_oldsack = 0;
79uint64_t cnt_used_oldsack = 0;
80int highest_used=0;
81int over_written=0;
82int empty_avail=0;
83int no_collapse = 0;
84FILE *out = NULL;
85FILE *in = NULL;
86#endif
87
88#define sack_blk_used(sf, i) ((1 << i) & sf->sf_bits)
89#define sack_blk_set(sf, i) ((1 << i) | sf->sf_bits)
90#define sack_blk_clr(sf, i) (~(1 << i) & sf->sf_bits)
91
92#ifndef _KERNEL
93static
94#endif
95void
96sack_filter_clear(struct sack_filter *sf, tcp_seq seq)
97{
98	sf->sf_ack = seq;
99	sf->sf_bits = 0;
100	sf->sf_cur = 0;
101	sf->sf_used = 0;
102}
103/*
104 * Given a previous sack filter block, filter out
105 * any entries where the cum-ack moves over them
106 * fully or partially.
107 */
108static void
109sack_filter_prune(struct sack_filter *sf, tcp_seq th_ack)
110{
111	int32_t i;
112	/* start with the oldest */
113	for (i = 0; i < SACK_FILTER_BLOCKS; i++) {
114		if (sack_blk_used(sf, i)) {
115			if (SEQ_GT(th_ack, sf->sf_blks[i].end)) {
116				/* This block is consumed */
117				sf->sf_bits = sack_blk_clr(sf, i);
118				sf->sf_used--;
119			} else if (SEQ_GT(th_ack, sf->sf_blks[i].start)) {
120				/* Some of it is acked */
121				sf->sf_blks[i].start = th_ack;
122				/* We could in theory break here, but
123				 * there are some broken implementations
124				 * that send multiple blocks. We want
125				 * to catch them all with similar seq's.
126				 */
127			}
128		}
129	}
130	sf->sf_ack = th_ack;
131}
132
133/*
134 * Return true if you find that
135 * the sackblock b is on the score
136 * board. Update it along the way
137 * if part of it is on the board.
138 */
139static int32_t
140is_sack_on_board(struct sack_filter *sf, struct sackblk *b)
141{
142	int32_t i, cnt;
143	for (i = sf->sf_cur, cnt=0; cnt < SACK_FILTER_BLOCKS; cnt++) {
144		if (sack_blk_used(sf, i)) {
145			if (SEQ_LT(b->start, sf->sf_ack)) {
146				/* Behind cum-ack update */
147				b->start = sf->sf_ack;
148			}
149			if (SEQ_LT(b->end, sf->sf_ack)) {
150				/* End back behind too */
151				b->end = sf->sf_ack;
152			}
153			if (b->start == b->end)
154				return(1);
155			/* Jonathans Rule 1 */
156			if (SEQ_LEQ(sf->sf_blks[i].start, b->start) &&
157			    SEQ_GEQ(sf->sf_blks[i].end, b->end)) {
158				/**
159				 * Our board has this entirely in
160				 * whole or in part:
161				 *
162				 * board  |-------------|
163				 * sack   |-------------|
164				 * <or>
165				 * board  |-------------|
166				 * sack       |----|
167				 *
168				 */
169				return(1);
170			}
171			/* Jonathans Rule 2 */
172			if(SEQ_LT(sf->sf_blks[i].end, b->start)) {
173				/**
174				 * Not near each other:
175				 *
176				 * board   |---|
177				 * sack           |---|
178				 */
179				goto nxt_blk;
180			}
181			/* Jonathans Rule 3 */
182			if (SEQ_GT(sf->sf_blks[i].start, b->end)) {
183				/**
184				 * Not near each other:
185				 *
186				 * board         |---|
187				 * sack  |---|
188				 */
189				goto nxt_blk;
190			}
191			if (SEQ_LEQ(sf->sf_blks[i].start, b->start)) {
192				/**
193				 * The board block partial meets:
194				 *
195				 *  board   |--------|
196				 *  sack        |----------|
197				 *    <or>
198				 *  board   |--------|
199				 *  sack    |--------------|
200				 *
201				 * up with this one (we have part of it).
202				 * 1) Update the board block to the new end
203				 *      and
204				 * 2) Update the start of this block to my end.
205				 */
206				b->start = sf->sf_blks[i].end;
207				sf->sf_blks[i].end = b->end;
208				goto nxt_blk;
209			}
210			if (SEQ_GEQ(sf->sf_blks[i].end, b->end)) {
211				/**
212				 * The board block partial meets:
213				 *
214				 *  board       |--------|
215				 *  sack  |----------|
216				 *     <or>
217				 *  board       |----|
218				 *  sack  |----------|
219				 * 1) Update the board block to the new start
220				 *      and
221				 * 2) Update the start of this block to my end.
222				 */
223				b->end = sf->sf_blks[i].start;
224				sf->sf_blks[i].start = b->start;
225				goto nxt_blk;
226			}
227		}
228	nxt_blk:
229		i++;
230		i %= SACK_FILTER_BLOCKS;
231	}
232	/* Did we totally consume it in pieces? */
233	if (b->start != b->end)
234		return(0);
235	else
236		return(1);
237}
238
239static int32_t
240sack_filter_old(struct sack_filter *sf, struct sackblk *in, int  numblks)
241{
242	int32_t num, i;
243	struct sackblk blkboard[TCP_MAX_SACK];
244	/*
245	 * An old sack has arrived. It may contain data
246	 * we do not have. We might not have it since
247	 * we could have had a lost ack <or> we might have the
248	 * entire thing on our current board. We want to prune
249	 * off anything we have. With this function though we
250	 * won't add to the board.
251	 */
252	for( i = 0, num = 0; i<numblks; i++ ) {
253		if (is_sack_on_board(sf, &in[i])) {
254#ifndef _KERNEL
255			cnt_skipped_oldsack++;
256#endif
257			continue;
258		}
259		/* Did not find it (or found only
260		 * a piece of it). Copy it to
261		 * our outgoing board.
262		 */
263		memcpy(&blkboard[num], &in[i], sizeof(struct sackblk));
264#ifndef _KERNEL
265		cnt_used_oldsack++;
266#endif
267		num++;
268	}
269	if (num) {
270		memcpy(in, blkboard, (num * sizeof(struct sackblk)));
271	}
272	return (num);
273}
274
275/*
276 * Given idx its used but there is space available
277 * move the entry to the next free slot
278 */
279static void
280sack_move_to_empty(struct sack_filter *sf, uint32_t idx)
281{
282	int32_t i, cnt;
283
284	i = (idx + 1) % SACK_FILTER_BLOCKS;
285	for (cnt=0; cnt <(SACK_FILTER_BLOCKS-1); cnt++) {
286		if (sack_blk_used(sf, i) == 0) {
287			memcpy(&sf->sf_blks[i], &sf->sf_blks[idx], sizeof(struct sackblk));
288			sf->sf_bits = sack_blk_clr(sf, idx);
289			sf->sf_bits = sack_blk_set(sf, i);
290			return;
291		}
292		i++;
293		i %= SACK_FILTER_BLOCKS;
294	}
295}
296
297static int32_t
298sack_filter_new(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack)
299{
300	struct sackblk blkboard[TCP_MAX_SACK];
301	int32_t num, i;
302	/*
303	 * First lets trim the old and possibly
304	 * throw any away we have.
305	 */
306	for(i=0, num=0; i<numblks; i++) {
307		if (is_sack_on_board(sf, &in[i]))
308			continue;
309		memcpy(&blkboard[num], &in[i], sizeof(struct sackblk));
310		num++;
311	}
312	if (num == 0)
313		return(num);
314
315	/* Now what we are left is either
316	 * completely merged on to the board
317	 * from the above steps, or are new
318	 * and need to be added to the board
319	 * with the last one updated to current.
320	 *
321	 * First copy it out we want to return that
322	 * to our caller for processing.
323	 */
324	memcpy(in, blkboard, (num * sizeof(struct sackblk)));
325	numblks = num;
326	/* Now go through and add to our board as needed */
327	for(i=(num-1); i>=0; i--) {
328		if (is_sack_on_board(sf, &blkboard[i]))
329			continue;
330		/* Add this guy its not listed */
331		sf->sf_cur++;
332		sf->sf_cur %= SACK_FILTER_BLOCKS;
333		if ((sack_blk_used(sf, sf->sf_cur)) &&
334		    (sf->sf_used < SACK_FILTER_BLOCKS)) {
335			sack_move_to_empty(sf, sf->sf_cur);
336		}
337#ifndef _KERNEL
338		if (sack_blk_used(sf, sf->sf_cur)) {
339			over_written++;
340			if (sf->sf_used < SACK_FILTER_BLOCKS)
341				empty_avail++;
342		}
343#endif
344		memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk));
345		if (sack_blk_used(sf, sf->sf_cur) == 0) {
346			sf->sf_used++;
347#ifndef _KERNEL
348			if (sf->sf_used > highest_used)
349				highest_used = sf->sf_used;
350#endif
351			sf->sf_bits = sack_blk_set(sf, sf->sf_cur);
352		}
353	}
354	return(numblks);
355}
356
357/*
358 * Given a sack block on the board (the skip index) see if
359 * any other used entries overlap or meet, if so return the index.
360 */
361static int32_t
362sack_blocks_overlap_or_meet(struct sack_filter *sf, struct sackblk *sb, uint32_t skip)
363{
364	int32_t i;
365
366	for(i=0; i<SACK_FILTER_BLOCKS; i++) {
367		if (sack_blk_used(sf, i) == 0)
368			continue;
369		if (i == skip)
370			continue;
371		if (SEQ_GEQ(sf->sf_blks[i].end, sb->start) &&
372		    SEQ_LEQ(sf->sf_blks[i].end, sb->end) &&
373		    SEQ_LEQ(sf->sf_blks[i].start, sb->start)) {
374			/**
375			 * The two board blocks meet:
376			 *
377			 *  board1   |--------|
378			 *  board2       |----------|
379			 *    <or>
380			 *  board1   |--------|
381			 *  board2   |--------------|
382			 *    <or>
383			 *  board1   |--------|
384			 *  board2   |--------|
385			 */
386			return(i);
387		}
388		if (SEQ_LEQ(sf->sf_blks[i].start, sb->end) &&
389		    SEQ_GEQ(sf->sf_blks[i].start, sb->start) &&
390		    SEQ_GEQ(sf->sf_blks[i].end, sb->end)) {
391			/**
392			 * The board block partial meets:
393			 *
394			 *  board       |--------|
395			 *  sack  |----------|
396			 *     <or>
397			 *  board       |----|
398			 *  sack  |----------|
399			 * 1) Update the board block to the new start
400			 *      and
401			 * 2) Update the start of this block to my end.
402			 */
403			return(i);
404		}
405	}
406	return (-1);
407}
408
409/*
410 * Collapse entry src into entry into
411 * and free up the src entry afterwards.
412 */
413static void
414sack_collapse(struct sack_filter *sf, int32_t src, int32_t into)
415{
416	if (SEQ_LT(sf->sf_blks[src].start, sf->sf_blks[into].start)) {
417		/* src has a lower starting point */
418		sf->sf_blks[into].start = sf->sf_blks[src].start;
419	}
420	if (SEQ_GT(sf->sf_blks[src].end, sf->sf_blks[into].end)) {
421		/* src has a higher ending point */
422		sf->sf_blks[into].end = sf->sf_blks[src].end;
423	}
424	sf->sf_bits = sack_blk_clr(sf, src);
425	sf->sf_used--;
426}
427
428static void
429sack_board_collapse(struct sack_filter *sf)
430{
431	int32_t i, j, i_d, j_d;
432
433	for(i=0; i<SACK_FILTER_BLOCKS; i++) {
434		if (sack_blk_used(sf, i) == 0)
435			continue;
436		/*
437		 * Look at all other blocks but this guy
438		 * to see if they overlap. If so we collapse
439		 * the two blocks together.
440		 */
441		j = sack_blocks_overlap_or_meet(sf, &sf->sf_blks[i], i);
442		if (j == -1) {
443			/* No overlap */
444			continue;
445		}
446		/*
447		 * Ok j and i overlap with each other, collapse the
448		 * one out furthest away from the current position.
449		 */
450		if (sf->sf_cur > i)
451			i_d = sf->sf_cur - i;
452		else
453			i_d = i - sf->sf_cur;
454		if (sf->sf_cur > j)
455			j_d = sf->sf_cur - j;
456		else
457			j_d = j - sf->sf_cur;
458		if (j_d > i_d) {
459			sack_collapse(sf, j, i);
460		} else
461			sack_collapse(sf, i, j);
462	}
463}
464
465#ifndef _KERNEL
466static
467#endif
468int
469sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack)
470{
471	int32_t i, ret;
472
473	if (numblks > TCP_MAX_SACK) {
474		panic("sf:%p sb:%p Impossible number of sack blocks %d > 4\n",
475		      sf, in,
476		      numblks);
477		return(numblks);
478	}
479	if ((sf->sf_used == 0) && numblks) {
480		/*
481		 * We are brand new add the blocks in
482		 * reverse order. Note we can see more
483		 * than one in new, since ack's could be lost.
484		 */
485		sf->sf_ack = th_ack;
486		for(i=(numblks-1), sf->sf_cur=0; i >= 0; i--) {
487			memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk));
488			sf->sf_bits = sack_blk_set(sf, sf->sf_cur);
489			sf->sf_cur++;
490			sf->sf_cur %= SACK_FILTER_BLOCKS;
491			sf->sf_used++;
492#ifndef _KERNEL
493			if (sf->sf_used > highest_used)
494				highest_used = sf->sf_used;
495#endif
496		}
497		if (sf->sf_cur)
498			sf->sf_cur--;
499		return(numblks);
500	}
501	if (SEQ_GT(th_ack, sf->sf_ack)) {
502		sack_filter_prune(sf, th_ack);
503	}
504	if (numblks) {
505		if (SEQ_GEQ(th_ack, sf->sf_ack)) {
506			ret = sack_filter_new(sf, in, numblks, th_ack);
507		} else {
508			ret = sack_filter_old(sf, in, numblks);
509		}
510	} else
511		ret = 0;
512#ifndef _KERNEL
513	if ((sf->sf_used > 1) && (no_collapse == 0))
514		sack_board_collapse(sf);
515
516#else
517	if (sf->sf_used > 1)
518		sack_board_collapse(sf);
519
520#endif
521	return (ret);
522}
523
524#ifndef _KERNEL
525uint64_t saved=0;
526uint64_t tot_sack_blks=0;
527
528static void
529sack_filter_dump(FILE *out, struct sack_filter *sf)
530{
531	int i;
532	fprintf(out, "	sf_ack:%u sf_bits:0x%x c:%d used:%d\n",
533		sf->sf_ack, sf->sf_bits,
534		sf->sf_cur, sf->sf_used);
535
536	for(i=0; i<SACK_FILTER_BLOCKS; i++) {
537		if (sack_blk_used(sf, i)) {
538			fprintf(out, "Entry:%d start:%u end:%u\n", i,
539			       sf->sf_blks[i].start,
540			       sf->sf_blks[i].end);
541		}
542	}
543}
544
545int
546main(int argc, char **argv)
547{
548	char buffer[512];
549	struct sackblk blks[TCP_MAX_SACK];
550	FILE *err;
551	tcp_seq th_ack, snd_una;
552	struct sack_filter sf;
553	int32_t numblks,i;
554	int snd_una_set=0;
555	double a, b, c;
556	int invalid_sack_print = 0;
557	uint32_t chg_remembered=0;
558	uint32_t sack_chg=0;
559	char line_buf[10][256];
560	int line_buf_at=0;
561
562	in = stdin;
563	out = stdout;
564	while ((i = getopt(argc, argv, "ndIi:o:?h")) != -1) {
565		switch (i) {
566		case 'n':
567			no_collapse = 1;
568			break;
569		case 'd':
570			detailed_dump = 1;
571			break;
572		case'I':
573			invalid_sack_print = 1;
574			break;
575		case 'i':
576			in = fopen(optarg, "r");
577			if (in == NULL) {
578				fprintf(stderr, "Fatal error can't open %s for input\n", optarg);
579				exit(-1);
580			}
581			break;
582		case 'o':
583			out = fopen(optarg, "w");
584			if (out == NULL) {
585				fprintf(stderr, "Fatal error can't open %s for output\n", optarg);
586				exit(-1);
587			}
588			break;
589		default:
590		case '?':
591		case 'h':
592			fprintf(stderr, "Use %s [ -i infile -o outfile -I]\n", argv[0]);
593			return(0);
594			break;
595		};
596	}
597	sack_filter_clear(&sf, 0);
598	memset(buffer, 0, sizeof(buffer));
599	memset(blks, 0, sizeof(blks));
600	numblks = 0;
601	fprintf(out, "************************************\n");
602	while (fgets(buffer, sizeof(buffer), in) != NULL) {
603		sprintf(line_buf[line_buf_at], "%s", buffer);
604		line_buf_at++;
605		if (strncmp(buffer, "QUIT", 4) == 0) {
606			break;
607		} else if (strncmp(buffer, "DONE", 4) == 0) {
608			int nn, ii;
609			if (numblks) {
610				uint32_t szof, tot_chg;
611				for(ii=0; ii<line_buf_at; ii++) {
612					fprintf(out, "%s", line_buf[ii]);
613				}
614				fprintf(out, "------------------------------------\n");
615				nn = sack_filter_blks(&sf, blks, numblks, th_ack);
616				saved += numblks - nn;
617				tot_sack_blks += numblks;
618				fprintf(out, "ACK:%u\n", sf.sf_ack);
619				for(ii=0, tot_chg=0; ii<nn; ii++) {
620					szof = blks[ii].end - blks[ii].start;
621					tot_chg += szof;
622					fprintf(out, "SACK:%u:%u [%u]\n",
623					       blks[ii].start,
624						blks[ii].end, szof);
625				}
626				fprintf(out,"************************************\n");
627				chg_remembered = tot_chg;
628				if (detailed_dump) {
629					sack_filter_dump(out, &sf);
630					fprintf(out,"************************************\n");
631				}
632			}
633			memset(blks, 0, sizeof(blks));
634			memset(line_buf, 0, sizeof(line_buf));
635			line_buf_at=0;
636			numblks = 0;
637		} else if (strncmp(buffer, "CHG:", 4) == 0) {
638			sack_chg = strtoul(&buffer[4], NULL, 0);
639			if ((sack_chg != chg_remembered) &&
640			    (sack_chg > chg_remembered)){
641				fprintf(out,"***WARNING WILL RODGERS DANGER!! sack_chg:%u last:%u\n",
642					sack_chg, chg_remembered
643					);
644			}
645			sack_chg = chg_remembered = 0;
646		} else if (strncmp(buffer, "RXT", 3) == 0) {
647			sack_filter_clear(&sf, snd_una);
648		} else if (strncmp(buffer, "ACK:", 4) == 0) {
649			th_ack = strtoul(&buffer[4], NULL, 0);
650			if (snd_una_set == 0) {
651				snd_una = th_ack;
652				snd_una_set = 1;
653			} else if (SEQ_GT(th_ack, snd_una)) {
654				snd_una = th_ack;
655			}
656		} else if (strncmp(buffer, "EXIT", 4) == 0) {
657			sack_filter_clear(&sf, snd_una);
658			sack_chg = chg_remembered = 0;
659		} else if (strncmp(buffer, "SACK:", 5) == 0) {
660			char *end=NULL;
661			uint32_t start;
662			uint32_t endv;
663			start = strtoul(&buffer[5], &end, 0);
664			if (end) {
665				endv = strtoul(&end[1], NULL, 0);
666			} else {
667				fprintf(out, "--Sack invalid skip 0 start:%u : ??\n", start);
668				continue;
669			}
670			if (SEQ_LT(endv, start)) {
671				fprintf(out, "--Sack invalid skip 1 endv:%u < start:%u\n", endv, start);
672				continue;
673			}
674			if (numblks == TCP_MAX_SACK) {
675				fprintf(out, "--Exceeded max %d\n", numblks);
676				exit(0);
677			}
678			blks[numblks].start = start;
679			blks[numblks].end = endv;
680			numblks++;
681		}
682		memset(buffer, 0, sizeof(buffer));
683	}
684	if (in != stdin) {
685		fclose(in);
686	}
687	if (out != stdout) {
688		fclose(out);
689	}
690	a = saved * 100.0;
691	b = tot_sack_blks * 1.0;
692	if (b > 0.0)
693		c = a/b;
694	else
695		c = 0.0;
696	if (out != stdout)
697		err = stdout;
698	else
699		err = stderr;
700	fprintf(err, "Saved %lu sack blocks out of %lu (%2.3f%%) old_skip:%lu old_usd:%lu high_cnt:%d ow:%d ea:%d\n",
701		saved, tot_sack_blks, c, cnt_skipped_oldsack, cnt_used_oldsack, highest_used, over_written, empty_avail);
702	return(0);
703}
704#endif
705