1/*-
2 *
3 * SPDX-License-Identifier: BSD-3-Clause
4 *
5 * Copyright (c) 2018-2020
6 *	Netflix Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 */
30/**
31 * Author: Randall Stewart <rrs@netflix.com>
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD$");
36#include "opt_inet.h"
37#include "opt_inet6.h"
38#include "opt_ipsec.h"
39#include "opt_tcpdebug.h"
40#include "opt_ratelimit.h"
41#include <sys/param.h>
42#include <sys/kernel.h>
43#include <sys/malloc.h>
44#include <sys/mbuf.h>
45#include <sys/socket.h>
46#include <sys/socketvar.h>
47#include <sys/sysctl.h>
48#include <sys/eventhandler.h>
49#include <sys/mutex.h>
50#include <sys/ck.h>
51#include <net/if.h>
52#include <net/if_var.h>
53#include <netinet/in.h>
54#include <netinet/in_pcb.h>
55#define TCPSTATES		/* for logging */
56#include <netinet/tcp_var.h>
57#ifdef INET6
58#include <netinet6/tcp6_var.h>
59#endif
60#include <netinet/tcp_hpts.h>
61#include <netinet/tcp_log_buf.h>
62#include <netinet/tcp_ratelimit.h>
63#ifndef USECS_IN_SECOND
64#define USECS_IN_SECOND 1000000
65#endif
66/*
67 * For the purposes of each send, what is the size
68 * of an ethernet frame.
69 */
70MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
71#ifdef RATELIMIT
72
73/*
74 * The following preferred table will seem weird to
75 * the casual viewer. Why do we not have any rates below
76 * 1Mbps? Why do we have a rate at 1.44Mbps called common?
77 * Why do the rates cluster in the 1-100Mbps range more
78 * than others? Why does the table jump around at the beginnign
79 * and then be more consistently raising?
80 *
81 * Let me try to answer those questions. A lot of
82 * this is dependant on the hardware. We have three basic
83 * supporters of rate limiting
84 *
85 * Chelsio - Supporting 16 configurable rates.
86 * Mlx  - c4 supporting 13 fixed rates.
87 * Mlx  - c5 & c6 supporting 127 configurable rates.
88 *
89 * The c4 is why we have a common rate that is available
90 * in all rate tables. This is a selected rate from the
91 * c4 table and we assure its available in all ratelimit
92 * tables. This way the tcp_ratelimit code has an assured
93 * rate it should always be able to get. This answers a
94 * couple of the questions above.
95 *
96 * So what about the rest, well the table is built to
97 * try to get the most out of a joint hardware/software
98 * pacing system.  The software pacer will always pick
99 * a rate higher than the b/w that it is estimating
100 *
101 * on the path. This is done for two reasons.
102 * a) So we can discover more b/w
103 * and
104 * b) So we can send a block of MSS's down and then
105 *    have the software timer go off after the previous
106 *    send is completely out of the hardware.
107 *
108 * But when we do <b> we don't want to have the delay
109 * between the last packet sent by the hardware be
110 * excessively long (to reach our desired rate).
111 *
112 * So let me give an example for clarity.
113 *
114 * Lets assume that the tcp stack sees that 29,110,000 bps is
115 * what the bw of the path is. The stack would select the
116 * rate 31Mbps. 31Mbps means that each send that is done
117 * by the hardware will cause a 390 micro-second gap between
118 * the packets sent at that rate. For 29,110,000 bps we
119 * would need 416 micro-seconds gap between each send.
120 *
121 * Note that are calculating a complete time for pacing
122 * which includes the ethernet, IP and TCP overhead. So
123 * a full 1514 bytes is used for the above calculations.
124 * My testing has shown that both cards are also using this
125 * as their basis i.e. full payload size of the ethernet frame.
126 * The TCP stack caller needs to be aware of this and make the
127 * appropriate overhead calculations be included in its choices.
128 *
129 * Now, continuing our example, we pick a MSS size based on the
130 * delta between the two rates (416 - 390) divided into the rate
131 * we really wish to send at rounded up.  That results in a MSS
132 * send of 17 mss's at once. The hardware then will
133 * run out of data in a single 17MSS send in 6,630 micro-seconds.
134 *
135 * On the other hand the software pacer will send more data
136 * in 7,072 micro-seconds. This means that we will refill
137 * the hardware 52 microseconds after it would have sent
138 * next if it had not ran out of data. This is a win since we are
139 * only sending every 7ms or so and yet all the packets are spaced on
140 * the wire with 94% of what they should be and only
141 * the last packet is delayed extra to make up for the
142 * difference.
143 *
144 * Note that the above formula has two important caveat.
145 * If we are above (b/w wise) over 100Mbps we double the result
146 * of the MSS calculation. The second caveat is if we are 500Mbps
147 * or more we just send the maximum MSS at once i.e. 45MSS. At
148 * the higher b/w's even the cards have limits to what times (timer granularity)
149 * they can insert between packets and start to send more than one
150 * packet at a time on the wire.
151 *
152 */
153#define COMMON_RATE 180500
154const uint64_t desired_rates[] = {
155	122500,			/* 1Mbps  - rate 1 */
156	180500,			/* 1.44Mpbs - rate 2  common rate */
157	375000,			/* 3Mbps    - rate 3 */
158	625000,			/* 5Mbps    - rate 4 */
159	1250000,		/* 10Mbps   - rate 5 */
160	1875000,		/* 15Mbps   - rate 6 */
161	2500000,		/* 20Mbps   - rate 7 */
162	3125000,	       	/* 25Mbps   - rate 8 */
163	3750000,		/* 30Mbps   - rate 9 */
164	4375000,		/* 35Mbps   - rate 10 */
165	5000000,		/* 40Meg    - rate 11 */
166	6250000,		/* 50Mbps   - rate 12 */
167	12500000,		/* 100Mbps  - rate 13 */
168	25000000,		/* 200Mbps  - rate 14 */
169	50000000,		/* 400Mbps  - rate 15 */
170	100000000,		/* 800Mbps  - rate 16 */
171	5625000,		/* 45Mbps   - rate 17 */
172	6875000,		/* 55Mbps   - rate 19 */
173	7500000,		/* 60Mbps   - rate 20 */
174	8125000,		/* 65Mbps   - rate 21 */
175	8750000,		/* 70Mbps   - rate 22 */
176	9375000,		/* 75Mbps   - rate 23 */
177	10000000,		/* 80Mbps   - rate 24 */
178	10625000,		/* 85Mbps   - rate 25 */
179	11250000,		/* 90Mbps   - rate 26 */
180	11875000,		/* 95Mbps   - rate 27 */
181	12500000,		/* 100Mbps  - rate 28 */
182	13750000,		/* 110Mbps  - rate 29 */
183	15000000,		/* 120Mbps  - rate 30 */
184	16250000,		/* 130Mbps  - rate 31 */
185	17500000,		/* 140Mbps  - rate 32 */
186	18750000,		/* 150Mbps  - rate 33 */
187	20000000,		/* 160Mbps  - rate 34 */
188	21250000,		/* 170Mbps  - rate 35 */
189	22500000,		/* 180Mbps  - rate 36 */
190	23750000,		/* 190Mbps  - rate 37 */
191	26250000,		/* 210Mbps  - rate 38 */
192	27500000,		/* 220Mbps  - rate 39 */
193	28750000,		/* 230Mbps  - rate 40 */
194	30000000,	       	/* 240Mbps  - rate 41 */
195	31250000,		/* 250Mbps  - rate 42 */
196	34375000,		/* 275Mbps  - rate 43 */
197	37500000,		/* 300Mbps  - rate 44 */
198	40625000,		/* 325Mbps  - rate 45 */
199	43750000,		/* 350Mbps  - rate 46 */
200	46875000,		/* 375Mbps  - rate 47 */
201	53125000,		/* 425Mbps  - rate 48 */
202	56250000,		/* 450Mbps  - rate 49 */
203	59375000,		/* 475Mbps  - rate 50 */
204	62500000,		/* 500Mbps  - rate 51 */
205	68750000,		/* 550Mbps  - rate 52 */
206	75000000,		/* 600Mbps  - rate 53 */
207	81250000,		/* 650Mbps  - rate 54 */
208	87500000,		/* 700Mbps  - rate 55 */
209	93750000,		/* 750Mbps  - rate 56 */
210	106250000,		/* 850Mbps  - rate 57 */
211	112500000,		/* 900Mbps  - rate 58 */
212	125000000,		/* 1Gbps    - rate 59 */
213	156250000,		/* 1.25Gps  - rate 60 */
214	187500000,		/* 1.5Gps   - rate 61 */
215	218750000,		/* 1.75Gps  - rate 62 */
216	250000000,		/* 2Gbps    - rate 63 */
217	281250000,		/* 2.25Gps  - rate 64 */
218	312500000,		/* 2.5Gbps  - rate 65 */
219	343750000,		/* 2.75Gbps - rate 66 */
220	375000000,		/* 3Gbps    - rate 67 */
221	500000000,		/* 4Gbps    - rate 68 */
222	625000000,		/* 5Gbps    - rate 69 */
223	750000000,		/* 6Gbps    - rate 70 */
224	875000000,		/* 7Gbps    - rate 71 */
225	1000000000,		/* 8Gbps    - rate 72 */
226	1125000000,		/* 9Gbps    - rate 73 */
227	1250000000,		/* 10Gbps   - rate 74 */
228	1875000000,		/* 15Gbps   - rate 75 */
229	2500000000		/* 20Gbps   - rate 76 */
230};
231
232#define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
233#define RS_ORDERED_COUNT 16	/*
234				 * Number that are in order
235				 * at the beginning of the table,
236				 * over this a sort is required.
237				 */
238#define RS_NEXT_ORDER_GROUP 16	/*
239				 * The point in our table where
240				 * we come fill in a second ordered
241				 * group (index wise means -1).
242				 */
243#define ALL_HARDWARE_RATES 1004 /*
244				 * 1Meg - 1Gig in 1 Meg steps
245				 * plus 100, 200k  and 500k and
246				 * 10Gig
247				 */
248
249#define RS_ONE_MEGABIT_PERSEC 1000000
250#define RS_ONE_GIGABIT_PERSEC 1000000000
251#define RS_TEN_GIGABIT_PERSEC 10000000000
252
253static struct head_tcp_rate_set int_rs;
254static struct mtx rs_mtx;
255uint32_t rs_number_alive;
256uint32_t rs_number_dead;
257static uint32_t rs_floor_mss = 0;
258static uint32_t wait_time_floor = 8000;	/* 8 ms */
259static uint32_t rs_hw_floor_mss = 16;
260static uint32_t num_of_waits_allowed = 1; /* How many time blocks are we willing to wait */
261
262SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
263    "TCP Ratelimit stats");
264SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
265    &rs_number_alive, 0,
266    "Number of interfaces initialized for ratelimiting");
267SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
268    &rs_number_dead, 0,
269    "Number of interfaces departing from ratelimiting");
270SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, floor_mss, CTLFLAG_RW,
271    &rs_floor_mss, 0,
272    "Number of MSS that will override the normal minimums (0 means don't enforce)");
273SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, wait_floor, CTLFLAG_RW,
274    &wait_time_floor, 2000,
275    "Has b/w increases what is the wait floor we are willing to wait at the end?");
276SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, time_blocks, CTLFLAG_RW,
277    &num_of_waits_allowed, 1,
278    "How many time blocks on the end should software pacing be willing to wait?");
279
280SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, hw_floor_mss, CTLFLAG_RW,
281    &rs_hw_floor_mss, 16,
282    "Number of mss that are a minum for hardware pacing?");
283
284
285static void
286rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
287{
288	/*
289	 * Add sysctl entries for thus interface.
290	 */
291	if (rs->rs_flags & RS_INTF_NO_SUP) {
292		SYSCTL_ADD_S32(&rs->sysctl_ctx,
293		   SYSCTL_CHILDREN(rl_sysctl_root),
294		   OID_AUTO, "disable", CTLFLAG_RD,
295		   &rs->rs_disable, 0,
296		   "Disable this interface from new hdwr limiting?");
297	} else {
298		SYSCTL_ADD_S32(&rs->sysctl_ctx,
299		   SYSCTL_CHILDREN(rl_sysctl_root),
300		   OID_AUTO, "disable", CTLFLAG_RW,
301		   &rs->rs_disable, 0,
302		   "Disable this interface from new hdwr limiting?");
303	}
304	SYSCTL_ADD_S32(&rs->sysctl_ctx,
305	    SYSCTL_CHILDREN(rl_sysctl_root),
306	    OID_AUTO, "minseg", CTLFLAG_RW,
307	    &rs->rs_min_seg, 0,
308	    "What is the minimum we need to send on this interface?");
309	SYSCTL_ADD_U64(&rs->sysctl_ctx,
310	    SYSCTL_CHILDREN(rl_sysctl_root),
311	    OID_AUTO, "flow_limit", CTLFLAG_RW,
312	    &rs->rs_flow_limit, 0,
313	    "What is the limit for number of flows (0=unlimited)?");
314	SYSCTL_ADD_S32(&rs->sysctl_ctx,
315	    SYSCTL_CHILDREN(rl_sysctl_root),
316	    OID_AUTO, "highest", CTLFLAG_RD,
317	    &rs->rs_highest_valid, 0,
318	    "Highest valid rate");
319	SYSCTL_ADD_S32(&rs->sysctl_ctx,
320	    SYSCTL_CHILDREN(rl_sysctl_root),
321	    OID_AUTO, "lowest", CTLFLAG_RD,
322	    &rs->rs_lowest_valid, 0,
323	    "Lowest valid rate");
324	SYSCTL_ADD_S32(&rs->sysctl_ctx,
325	    SYSCTL_CHILDREN(rl_sysctl_root),
326	    OID_AUTO, "flags", CTLFLAG_RD,
327	    &rs->rs_flags, 0,
328	    "What lags are on the entry?");
329	SYSCTL_ADD_S32(&rs->sysctl_ctx,
330	    SYSCTL_CHILDREN(rl_sysctl_root),
331	    OID_AUTO, "numrates", CTLFLAG_RD,
332	    &rs->rs_rate_cnt, 0,
333	    "How many rates re there?");
334	SYSCTL_ADD_U64(&rs->sysctl_ctx,
335	    SYSCTL_CHILDREN(rl_sysctl_root),
336	    OID_AUTO, "flows_using", CTLFLAG_RD,
337	    &rs->rs_flows_using, 0,
338	    "How many flows are using this interface now?");
339#ifdef DETAILED_RATELIMIT_SYSCTL
340	if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
341		/*  Lets display the rates */
342		int i;
343		struct sysctl_oid *rl_rates;
344		struct sysctl_oid *rl_rate_num;
345		char rate_num[16];
346		rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
347					    SYSCTL_CHILDREN(rl_sysctl_root),
348					    OID_AUTO,
349					    "rate",
350					    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
351					    "Ratelist");
352		for( i = 0; i < rs->rs_rate_cnt; i++) {
353			sprintf(rate_num, "%d", i);
354			rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
355					    SYSCTL_CHILDREN(rl_rates),
356					    OID_AUTO,
357					    rate_num,
358					    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
359					    "Individual Rate");
360			SYSCTL_ADD_U32(&rs->sysctl_ctx,
361				       SYSCTL_CHILDREN(rl_rate_num),
362				       OID_AUTO, "flags", CTLFLAG_RD,
363				       &rs->rs_rlt[i].flags, 0,
364				       "Flags on this rate");
365			SYSCTL_ADD_U32(&rs->sysctl_ctx,
366				       SYSCTL_CHILDREN(rl_rate_num),
367				       OID_AUTO, "pacetime", CTLFLAG_RD,
368				       &rs->rs_rlt[i].time_between, 0,
369				       "Time hardware inserts between 1500 byte sends");
370			SYSCTL_ADD_LONG(&rs->sysctl_ctx,
371				       SYSCTL_CHILDREN(rl_rate_num),
372				       OID_AUTO, "rate", CTLFLAG_RD,
373				       &rs->rs_rlt[i].rate,
374				       "Rate in bytes per second");
375			SYSCTL_ADD_LONG(&rs->sysctl_ctx,
376				       SYSCTL_CHILDREN(rl_rate_num),
377				       OID_AUTO, "using", CTLFLAG_RD,
378				       &rs->rs_rlt[i].using,
379				       "Number of flows using");
380			SYSCTL_ADD_LONG(&rs->sysctl_ctx,
381				       SYSCTL_CHILDREN(rl_rate_num),
382				       OID_AUTO, "enobufs", CTLFLAG_RD,
383				       &rs->rs_rlt[i].rs_num_enobufs,
384				       "Number of enobufs logged on this rate");
385
386		}
387	}
388#endif
389}
390
391static void
392rs_destroy(epoch_context_t ctx)
393{
394	struct tcp_rate_set *rs;
395	bool do_free_rs;
396
397	rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
398
399	mtx_lock(&rs_mtx);
400	rs->rs_flags &= ~RS_FUNERAL_SCHD;
401	/*
402	 * In theory its possible (but unlikely)
403	 * that while the delete was occuring
404	 * and we were applying the DEAD flag
405	 * someone slipped in and found the
406	 * interface in a lookup. While we
407	 * decided rs_flows_using were 0 and
408	 * scheduling the epoch_call, the other
409	 * thread incremented rs_flow_using. This
410	 * is because users have a pointer and
411	 * we only use the rs_flows_using in an
412	 * atomic fashion, i.e. the other entities
413	 * are not protected. To assure this did
414	 * not occur, we check rs_flows_using here
415	 * before deleting.
416	 */
417	do_free_rs = (rs->rs_flows_using == 0);
418	rs_number_dead--;
419	mtx_unlock(&rs_mtx);
420
421	if (do_free_rs) {
422		sysctl_ctx_free(&rs->sysctl_ctx);
423		free(rs->rs_rlt, M_TCPPACE);
424		free(rs, M_TCPPACE);
425	}
426}
427
428static void
429rs_defer_destroy(struct tcp_rate_set *rs)
430{
431
432	mtx_assert(&rs_mtx, MA_OWNED);
433
434	/* Check if already pending. */
435	if (rs->rs_flags & RS_FUNERAL_SCHD)
436		return;
437
438	rs_number_dead++;
439
440	/* Set flag to only defer once. */
441	rs->rs_flags |= RS_FUNERAL_SCHD;
442	NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx);
443}
444
445#ifdef INET
446extern counter_u64_t rate_limit_new;
447extern counter_u64_t rate_limit_chg;
448extern counter_u64_t rate_limit_set_ok;
449extern counter_u64_t rate_limit_active;
450extern counter_u64_t rate_limit_alloc_fail;
451#endif
452
453static int
454rl_attach_txrtlmt(struct ifnet *ifp,
455    uint32_t flowtype,
456    int flowid,
457    uint64_t cfg_rate,
458    struct m_snd_tag **tag)
459{
460	int error;
461	union if_snd_tag_alloc_params params = {
462		.rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
463		.rate_limit.hdr.flowid = flowid,
464		.rate_limit.hdr.flowtype = flowtype,
465		.rate_limit.max_rate = cfg_rate,
466		.rate_limit.flags = M_NOWAIT,
467	};
468
469	error = m_snd_tag_alloc(ifp, &params, tag);
470#ifdef INET
471	if (error == 0) {
472		counter_u64_add(rate_limit_set_ok, 1);
473		counter_u64_add(rate_limit_active, 1);
474	} else if (error != EOPNOTSUPP)
475		counter_u64_add(rate_limit_alloc_fail, 1);
476#endif
477	return (error);
478}
479
480static void
481populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
482{
483	/*
484	 * The internal table is "special", it
485	 * is two seperate ordered tables that
486	 * must be merged. We get here when the
487	 * adapter specifies a number of rates that
488	 * covers both ranges in the table in some
489	 * form.
490	 */
491	int i, at_low, at_high;
492	uint8_t low_disabled = 0, high_disabled = 0;
493
494	for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
495		rs->rs_rlt[i].flags = 0;
496		rs->rs_rlt[i].time_between = 0;
497		if ((low_disabled == 0) &&
498		    (high_disabled ||
499		     (rate_table_act[at_low] < rate_table_act[at_high]))) {
500			rs->rs_rlt[i].rate = rate_table_act[at_low];
501			at_low++;
502			if (at_low == RS_NEXT_ORDER_GROUP)
503				low_disabled = 1;
504		} else if (high_disabled == 0) {
505			rs->rs_rlt[i].rate = rate_table_act[at_high];
506			at_high++;
507			if (at_high == MAX_HDWR_RATES)
508				high_disabled = 1;
509		}
510	}
511}
512
513static struct tcp_rate_set *
514rt_setup_new_rs(struct ifnet *ifp, int *error)
515{
516	struct tcp_rate_set *rs;
517	const uint64_t *rate_table_act;
518	uint64_t lentim, res;
519	size_t sz;
520	uint32_t hash_type;
521	int i;
522	struct if_ratelimit_query_results rl;
523	struct sysctl_oid *rl_sysctl_root;
524	struct epoch_tracker et;
525	/*
526	 * We expect to enter with the
527	 * mutex locked.
528	 */
529
530	if (ifp->if_ratelimit_query == NULL) {
531		/*
532		 * We can do nothing if we cannot
533		 * get a query back from the driver.
534		 */
535		printf("Warning:No query functions for %s:%d-- failed\n",
536		       ifp->if_dname, ifp->if_dunit);
537		return (NULL);
538	}
539	rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
540	if (rs == NULL) {
541		if (error)
542			*error = ENOMEM;
543		printf("Warning:No memory for malloc of tcp_rate_set\n");
544		return (NULL);
545	}
546	memset(&rl, 0, sizeof(rl));
547	rl.flags = RT_NOSUPPORT;
548	ifp->if_ratelimit_query(ifp, &rl);
549	if (rl.flags & RT_IS_UNUSABLE) {
550		/*
551		 * The interface does not really support
552		 * the rate-limiting.
553		 */
554		memset(rs, 0, sizeof(struct tcp_rate_set));
555		rs->rs_ifp = ifp;
556		rs->rs_if_dunit = ifp->if_dunit;
557		rs->rs_flags = RS_INTF_NO_SUP;
558		rs->rs_disable = 1;
559		rs_number_alive++;
560		sysctl_ctx_init(&rs->sysctl_ctx);
561		rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
562		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
563		    OID_AUTO,
564		    rs->rs_ifp->if_xname,
565		    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
566		    "");
567		rl_add_syctl_entries(rl_sysctl_root, rs);
568		NET_EPOCH_ENTER(et);
569		mtx_lock(&rs_mtx);
570		CK_LIST_INSERT_HEAD(&int_rs, rs, next);
571		mtx_unlock(&rs_mtx);
572		NET_EPOCH_EXIT(et);
573		return (rs);
574	} else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
575		memset(rs, 0, sizeof(struct tcp_rate_set));
576		rs->rs_ifp = ifp;
577		rs->rs_if_dunit = ifp->if_dunit;
578		rs->rs_flags = RS_IS_DEFF;
579		rs_number_alive++;
580		sysctl_ctx_init(&rs->sysctl_ctx);
581		rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
582		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
583		    OID_AUTO,
584		    rs->rs_ifp->if_xname,
585		    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
586		    "");
587		rl_add_syctl_entries(rl_sysctl_root, rs);
588		NET_EPOCH_ENTER(et);
589		mtx_lock(&rs_mtx);
590		CK_LIST_INSERT_HEAD(&int_rs, rs, next);
591		mtx_unlock(&rs_mtx);
592		NET_EPOCH_EXIT(et);
593		return (rs);
594	} else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
595		/* Mellanox C4 likely */
596		rs->rs_ifp = ifp;
597		rs->rs_if_dunit = ifp->if_dunit;
598		rs->rs_rate_cnt = rl.number_of_rates;
599		rs->rs_min_seg = rl.min_segment_burst;
600		rs->rs_highest_valid = 0;
601		rs->rs_flow_limit = rl.max_flows;
602		rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
603		rs->rs_disable = 0;
604		rate_table_act = rl.rate_table;
605	} else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
606		/* Chelsio, C5 and C6 of Mellanox? */
607		rs->rs_ifp = ifp;
608		rs->rs_if_dunit = ifp->if_dunit;
609		rs->rs_rate_cnt = rl.number_of_rates;
610		rs->rs_min_seg = rl.min_segment_burst;
611		rs->rs_disable = 0;
612		rs->rs_flow_limit = rl.max_flows;
613		rate_table_act = desired_rates;
614		if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
615		    (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
616			/*
617			 * Our desired table is not big
618			 * enough, do what we can.
619			 */
620			rs->rs_rate_cnt = MAX_HDWR_RATES;
621		 }
622		if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
623			rs->rs_flags = RS_IS_INTF;
624		else
625			rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
626		if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
627			rs->rs_rate_cnt = ALL_HARDWARE_RATES;
628	} else {
629		free(rs, M_TCPPACE);
630		return (NULL);
631	}
632	sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
633	rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
634	if (rs->rs_rlt == NULL) {
635		if (error)
636			*error = ENOMEM;
637bail:
638		free(rs, M_TCPPACE);
639		return (NULL);
640	}
641	if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
642		/*
643		 * The interface supports all
644		 * the rates we could possibly want.
645		 */
646		uint64_t rat;
647
648		rs->rs_rlt[0].rate = 12500;	/* 100k */
649		rs->rs_rlt[1].rate = 25000;	/* 200k */
650		rs->rs_rlt[2].rate = 62500;	/* 500k */
651		/* Note 125000 == 1Megabit
652		 * populate 1Meg - 1000meg.
653		 */
654		for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
655			rs->rs_rlt[i].rate = rat;
656			rat += 125000;
657		}
658		rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
659	} else if (rs->rs_flags & RS_INT_TBL) {
660		/* We populate this in a special way */
661		populate_canned_table(rs, rate_table_act);
662	} else {
663		/*
664		 * Just copy in the rates from
665		 * the table, it is in order.
666		 */
667		for (i=0; i<rs->rs_rate_cnt; i++) {
668			rs->rs_rlt[i].rate = rate_table_act[i];
669			rs->rs_rlt[i].time_between = 0;
670			rs->rs_rlt[i].flags = 0;
671		}
672	}
673	for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) {
674		/*
675		 * We go backwards through the list so that if we can't get
676		 * a rate and fail to init one, we have at least a chance of
677		 * getting the highest one.
678		 */
679		rs->rs_rlt[i].ptbl = rs;
680		rs->rs_rlt[i].tag = NULL;
681		rs->rs_rlt[i].using = 0;
682		rs->rs_rlt[i].rs_num_enobufs = 0;
683		/*
684		 * Calculate the time between.
685		 */
686		lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
687		res = lentim / rs->rs_rlt[i].rate;
688		if (res > 0)
689			rs->rs_rlt[i].time_between = res;
690		else
691			rs->rs_rlt[i].time_between = 1;
692		if (rs->rs_flags & RS_NO_PRE) {
693			rs->rs_rlt[i].flags = HDWRPACE_INITED;
694			rs->rs_lowest_valid = i;
695		} else {
696			int err;
697
698			if ((rl.flags & RT_IS_SETUP_REQ)  &&
699			    (ifp->if_ratelimit_query)) {
700				err = ifp->if_ratelimit_setup(ifp,
701  				         rs->rs_rlt[i].rate, i);
702				if (err)
703					goto handle_err;
704			}
705#ifdef RSS
706			hash_type = M_HASHTYPE_RSS_TCP_IPV4;
707#else
708			hash_type = M_HASHTYPE_OPAQUE_HASH;
709#endif
710			err = rl_attach_txrtlmt(ifp,
711			    hash_type,
712			    (i + 1),
713			    rs->rs_rlt[i].rate,
714			    &rs->rs_rlt[i].tag);
715			if (err) {
716handle_err:
717				if (i == (rs->rs_rate_cnt - 1)) {
718					/*
719					 * Huh - first rate and we can't get
720					 * it?
721					 */
722					free(rs->rs_rlt, M_TCPPACE);
723					if (error)
724						*error = err;
725					goto bail;
726				} else {
727					if (error)
728						*error = err;
729				}
730				break;
731			} else {
732				rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT;
733				rs->rs_lowest_valid = i;
734			}
735		}
736	}
737	/* Did we get at least 1 rate? */
738	if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED)
739		rs->rs_highest_valid = rs->rs_rate_cnt - 1;
740	else {
741		free(rs->rs_rlt, M_TCPPACE);
742		goto bail;
743	}
744	rs_number_alive++;
745	sysctl_ctx_init(&rs->sysctl_ctx);
746	rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
747	    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
748	    OID_AUTO,
749	    rs->rs_ifp->if_xname,
750	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
751	    "");
752	rl_add_syctl_entries(rl_sysctl_root, rs);
753	NET_EPOCH_ENTER(et);
754	mtx_lock(&rs_mtx);
755	CK_LIST_INSERT_HEAD(&int_rs, rs, next);
756	mtx_unlock(&rs_mtx);
757	NET_EPOCH_EXIT(et);
758	return (rs);
759}
760
761/*
762 * For an explanation of why the argument is volatile please
763 * look at the comments around rt_setup_rate().
764 */
765static const struct tcp_hwrate_limit_table *
766tcp_int_find_suitable_rate(const volatile struct tcp_rate_set *rs,
767    uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate)
768{
769	struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL;
770	uint64_t mbits_per_sec, ind_calc, previous_rate = 0;
771	int i;
772
773	mbits_per_sec = (bytes_per_sec * 8);
774	if (flags & RS_PACING_LT) {
775		if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
776		    (rs->rs_lowest_valid <= 2)){
777			/*
778			 * Smaller than 1Meg, only
779			 * 3 entries can match it.
780			 */
781			previous_rate = 0;
782			for(i = rs->rs_lowest_valid; i < 3; i++) {
783				if (bytes_per_sec <= rs->rs_rlt[i].rate) {
784					rte = &rs->rs_rlt[i];
785					break;
786				} else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) {
787					arte = &rs->rs_rlt[i];
788				}
789				previous_rate = rs->rs_rlt[i].rate;
790			}
791			goto done;
792		} else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
793			   (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
794			/*
795			 * Larger than 1G (the majority of
796			 * our table.
797			 */
798			if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC)
799				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
800			else
801				arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
802			previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate;
803			goto done;
804		}
805		/*
806		 * If we reach here its in our table (between 1Meg - 1000Meg),
807		 * just take the rounded down mbits per second, and add
808		 * 1Megabit to it, from this we can calculate
809		 * the index in the table.
810		 */
811		ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
812		if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec)
813			ind_calc++;
814		/* our table is offset by 3, we add 2 */
815		ind_calc += 2;
816		if (ind_calc > (ALL_HARDWARE_RATES-1)) {
817			/* This should not happen */
818			ind_calc = ALL_HARDWARE_RATES-1;
819		}
820		if ((ind_calc >= rs->rs_lowest_valid) &&
821		    (ind_calc <= rs->rs_highest_valid)) {
822			rte = &rs->rs_rlt[ind_calc];
823			if (ind_calc >= 1)
824				previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
825		}
826	} else if (flags & RS_PACING_EXACT_MATCH) {
827		if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
828		    (rs->rs_lowest_valid <= 2)){
829			for(i = rs->rs_lowest_valid; i < 3; i++) {
830				if (bytes_per_sec == rs->rs_rlt[i].rate) {
831					rte = &rs->rs_rlt[i];
832					break;
833				}
834			}
835		} else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
836			   (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
837			/* > 1Gbps only one rate */
838			if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) {
839				/* Its 10G wow */
840				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
841			}
842		} else {
843			/* Ok it must be a exact meg (its between 1G and 1Meg) */
844			ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
845			if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
846				/* its an exact Mbps */
847				ind_calc += 2;
848				if (ind_calc > (ALL_HARDWARE_RATES-1)) {
849					/* This should not happen */
850					ind_calc = ALL_HARDWARE_RATES-1;
851				}
852				if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
853					rte = &rs->rs_rlt[ind_calc];
854			}
855		}
856	} else {
857		/* we want greater than the requested rate */
858		if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
859		    (rs->rs_lowest_valid <= 2)){
860			arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */
861			for (i=2; i>=rs->rs_lowest_valid; i--) {
862				if (bytes_per_sec < rs->rs_rlt[i].rate) {
863					rte = &rs->rs_rlt[i];
864					if (i >= 1) {
865						previous_rate = rs->rs_rlt[(i-1)].rate;
866					}
867					break;
868				} else if ((flags & RS_PACING_GEQ) &&
869					   (bytes_per_sec == rs->rs_rlt[i].rate)) {
870					rte = &rs->rs_rlt[i];
871					if (i >= 1) {
872						previous_rate = rs->rs_rlt[(i-1)].rate;
873					}
874					break;
875				} else {
876					arte = &rs->rs_rlt[i]; /* new alternate */
877				}
878			}
879		} else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) {
880			if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
881			    (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
882				/* Our top rate is larger than the request */
883				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
884			} else if ((flags & RS_PACING_GEQ) &&
885				   (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
886				   (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
887				/* It matches our top rate */
888				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
889			} else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) {
890				/* The top rate is an alternative */
891				arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
892			}
893			previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate;
894		} else {
895			/* Its in our range 1Meg - 1Gig */
896			if (flags & RS_PACING_GEQ) {
897				ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
898				if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
899					if (ind_calc > (ALL_HARDWARE_RATES-1)) {
900						/* This should not happen */
901						ind_calc = (ALL_HARDWARE_RATES-1);
902					}
903					rte = &rs->rs_rlt[ind_calc];
904					if (ind_calc >= 1)
905						previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
906				}
907				goto done;
908			}
909			ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC;
910			ind_calc += 2;
911			if (ind_calc > (ALL_HARDWARE_RATES-1)) {
912				/* This should not happen */
913				ind_calc = ALL_HARDWARE_RATES-1;
914			}
915			if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) {
916				rte = &rs->rs_rlt[ind_calc];
917				if (ind_calc >= 1)
918					previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
919			}
920		}
921	}
922done:
923	if ((rte == NULL) &&
924	    (arte != NULL) &&
925	    (flags & RS_PACING_SUB_OK)) {
926		/* We can use the substitute */
927		rte = arte;
928	}
929	if (lower_rate)
930		*lower_rate = previous_rate;
931	return (rte);
932}
933
934/*
935 * For an explanation of why the argument is volatile please
936 * look at the comments around rt_setup_rate().
937 */
938static const struct tcp_hwrate_limit_table *
939tcp_find_suitable_rate(const volatile struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate)
940{
941	/**
942	 * Hunt the rate table with the restrictions in flags and find a
943	 * suitable rate if possible.
944	 * RS_PACING_EXACT_MATCH - look for an exact match to rate.
945	 * RS_PACING_GT     - must be greater than.
946	 * RS_PACING_GEQ    - must be greater than or equal.
947	 * RS_PACING_LT     - must be less than.
948	 * RS_PACING_SUB_OK - If we don't meet criteria a
949	 *                    substitute is ok.
950	 */
951	int i, matched;
952	struct tcp_hwrate_limit_table *rte = NULL;
953	uint64_t previous_rate = 0;
954
955	if ((rs->rs_flags & RS_INT_TBL) &&
956	    (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) {
957		/*
958		 * Here we don't want to paw thru
959		 * a big table, we have everything
960		 * from 1Meg - 1000Meg in 1Meg increments.
961		 * Use an alternate method to "lookup".
962		 */
963		return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate));
964	}
965	if ((flags & RS_PACING_LT) ||
966	    (flags & RS_PACING_EXACT_MATCH)) {
967		/*
968		 * For exact and less than we go forward through the table.
969		 * This way when we find one larger we stop (exact was a
970		 * toss up).
971		 */
972		for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) {
973			if ((flags & RS_PACING_EXACT_MATCH) &&
974			    (bytes_per_sec == rs->rs_rlt[i].rate)) {
975				rte = &rs->rs_rlt[i];
976				matched = 1;
977				if (lower_rate != NULL)
978					*lower_rate = previous_rate;
979				break;
980			} else if ((flags & RS_PACING_LT) &&
981			    (bytes_per_sec <= rs->rs_rlt[i].rate)) {
982				rte = &rs->rs_rlt[i];
983				matched = 1;
984				if (lower_rate != NULL)
985					*lower_rate = previous_rate;
986				break;
987			}
988			previous_rate = rs->rs_rlt[i].rate;
989			if (bytes_per_sec > rs->rs_rlt[i].rate)
990				break;
991		}
992		if ((matched == 0) &&
993		    (flags & RS_PACING_LT) &&
994		    (flags & RS_PACING_SUB_OK)) {
995			/* Kick in a substitute (the lowest) */
996			rte = &rs->rs_rlt[rs->rs_lowest_valid];
997		}
998	} else {
999		/*
1000		 * Here we go backward through the table so that we can find
1001		 * the one greater in theory faster (but its probably a
1002		 * wash).
1003		 */
1004		for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) {
1005			if (rs->rs_rlt[i].rate > bytes_per_sec) {
1006				/* A possible candidate */
1007				rte = &rs->rs_rlt[i];
1008			}
1009			if ((flags & RS_PACING_GEQ) &&
1010			    (bytes_per_sec == rs->rs_rlt[i].rate)) {
1011				/* An exact match and we want equal */
1012				matched = 1;
1013				rte = &rs->rs_rlt[i];
1014				break;
1015			} else if (rte) {
1016				/*
1017				 * Found one that is larger than but don't
1018				 * stop, there may be a more closer match.
1019				 */
1020				matched = 1;
1021			}
1022			if (rs->rs_rlt[i].rate < bytes_per_sec) {
1023				/*
1024				 * We found a table entry that is smaller,
1025				 * stop there will be none greater or equal.
1026				 */
1027				if (lower_rate != NULL)
1028					*lower_rate = rs->rs_rlt[i].rate;
1029				break;
1030			}
1031		}
1032		if ((matched == 0) &&
1033		    (flags & RS_PACING_SUB_OK)) {
1034			/* Kick in a substitute (the highest) */
1035			rte = &rs->rs_rlt[rs->rs_highest_valid];
1036		}
1037	}
1038	return (rte);
1039}
1040
1041static struct ifnet *
1042rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
1043{
1044	struct ifnet *tifp;
1045	struct m_snd_tag *tag, *ntag;
1046	union if_snd_tag_alloc_params params = {
1047		.rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
1048		.rate_limit.hdr.flowid = inp->inp_flowid,
1049		.rate_limit.hdr.numa_domain = inp->inp_numa_domain,
1050		.rate_limit.max_rate = COMMON_RATE,
1051		.rate_limit.flags = M_NOWAIT,
1052	};
1053	int err;
1054#ifdef RSS
1055	params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ?
1056	    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4);
1057#else
1058	params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH;
1059#endif
1060	err = m_snd_tag_alloc(ifp, &params, &tag);
1061	if (err) {
1062		/* Failed to setup a tag? */
1063		if (error)
1064			*error = err;
1065		return (NULL);
1066	}
1067	ntag = tag;
1068	while(ntag->ifp->if_next_snd_tag != NULL) {
1069		ntag = ntag->ifp->if_next_snd_tag(ntag);
1070	}
1071	tifp = ntag->ifp;
1072	m_snd_tag_rele(tag);
1073	return (tifp);
1074}
1075
1076static void
1077rl_increment_using(const struct tcp_hwrate_limit_table *rte)
1078{
1079	struct tcp_hwrate_limit_table *decon_rte;
1080
1081	decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
1082	atomic_add_long(&decon_rte->using, 1);
1083}
1084
1085static void
1086rl_decrement_using(const struct tcp_hwrate_limit_table *rte)
1087{
1088	struct tcp_hwrate_limit_table *decon_rte;
1089
1090	decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
1091	atomic_subtract_long(&decon_rte->using, 1);
1092}
1093
1094void
1095tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte)
1096{
1097	struct tcp_hwrate_limit_table *decon_rte;
1098
1099	decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
1100	atomic_add_long(&decon_rte->rs_num_enobufs, 1);
1101}
1102
1103/*
1104 * Do NOT take the __noinline out of the
1105 * find_rs_for_ifp() function. If you do the inline
1106 * of it for the rt_setup_rate() will show you a
1107 * compiler bug. For some reason the compiler thinks
1108 * the list can never be empty. The consequence of
1109 * this will be a crash when we dereference NULL
1110 * if an ifp is removed just has a hw rate limit
1111 * is attempted. If you are working on the compiler
1112 * and want to "test" this go ahead and take the noinline
1113 * out otherwise let sleeping dogs ly until such time
1114 * as we get a compiler fix 10/2/20 -- RRS
1115 */
1116static __noinline struct tcp_rate_set *
1117find_rs_for_ifp(struct ifnet *ifp)
1118{
1119	struct tcp_rate_set *rs;
1120
1121	CK_LIST_FOREACH(rs, &int_rs, next) {
1122		if ((rs->rs_ifp == ifp) &&
1123		    (rs->rs_if_dunit == ifp->if_dunit)) {
1124			/* Ok we found it */
1125			return (rs);
1126		}
1127	}
1128	return (NULL);
1129}
1130
1131
1132static const struct tcp_hwrate_limit_table *
1133rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec,
1134    uint32_t flags, int *error, uint64_t *lower_rate)
1135{
1136	/* First lets find the interface if it exists */
1137	const struct tcp_hwrate_limit_table *rte;
1138	/*
1139	 * So why is rs volatile? This is to defeat a
1140	 * compiler bug where in the compiler is convinced
1141	 * that rs can never be NULL (which is not true). Because
1142	 * of its conviction it nicely optimizes out the if ((rs == NULL
1143	 * below which means if you get a NULL back you dereference it.
1144	 */
1145	volatile struct tcp_rate_set *rs;
1146	struct epoch_tracker et;
1147	struct ifnet *oifp = ifp;
1148	int err;
1149
1150	NET_EPOCH_ENTER(et);
1151use_real_interface:
1152	rs = find_rs_for_ifp(ifp);
1153	if ((rs == NULL) ||
1154	    (rs->rs_flags & RS_INTF_NO_SUP) ||
1155	    (rs->rs_flags & RS_IS_DEAD)) {
1156		/*
1157		 * This means we got a packet *before*
1158		 * the IF-UP was processed below, <or>
1159		 * while or after we already received an interface
1160		 * departed event. In either case we really don't
1161		 * want to do anything with pacing, in
1162		 * the departing case the packet is not
1163		 * going to go very far. The new case
1164		 * might be arguable, but its impossible
1165		 * to tell from the departing case.
1166		 */
1167		if (error)
1168			*error = ENODEV;
1169		NET_EPOCH_EXIT(et);
1170		return (NULL);
1171	}
1172
1173	if ((rs == NULL) || (rs->rs_disable != 0)) {
1174		if (error)
1175			*error = ENOSPC;
1176		NET_EPOCH_EXIT(et);
1177		return (NULL);
1178	}
1179	if (rs->rs_flags & RS_IS_DEFF) {
1180		/* We need to find the real interface */
1181		struct ifnet *tifp;
1182
1183		tifp = rt_find_real_interface(ifp, inp, error);
1184		if (tifp == NULL) {
1185			if (rs->rs_disable && error)
1186				*error = ENOTSUP;
1187			NET_EPOCH_EXIT(et);
1188			return (NULL);
1189		}
1190		KASSERT((tifp != ifp),
1191			("Lookup failure ifp:%p inp:%p rt_find_real_interface() returns the same interface tifp:%p?\n",
1192			 ifp, inp, tifp));
1193		ifp = tifp;
1194		goto use_real_interface;
1195	}
1196	if (rs->rs_flow_limit &&
1197	    ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) {
1198		if (error)
1199			*error = ENOSPC;
1200		NET_EPOCH_EXIT(et);
1201		return (NULL);
1202	}
1203	rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate);
1204	if (rte) {
1205		err = in_pcbattach_txrtlmt(inp, oifp,
1206		    inp->inp_flowtype,
1207		    inp->inp_flowid,
1208		    rte->rate,
1209		    &inp->inp_snd_tag);
1210		if (err) {
1211			/* Failed to attach */
1212			if (error)
1213				*error = err;
1214			rte = NULL;
1215		} else {
1216			KASSERT((inp->inp_snd_tag != NULL) ,
1217				("Setup rate has no snd_tag inp:%p rte:%p rate:%llu rs:%p",
1218				 inp, rte, (unsigned long long)rte->rate, rs));
1219#ifdef INET
1220			counter_u64_add(rate_limit_new, 1);
1221#endif
1222		}
1223	}
1224	if (rte) {
1225		/*
1226		 * We use an atomic here for accounting so we don't have to
1227		 * use locks when freeing.
1228		 */
1229		atomic_add_64(&rs->rs_flows_using, 1);
1230	}
1231	NET_EPOCH_EXIT(et);
1232	return (rte);
1233}
1234
1235static void
1236tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state)
1237{
1238	int error;
1239	struct tcp_rate_set *rs;
1240	struct epoch_tracker et;
1241
1242	if (((ifp->if_capenable & IFCAP_TXRTLMT) == 0) ||
1243	    (link_state != LINK_STATE_UP)) {
1244		/*
1245		 * We only care on an interface going up that is rate-limit
1246		 * capable.
1247		 */
1248		return;
1249	}
1250	NET_EPOCH_ENTER(et);
1251	mtx_lock(&rs_mtx);
1252	rs = find_rs_for_ifp(ifp);
1253	if (rs) {
1254		/* We already have initialized this guy */
1255		mtx_unlock(&rs_mtx);
1256		NET_EPOCH_EXIT(et);
1257		return;
1258	}
1259	mtx_unlock(&rs_mtx);
1260	NET_EPOCH_EXIT(et);
1261	rt_setup_new_rs(ifp, &error);
1262}
1263
1264static void
1265tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp)
1266{
1267	struct tcp_rate_set *rs;
1268	struct epoch_tracker et;
1269	int i;
1270
1271	NET_EPOCH_ENTER(et);
1272	mtx_lock(&rs_mtx);
1273	rs = find_rs_for_ifp(ifp);
1274	if (rs) {
1275		CK_LIST_REMOVE(rs, next);
1276		rs_number_alive--;
1277		rs->rs_flags |= RS_IS_DEAD;
1278		for (i = 0; i < rs->rs_rate_cnt; i++) {
1279			if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
1280				in_pcbdetach_tag(rs->rs_rlt[i].tag);
1281				rs->rs_rlt[i].tag = NULL;
1282			}
1283			rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
1284		}
1285		if (rs->rs_flows_using == 0)
1286			rs_defer_destroy(rs);
1287	}
1288	mtx_unlock(&rs_mtx);
1289	NET_EPOCH_EXIT(et);
1290}
1291
1292static void
1293tcp_rl_shutdown(void *arg __unused, int howto __unused)
1294{
1295	struct tcp_rate_set *rs, *nrs;
1296	struct epoch_tracker et;
1297	int i;
1298
1299	NET_EPOCH_ENTER(et);
1300	mtx_lock(&rs_mtx);
1301	CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
1302		CK_LIST_REMOVE(rs, next);
1303		rs_number_alive--;
1304		rs->rs_flags |= RS_IS_DEAD;
1305		for (i = 0; i < rs->rs_rate_cnt; i++) {
1306			if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
1307				in_pcbdetach_tag(rs->rs_rlt[i].tag);
1308				rs->rs_rlt[i].tag = NULL;
1309			}
1310			rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
1311		}
1312		if (rs->rs_flows_using == 0)
1313			rs_defer_destroy(rs);
1314	}
1315	mtx_unlock(&rs_mtx);
1316	NET_EPOCH_EXIT(et);
1317}
1318
1319const struct tcp_hwrate_limit_table *
1320tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
1321    uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
1322{
1323	const struct tcp_hwrate_limit_table *rte;
1324#ifdef KERN_TLS
1325	struct ktls_session *tls;
1326#endif
1327
1328	INP_WLOCK_ASSERT(tp->t_inpcb);
1329
1330	if (tp->t_inpcb->inp_snd_tag == NULL) {
1331		/*
1332		 * We are setting up a rate for the first time.
1333		 */
1334		if ((ifp->if_capenable & IFCAP_TXRTLMT) == 0) {
1335			/* Not supported by the egress */
1336			if (error)
1337				*error = ENODEV;
1338			return (NULL);
1339		}
1340#ifdef KERN_TLS
1341		tls = NULL;
1342		if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
1343			tls = tp->t_inpcb->inp_socket->so_snd.sb_tls_info;
1344
1345			if ((ifp->if_capenable & IFCAP_TXTLS_RTLMT) == 0 ||
1346			    tls->mode != TCP_TLS_MODE_IFNET) {
1347				if (error)
1348					*error = ENODEV;
1349				return (NULL);
1350			}
1351		}
1352#endif
1353		rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error, lower_rate);
1354		if (rte)
1355			rl_increment_using(rte);
1356#ifdef KERN_TLS
1357		if (rte != NULL && tls != NULL && tls->snd_tag != NULL) {
1358			/*
1359			 * Fake a route change error to reset the TLS
1360			 * send tag.  This will convert the existing
1361			 * tag to a TLS ratelimit tag.
1362			 */
1363			MPASS(tls->snd_tag->type == IF_SND_TAG_TYPE_TLS);
1364			ktls_output_eagain(tp->t_inpcb, tls);
1365		}
1366#endif
1367	} else {
1368		/*
1369		 * We are modifying a rate, wrong interface?
1370		 */
1371		if (error)
1372			*error = EINVAL;
1373		rte = NULL;
1374	}
1375	if (rte != NULL) {
1376		tp->t_pacing_rate = rte->rate;
1377		*error = 0;
1378	}
1379	return (rte);
1380}
1381
1382const struct tcp_hwrate_limit_table *
1383tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
1384    struct tcpcb *tp, struct ifnet *ifp,
1385    uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
1386{
1387	const struct tcp_hwrate_limit_table *nrte;
1388	const struct tcp_rate_set *rs;
1389#ifdef KERN_TLS
1390	struct ktls_session *tls = NULL;
1391#endif
1392	int err;
1393
1394	INP_WLOCK_ASSERT(tp->t_inpcb);
1395
1396	if (crte == NULL) {
1397		/* Wrong interface */
1398		if (error)
1399			*error = EINVAL;
1400		return (NULL);
1401	}
1402
1403#ifdef KERN_TLS
1404	if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
1405		tls = tp->t_inpcb->inp_socket->so_snd.sb_tls_info;
1406		MPASS(tls->mode == TCP_TLS_MODE_IFNET);
1407		if (tls->snd_tag != NULL &&
1408		    tls->snd_tag->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT) {
1409			/*
1410			 * NIC probably doesn't support ratelimit TLS
1411			 * tags if it didn't allocate one when an
1412			 * existing rate was present, so ignore.
1413			 */
1414			if (error)
1415				*error = EOPNOTSUPP;
1416			return (NULL);
1417		}
1418	}
1419#endif
1420	if (tp->t_inpcb->inp_snd_tag == NULL) {
1421		/* Wrong interface */
1422		if (error)
1423			*error = EINVAL;
1424		return (NULL);
1425	}
1426	rs = crte->ptbl;
1427	if ((rs->rs_flags & RS_IS_DEAD) ||
1428	    (crte->flags & HDWRPACE_IFPDEPARTED)) {
1429		/* Release the rate, and try anew */
1430
1431		tcp_rel_pacing_rate(crte, tp);
1432		nrte = tcp_set_pacing_rate(tp, ifp,
1433		    bytes_per_sec, flags, error, lower_rate);
1434		return (nrte);
1435	}
1436	nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate);
1437	if (nrte == crte) {
1438		/* No change */
1439		if (error)
1440			*error = 0;
1441		return (crte);
1442	}
1443	if (nrte == NULL) {
1444		/* Release the old rate */
1445		if (error)
1446			*error = ENOENT;
1447		tcp_rel_pacing_rate(crte, tp);
1448		return (NULL);
1449	}
1450	rl_decrement_using(crte);
1451	rl_increment_using(nrte);
1452	/* Change rates to our new entry */
1453#ifdef KERN_TLS
1454	if (tls != NULL)
1455		err = ktls_modify_txrtlmt(tls, nrte->rate);
1456	else
1457#endif
1458		err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate);
1459	if (err) {
1460		rl_decrement_using(nrte);
1461		/* Do we still have a snd-tag attached? */
1462		if (tp->t_inpcb->inp_snd_tag)
1463			in_pcbdetach_txrtlmt(tp->t_inpcb);
1464		if (error)
1465			*error = err;
1466		return (NULL);
1467	} else {
1468#ifdef INET
1469		counter_u64_add(rate_limit_chg, 1);
1470#endif
1471	}
1472	if (error)
1473		*error = 0;
1474	tp->t_pacing_rate = nrte->rate;
1475	return (nrte);
1476}
1477
1478void
1479tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp)
1480{
1481	const struct tcp_rate_set *crs;
1482	struct tcp_rate_set *rs;
1483	uint64_t pre;
1484
1485	INP_WLOCK_ASSERT(tp->t_inpcb);
1486
1487	tp->t_pacing_rate = -1;
1488	crs = crte->ptbl;
1489	/*
1490	 * Now we must break the const
1491	 * in order to release our refcount.
1492	 */
1493	rs = __DECONST(struct tcp_rate_set *, crs);
1494	rl_decrement_using(crte);
1495	pre = atomic_fetchadd_64(&rs->rs_flows_using, -1);
1496	if (pre == 1) {
1497		struct epoch_tracker et;
1498
1499		NET_EPOCH_ENTER(et);
1500		mtx_lock(&rs_mtx);
1501		/*
1502		 * Is it dead?
1503		 */
1504		if (rs->rs_flags & RS_IS_DEAD)
1505			rs_defer_destroy(rs);
1506		mtx_unlock(&rs_mtx);
1507		NET_EPOCH_EXIT(et);
1508	}
1509
1510	/*
1511	 * XXX: If this connection is using ifnet TLS, should we
1512	 * switch it to using an unlimited rate, or perhaps use
1513	 * ktls_output_eagain() to reset the send tag to a plain
1514	 * TLS tag?
1515	 */
1516	in_pcbdetach_txrtlmt(tp->t_inpcb);
1517}
1518
1519#define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */
1520#define ONE_HUNDRED_MBPS 12500000	/* 100Mbps in bytes per second */
1521#define FIVE_HUNDRED_MBPS 62500000	/* 500Mbps in bytes per second */
1522#define MAX_MSS_SENT 43	/* 43 mss = 43 x 1500 = 64,500 bytes */
1523
1524static void
1525tcp_log_pacing_size(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, uint32_t new_tso,
1526		    uint64_t hw_rate, uint32_t time_between, uint32_t calc_time_between,
1527		    uint32_t segs, uint32_t res_div, uint16_t mult, uint8_t mod)
1528{
1529	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
1530		union tcp_log_stackspecific log;
1531		struct timeval tv;
1532		uint32_t cts;
1533
1534		memset(&log, 0, sizeof(log));
1535		cts = tcp_get_usecs(&tv);
1536		log.u_bbr.flex1 = segsiz;
1537		log.u_bbr.flex2 = new_tso;
1538		log.u_bbr.flex3 = time_between;
1539		log.u_bbr.flex4 = calc_time_between;
1540		log.u_bbr.flex5 = segs;
1541		log.u_bbr.flex6 = res_div;
1542		log.u_bbr.flex7 = mult;
1543		log.u_bbr.flex8 = mod;
1544		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1545		log.u_bbr.cur_del_rate = bw;
1546		log.u_bbr.delRate = hw_rate;
1547		TCP_LOG_EVENTP(tp, NULL,
1548		    &tp->t_inpcb->inp_socket->so_rcv,
1549		    &tp->t_inpcb->inp_socket->so_snd,
1550		    TCP_HDWR_PACE_SIZE, 0,
1551		    0, &log, false, &tv);
1552	}
1553}
1554
1555uint32_t
1556tcp_get_pacing_burst_size (struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss,
1557   const struct tcp_hwrate_limit_table *te, int *err)
1558{
1559	/*
1560	 * We use the google formula to calculate the
1561	 * TSO size. I.E.
1562	 * bw < 24Meg
1563	 *   tso = 2mss
1564	 * else
1565	 *   tso = min(bw/1000, 64k)
1566	 *
1567	 * Note for these calculations we ignore the
1568	 * packet overhead (enet hdr, ip hdr and tcp hdr).
1569	 */
1570	uint64_t lentim, res, bytes;
1571	uint32_t new_tso, min_tso_segs;
1572
1573	bytes = bw / 1000;
1574	if (bytes > (64 * 1000))
1575		bytes = 64 * 1000;
1576	/* Round up */
1577	new_tso = (bytes + segsiz - 1) / segsiz;
1578	if (can_use_1mss && (bw < ONE_POINT_TWO_MEG))
1579		min_tso_segs = 1;
1580	else
1581		min_tso_segs = 2;
1582	if (rs_floor_mss && (new_tso < rs_floor_mss))
1583		new_tso = rs_floor_mss;
1584	else if (new_tso < min_tso_segs)
1585		new_tso = min_tso_segs;
1586	if (new_tso > MAX_MSS_SENT)
1587		new_tso = MAX_MSS_SENT;
1588	new_tso *= segsiz;
1589 	tcp_log_pacing_size(tp, bw, segsiz, new_tso,
1590			    0, 0, 0, 0, 0, 0, 1);
1591	/*
1592	 * If we are not doing hardware pacing
1593	 * then we are done.
1594	 */
1595	if (te == NULL) {
1596		if (err)
1597			*err = 0;
1598		return(new_tso);
1599	}
1600	/*
1601	 * For hardware pacing we look at the
1602	 * rate you are sending at and compare
1603	 * that to the rate you have in hardware.
1604	 *
1605	 * If the hardware rate is slower than your
1606	 * software rate then you are in error and
1607	 * we will build a queue in our hardware whic
1608	 * is probably not desired, in such a case
1609	 * just return the non-hardware TSO size.
1610	 *
1611	 * If the rate in hardware is faster (which
1612	 * it should be) then look at how long it
1613	 * takes to send one ethernet segment size at
1614	 * your b/w and compare that to the time it
1615	 * takes to send at the rate you had selected.
1616	 *
1617	 * If your time is greater (which we hope it is)
1618	 * we get the delta between the two, and then
1619	 * divide that into your pacing time. This tells
1620	 * us how many MSS you can send down at once (rounded up).
1621	 *
1622	 * Note we also double this value if the b/w is over
1623	 * 100Mbps. If its over 500meg we just set you to the
1624	 * max (43 segments).
1625	 */
1626	if (te->rate > FIVE_HUNDRED_MBPS)
1627		goto max;
1628	if (te->rate == bw) {
1629		/* We are pacing at exactly the hdwr rate */
1630max:
1631		tcp_log_pacing_size(tp, bw, segsiz, new_tso,
1632				    te->rate, te->time_between, (uint32_t)0,
1633				    (segsiz * MAX_MSS_SENT), 0, 0, 3);
1634		return (segsiz * MAX_MSS_SENT);
1635	}
1636	lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
1637	res = lentim / bw;
1638	if (res > te->time_between) {
1639		uint32_t delta, segs, res_div;
1640
1641		res_div = ((res * num_of_waits_allowed) + wait_time_floor);
1642		delta = res - te->time_between;
1643		segs = (res_div + delta - 1)/delta;
1644		if (segs < min_tso_segs)
1645			segs = min_tso_segs;
1646		if (segs < rs_hw_floor_mss)
1647			segs = rs_hw_floor_mss;
1648		if (segs > MAX_MSS_SENT)
1649			segs = MAX_MSS_SENT;
1650		segs *= segsiz;
1651		tcp_log_pacing_size(tp, bw, segsiz, new_tso,
1652				    te->rate, te->time_between, (uint32_t)res,
1653				    segs, res_div, 1, 3);
1654		if (err)
1655			*err = 0;
1656		if (segs < new_tso) {
1657			/* unexpected ? */
1658			return(new_tso);
1659		} else {
1660			return (segs);
1661		}
1662	} else {
1663		/*
1664		 * Your time is smaller which means
1665		 * we will grow a queue on our
1666		 * hardware. Send back the non-hardware
1667		 * rate.
1668		 */
1669		tcp_log_pacing_size(tp, bw, segsiz, new_tso,
1670				    te->rate, te->time_between, (uint32_t)res,
1671				    0, 0, 0, 4);
1672		if (err)
1673			*err = -1;
1674		return (new_tso);
1675	}
1676}
1677
1678uint64_t
1679tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp)
1680{
1681	struct epoch_tracker et;
1682	struct tcp_rate_set *rs;
1683	uint64_t rate_ret;
1684
1685	NET_EPOCH_ENTER(et);
1686use_next_interface:
1687	rs = find_rs_for_ifp(ifp);
1688	if (rs == NULL) {
1689		/* This interface does not do ratelimiting */
1690		rate_ret = 0;
1691	} else if (rs->rs_flags & RS_IS_DEFF) {
1692		/* We need to find the real interface */
1693		struct ifnet *tifp;
1694
1695		tifp = rt_find_real_interface(ifp, inp, NULL);
1696		if (tifp == NULL) {
1697			NET_EPOCH_EXIT(et);
1698			return (0);
1699		}
1700		ifp = tifp;
1701		goto use_next_interface;
1702	} else {
1703		/* Lets return the highest rate this guy has */
1704		rate_ret = rs->rs_rlt[rs->rs_highest_valid].rate;
1705	}
1706	NET_EPOCH_EXIT(et);
1707	return(rate_ret);
1708}
1709
1710static eventhandler_tag rl_ifnet_departs;
1711static eventhandler_tag rl_ifnet_arrives;
1712static eventhandler_tag rl_shutdown_start;
1713
1714static void
1715tcp_rs_init(void *st __unused)
1716{
1717	CK_LIST_INIT(&int_rs);
1718	rs_number_alive = 0;
1719	rs_number_dead = 0;
1720	mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF);
1721	rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event,
1722	    tcp_rl_ifnet_departure,
1723	    NULL, EVENTHANDLER_PRI_ANY);
1724	rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event,
1725	    tcp_rl_ifnet_link,
1726	    NULL, EVENTHANDLER_PRI_ANY);
1727	rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync,
1728	    tcp_rl_shutdown, NULL,
1729	    SHUTDOWN_PRI_FIRST);
1730	printf("TCP_ratelimit: Is now initialized\n");
1731}
1732
1733SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL);
1734#endif
1735