Deleted Added
full compact
flowtable.c (215317) flowtable.c (215701)
1/**************************************************************************
2
3Copyright (c) 2008-2010, BitGravity Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
11
12 2. Neither the name of the BitGravity Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include "opt_route.h"
31#include "opt_mpath.h"
32#include "opt_ddb.h"
33#include "opt_inet.h"
34#include "opt_inet6.h"
35
36#include <sys/cdefs.h>
1/**************************************************************************
2
3Copyright (c) 2008-2010, BitGravity Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
11
12 2. Neither the name of the BitGravity Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include "opt_route.h"
31#include "opt_mpath.h"
32#include "opt_ddb.h"
33#include "opt_inet.h"
34#include "opt_inet6.h"
35
36#include <sys/cdefs.h>
37__FBSDID("$FreeBSD: head/sys/net/flowtable.c 215317 2010-11-14 20:38:11Z dim $");
37__FBSDID("$FreeBSD: head/sys/net/flowtable.c 215701 2010-11-22 19:32:54Z dim $");
38
39#include <sys/param.h>
40#include <sys/types.h>
41#include <sys/bitstring.h>
42#include <sys/condvar.h>
43#include <sys/callout.h>
44#include <sys/kernel.h>
45#include <sys/kthread.h>
46#include <sys/limits.h>
47#include <sys/malloc.h>
48#include <sys/mbuf.h>
49#include <sys/proc.h>
50#include <sys/sbuf.h>
51#include <sys/sched.h>
52#include <sys/smp.h>
53#include <sys/socket.h>
54#include <sys/syslog.h>
55#include <sys/sysctl.h>
56
57#include <net/if.h>
58#include <net/if_llatbl.h>
59#include <net/if_var.h>
60#include <net/route.h>
61#include <net/flowtable.h>
62#include <net/vnet.h>
63
64#include <netinet/in.h>
65#include <netinet/in_systm.h>
66#include <netinet/in_var.h>
67#include <netinet/if_ether.h>
68#include <netinet/ip.h>
69#ifdef INET6
70#include <netinet/ip6.h>
71#endif
72#include <netinet/tcp.h>
73#include <netinet/udp.h>
74#include <netinet/sctp.h>
75
76#include <libkern/jenkins.h>
77#include <ddb/ddb.h>
78
79struct ipv4_tuple {
80 uint16_t ip_sport; /* source port */
81 uint16_t ip_dport; /* destination port */
82 in_addr_t ip_saddr; /* source address */
83 in_addr_t ip_daddr; /* destination address */
84};
85
86union ipv4_flow {
87 struct ipv4_tuple ipf_ipt;
88 uint32_t ipf_key[3];
89};
90
91struct ipv6_tuple {
92 uint16_t ip_sport; /* source port */
93 uint16_t ip_dport; /* destination port */
94 struct in6_addr ip_saddr; /* source address */
95 struct in6_addr ip_daddr; /* destination address */
96};
97
98union ipv6_flow {
99 struct ipv6_tuple ipf_ipt;
100 uint32_t ipf_key[9];
101};
102
103struct flentry {
104 volatile uint32_t f_fhash; /* hash flowing forward */
105 uint16_t f_flags; /* flow flags */
106 uint8_t f_pad;
107 uint8_t f_proto; /* protocol */
108 uint32_t f_fibnum; /* fib index */
109 uint32_t f_uptime; /* uptime at last access */
110 struct flentry *f_next; /* pointer to collision entry */
111 volatile struct rtentry *f_rt; /* rtentry for flow */
112 volatile struct llentry *f_lle; /* llentry for flow */
113};
114
115struct flentry_v4 {
116 struct flentry fl_entry;
117 union ipv4_flow fl_flow;
118};
119
120struct flentry_v6 {
121 struct flentry fl_entry;
122 union ipv6_flow fl_flow;
123};
124
125#define fl_fhash fl_entry.fl_fhash
126#define fl_flags fl_entry.fl_flags
127#define fl_proto fl_entry.fl_proto
128#define fl_uptime fl_entry.fl_uptime
129#define fl_rt fl_entry.fl_rt
130#define fl_lle fl_entry.fl_lle
131
132#define SECS_PER_HOUR 3600
133#define SECS_PER_DAY (24*SECS_PER_HOUR)
134
135#define SYN_IDLE 300
136#define UDP_IDLE 300
137#define FIN_WAIT_IDLE 600
138#define TCP_IDLE SECS_PER_DAY
139
140
141typedef void fl_lock_t(struct flowtable *, uint32_t);
142typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
143
144union flentryp {
145 struct flentry **global;
146 struct flentry **pcpu[MAXCPU];
147};
148
149struct flowtable_stats {
150 uint64_t ft_collisions;
151 uint64_t ft_allocated;
152 uint64_t ft_misses;
153 uint64_t ft_max_depth;
154 uint64_t ft_free_checks;
155 uint64_t ft_frees;
156 uint64_t ft_hits;
157 uint64_t ft_lookups;
158} __aligned(CACHE_LINE_SIZE);
159
160struct flowtable {
161 struct flowtable_stats ft_stats[MAXCPU];
162 int ft_size;
163 int ft_lock_count;
164 uint32_t ft_flags;
165 char *ft_name;
166 fl_lock_t *ft_lock;
167 fl_lock_t *ft_unlock;
168 fl_rtalloc_t *ft_rtalloc;
169 /*
170 * XXX need to pad out
171 */
172 struct mtx *ft_locks;
173 union flentryp ft_table;
174 bitstr_t *ft_masks[MAXCPU];
175 bitstr_t *ft_tmpmask;
176 struct flowtable *ft_next;
177
178 uint32_t ft_count __aligned(CACHE_LINE_SIZE);
179 uint32_t ft_udp_idle __aligned(CACHE_LINE_SIZE);
180 uint32_t ft_fin_wait_idle;
181 uint32_t ft_syn_idle;
182 uint32_t ft_tcp_idle;
183 boolean_t ft_full;
184} __aligned(CACHE_LINE_SIZE);
185
186static struct proc *flowcleanerproc;
38
39#include <sys/param.h>
40#include <sys/types.h>
41#include <sys/bitstring.h>
42#include <sys/condvar.h>
43#include <sys/callout.h>
44#include <sys/kernel.h>
45#include <sys/kthread.h>
46#include <sys/limits.h>
47#include <sys/malloc.h>
48#include <sys/mbuf.h>
49#include <sys/proc.h>
50#include <sys/sbuf.h>
51#include <sys/sched.h>
52#include <sys/smp.h>
53#include <sys/socket.h>
54#include <sys/syslog.h>
55#include <sys/sysctl.h>
56
57#include <net/if.h>
58#include <net/if_llatbl.h>
59#include <net/if_var.h>
60#include <net/route.h>
61#include <net/flowtable.h>
62#include <net/vnet.h>
63
64#include <netinet/in.h>
65#include <netinet/in_systm.h>
66#include <netinet/in_var.h>
67#include <netinet/if_ether.h>
68#include <netinet/ip.h>
69#ifdef INET6
70#include <netinet/ip6.h>
71#endif
72#include <netinet/tcp.h>
73#include <netinet/udp.h>
74#include <netinet/sctp.h>
75
76#include <libkern/jenkins.h>
77#include <ddb/ddb.h>
78
79struct ipv4_tuple {
80 uint16_t ip_sport; /* source port */
81 uint16_t ip_dport; /* destination port */
82 in_addr_t ip_saddr; /* source address */
83 in_addr_t ip_daddr; /* destination address */
84};
85
86union ipv4_flow {
87 struct ipv4_tuple ipf_ipt;
88 uint32_t ipf_key[3];
89};
90
91struct ipv6_tuple {
92 uint16_t ip_sport; /* source port */
93 uint16_t ip_dport; /* destination port */
94 struct in6_addr ip_saddr; /* source address */
95 struct in6_addr ip_daddr; /* destination address */
96};
97
98union ipv6_flow {
99 struct ipv6_tuple ipf_ipt;
100 uint32_t ipf_key[9];
101};
102
103struct flentry {
104 volatile uint32_t f_fhash; /* hash flowing forward */
105 uint16_t f_flags; /* flow flags */
106 uint8_t f_pad;
107 uint8_t f_proto; /* protocol */
108 uint32_t f_fibnum; /* fib index */
109 uint32_t f_uptime; /* uptime at last access */
110 struct flentry *f_next; /* pointer to collision entry */
111 volatile struct rtentry *f_rt; /* rtentry for flow */
112 volatile struct llentry *f_lle; /* llentry for flow */
113};
114
115struct flentry_v4 {
116 struct flentry fl_entry;
117 union ipv4_flow fl_flow;
118};
119
120struct flentry_v6 {
121 struct flentry fl_entry;
122 union ipv6_flow fl_flow;
123};
124
125#define fl_fhash fl_entry.fl_fhash
126#define fl_flags fl_entry.fl_flags
127#define fl_proto fl_entry.fl_proto
128#define fl_uptime fl_entry.fl_uptime
129#define fl_rt fl_entry.fl_rt
130#define fl_lle fl_entry.fl_lle
131
132#define SECS_PER_HOUR 3600
133#define SECS_PER_DAY (24*SECS_PER_HOUR)
134
135#define SYN_IDLE 300
136#define UDP_IDLE 300
137#define FIN_WAIT_IDLE 600
138#define TCP_IDLE SECS_PER_DAY
139
140
141typedef void fl_lock_t(struct flowtable *, uint32_t);
142typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
143
144union flentryp {
145 struct flentry **global;
146 struct flentry **pcpu[MAXCPU];
147};
148
149struct flowtable_stats {
150 uint64_t ft_collisions;
151 uint64_t ft_allocated;
152 uint64_t ft_misses;
153 uint64_t ft_max_depth;
154 uint64_t ft_free_checks;
155 uint64_t ft_frees;
156 uint64_t ft_hits;
157 uint64_t ft_lookups;
158} __aligned(CACHE_LINE_SIZE);
159
160struct flowtable {
161 struct flowtable_stats ft_stats[MAXCPU];
162 int ft_size;
163 int ft_lock_count;
164 uint32_t ft_flags;
165 char *ft_name;
166 fl_lock_t *ft_lock;
167 fl_lock_t *ft_unlock;
168 fl_rtalloc_t *ft_rtalloc;
169 /*
170 * XXX need to pad out
171 */
172 struct mtx *ft_locks;
173 union flentryp ft_table;
174 bitstr_t *ft_masks[MAXCPU];
175 bitstr_t *ft_tmpmask;
176 struct flowtable *ft_next;
177
178 uint32_t ft_count __aligned(CACHE_LINE_SIZE);
179 uint32_t ft_udp_idle __aligned(CACHE_LINE_SIZE);
180 uint32_t ft_fin_wait_idle;
181 uint32_t ft_syn_idle;
182 uint32_t ft_tcp_idle;
183 boolean_t ft_full;
184} __aligned(CACHE_LINE_SIZE);
185
186static struct proc *flowcleanerproc;
187STATIC_VNET_DEFINE(struct flowtable *, flow_list_head);
188STATIC_VNET_DEFINE(uint32_t, flow_hashjitter);
189STATIC_VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
190STATIC_VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
187static VNET_DEFINE(struct flowtable *, flow_list_head);
188static VNET_DEFINE(uint32_t, flow_hashjitter);
189static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
190static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
191
192#define V_flow_list_head VNET(flow_list_head)
193#define V_flow_hashjitter VNET(flow_hashjitter)
194#define V_flow_ipv4_zone VNET(flow_ipv4_zone)
195#define V_flow_ipv6_zone VNET(flow_ipv6_zone)
196
197
198static struct cv flowclean_cv;
199static struct mtx flowclean_lock;
200static uint32_t flowclean_cycles;
201static uint32_t flowclean_freq;
202
203#ifdef FLOWTABLE_DEBUG
204#define FLDPRINTF(ft, flags, fmt, ...) \
205do { \
206 if ((ft)->ft_flags & (flags)) \
207 printf((fmt), __VA_ARGS__); \
208} while (0); \
209
210#else
211#define FLDPRINTF(ft, flags, fmt, ...)
212
213#endif
214
215
216/*
217 * TODO:
218 * - Make flowtable stats per-cpu, aggregated at sysctl call time,
219 * to avoid extra cache evictions caused by incrementing a shared
220 * counter
221 * - add sysctls to resize && flush flow tables
222 * - Add per flowtable sysctls for statistics and configuring timeouts
223 * - add saturation counter to rtentry to support per-packet load-balancing
224 * add flag to indicate round-robin flow, add list lookup from head
225 for flows
226 * - add sysctl / device node / syscall to support exporting and importing
227 * of flows with flag to indicate that a flow was imported so should
228 * not be considered for auto-cleaning
229 * - support explicit connection state (currently only ad-hoc for DSR)
230 * - idetach() cleanup for options VIMAGE builds.
231 */
232VNET_DEFINE(int, flowtable_enable) = 1;
191
192#define V_flow_list_head VNET(flow_list_head)
193#define V_flow_hashjitter VNET(flow_hashjitter)
194#define V_flow_ipv4_zone VNET(flow_ipv4_zone)
195#define V_flow_ipv6_zone VNET(flow_ipv6_zone)
196
197
198static struct cv flowclean_cv;
199static struct mtx flowclean_lock;
200static uint32_t flowclean_cycles;
201static uint32_t flowclean_freq;
202
203#ifdef FLOWTABLE_DEBUG
204#define FLDPRINTF(ft, flags, fmt, ...) \
205do { \
206 if ((ft)->ft_flags & (flags)) \
207 printf((fmt), __VA_ARGS__); \
208} while (0); \
209
210#else
211#define FLDPRINTF(ft, flags, fmt, ...)
212
213#endif
214
215
216/*
217 * TODO:
218 * - Make flowtable stats per-cpu, aggregated at sysctl call time,
219 * to avoid extra cache evictions caused by incrementing a shared
220 * counter
221 * - add sysctls to resize && flush flow tables
222 * - Add per flowtable sysctls for statistics and configuring timeouts
223 * - add saturation counter to rtentry to support per-packet load-balancing
224 * add flag to indicate round-robin flow, add list lookup from head
225 for flows
226 * - add sysctl / device node / syscall to support exporting and importing
227 * of flows with flag to indicate that a flow was imported so should
228 * not be considered for auto-cleaning
229 * - support explicit connection state (currently only ad-hoc for DSR)
230 * - idetach() cleanup for options VIMAGE builds.
231 */
232VNET_DEFINE(int, flowtable_enable) = 1;
233STATIC_VNET_DEFINE(int, flowtable_debug);
234STATIC_VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
235STATIC_VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
236STATIC_VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
237STATIC_VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
238STATIC_VNET_DEFINE(int, flowtable_nmbflows);
239STATIC_VNET_DEFINE(int, flowtable_ready) = 0;
233static VNET_DEFINE(int, flowtable_debug);
234static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
235static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
236static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
237static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
238static VNET_DEFINE(int, flowtable_nmbflows);
239static VNET_DEFINE(int, flowtable_ready) = 0;
240
241#define V_flowtable_enable VNET(flowtable_enable)
242#define V_flowtable_debug VNET(flowtable_debug)
243#define V_flowtable_syn_expire VNET(flowtable_syn_expire)
244#define V_flowtable_udp_expire VNET(flowtable_udp_expire)
245#define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire)
246#define V_flowtable_tcp_expire VNET(flowtable_tcp_expire)
247#define V_flowtable_nmbflows VNET(flowtable_nmbflows)
248#define V_flowtable_ready VNET(flowtable_ready)
249
250SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
251SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
252 &VNET_NAME(flowtable_debug), 0, "print debug info.");
253SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
254 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
255
256/*
257 * XXX This does not end up updating timeouts at runtime
258 * and only reflects the value for the last table added :-/
259 */
260SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
261 &VNET_NAME(flowtable_syn_expire), 0,
262 "seconds after which to remove syn allocated flow.");
263SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
264 &VNET_NAME(flowtable_udp_expire), 0,
265 "seconds after which to remove flow allocated to UDP.");
266SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
267 &VNET_NAME(flowtable_fin_wait_expire), 0,
268 "seconds after which to remove a flow in FIN_WAIT.");
269SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
270 &VNET_NAME(flowtable_tcp_expire), 0,
271 "seconds after which to remove flow allocated to a TCP connection.");
272
273
274/*
275 * Maximum number of flows that can be allocated of a given type.
276 *
277 * The table is allocated at boot time (for the pure caching case
278 * there is no reason why this could not be changed at runtime)
279 * and thus (currently) needs to be set with a tunable.
280 */
281static int
282sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
283{
284 int error, newnmbflows;
285
286 newnmbflows = V_flowtable_nmbflows;
287 error = sysctl_handle_int(oidp, &newnmbflows, 0, req);
288 if (error == 0 && req->newptr) {
289 if (newnmbflows > V_flowtable_nmbflows) {
290 V_flowtable_nmbflows = newnmbflows;
291 uma_zone_set_max(V_flow_ipv4_zone,
292 V_flowtable_nmbflows);
293 uma_zone_set_max(V_flow_ipv6_zone,
294 V_flowtable_nmbflows);
295 } else
296 error = EINVAL;
297 }
298 return (error);
299}
300SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
301 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
302 "Maximum number of flows allowed");
303
304
305
306#define FS_PRINT(sb, field) sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field)
307
308static void
309fs_print(struct sbuf *sb, struct flowtable_stats *fs)
310{
311
312 FS_PRINT(sb, collisions);
313 FS_PRINT(sb, allocated);
314 FS_PRINT(sb, misses);
315 FS_PRINT(sb, max_depth);
316 FS_PRINT(sb, free_checks);
317 FS_PRINT(sb, frees);
318 FS_PRINT(sb, hits);
319 FS_PRINT(sb, lookups);
320}
321
322static void
323flowtable_show_stats(struct sbuf *sb, struct flowtable *ft)
324{
325 int i;
326 struct flowtable_stats fs, *pfs;
327
328 if (ft->ft_flags & FL_PCPU) {
329 bzero(&fs, sizeof(fs));
330 pfs = &fs;
331 CPU_FOREACH(i) {
332 pfs->ft_collisions += ft->ft_stats[i].ft_collisions;
333 pfs->ft_allocated += ft->ft_stats[i].ft_allocated;
334 pfs->ft_misses += ft->ft_stats[i].ft_misses;
335 pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks;
336 pfs->ft_frees += ft->ft_stats[i].ft_frees;
337 pfs->ft_hits += ft->ft_stats[i].ft_hits;
338 pfs->ft_lookups += ft->ft_stats[i].ft_lookups;
339 if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth)
340 pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth;
341 }
342 } else {
343 pfs = &ft->ft_stats[0];
344 }
345 fs_print(sb, pfs);
346}
347
348static int
349sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS)
350{
351 struct flowtable *ft;
352 struct sbuf *sb;
353 int error;
354
355 sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN);
356
357 ft = V_flow_list_head;
358 while (ft != NULL) {
359 sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name);
360 flowtable_show_stats(sb, ft);
361 ft = ft->ft_next;
362 }
363 sbuf_finish(sb);
364 error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
365 sbuf_delete(sb);
366
367 return (error);
368}
369SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
370 NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics");
371
372
373#ifndef RADIX_MPATH
374static void
375in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
376{
377
378 rtalloc_ign_fib(ro, 0, fibnum);
379}
380#endif
381
382static void
383flowtable_global_lock(struct flowtable *table, uint32_t hash)
384{
385 int lock_index = (hash)&(table->ft_lock_count - 1);
386
387 mtx_lock(&table->ft_locks[lock_index]);
388}
389
390static void
391flowtable_global_unlock(struct flowtable *table, uint32_t hash)
392{
393 int lock_index = (hash)&(table->ft_lock_count - 1);
394
395 mtx_unlock(&table->ft_locks[lock_index]);
396}
397
398static void
399flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
400{
401
402 critical_enter();
403}
404
405static void
406flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
407{
408
409 critical_exit();
410}
411
412#define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
413#define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
414#define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash))
415#define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
416
417#define FL_STALE (1<<8)
418#define FL_OVERWRITE (1<<10)
419
420void
421flow_invalidate(struct flentry *fle)
422{
423
424 fle->f_flags |= FL_STALE;
425}
426
427static __inline int
428proto_to_flags(uint8_t proto)
429{
430 int flag;
431
432 switch (proto) {
433 case IPPROTO_TCP:
434 flag = FL_TCP;
435 break;
436 case IPPROTO_SCTP:
437 flag = FL_SCTP;
438 break;
439 case IPPROTO_UDP:
440 flag = FL_UDP;
441 break;
442 default:
443 flag = 0;
444 break;
445 }
446
447 return (flag);
448}
449
450static __inline int
451flags_to_proto(int flags)
452{
453 int proto, protoflags;
454
455 protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
456 switch (protoflags) {
457 case FL_TCP:
458 proto = IPPROTO_TCP;
459 break;
460 case FL_SCTP:
461 proto = IPPROTO_SCTP;
462 break;
463 case FL_UDP:
464 proto = IPPROTO_UDP;
465 break;
466 default:
467 proto = 0;
468 break;
469 }
470 return (proto);
471}
472
473#ifdef INET
474#ifdef FLOWTABLE_DEBUG
475static void
476ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin,
477 struct sockaddr_in *dsin)
478{
479 char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
480
481 if (flags & FL_HASH_ALL) {
482 inet_ntoa_r(ssin->sin_addr, saddr);
483 inet_ntoa_r(dsin->sin_addr, daddr);
484 printf("proto=%d %s:%d->%s:%d\n",
485 proto, saddr, ntohs(ssin->sin_port), daddr,
486 ntohs(dsin->sin_port));
487 } else {
488 inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr);
489 printf("proto=%d %s\n", proto, daddr);
490 }
491
492}
493#endif
494
495static int
496ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
497 struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags)
498{
499 struct ip *ip;
500 uint8_t proto;
501 int iphlen;
502 struct tcphdr *th;
503 struct udphdr *uh;
504 struct sctphdr *sh;
505 uint16_t sport, dport;
506
507 proto = sport = dport = 0;
508 ip = mtod(m, struct ip *);
509 dsin->sin_family = AF_INET;
510 dsin->sin_len = sizeof(*dsin);
511 dsin->sin_addr = ip->ip_dst;
512 ssin->sin_family = AF_INET;
513 ssin->sin_len = sizeof(*ssin);
514 ssin->sin_addr = ip->ip_src;
515
516 proto = ip->ip_p;
517 if ((*flags & FL_HASH_ALL) == 0) {
518 FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ",
519 *flags);
520 goto skipports;
521 }
522
523 iphlen = ip->ip_hl << 2; /* XXX options? */
524
525 switch (proto) {
526 case IPPROTO_TCP:
527 th = (struct tcphdr *)((caddr_t)ip + iphlen);
528 sport = th->th_sport;
529 dport = th->th_dport;
530 if ((*flags & FL_HASH_ALL) &&
531 (th->th_flags & (TH_RST|TH_FIN)))
532 *flags |= FL_STALE;
533 break;
534 case IPPROTO_UDP:
535 uh = (struct udphdr *)((caddr_t)ip + iphlen);
536 sport = uh->uh_sport;
537 dport = uh->uh_dport;
538 break;
539 case IPPROTO_SCTP:
540 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
541 sport = sh->src_port;
542 dport = sh->dest_port;
543 break;
544 default:
545 FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto);
546 return (ENOTSUP);
547 /* no port - hence not a protocol we care about */
548 break;
549
550 }
551
552skipports:
553 *flags |= proto_to_flags(proto);
554 ssin->sin_port = sport;
555 dsin->sin_port = dport;
556 return (0);
557}
558
559static uint32_t
560ipv4_flow_lookup_hash_internal(
561 struct sockaddr_in *ssin, struct sockaddr_in *dsin,
562 uint32_t *key, uint16_t flags)
563{
564 uint16_t sport, dport;
565 uint8_t proto;
566 int offset = 0;
567
568 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
569 return (0);
570 proto = flags_to_proto(flags);
571 sport = dport = key[2] = key[1] = key[0] = 0;
572 if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
573 key[1] = ssin->sin_addr.s_addr;
574 sport = ssin->sin_port;
575 }
576 if (dsin != NULL) {
577 key[2] = dsin->sin_addr.s_addr;
578 dport = dsin->sin_port;
579 }
580 if (flags & FL_HASH_ALL) {
581 ((uint16_t *)key)[0] = sport;
582 ((uint16_t *)key)[1] = dport;
583 } else
584 offset = V_flow_hashjitter + proto;
585
586 return (jenkins_hashword(key, 3, offset));
587}
588
589static struct flentry *
590flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m)
591{
592 struct sockaddr_storage ssa, dsa;
593 uint16_t flags;
594 struct sockaddr_in *dsin, *ssin;
595
596 dsin = (struct sockaddr_in *)&dsa;
597 ssin = (struct sockaddr_in *)&ssa;
598 bzero(dsin, sizeof(*dsin));
599 bzero(ssin, sizeof(*ssin));
600 flags = ft->ft_flags;
601 if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0)
602 return (NULL);
603
604 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
605}
606
607void
608flow_to_route(struct flentry *fle, struct route *ro)
609{
610 uint32_t *hashkey = NULL;
611 struct sockaddr_in *sin;
612
613 sin = (struct sockaddr_in *)&ro->ro_dst;
614 sin->sin_family = AF_INET;
615 sin->sin_len = sizeof(*sin);
616 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
617 sin->sin_addr.s_addr = hashkey[2];
618 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
619 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
620}
621#endif /* INET */
622
623#ifdef INET6
624/*
625 * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
626 * then it sets p to point at the offset "len" in the mbuf. WARNING: the
627 * pointer might become stale after other pullups (but we never use it
628 * this way).
629 */
630#define PULLUP_TO(_len, p, T) \
631do { \
632 int x = (_len) + sizeof(T); \
633 if ((m)->m_len < x) { \
634 goto receive_failed; \
635 } \
636 p = (mtod(m, char *) + (_len)); \
637} while (0)
638
639#define TCP(p) ((struct tcphdr *)(p))
640#define SCTP(p) ((struct sctphdr *)(p))
641#define UDP(p) ((struct udphdr *)(p))
642
643static int
644ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
645 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags)
646{
647 struct ip6_hdr *ip6;
648 uint8_t proto;
649 int hlen;
650 uint16_t src_port, dst_port;
651 u_short offset;
652 void *ulp;
653
654 offset = hlen = src_port = dst_port = 0;
655 ulp = NULL;
656 ip6 = mtod(m, struct ip6_hdr *);
657 hlen = sizeof(struct ip6_hdr);
658 proto = ip6->ip6_nxt;
659
660 if ((*flags & FL_HASH_ALL) == 0)
661 goto skipports;
662
663 while (ulp == NULL) {
664 switch (proto) {
665 case IPPROTO_ICMPV6:
666 case IPPROTO_OSPFIGP:
667 case IPPROTO_PIM:
668 case IPPROTO_CARP:
669 case IPPROTO_ESP:
670 case IPPROTO_NONE:
671 ulp = ip6;
672 break;
673 case IPPROTO_TCP:
674 PULLUP_TO(hlen, ulp, struct tcphdr);
675 dst_port = TCP(ulp)->th_dport;
676 src_port = TCP(ulp)->th_sport;
677 if ((*flags & FL_HASH_ALL) &&
678 (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
679 *flags |= FL_STALE;
680 break;
681 case IPPROTO_SCTP:
682 PULLUP_TO(hlen, ulp, struct sctphdr);
683 src_port = SCTP(ulp)->src_port;
684 dst_port = SCTP(ulp)->dest_port;
685 break;
686 case IPPROTO_UDP:
687 PULLUP_TO(hlen, ulp, struct udphdr);
688 dst_port = UDP(ulp)->uh_dport;
689 src_port = UDP(ulp)->uh_sport;
690 break;
691 case IPPROTO_HOPOPTS: /* RFC 2460 */
692 PULLUP_TO(hlen, ulp, struct ip6_hbh);
693 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
694 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
695 ulp = NULL;
696 break;
697 case IPPROTO_ROUTING: /* RFC 2460 */
698 PULLUP_TO(hlen, ulp, struct ip6_rthdr);
699 hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
700 proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
701 ulp = NULL;
702 break;
703 case IPPROTO_FRAGMENT: /* RFC 2460 */
704 PULLUP_TO(hlen, ulp, struct ip6_frag);
705 hlen += sizeof (struct ip6_frag);
706 proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
707 offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
708 IP6F_OFF_MASK;
709 ulp = NULL;
710 break;
711 case IPPROTO_DSTOPTS: /* RFC 2460 */
712 PULLUP_TO(hlen, ulp, struct ip6_hbh);
713 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
714 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
715 ulp = NULL;
716 break;
717 case IPPROTO_AH: /* RFC 2402 */
718 PULLUP_TO(hlen, ulp, struct ip6_ext);
719 hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
720 proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
721 ulp = NULL;
722 break;
723 default:
724 PULLUP_TO(hlen, ulp, struct ip6_ext);
725 break;
726 }
727 }
728
729 if (src_port == 0) {
730 receive_failed:
731 return (ENOTSUP);
732 }
733
734skipports:
735 dsin6->sin6_family = AF_INET6;
736 dsin6->sin6_len = sizeof(*dsin6);
737 dsin6->sin6_port = dst_port;
738 memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
739
740 ssin6->sin6_family = AF_INET6;
741 ssin6->sin6_len = sizeof(*ssin6);
742 ssin6->sin6_port = src_port;
743 memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
744 *flags |= proto_to_flags(proto);
745
746 return (0);
747}
748
749#define zero_key(key) \
750do { \
751 key[0] = 0; \
752 key[1] = 0; \
753 key[2] = 0; \
754 key[3] = 0; \
755 key[4] = 0; \
756 key[5] = 0; \
757 key[6] = 0; \
758 key[7] = 0; \
759 key[8] = 0; \
760} while (0)
761
762static uint32_t
763ipv6_flow_lookup_hash_internal(
764 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6,
765 uint32_t *key, uint16_t flags)
766{
767 uint16_t sport, dport;
768 uint8_t proto;
769 int offset = 0;
770
771 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
772 return (0);
773
774 proto = flags_to_proto(flags);
775 zero_key(key);
776 sport = dport = 0;
777 if (dsin6 != NULL) {
778 memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
779 dport = dsin6->sin6_port;
780 }
781 if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
782 memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
783 sport = ssin6->sin6_port;
784 }
785 if (flags & FL_HASH_ALL) {
786 ((uint16_t *)key)[0] = sport;
787 ((uint16_t *)key)[1] = dport;
788 } else
789 offset = V_flow_hashjitter + proto;
790
791 return (jenkins_hashword(key, 9, offset));
792}
793
794static struct flentry *
795flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m)
796{
797 struct sockaddr_storage ssa, dsa;
798 struct sockaddr_in6 *dsin6, *ssin6;
799 uint16_t flags;
800
801 dsin6 = (struct sockaddr_in6 *)&dsa;
802 ssin6 = (struct sockaddr_in6 *)&ssa;
803 bzero(dsin6, sizeof(*dsin6));
804 bzero(ssin6, sizeof(*ssin6));
805 flags = ft->ft_flags;
806
807 if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0)
808 return (NULL);
809
810 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
811}
812
813void
814flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
815{
816 uint32_t *hashkey = NULL;
817 struct sockaddr_in6 *sin6;
818
819 sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
820
821 sin6->sin6_family = AF_INET6;
822 sin6->sin6_len = sizeof(*sin6);
823 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
824 memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
825 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
826 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
827
828}
829#endif /* INET6 */
830
831static bitstr_t *
832flowtable_mask(struct flowtable *ft)
833{
834 bitstr_t *mask;
835
836 if (ft->ft_flags & FL_PCPU)
837 mask = ft->ft_masks[curcpu];
838 else
839 mask = ft->ft_masks[0];
840
841 return (mask);
842}
843
844static struct flentry **
845flowtable_entry(struct flowtable *ft, uint32_t hash)
846{
847 struct flentry **fle;
848 int index = (hash % ft->ft_size);
849
850 if (ft->ft_flags & FL_PCPU) {
851 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
852 fle = &ft->ft_table.pcpu[curcpu][index];
853 } else {
854 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
855 fle = &ft->ft_table.global[index];
856 }
857
858 return (fle);
859}
860
861static int
862flow_stale(struct flowtable *ft, struct flentry *fle)
863{
864 time_t idle_time;
865
866 if ((fle->f_fhash == 0)
867 || ((fle->f_rt->rt_flags & RTF_HOST) &&
868 ((fle->f_rt->rt_flags & (RTF_UP))
869 != (RTF_UP)))
870 || (fle->f_rt->rt_ifp == NULL)
871 || !RT_LINK_IS_UP(fle->f_rt->rt_ifp))
872 return (1);
873
874 idle_time = time_uptime - fle->f_uptime;
875
876 if ((fle->f_flags & FL_STALE) ||
877 ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
878 && (idle_time > ft->ft_udp_idle)) ||
879 ((fle->f_flags & TH_FIN)
880 && (idle_time > ft->ft_fin_wait_idle)) ||
881 ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
882 && (idle_time > ft->ft_syn_idle)) ||
883 ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
884 && (idle_time > ft->ft_tcp_idle)) ||
885 ((fle->f_rt->rt_flags & RTF_UP) == 0 ||
886 (fle->f_rt->rt_ifp == NULL)))
887 return (1);
888
889 return (0);
890}
891
892static void
893flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
894{
895 uint32_t *hashkey;
896 int i, nwords;
897
898 if (fle->f_flags & FL_IPV6) {
899 nwords = 9;
900 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
901 } else {
902 nwords = 3;
903 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
904 }
905
906 for (i = 0; i < nwords; i++)
907 hashkey[i] = key[i];
908}
909
910static struct flentry *
911flow_alloc(struct flowtable *ft)
912{
913 struct flentry *newfle;
914 uma_zone_t zone;
915
916 newfle = NULL;
917 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
918
919 newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO);
920 if (newfle != NULL)
921 atomic_add_int(&ft->ft_count, 1);
922 return (newfle);
923}
924
925static void
926flow_free(struct flentry *fle, struct flowtable *ft)
927{
928 uma_zone_t zone;
929
930 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
931 atomic_add_int(&ft->ft_count, -1);
932 uma_zfree(zone, fle);
933}
934
935static int
936flow_full(struct flowtable *ft)
937{
938 boolean_t full;
939 uint32_t count;
940
941 full = ft->ft_full;
942 count = ft->ft_count;
943
944 if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3))))
945 ft->ft_full = FALSE;
946 else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5))))
947 ft->ft_full = TRUE;
948
949 if (full && !ft->ft_full) {
950 flowclean_freq = 4*hz;
951 if ((ft->ft_flags & FL_HASH_ALL) == 0)
952 ft->ft_udp_idle = ft->ft_fin_wait_idle =
953 ft->ft_syn_idle = ft->ft_tcp_idle = 5;
954 cv_broadcast(&flowclean_cv);
955 } else if (!full && ft->ft_full) {
956 flowclean_freq = 20*hz;
957 if ((ft->ft_flags & FL_HASH_ALL) == 0)
958 ft->ft_udp_idle = ft->ft_fin_wait_idle =
959 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
960 }
961
962 return (ft->ft_full);
963}
964
965static int
966flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
967 uint32_t fibnum, struct route *ro, uint16_t flags)
968{
969 struct flentry *fle, *fletail, *newfle, **flep;
970 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
971 int depth;
972 bitstr_t *mask;
973 uint8_t proto;
974
975 newfle = flow_alloc(ft);
976 if (newfle == NULL)
977 return (ENOMEM);
978
979 newfle->f_flags |= (flags & FL_IPV6);
980 proto = flags_to_proto(flags);
981
982 FL_ENTRY_LOCK(ft, hash);
983 mask = flowtable_mask(ft);
984 flep = flowtable_entry(ft, hash);
985 fletail = fle = *flep;
986
987 if (fle == NULL) {
988 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
989 *flep = fle = newfle;
990 goto skip;
991 }
992
993 depth = 0;
994 fs->ft_collisions++;
995 /*
996 * find end of list and make sure that we were not
997 * preempted by another thread handling this flow
998 */
999 while (fle != NULL) {
1000 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
1001 /*
1002 * there was either a hash collision
1003 * or we lost a race to insert
1004 */
1005 FL_ENTRY_UNLOCK(ft, hash);
1006 flow_free(newfle, ft);
1007
1008 if (flags & FL_OVERWRITE)
1009 goto skip;
1010 return (EEXIST);
1011 }
1012 /*
1013 * re-visit this double condition XXX
1014 */
1015 if (fletail->f_next != NULL)
1016 fletail = fle->f_next;
1017
1018 depth++;
1019 fle = fle->f_next;
1020 }
1021
1022 if (depth > fs->ft_max_depth)
1023 fs->ft_max_depth = depth;
1024 fletail->f_next = newfle;
1025 fle = newfle;
1026skip:
1027 flowtable_set_hashkey(fle, key);
1028
1029 fle->f_proto = proto;
1030 fle->f_rt = ro->ro_rt;
1031 fle->f_lle = ro->ro_lle;
1032 fle->f_fhash = hash;
1033 fle->f_fibnum = fibnum;
1034 fle->f_uptime = time_uptime;
1035 FL_ENTRY_UNLOCK(ft, hash);
1036 return (0);
1037}
1038
1039int
1040kern_flowtable_insert(struct flowtable *ft,
1041 struct sockaddr_storage *ssa, struct sockaddr_storage *dsa,
1042 struct route *ro, uint32_t fibnum, int flags)
1043{
1044 uint32_t key[9], hash;
1045
1046 flags = (ft->ft_flags | flags | FL_OVERWRITE);
1047 hash = 0;
1048
1049#ifdef INET
1050 if (ssa->ss_family == AF_INET)
1051 hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa,
1052 (struct sockaddr_in *)dsa, key, flags);
1053#endif
1054#ifdef INET6
1055 if (ssa->ss_family == AF_INET6)
1056 hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa,
1057 (struct sockaddr_in6 *)dsa, key, flags);
1058#endif
1059 if (ro->ro_rt == NULL || ro->ro_lle == NULL)
1060 return (EINVAL);
1061
1062 FLDPRINTF(ft, FL_DEBUG,
1063 "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n",
1064 key[0], key[1], key[2], hash, fibnum, flags);
1065 return (flowtable_insert(ft, hash, key, fibnum, ro, flags));
1066}
1067
1068static int
1069flowtable_key_equal(struct flentry *fle, uint32_t *key)
1070{
1071 uint32_t *hashkey;
1072 int i, nwords;
1073
1074 if (fle->f_flags & FL_IPV6) {
1075 nwords = 9;
1076 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1077 } else {
1078 nwords = 3;
1079 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1080 }
1081
1082 for (i = 0; i < nwords; i++)
1083 if (hashkey[i] != key[i])
1084 return (0);
1085
1086 return (1);
1087}
1088
1089struct flentry *
1090flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af)
1091{
1092 struct flentry *fle = NULL;
1093
1094#ifdef INET
1095 if (af == AF_INET)
1096 fle = flowtable_lookup_mbuf4(ft, m);
1097#endif
1098#ifdef INET6
1099 if (af == AF_INET6)
1100 fle = flowtable_lookup_mbuf6(ft, m);
1101#endif
1102 if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) {
1103 m->m_flags |= M_FLOWID;
1104 m->m_pkthdr.flowid = fle->f_fhash;
1105 }
1106 return (fle);
1107}
1108
1109struct flentry *
1110flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
1111 struct sockaddr_storage *dsa, uint32_t fibnum, int flags)
1112{
1113 uint32_t key[9], hash;
1114 struct flentry *fle;
1115 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1116 uint8_t proto = 0;
1117 int error = 0;
1118 struct rtentry *rt;
1119 struct llentry *lle;
1120 struct route sro, *ro;
1121 struct route_in6 sro6;
1122
1123 sro.ro_rt = sro6.ro_rt = NULL;
1124 sro.ro_lle = sro6.ro_lle = NULL;
1125 ro = NULL;
1126 hash = 0;
1127 flags |= ft->ft_flags;
1128 proto = flags_to_proto(flags);
1129#ifdef INET
1130 if (ssa->ss_family == AF_INET) {
1131 struct sockaddr_in *ssin, *dsin;
1132
1133 ro = &sro;
1134 memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
1135 /*
1136 * The harvested source and destination addresses
1137 * may contain port information if the packet is
1138 * from a transport protocol (e.g. TCP/UDP). The
1139 * port field must be cleared before performing
1140 * a route lookup.
1141 */
1142 ((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0;
1143 dsin = (struct sockaddr_in *)dsa;
1144 ssin = (struct sockaddr_in *)ssa;
1145 if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
1146 (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1147 (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
1148 return (NULL);
1149
1150 hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags);
1151 }
1152#endif
1153#ifdef INET6
1154 if (ssa->ss_family == AF_INET6) {
1155 struct sockaddr_in6 *ssin6, *dsin6;
1156
1157 ro = (struct route *)&sro6;
1158 memcpy(&sro6.ro_dst, dsa,
1159 sizeof(struct sockaddr_in6));
1160 ((struct sockaddr_in6 *)&ro->ro_dst)->sin6_port = 0;
1161 dsin6 = (struct sockaddr_in6 *)dsa;
1162 ssin6 = (struct sockaddr_in6 *)ssa;
1163
1164 flags |= FL_IPV6;
1165 hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags);
1166 }
1167#endif
1168 /*
1169 * Ports are zero and this isn't a transmit cache
1170 * - thus not a protocol for which we need to keep
1171 * state
1172 * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
1173 */
1174 if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL)))
1175 return (NULL);
1176
1177 fs->ft_lookups++;
1178 FL_ENTRY_LOCK(ft, hash);
1179 if ((fle = FL_ENTRY(ft, hash)) == NULL) {
1180 FL_ENTRY_UNLOCK(ft, hash);
1181 goto uncached;
1182 }
1183keycheck:
1184 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1185 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1186 if ((rt != NULL)
1187 && fle->f_fhash == hash
1188 && flowtable_key_equal(fle, key)
1189 && (proto == fle->f_proto)
1190 && (fibnum == fle->f_fibnum)
1191 && (rt->rt_flags & RTF_UP)
1192 && (rt->rt_ifp != NULL)) {
1193 fs->ft_hits++;
1194 fle->f_uptime = time_uptime;
1195 fle->f_flags |= flags;
1196 FL_ENTRY_UNLOCK(ft, hash);
1197 return (fle);
1198 } else if (fle->f_next != NULL) {
1199 fle = fle->f_next;
1200 goto keycheck;
1201 }
1202 FL_ENTRY_UNLOCK(ft, hash);
1203uncached:
1204 if (flags & FL_NOAUTO || flow_full(ft))
1205 return (NULL);
1206
1207 fs->ft_misses++;
1208 /*
1209 * This bit of code ends up locking the
1210 * same route 3 times (just like ip_output + ether_output)
1211 * - at lookup
1212 * - in rt_check when called by arpresolve
1213 * - dropping the refcount for the rtentry
1214 *
1215 * This could be consolidated to one if we wrote a variant
1216 * of arpresolve with an rt_check variant that expected to
1217 * receive the route locked
1218 */
1219
1220#ifdef INVARIANTS
1221 if ((ro->ro_dst.sa_family != AF_INET) &&
1222 (ro->ro_dst.sa_family != AF_INET6))
1223 panic("sa_family == %d\n", ro->ro_dst.sa_family);
1224#endif
1225
1226 ft->ft_rtalloc(ro, hash, fibnum);
1227 if (ro->ro_rt == NULL)
1228 error = ENETUNREACH;
1229 else {
1230 struct llentry *lle = NULL;
1231 struct sockaddr_storage *l3addr;
1232 struct rtentry *rt = ro->ro_rt;
1233 struct ifnet *ifp = rt->rt_ifp;
1234
1235 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
1236 RTFREE(rt);
1237 ro->ro_rt = NULL;
1238 return (NULL);
1239 }
1240#ifdef INET6
1241 if (ssa->ss_family == AF_INET6) {
1242 struct sockaddr_in6 *dsin6;
1243
1244 dsin6 = (struct sockaddr_in6 *)dsa;
1245 if (in6_localaddr(&dsin6->sin6_addr)) {
1246 RTFREE(rt);
1247 ro->ro_rt = NULL;
1248 return (NULL);
1249 }
1250
1251 if (rt->rt_flags & RTF_GATEWAY)
1252 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1253
1254 else
1255 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1256 llentry_update(&lle, LLTABLE6(ifp), l3addr, ifp);
1257 }
1258#endif
1259#ifdef INET
1260 if (ssa->ss_family == AF_INET) {
1261 if (rt->rt_flags & RTF_GATEWAY)
1262 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1263 else
1264 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1265 llentry_update(&lle, LLTABLE(ifp), l3addr, ifp);
1266 }
1267
1268#endif
1269 ro->ro_lle = lle;
1270
1271 if (lle == NULL) {
1272 RTFREE(rt);
1273 ro->ro_rt = NULL;
1274 return (NULL);
1275 }
1276 error = flowtable_insert(ft, hash, key, fibnum, ro, flags);
1277
1278 if (error) {
1279 RTFREE(rt);
1280 LLE_FREE(lle);
1281 ro->ro_rt = NULL;
1282 ro->ro_lle = NULL;
1283 }
1284 }
1285
1286 return ((error) ? NULL : fle);
1287}
1288
1289/*
1290 * used by the bit_alloc macro
1291 */
1292#define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
1293
1294struct flowtable *
1295flowtable_alloc(char *name, int nentry, int flags)
1296{
1297 struct flowtable *ft, *fttail;
1298 int i;
1299
1300 if (V_flow_hashjitter == 0)
1301 V_flow_hashjitter = arc4random();
1302
1303 KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
1304
1305 ft = malloc(sizeof(struct flowtable),
1306 M_RTABLE, M_WAITOK | M_ZERO);
1307
1308 ft->ft_name = name;
1309 ft->ft_flags = flags;
1310 ft->ft_size = nentry;
1311#ifdef RADIX_MPATH
1312 ft->ft_rtalloc = rtalloc_mpath_fib;
1313#else
1314 ft->ft_rtalloc = in_rtalloc_ign_wrapper;
1315#endif
1316 if (flags & FL_PCPU) {
1317 ft->ft_lock = flowtable_pcpu_lock;
1318 ft->ft_unlock = flowtable_pcpu_unlock;
1319
1320 for (i = 0; i <= mp_maxid; i++) {
1321 ft->ft_table.pcpu[i] =
1322 malloc(nentry*sizeof(struct flentry *),
1323 M_RTABLE, M_WAITOK | M_ZERO);
1324 ft->ft_masks[i] = bit_alloc(nentry);
1325 }
1326 } else {
1327 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
1328 (fls(mp_maxid + 1) << 1));
1329
1330 ft->ft_lock = flowtable_global_lock;
1331 ft->ft_unlock = flowtable_global_unlock;
1332 ft->ft_table.global =
1333 malloc(nentry*sizeof(struct flentry *),
1334 M_RTABLE, M_WAITOK | M_ZERO);
1335 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
1336 M_RTABLE, M_WAITOK | M_ZERO);
1337 for (i = 0; i < ft->ft_lock_count; i++)
1338 mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
1339
1340 ft->ft_masks[0] = bit_alloc(nentry);
1341 }
1342 ft->ft_tmpmask = bit_alloc(nentry);
1343
1344 /*
1345 * In the local transmit case the table truly is
1346 * just a cache - so everything is eligible for
1347 * replacement after 5s of non-use
1348 */
1349 if (flags & FL_HASH_ALL) {
1350 ft->ft_udp_idle = V_flowtable_udp_expire;
1351 ft->ft_syn_idle = V_flowtable_syn_expire;
1352 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
1353 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
1354 } else {
1355 ft->ft_udp_idle = ft->ft_fin_wait_idle =
1356 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
1357
1358 }
1359
1360 /*
1361 * hook in to the cleaner list
1362 */
1363 if (V_flow_list_head == NULL)
1364 V_flow_list_head = ft;
1365 else {
1366 fttail = V_flow_list_head;
1367 while (fttail->ft_next != NULL)
1368 fttail = fttail->ft_next;
1369 fttail->ft_next = ft;
1370 }
1371
1372 return (ft);
1373}
1374
1375/*
1376 * The rest of the code is devoted to garbage collection of expired entries.
1377 * It is a new additon made necessary by the switch to dynamically allocating
1378 * flow tables.
1379 *
1380 */
1381static void
1382fle_free(struct flentry *fle, struct flowtable *ft)
1383{
1384 struct rtentry *rt;
1385 struct llentry *lle;
1386
1387 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1388 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1389 if (rt != NULL)
1390 RTFREE(rt);
1391 if (lle != NULL)
1392 LLE_FREE(lle);
1393 flow_free(fle, ft);
1394}
1395
1396static void
1397flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
1398{
1399 int curbit = 0, count;
1400 struct flentry *fle, **flehead, *fleprev;
1401 struct flentry *flefreehead, *flefreetail, *fletmp;
1402 bitstr_t *mask, *tmpmask;
1403 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1404
1405 flefreehead = flefreetail = NULL;
1406 mask = flowtable_mask(ft);
1407 tmpmask = ft->ft_tmpmask;
1408 memcpy(tmpmask, mask, ft->ft_size/8);
1409 /*
1410 * XXX Note to self, bit_ffs operates at the byte level
1411 * and thus adds gratuitous overhead
1412 */
1413 bit_ffs(tmpmask, ft->ft_size, &curbit);
1414 while (curbit != -1) {
1415 if (curbit >= ft->ft_size || curbit < -1) {
1416 log(LOG_ALERT,
1417 "warning: bad curbit value %d \n",
1418 curbit);
1419 break;
1420 }
1421
1422 FL_ENTRY_LOCK(ft, curbit);
1423 flehead = flowtable_entry(ft, curbit);
1424 fle = fleprev = *flehead;
1425
1426 fs->ft_free_checks++;
1427#ifdef DIAGNOSTIC
1428 if (fle == NULL && curbit > 0) {
1429 log(LOG_ALERT,
1430 "warning bit=%d set, but no fle found\n",
1431 curbit);
1432 }
1433#endif
1434 while (fle != NULL) {
1435 if (rt != NULL) {
1436 if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
1437 fleprev = fle;
1438 fle = fle->f_next;
1439 continue;
1440 }
1441 } else if (!flow_stale(ft, fle)) {
1442 fleprev = fle;
1443 fle = fle->f_next;
1444 continue;
1445 }
1446 /*
1447 * delete head of the list
1448 */
1449 if (fleprev == *flehead) {
1450 fletmp = fleprev;
1451 if (fle == fleprev) {
1452 fleprev = *flehead = fle->f_next;
1453 } else
1454 fleprev = *flehead = fle;
1455 fle = fle->f_next;
1456 } else {
1457 /*
1458 * don't advance fleprev
1459 */
1460 fletmp = fle;
1461 fleprev->f_next = fle->f_next;
1462 fle = fleprev->f_next;
1463 }
1464
1465 if (flefreehead == NULL)
1466 flefreehead = flefreetail = fletmp;
1467 else {
1468 flefreetail->f_next = fletmp;
1469 flefreetail = fletmp;
1470 }
1471 fletmp->f_next = NULL;
1472 }
1473 if (*flehead == NULL)
1474 bit_clear(mask, curbit);
1475 FL_ENTRY_UNLOCK(ft, curbit);
1476 bit_clear(tmpmask, curbit);
1477 bit_ffs(tmpmask, ft->ft_size, &curbit);
1478 }
1479 count = 0;
1480 while ((fle = flefreehead) != NULL) {
1481 flefreehead = fle->f_next;
1482 count++;
1483 fs->ft_frees++;
1484 fle_free(fle, ft);
1485 }
1486 if (V_flowtable_debug && count)
1487 log(LOG_DEBUG, "freed %d flow entries\n", count);
1488}
1489
1490void
1491flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
1492{
1493 int i;
1494
1495 if (ft->ft_flags & FL_PCPU) {
1496 CPU_FOREACH(i) {
1497 if (smp_started == 1) {
1498 thread_lock(curthread);
1499 sched_bind(curthread, i);
1500 thread_unlock(curthread);
1501 }
1502
1503 flowtable_free_stale(ft, rt);
1504
1505 if (smp_started == 1) {
1506 thread_lock(curthread);
1507 sched_unbind(curthread);
1508 thread_unlock(curthread);
1509 }
1510 }
1511 } else {
1512 flowtable_free_stale(ft, rt);
1513 }
1514}
1515
1516static void
1517flowtable_clean_vnet(void)
1518{
1519 struct flowtable *ft;
1520 int i;
1521
1522 ft = V_flow_list_head;
1523 while (ft != NULL) {
1524 if (ft->ft_flags & FL_PCPU) {
1525 CPU_FOREACH(i) {
1526 if (smp_started == 1) {
1527 thread_lock(curthread);
1528 sched_bind(curthread, i);
1529 thread_unlock(curthread);
1530 }
1531
1532 flowtable_free_stale(ft, NULL);
1533
1534 if (smp_started == 1) {
1535 thread_lock(curthread);
1536 sched_unbind(curthread);
1537 thread_unlock(curthread);
1538 }
1539 }
1540 } else {
1541 flowtable_free_stale(ft, NULL);
1542 }
1543 ft = ft->ft_next;
1544 }
1545}
1546
1547static void
1548flowtable_cleaner(void)
1549{
1550 VNET_ITERATOR_DECL(vnet_iter);
1551
1552 if (bootverbose)
1553 log(LOG_INFO, "flowtable cleaner started\n");
1554 while (1) {
1555 VNET_LIST_RLOCK();
1556 VNET_FOREACH(vnet_iter) {
1557 CURVNET_SET(vnet_iter);
1558 flowtable_clean_vnet();
1559 CURVNET_RESTORE();
1560 }
1561 VNET_LIST_RUNLOCK();
1562
1563 flowclean_cycles++;
1564 /*
1565 * The 10 second interval between cleaning checks
1566 * is arbitrary
1567 */
1568 mtx_lock(&flowclean_lock);
1569 cv_broadcast(&flowclean_cv);
1570 cv_timedwait(&flowclean_cv, &flowclean_lock, flowclean_freq);
1571 mtx_unlock(&flowclean_lock);
1572 }
1573}
1574
1575static void
1576flowtable_flush(void *unused __unused)
1577{
1578 uint64_t start;
1579
1580 mtx_lock(&flowclean_lock);
1581 start = flowclean_cycles;
1582 while (start == flowclean_cycles) {
1583 cv_broadcast(&flowclean_cv);
1584 cv_wait(&flowclean_cv, &flowclean_lock);
1585 }
1586 mtx_unlock(&flowclean_lock);
1587}
1588
1589static struct kproc_desc flow_kp = {
1590 "flowcleaner",
1591 flowtable_cleaner,
1592 &flowcleanerproc
1593};
1594SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
1595
1596static void
1597flowtable_init_vnet(const void *unused __unused)
1598{
1599
1600 V_flowtable_nmbflows = 1024 + maxusers * 64 * mp_ncpus;
1601 V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
1602 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1603 V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
1604 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1605 uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
1606 uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
1607 V_flowtable_ready = 1;
1608}
1609VNET_SYSINIT(flowtable_init_vnet, SI_SUB_SMP, SI_ORDER_ANY,
1610 flowtable_init_vnet, NULL);
1611
1612static void
1613flowtable_init(const void *unused __unused)
1614{
1615
1616 cv_init(&flowclean_cv, "flowcleanwait");
1617 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1618 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1619 EVENTHANDLER_PRI_ANY);
1620 flowclean_freq = 20*hz;
1621}
1622SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST,
1623 flowtable_init, NULL);
1624
1625
1626#ifdef VIMAGE
1627static void
1628flowtable_uninit(const void *unused __unused)
1629{
1630
1631 V_flowtable_ready = 0;
1632 uma_zdestroy(V_flow_ipv4_zone);
1633 uma_zdestroy(V_flow_ipv6_zone);
1634}
1635
1636VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1637 flowtable_uninit, NULL);
1638#endif
1639
1640#ifdef DDB
1641static uint32_t *
1642flowtable_get_hashkey(struct flentry *fle)
1643{
1644 uint32_t *hashkey;
1645
1646 if (fle->f_flags & FL_IPV6)
1647 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1648 else
1649 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1650
1651 return (hashkey);
1652}
1653
1654static bitstr_t *
1655flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1656{
1657 bitstr_t *mask;
1658
1659 if (ft->ft_flags & FL_PCPU)
1660 mask = ft->ft_masks[cpuid];
1661 else
1662 mask = ft->ft_masks[0];
1663
1664 return (mask);
1665}
1666
1667static struct flentry **
1668flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1669{
1670 struct flentry **fle;
1671 int index = (hash % ft->ft_size);
1672
1673 if (ft->ft_flags & FL_PCPU) {
1674 fle = &ft->ft_table.pcpu[cpuid][index];
1675 } else {
1676 fle = &ft->ft_table.global[index];
1677 }
1678
1679 return (fle);
1680}
1681
1682static void
1683flow_show(struct flowtable *ft, struct flentry *fle)
1684{
1685 int idle_time;
1686 int rt_valid, ifp_valid;
1687 uint16_t sport, dport;
1688 uint32_t *hashkey;
1689 char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
1690 volatile struct rtentry *rt;
1691 struct ifnet *ifp = NULL;
1692
1693 idle_time = (int)(time_uptime - fle->f_uptime);
1694 rt = fle->f_rt;
1695 rt_valid = rt != NULL;
1696 if (rt_valid)
1697 ifp = rt->rt_ifp;
1698 ifp_valid = ifp != NULL;
1699 hashkey = flowtable_get_hashkey(fle);
1700 if (fle->f_flags & FL_IPV6)
1701 goto skipaddr;
1702
1703 inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr);
1704 if (ft->ft_flags & FL_HASH_ALL) {
1705 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
1706 sport = ntohs(((uint16_t *)hashkey)[0]);
1707 dport = ntohs(((uint16_t *)hashkey)[1]);
1708 db_printf("%s:%d->%s:%d",
1709 saddr, sport, daddr,
1710 dport);
1711 } else
1712 db_printf("%s ", daddr);
1713
1714skipaddr:
1715 if (fle->f_flags & FL_STALE)
1716 db_printf(" FL_STALE ");
1717 if (fle->f_flags & FL_TCP)
1718 db_printf(" FL_TCP ");
1719 if (fle->f_flags & FL_UDP)
1720 db_printf(" FL_UDP ");
1721 if (rt_valid) {
1722 if (rt->rt_flags & RTF_UP)
1723 db_printf(" RTF_UP ");
1724 }
1725 if (ifp_valid) {
1726 if (ifp->if_flags & IFF_LOOPBACK)
1727 db_printf(" IFF_LOOPBACK ");
1728 if (ifp->if_flags & IFF_UP)
1729 db_printf(" IFF_UP ");
1730 if (ifp->if_flags & IFF_POINTOPOINT)
1731 db_printf(" IFF_POINTOPOINT ");
1732 }
1733 if (fle->f_flags & FL_IPV6)
1734 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1735 hashkey[0], hashkey[1], hashkey[2],
1736 hashkey[3], hashkey[4], hashkey[5],
1737 hashkey[6], hashkey[7], hashkey[8]);
1738 else
1739 db_printf("\n\tkey=%08x:%08x:%08x ",
1740 hashkey[0], hashkey[1], hashkey[2]);
1741 db_printf("hash=%08x idle_time=%03d"
1742 "\n\tfibnum=%02d rt=%p",
1743 fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt);
1744 db_printf("\n");
1745}
1746
1747static void
1748flowtable_show(struct flowtable *ft, int cpuid)
1749{
1750 int curbit = 0;
1751 struct flentry *fle, **flehead;
1752 bitstr_t *mask, *tmpmask;
1753
1754 if (cpuid != -1)
1755 db_printf("cpu: %d\n", cpuid);
1756 mask = flowtable_mask_pcpu(ft, cpuid);
1757 tmpmask = ft->ft_tmpmask;
1758 memcpy(tmpmask, mask, ft->ft_size/8);
1759 /*
1760 * XXX Note to self, bit_ffs operates at the byte level
1761 * and thus adds gratuitous overhead
1762 */
1763 bit_ffs(tmpmask, ft->ft_size, &curbit);
1764 while (curbit != -1) {
1765 if (curbit >= ft->ft_size || curbit < -1) {
1766 db_printf("warning: bad curbit value %d \n",
1767 curbit);
1768 break;
1769 }
1770
1771 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1772 fle = *flehead;
1773
1774 while (fle != NULL) {
1775 flow_show(ft, fle);
1776 fle = fle->f_next;
1777 continue;
1778 }
1779 bit_clear(tmpmask, curbit);
1780 bit_ffs(tmpmask, ft->ft_size, &curbit);
1781 }
1782}
1783
1784static void
1785flowtable_show_vnet(void)
1786{
1787 struct flowtable *ft;
1788 int i;
1789
1790 ft = V_flow_list_head;
1791 while (ft != NULL) {
1792 printf("name: %s\n", ft->ft_name);
1793 if (ft->ft_flags & FL_PCPU) {
1794 CPU_FOREACH(i) {
1795 flowtable_show(ft, i);
1796 }
1797 } else {
1798 flowtable_show(ft, -1);
1799 }
1800 ft = ft->ft_next;
1801 }
1802}
1803
1804DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1805{
1806 VNET_ITERATOR_DECL(vnet_iter);
1807
1808 VNET_FOREACH(vnet_iter) {
1809 CURVNET_SET(vnet_iter);
1810 flowtable_show_vnet();
1811 CURVNET_RESTORE();
1812 }
1813}
1814#endif
240
241#define V_flowtable_enable VNET(flowtable_enable)
242#define V_flowtable_debug VNET(flowtable_debug)
243#define V_flowtable_syn_expire VNET(flowtable_syn_expire)
244#define V_flowtable_udp_expire VNET(flowtable_udp_expire)
245#define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire)
246#define V_flowtable_tcp_expire VNET(flowtable_tcp_expire)
247#define V_flowtable_nmbflows VNET(flowtable_nmbflows)
248#define V_flowtable_ready VNET(flowtable_ready)
249
250SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
251SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
252 &VNET_NAME(flowtable_debug), 0, "print debug info.");
253SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
254 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
255
256/*
257 * XXX This does not end up updating timeouts at runtime
258 * and only reflects the value for the last table added :-/
259 */
260SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
261 &VNET_NAME(flowtable_syn_expire), 0,
262 "seconds after which to remove syn allocated flow.");
263SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
264 &VNET_NAME(flowtable_udp_expire), 0,
265 "seconds after which to remove flow allocated to UDP.");
266SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
267 &VNET_NAME(flowtable_fin_wait_expire), 0,
268 "seconds after which to remove a flow in FIN_WAIT.");
269SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
270 &VNET_NAME(flowtable_tcp_expire), 0,
271 "seconds after which to remove flow allocated to a TCP connection.");
272
273
274/*
275 * Maximum number of flows that can be allocated of a given type.
276 *
277 * The table is allocated at boot time (for the pure caching case
278 * there is no reason why this could not be changed at runtime)
279 * and thus (currently) needs to be set with a tunable.
280 */
281static int
282sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
283{
284 int error, newnmbflows;
285
286 newnmbflows = V_flowtable_nmbflows;
287 error = sysctl_handle_int(oidp, &newnmbflows, 0, req);
288 if (error == 0 && req->newptr) {
289 if (newnmbflows > V_flowtable_nmbflows) {
290 V_flowtable_nmbflows = newnmbflows;
291 uma_zone_set_max(V_flow_ipv4_zone,
292 V_flowtable_nmbflows);
293 uma_zone_set_max(V_flow_ipv6_zone,
294 V_flowtable_nmbflows);
295 } else
296 error = EINVAL;
297 }
298 return (error);
299}
300SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
301 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
302 "Maximum number of flows allowed");
303
304
305
306#define FS_PRINT(sb, field) sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field)
307
308static void
309fs_print(struct sbuf *sb, struct flowtable_stats *fs)
310{
311
312 FS_PRINT(sb, collisions);
313 FS_PRINT(sb, allocated);
314 FS_PRINT(sb, misses);
315 FS_PRINT(sb, max_depth);
316 FS_PRINT(sb, free_checks);
317 FS_PRINT(sb, frees);
318 FS_PRINT(sb, hits);
319 FS_PRINT(sb, lookups);
320}
321
322static void
323flowtable_show_stats(struct sbuf *sb, struct flowtable *ft)
324{
325 int i;
326 struct flowtable_stats fs, *pfs;
327
328 if (ft->ft_flags & FL_PCPU) {
329 bzero(&fs, sizeof(fs));
330 pfs = &fs;
331 CPU_FOREACH(i) {
332 pfs->ft_collisions += ft->ft_stats[i].ft_collisions;
333 pfs->ft_allocated += ft->ft_stats[i].ft_allocated;
334 pfs->ft_misses += ft->ft_stats[i].ft_misses;
335 pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks;
336 pfs->ft_frees += ft->ft_stats[i].ft_frees;
337 pfs->ft_hits += ft->ft_stats[i].ft_hits;
338 pfs->ft_lookups += ft->ft_stats[i].ft_lookups;
339 if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth)
340 pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth;
341 }
342 } else {
343 pfs = &ft->ft_stats[0];
344 }
345 fs_print(sb, pfs);
346}
347
348static int
349sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS)
350{
351 struct flowtable *ft;
352 struct sbuf *sb;
353 int error;
354
355 sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN);
356
357 ft = V_flow_list_head;
358 while (ft != NULL) {
359 sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name);
360 flowtable_show_stats(sb, ft);
361 ft = ft->ft_next;
362 }
363 sbuf_finish(sb);
364 error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
365 sbuf_delete(sb);
366
367 return (error);
368}
369SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
370 NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics");
371
372
373#ifndef RADIX_MPATH
374static void
375in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
376{
377
378 rtalloc_ign_fib(ro, 0, fibnum);
379}
380#endif
381
382static void
383flowtable_global_lock(struct flowtable *table, uint32_t hash)
384{
385 int lock_index = (hash)&(table->ft_lock_count - 1);
386
387 mtx_lock(&table->ft_locks[lock_index]);
388}
389
390static void
391flowtable_global_unlock(struct flowtable *table, uint32_t hash)
392{
393 int lock_index = (hash)&(table->ft_lock_count - 1);
394
395 mtx_unlock(&table->ft_locks[lock_index]);
396}
397
398static void
399flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
400{
401
402 critical_enter();
403}
404
405static void
406flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
407{
408
409 critical_exit();
410}
411
412#define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
413#define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
414#define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash))
415#define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
416
417#define FL_STALE (1<<8)
418#define FL_OVERWRITE (1<<10)
419
420void
421flow_invalidate(struct flentry *fle)
422{
423
424 fle->f_flags |= FL_STALE;
425}
426
427static __inline int
428proto_to_flags(uint8_t proto)
429{
430 int flag;
431
432 switch (proto) {
433 case IPPROTO_TCP:
434 flag = FL_TCP;
435 break;
436 case IPPROTO_SCTP:
437 flag = FL_SCTP;
438 break;
439 case IPPROTO_UDP:
440 flag = FL_UDP;
441 break;
442 default:
443 flag = 0;
444 break;
445 }
446
447 return (flag);
448}
449
450static __inline int
451flags_to_proto(int flags)
452{
453 int proto, protoflags;
454
455 protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
456 switch (protoflags) {
457 case FL_TCP:
458 proto = IPPROTO_TCP;
459 break;
460 case FL_SCTP:
461 proto = IPPROTO_SCTP;
462 break;
463 case FL_UDP:
464 proto = IPPROTO_UDP;
465 break;
466 default:
467 proto = 0;
468 break;
469 }
470 return (proto);
471}
472
473#ifdef INET
474#ifdef FLOWTABLE_DEBUG
475static void
476ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin,
477 struct sockaddr_in *dsin)
478{
479 char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
480
481 if (flags & FL_HASH_ALL) {
482 inet_ntoa_r(ssin->sin_addr, saddr);
483 inet_ntoa_r(dsin->sin_addr, daddr);
484 printf("proto=%d %s:%d->%s:%d\n",
485 proto, saddr, ntohs(ssin->sin_port), daddr,
486 ntohs(dsin->sin_port));
487 } else {
488 inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr);
489 printf("proto=%d %s\n", proto, daddr);
490 }
491
492}
493#endif
494
495static int
496ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
497 struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags)
498{
499 struct ip *ip;
500 uint8_t proto;
501 int iphlen;
502 struct tcphdr *th;
503 struct udphdr *uh;
504 struct sctphdr *sh;
505 uint16_t sport, dport;
506
507 proto = sport = dport = 0;
508 ip = mtod(m, struct ip *);
509 dsin->sin_family = AF_INET;
510 dsin->sin_len = sizeof(*dsin);
511 dsin->sin_addr = ip->ip_dst;
512 ssin->sin_family = AF_INET;
513 ssin->sin_len = sizeof(*ssin);
514 ssin->sin_addr = ip->ip_src;
515
516 proto = ip->ip_p;
517 if ((*flags & FL_HASH_ALL) == 0) {
518 FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ",
519 *flags);
520 goto skipports;
521 }
522
523 iphlen = ip->ip_hl << 2; /* XXX options? */
524
525 switch (proto) {
526 case IPPROTO_TCP:
527 th = (struct tcphdr *)((caddr_t)ip + iphlen);
528 sport = th->th_sport;
529 dport = th->th_dport;
530 if ((*flags & FL_HASH_ALL) &&
531 (th->th_flags & (TH_RST|TH_FIN)))
532 *flags |= FL_STALE;
533 break;
534 case IPPROTO_UDP:
535 uh = (struct udphdr *)((caddr_t)ip + iphlen);
536 sport = uh->uh_sport;
537 dport = uh->uh_dport;
538 break;
539 case IPPROTO_SCTP:
540 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
541 sport = sh->src_port;
542 dport = sh->dest_port;
543 break;
544 default:
545 FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto);
546 return (ENOTSUP);
547 /* no port - hence not a protocol we care about */
548 break;
549
550 }
551
552skipports:
553 *flags |= proto_to_flags(proto);
554 ssin->sin_port = sport;
555 dsin->sin_port = dport;
556 return (0);
557}
558
559static uint32_t
560ipv4_flow_lookup_hash_internal(
561 struct sockaddr_in *ssin, struct sockaddr_in *dsin,
562 uint32_t *key, uint16_t flags)
563{
564 uint16_t sport, dport;
565 uint8_t proto;
566 int offset = 0;
567
568 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
569 return (0);
570 proto = flags_to_proto(flags);
571 sport = dport = key[2] = key[1] = key[0] = 0;
572 if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
573 key[1] = ssin->sin_addr.s_addr;
574 sport = ssin->sin_port;
575 }
576 if (dsin != NULL) {
577 key[2] = dsin->sin_addr.s_addr;
578 dport = dsin->sin_port;
579 }
580 if (flags & FL_HASH_ALL) {
581 ((uint16_t *)key)[0] = sport;
582 ((uint16_t *)key)[1] = dport;
583 } else
584 offset = V_flow_hashjitter + proto;
585
586 return (jenkins_hashword(key, 3, offset));
587}
588
589static struct flentry *
590flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m)
591{
592 struct sockaddr_storage ssa, dsa;
593 uint16_t flags;
594 struct sockaddr_in *dsin, *ssin;
595
596 dsin = (struct sockaddr_in *)&dsa;
597 ssin = (struct sockaddr_in *)&ssa;
598 bzero(dsin, sizeof(*dsin));
599 bzero(ssin, sizeof(*ssin));
600 flags = ft->ft_flags;
601 if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0)
602 return (NULL);
603
604 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
605}
606
607void
608flow_to_route(struct flentry *fle, struct route *ro)
609{
610 uint32_t *hashkey = NULL;
611 struct sockaddr_in *sin;
612
613 sin = (struct sockaddr_in *)&ro->ro_dst;
614 sin->sin_family = AF_INET;
615 sin->sin_len = sizeof(*sin);
616 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
617 sin->sin_addr.s_addr = hashkey[2];
618 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
619 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
620}
621#endif /* INET */
622
623#ifdef INET6
624/*
625 * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
626 * then it sets p to point at the offset "len" in the mbuf. WARNING: the
627 * pointer might become stale after other pullups (but we never use it
628 * this way).
629 */
630#define PULLUP_TO(_len, p, T) \
631do { \
632 int x = (_len) + sizeof(T); \
633 if ((m)->m_len < x) { \
634 goto receive_failed; \
635 } \
636 p = (mtod(m, char *) + (_len)); \
637} while (0)
638
639#define TCP(p) ((struct tcphdr *)(p))
640#define SCTP(p) ((struct sctphdr *)(p))
641#define UDP(p) ((struct udphdr *)(p))
642
643static int
644ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
645 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags)
646{
647 struct ip6_hdr *ip6;
648 uint8_t proto;
649 int hlen;
650 uint16_t src_port, dst_port;
651 u_short offset;
652 void *ulp;
653
654 offset = hlen = src_port = dst_port = 0;
655 ulp = NULL;
656 ip6 = mtod(m, struct ip6_hdr *);
657 hlen = sizeof(struct ip6_hdr);
658 proto = ip6->ip6_nxt;
659
660 if ((*flags & FL_HASH_ALL) == 0)
661 goto skipports;
662
663 while (ulp == NULL) {
664 switch (proto) {
665 case IPPROTO_ICMPV6:
666 case IPPROTO_OSPFIGP:
667 case IPPROTO_PIM:
668 case IPPROTO_CARP:
669 case IPPROTO_ESP:
670 case IPPROTO_NONE:
671 ulp = ip6;
672 break;
673 case IPPROTO_TCP:
674 PULLUP_TO(hlen, ulp, struct tcphdr);
675 dst_port = TCP(ulp)->th_dport;
676 src_port = TCP(ulp)->th_sport;
677 if ((*flags & FL_HASH_ALL) &&
678 (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
679 *flags |= FL_STALE;
680 break;
681 case IPPROTO_SCTP:
682 PULLUP_TO(hlen, ulp, struct sctphdr);
683 src_port = SCTP(ulp)->src_port;
684 dst_port = SCTP(ulp)->dest_port;
685 break;
686 case IPPROTO_UDP:
687 PULLUP_TO(hlen, ulp, struct udphdr);
688 dst_port = UDP(ulp)->uh_dport;
689 src_port = UDP(ulp)->uh_sport;
690 break;
691 case IPPROTO_HOPOPTS: /* RFC 2460 */
692 PULLUP_TO(hlen, ulp, struct ip6_hbh);
693 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
694 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
695 ulp = NULL;
696 break;
697 case IPPROTO_ROUTING: /* RFC 2460 */
698 PULLUP_TO(hlen, ulp, struct ip6_rthdr);
699 hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
700 proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
701 ulp = NULL;
702 break;
703 case IPPROTO_FRAGMENT: /* RFC 2460 */
704 PULLUP_TO(hlen, ulp, struct ip6_frag);
705 hlen += sizeof (struct ip6_frag);
706 proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
707 offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
708 IP6F_OFF_MASK;
709 ulp = NULL;
710 break;
711 case IPPROTO_DSTOPTS: /* RFC 2460 */
712 PULLUP_TO(hlen, ulp, struct ip6_hbh);
713 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
714 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
715 ulp = NULL;
716 break;
717 case IPPROTO_AH: /* RFC 2402 */
718 PULLUP_TO(hlen, ulp, struct ip6_ext);
719 hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
720 proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
721 ulp = NULL;
722 break;
723 default:
724 PULLUP_TO(hlen, ulp, struct ip6_ext);
725 break;
726 }
727 }
728
729 if (src_port == 0) {
730 receive_failed:
731 return (ENOTSUP);
732 }
733
734skipports:
735 dsin6->sin6_family = AF_INET6;
736 dsin6->sin6_len = sizeof(*dsin6);
737 dsin6->sin6_port = dst_port;
738 memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
739
740 ssin6->sin6_family = AF_INET6;
741 ssin6->sin6_len = sizeof(*ssin6);
742 ssin6->sin6_port = src_port;
743 memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
744 *flags |= proto_to_flags(proto);
745
746 return (0);
747}
748
749#define zero_key(key) \
750do { \
751 key[0] = 0; \
752 key[1] = 0; \
753 key[2] = 0; \
754 key[3] = 0; \
755 key[4] = 0; \
756 key[5] = 0; \
757 key[6] = 0; \
758 key[7] = 0; \
759 key[8] = 0; \
760} while (0)
761
762static uint32_t
763ipv6_flow_lookup_hash_internal(
764 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6,
765 uint32_t *key, uint16_t flags)
766{
767 uint16_t sport, dport;
768 uint8_t proto;
769 int offset = 0;
770
771 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
772 return (0);
773
774 proto = flags_to_proto(flags);
775 zero_key(key);
776 sport = dport = 0;
777 if (dsin6 != NULL) {
778 memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
779 dport = dsin6->sin6_port;
780 }
781 if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
782 memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
783 sport = ssin6->sin6_port;
784 }
785 if (flags & FL_HASH_ALL) {
786 ((uint16_t *)key)[0] = sport;
787 ((uint16_t *)key)[1] = dport;
788 } else
789 offset = V_flow_hashjitter + proto;
790
791 return (jenkins_hashword(key, 9, offset));
792}
793
794static struct flentry *
795flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m)
796{
797 struct sockaddr_storage ssa, dsa;
798 struct sockaddr_in6 *dsin6, *ssin6;
799 uint16_t flags;
800
801 dsin6 = (struct sockaddr_in6 *)&dsa;
802 ssin6 = (struct sockaddr_in6 *)&ssa;
803 bzero(dsin6, sizeof(*dsin6));
804 bzero(ssin6, sizeof(*ssin6));
805 flags = ft->ft_flags;
806
807 if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0)
808 return (NULL);
809
810 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
811}
812
813void
814flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
815{
816 uint32_t *hashkey = NULL;
817 struct sockaddr_in6 *sin6;
818
819 sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
820
821 sin6->sin6_family = AF_INET6;
822 sin6->sin6_len = sizeof(*sin6);
823 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
824 memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
825 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
826 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
827
828}
829#endif /* INET6 */
830
831static bitstr_t *
832flowtable_mask(struct flowtable *ft)
833{
834 bitstr_t *mask;
835
836 if (ft->ft_flags & FL_PCPU)
837 mask = ft->ft_masks[curcpu];
838 else
839 mask = ft->ft_masks[0];
840
841 return (mask);
842}
843
844static struct flentry **
845flowtable_entry(struct flowtable *ft, uint32_t hash)
846{
847 struct flentry **fle;
848 int index = (hash % ft->ft_size);
849
850 if (ft->ft_flags & FL_PCPU) {
851 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
852 fle = &ft->ft_table.pcpu[curcpu][index];
853 } else {
854 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
855 fle = &ft->ft_table.global[index];
856 }
857
858 return (fle);
859}
860
861static int
862flow_stale(struct flowtable *ft, struct flentry *fle)
863{
864 time_t idle_time;
865
866 if ((fle->f_fhash == 0)
867 || ((fle->f_rt->rt_flags & RTF_HOST) &&
868 ((fle->f_rt->rt_flags & (RTF_UP))
869 != (RTF_UP)))
870 || (fle->f_rt->rt_ifp == NULL)
871 || !RT_LINK_IS_UP(fle->f_rt->rt_ifp))
872 return (1);
873
874 idle_time = time_uptime - fle->f_uptime;
875
876 if ((fle->f_flags & FL_STALE) ||
877 ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
878 && (idle_time > ft->ft_udp_idle)) ||
879 ((fle->f_flags & TH_FIN)
880 && (idle_time > ft->ft_fin_wait_idle)) ||
881 ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
882 && (idle_time > ft->ft_syn_idle)) ||
883 ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
884 && (idle_time > ft->ft_tcp_idle)) ||
885 ((fle->f_rt->rt_flags & RTF_UP) == 0 ||
886 (fle->f_rt->rt_ifp == NULL)))
887 return (1);
888
889 return (0);
890}
891
892static void
893flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
894{
895 uint32_t *hashkey;
896 int i, nwords;
897
898 if (fle->f_flags & FL_IPV6) {
899 nwords = 9;
900 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
901 } else {
902 nwords = 3;
903 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
904 }
905
906 for (i = 0; i < nwords; i++)
907 hashkey[i] = key[i];
908}
909
910static struct flentry *
911flow_alloc(struct flowtable *ft)
912{
913 struct flentry *newfle;
914 uma_zone_t zone;
915
916 newfle = NULL;
917 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
918
919 newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO);
920 if (newfle != NULL)
921 atomic_add_int(&ft->ft_count, 1);
922 return (newfle);
923}
924
925static void
926flow_free(struct flentry *fle, struct flowtable *ft)
927{
928 uma_zone_t zone;
929
930 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
931 atomic_add_int(&ft->ft_count, -1);
932 uma_zfree(zone, fle);
933}
934
935static int
936flow_full(struct flowtable *ft)
937{
938 boolean_t full;
939 uint32_t count;
940
941 full = ft->ft_full;
942 count = ft->ft_count;
943
944 if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3))))
945 ft->ft_full = FALSE;
946 else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5))))
947 ft->ft_full = TRUE;
948
949 if (full && !ft->ft_full) {
950 flowclean_freq = 4*hz;
951 if ((ft->ft_flags & FL_HASH_ALL) == 0)
952 ft->ft_udp_idle = ft->ft_fin_wait_idle =
953 ft->ft_syn_idle = ft->ft_tcp_idle = 5;
954 cv_broadcast(&flowclean_cv);
955 } else if (!full && ft->ft_full) {
956 flowclean_freq = 20*hz;
957 if ((ft->ft_flags & FL_HASH_ALL) == 0)
958 ft->ft_udp_idle = ft->ft_fin_wait_idle =
959 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
960 }
961
962 return (ft->ft_full);
963}
964
965static int
966flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
967 uint32_t fibnum, struct route *ro, uint16_t flags)
968{
969 struct flentry *fle, *fletail, *newfle, **flep;
970 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
971 int depth;
972 bitstr_t *mask;
973 uint8_t proto;
974
975 newfle = flow_alloc(ft);
976 if (newfle == NULL)
977 return (ENOMEM);
978
979 newfle->f_flags |= (flags & FL_IPV6);
980 proto = flags_to_proto(flags);
981
982 FL_ENTRY_LOCK(ft, hash);
983 mask = flowtable_mask(ft);
984 flep = flowtable_entry(ft, hash);
985 fletail = fle = *flep;
986
987 if (fle == NULL) {
988 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
989 *flep = fle = newfle;
990 goto skip;
991 }
992
993 depth = 0;
994 fs->ft_collisions++;
995 /*
996 * find end of list and make sure that we were not
997 * preempted by another thread handling this flow
998 */
999 while (fle != NULL) {
1000 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
1001 /*
1002 * there was either a hash collision
1003 * or we lost a race to insert
1004 */
1005 FL_ENTRY_UNLOCK(ft, hash);
1006 flow_free(newfle, ft);
1007
1008 if (flags & FL_OVERWRITE)
1009 goto skip;
1010 return (EEXIST);
1011 }
1012 /*
1013 * re-visit this double condition XXX
1014 */
1015 if (fletail->f_next != NULL)
1016 fletail = fle->f_next;
1017
1018 depth++;
1019 fle = fle->f_next;
1020 }
1021
1022 if (depth > fs->ft_max_depth)
1023 fs->ft_max_depth = depth;
1024 fletail->f_next = newfle;
1025 fle = newfle;
1026skip:
1027 flowtable_set_hashkey(fle, key);
1028
1029 fle->f_proto = proto;
1030 fle->f_rt = ro->ro_rt;
1031 fle->f_lle = ro->ro_lle;
1032 fle->f_fhash = hash;
1033 fle->f_fibnum = fibnum;
1034 fle->f_uptime = time_uptime;
1035 FL_ENTRY_UNLOCK(ft, hash);
1036 return (0);
1037}
1038
1039int
1040kern_flowtable_insert(struct flowtable *ft,
1041 struct sockaddr_storage *ssa, struct sockaddr_storage *dsa,
1042 struct route *ro, uint32_t fibnum, int flags)
1043{
1044 uint32_t key[9], hash;
1045
1046 flags = (ft->ft_flags | flags | FL_OVERWRITE);
1047 hash = 0;
1048
1049#ifdef INET
1050 if (ssa->ss_family == AF_INET)
1051 hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa,
1052 (struct sockaddr_in *)dsa, key, flags);
1053#endif
1054#ifdef INET6
1055 if (ssa->ss_family == AF_INET6)
1056 hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa,
1057 (struct sockaddr_in6 *)dsa, key, flags);
1058#endif
1059 if (ro->ro_rt == NULL || ro->ro_lle == NULL)
1060 return (EINVAL);
1061
1062 FLDPRINTF(ft, FL_DEBUG,
1063 "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n",
1064 key[0], key[1], key[2], hash, fibnum, flags);
1065 return (flowtable_insert(ft, hash, key, fibnum, ro, flags));
1066}
1067
1068static int
1069flowtable_key_equal(struct flentry *fle, uint32_t *key)
1070{
1071 uint32_t *hashkey;
1072 int i, nwords;
1073
1074 if (fle->f_flags & FL_IPV6) {
1075 nwords = 9;
1076 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1077 } else {
1078 nwords = 3;
1079 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1080 }
1081
1082 for (i = 0; i < nwords; i++)
1083 if (hashkey[i] != key[i])
1084 return (0);
1085
1086 return (1);
1087}
1088
1089struct flentry *
1090flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af)
1091{
1092 struct flentry *fle = NULL;
1093
1094#ifdef INET
1095 if (af == AF_INET)
1096 fle = flowtable_lookup_mbuf4(ft, m);
1097#endif
1098#ifdef INET6
1099 if (af == AF_INET6)
1100 fle = flowtable_lookup_mbuf6(ft, m);
1101#endif
1102 if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) {
1103 m->m_flags |= M_FLOWID;
1104 m->m_pkthdr.flowid = fle->f_fhash;
1105 }
1106 return (fle);
1107}
1108
1109struct flentry *
1110flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
1111 struct sockaddr_storage *dsa, uint32_t fibnum, int flags)
1112{
1113 uint32_t key[9], hash;
1114 struct flentry *fle;
1115 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1116 uint8_t proto = 0;
1117 int error = 0;
1118 struct rtentry *rt;
1119 struct llentry *lle;
1120 struct route sro, *ro;
1121 struct route_in6 sro6;
1122
1123 sro.ro_rt = sro6.ro_rt = NULL;
1124 sro.ro_lle = sro6.ro_lle = NULL;
1125 ro = NULL;
1126 hash = 0;
1127 flags |= ft->ft_flags;
1128 proto = flags_to_proto(flags);
1129#ifdef INET
1130 if (ssa->ss_family == AF_INET) {
1131 struct sockaddr_in *ssin, *dsin;
1132
1133 ro = &sro;
1134 memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
1135 /*
1136 * The harvested source and destination addresses
1137 * may contain port information if the packet is
1138 * from a transport protocol (e.g. TCP/UDP). The
1139 * port field must be cleared before performing
1140 * a route lookup.
1141 */
1142 ((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0;
1143 dsin = (struct sockaddr_in *)dsa;
1144 ssin = (struct sockaddr_in *)ssa;
1145 if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
1146 (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1147 (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
1148 return (NULL);
1149
1150 hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags);
1151 }
1152#endif
1153#ifdef INET6
1154 if (ssa->ss_family == AF_INET6) {
1155 struct sockaddr_in6 *ssin6, *dsin6;
1156
1157 ro = (struct route *)&sro6;
1158 memcpy(&sro6.ro_dst, dsa,
1159 sizeof(struct sockaddr_in6));
1160 ((struct sockaddr_in6 *)&ro->ro_dst)->sin6_port = 0;
1161 dsin6 = (struct sockaddr_in6 *)dsa;
1162 ssin6 = (struct sockaddr_in6 *)ssa;
1163
1164 flags |= FL_IPV6;
1165 hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags);
1166 }
1167#endif
1168 /*
1169 * Ports are zero and this isn't a transmit cache
1170 * - thus not a protocol for which we need to keep
1171 * state
1172 * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
1173 */
1174 if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL)))
1175 return (NULL);
1176
1177 fs->ft_lookups++;
1178 FL_ENTRY_LOCK(ft, hash);
1179 if ((fle = FL_ENTRY(ft, hash)) == NULL) {
1180 FL_ENTRY_UNLOCK(ft, hash);
1181 goto uncached;
1182 }
1183keycheck:
1184 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1185 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1186 if ((rt != NULL)
1187 && fle->f_fhash == hash
1188 && flowtable_key_equal(fle, key)
1189 && (proto == fle->f_proto)
1190 && (fibnum == fle->f_fibnum)
1191 && (rt->rt_flags & RTF_UP)
1192 && (rt->rt_ifp != NULL)) {
1193 fs->ft_hits++;
1194 fle->f_uptime = time_uptime;
1195 fle->f_flags |= flags;
1196 FL_ENTRY_UNLOCK(ft, hash);
1197 return (fle);
1198 } else if (fle->f_next != NULL) {
1199 fle = fle->f_next;
1200 goto keycheck;
1201 }
1202 FL_ENTRY_UNLOCK(ft, hash);
1203uncached:
1204 if (flags & FL_NOAUTO || flow_full(ft))
1205 return (NULL);
1206
1207 fs->ft_misses++;
1208 /*
1209 * This bit of code ends up locking the
1210 * same route 3 times (just like ip_output + ether_output)
1211 * - at lookup
1212 * - in rt_check when called by arpresolve
1213 * - dropping the refcount for the rtentry
1214 *
1215 * This could be consolidated to one if we wrote a variant
1216 * of arpresolve with an rt_check variant that expected to
1217 * receive the route locked
1218 */
1219
1220#ifdef INVARIANTS
1221 if ((ro->ro_dst.sa_family != AF_INET) &&
1222 (ro->ro_dst.sa_family != AF_INET6))
1223 panic("sa_family == %d\n", ro->ro_dst.sa_family);
1224#endif
1225
1226 ft->ft_rtalloc(ro, hash, fibnum);
1227 if (ro->ro_rt == NULL)
1228 error = ENETUNREACH;
1229 else {
1230 struct llentry *lle = NULL;
1231 struct sockaddr_storage *l3addr;
1232 struct rtentry *rt = ro->ro_rt;
1233 struct ifnet *ifp = rt->rt_ifp;
1234
1235 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
1236 RTFREE(rt);
1237 ro->ro_rt = NULL;
1238 return (NULL);
1239 }
1240#ifdef INET6
1241 if (ssa->ss_family == AF_INET6) {
1242 struct sockaddr_in6 *dsin6;
1243
1244 dsin6 = (struct sockaddr_in6 *)dsa;
1245 if (in6_localaddr(&dsin6->sin6_addr)) {
1246 RTFREE(rt);
1247 ro->ro_rt = NULL;
1248 return (NULL);
1249 }
1250
1251 if (rt->rt_flags & RTF_GATEWAY)
1252 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1253
1254 else
1255 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1256 llentry_update(&lle, LLTABLE6(ifp), l3addr, ifp);
1257 }
1258#endif
1259#ifdef INET
1260 if (ssa->ss_family == AF_INET) {
1261 if (rt->rt_flags & RTF_GATEWAY)
1262 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1263 else
1264 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1265 llentry_update(&lle, LLTABLE(ifp), l3addr, ifp);
1266 }
1267
1268#endif
1269 ro->ro_lle = lle;
1270
1271 if (lle == NULL) {
1272 RTFREE(rt);
1273 ro->ro_rt = NULL;
1274 return (NULL);
1275 }
1276 error = flowtable_insert(ft, hash, key, fibnum, ro, flags);
1277
1278 if (error) {
1279 RTFREE(rt);
1280 LLE_FREE(lle);
1281 ro->ro_rt = NULL;
1282 ro->ro_lle = NULL;
1283 }
1284 }
1285
1286 return ((error) ? NULL : fle);
1287}
1288
1289/*
1290 * used by the bit_alloc macro
1291 */
1292#define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
1293
1294struct flowtable *
1295flowtable_alloc(char *name, int nentry, int flags)
1296{
1297 struct flowtable *ft, *fttail;
1298 int i;
1299
1300 if (V_flow_hashjitter == 0)
1301 V_flow_hashjitter = arc4random();
1302
1303 KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
1304
1305 ft = malloc(sizeof(struct flowtable),
1306 M_RTABLE, M_WAITOK | M_ZERO);
1307
1308 ft->ft_name = name;
1309 ft->ft_flags = flags;
1310 ft->ft_size = nentry;
1311#ifdef RADIX_MPATH
1312 ft->ft_rtalloc = rtalloc_mpath_fib;
1313#else
1314 ft->ft_rtalloc = in_rtalloc_ign_wrapper;
1315#endif
1316 if (flags & FL_PCPU) {
1317 ft->ft_lock = flowtable_pcpu_lock;
1318 ft->ft_unlock = flowtable_pcpu_unlock;
1319
1320 for (i = 0; i <= mp_maxid; i++) {
1321 ft->ft_table.pcpu[i] =
1322 malloc(nentry*sizeof(struct flentry *),
1323 M_RTABLE, M_WAITOK | M_ZERO);
1324 ft->ft_masks[i] = bit_alloc(nentry);
1325 }
1326 } else {
1327 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
1328 (fls(mp_maxid + 1) << 1));
1329
1330 ft->ft_lock = flowtable_global_lock;
1331 ft->ft_unlock = flowtable_global_unlock;
1332 ft->ft_table.global =
1333 malloc(nentry*sizeof(struct flentry *),
1334 M_RTABLE, M_WAITOK | M_ZERO);
1335 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
1336 M_RTABLE, M_WAITOK | M_ZERO);
1337 for (i = 0; i < ft->ft_lock_count; i++)
1338 mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
1339
1340 ft->ft_masks[0] = bit_alloc(nentry);
1341 }
1342 ft->ft_tmpmask = bit_alloc(nentry);
1343
1344 /*
1345 * In the local transmit case the table truly is
1346 * just a cache - so everything is eligible for
1347 * replacement after 5s of non-use
1348 */
1349 if (flags & FL_HASH_ALL) {
1350 ft->ft_udp_idle = V_flowtable_udp_expire;
1351 ft->ft_syn_idle = V_flowtable_syn_expire;
1352 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
1353 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
1354 } else {
1355 ft->ft_udp_idle = ft->ft_fin_wait_idle =
1356 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
1357
1358 }
1359
1360 /*
1361 * hook in to the cleaner list
1362 */
1363 if (V_flow_list_head == NULL)
1364 V_flow_list_head = ft;
1365 else {
1366 fttail = V_flow_list_head;
1367 while (fttail->ft_next != NULL)
1368 fttail = fttail->ft_next;
1369 fttail->ft_next = ft;
1370 }
1371
1372 return (ft);
1373}
1374
1375/*
1376 * The rest of the code is devoted to garbage collection of expired entries.
1377 * It is a new additon made necessary by the switch to dynamically allocating
1378 * flow tables.
1379 *
1380 */
1381static void
1382fle_free(struct flentry *fle, struct flowtable *ft)
1383{
1384 struct rtentry *rt;
1385 struct llentry *lle;
1386
1387 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1388 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1389 if (rt != NULL)
1390 RTFREE(rt);
1391 if (lle != NULL)
1392 LLE_FREE(lle);
1393 flow_free(fle, ft);
1394}
1395
1396static void
1397flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
1398{
1399 int curbit = 0, count;
1400 struct flentry *fle, **flehead, *fleprev;
1401 struct flentry *flefreehead, *flefreetail, *fletmp;
1402 bitstr_t *mask, *tmpmask;
1403 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1404
1405 flefreehead = flefreetail = NULL;
1406 mask = flowtable_mask(ft);
1407 tmpmask = ft->ft_tmpmask;
1408 memcpy(tmpmask, mask, ft->ft_size/8);
1409 /*
1410 * XXX Note to self, bit_ffs operates at the byte level
1411 * and thus adds gratuitous overhead
1412 */
1413 bit_ffs(tmpmask, ft->ft_size, &curbit);
1414 while (curbit != -1) {
1415 if (curbit >= ft->ft_size || curbit < -1) {
1416 log(LOG_ALERT,
1417 "warning: bad curbit value %d \n",
1418 curbit);
1419 break;
1420 }
1421
1422 FL_ENTRY_LOCK(ft, curbit);
1423 flehead = flowtable_entry(ft, curbit);
1424 fle = fleprev = *flehead;
1425
1426 fs->ft_free_checks++;
1427#ifdef DIAGNOSTIC
1428 if (fle == NULL && curbit > 0) {
1429 log(LOG_ALERT,
1430 "warning bit=%d set, but no fle found\n",
1431 curbit);
1432 }
1433#endif
1434 while (fle != NULL) {
1435 if (rt != NULL) {
1436 if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
1437 fleprev = fle;
1438 fle = fle->f_next;
1439 continue;
1440 }
1441 } else if (!flow_stale(ft, fle)) {
1442 fleprev = fle;
1443 fle = fle->f_next;
1444 continue;
1445 }
1446 /*
1447 * delete head of the list
1448 */
1449 if (fleprev == *flehead) {
1450 fletmp = fleprev;
1451 if (fle == fleprev) {
1452 fleprev = *flehead = fle->f_next;
1453 } else
1454 fleprev = *flehead = fle;
1455 fle = fle->f_next;
1456 } else {
1457 /*
1458 * don't advance fleprev
1459 */
1460 fletmp = fle;
1461 fleprev->f_next = fle->f_next;
1462 fle = fleprev->f_next;
1463 }
1464
1465 if (flefreehead == NULL)
1466 flefreehead = flefreetail = fletmp;
1467 else {
1468 flefreetail->f_next = fletmp;
1469 flefreetail = fletmp;
1470 }
1471 fletmp->f_next = NULL;
1472 }
1473 if (*flehead == NULL)
1474 bit_clear(mask, curbit);
1475 FL_ENTRY_UNLOCK(ft, curbit);
1476 bit_clear(tmpmask, curbit);
1477 bit_ffs(tmpmask, ft->ft_size, &curbit);
1478 }
1479 count = 0;
1480 while ((fle = flefreehead) != NULL) {
1481 flefreehead = fle->f_next;
1482 count++;
1483 fs->ft_frees++;
1484 fle_free(fle, ft);
1485 }
1486 if (V_flowtable_debug && count)
1487 log(LOG_DEBUG, "freed %d flow entries\n", count);
1488}
1489
1490void
1491flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
1492{
1493 int i;
1494
1495 if (ft->ft_flags & FL_PCPU) {
1496 CPU_FOREACH(i) {
1497 if (smp_started == 1) {
1498 thread_lock(curthread);
1499 sched_bind(curthread, i);
1500 thread_unlock(curthread);
1501 }
1502
1503 flowtable_free_stale(ft, rt);
1504
1505 if (smp_started == 1) {
1506 thread_lock(curthread);
1507 sched_unbind(curthread);
1508 thread_unlock(curthread);
1509 }
1510 }
1511 } else {
1512 flowtable_free_stale(ft, rt);
1513 }
1514}
1515
1516static void
1517flowtable_clean_vnet(void)
1518{
1519 struct flowtable *ft;
1520 int i;
1521
1522 ft = V_flow_list_head;
1523 while (ft != NULL) {
1524 if (ft->ft_flags & FL_PCPU) {
1525 CPU_FOREACH(i) {
1526 if (smp_started == 1) {
1527 thread_lock(curthread);
1528 sched_bind(curthread, i);
1529 thread_unlock(curthread);
1530 }
1531
1532 flowtable_free_stale(ft, NULL);
1533
1534 if (smp_started == 1) {
1535 thread_lock(curthread);
1536 sched_unbind(curthread);
1537 thread_unlock(curthread);
1538 }
1539 }
1540 } else {
1541 flowtable_free_stale(ft, NULL);
1542 }
1543 ft = ft->ft_next;
1544 }
1545}
1546
1547static void
1548flowtable_cleaner(void)
1549{
1550 VNET_ITERATOR_DECL(vnet_iter);
1551
1552 if (bootverbose)
1553 log(LOG_INFO, "flowtable cleaner started\n");
1554 while (1) {
1555 VNET_LIST_RLOCK();
1556 VNET_FOREACH(vnet_iter) {
1557 CURVNET_SET(vnet_iter);
1558 flowtable_clean_vnet();
1559 CURVNET_RESTORE();
1560 }
1561 VNET_LIST_RUNLOCK();
1562
1563 flowclean_cycles++;
1564 /*
1565 * The 10 second interval between cleaning checks
1566 * is arbitrary
1567 */
1568 mtx_lock(&flowclean_lock);
1569 cv_broadcast(&flowclean_cv);
1570 cv_timedwait(&flowclean_cv, &flowclean_lock, flowclean_freq);
1571 mtx_unlock(&flowclean_lock);
1572 }
1573}
1574
1575static void
1576flowtable_flush(void *unused __unused)
1577{
1578 uint64_t start;
1579
1580 mtx_lock(&flowclean_lock);
1581 start = flowclean_cycles;
1582 while (start == flowclean_cycles) {
1583 cv_broadcast(&flowclean_cv);
1584 cv_wait(&flowclean_cv, &flowclean_lock);
1585 }
1586 mtx_unlock(&flowclean_lock);
1587}
1588
1589static struct kproc_desc flow_kp = {
1590 "flowcleaner",
1591 flowtable_cleaner,
1592 &flowcleanerproc
1593};
1594SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
1595
1596static void
1597flowtable_init_vnet(const void *unused __unused)
1598{
1599
1600 V_flowtable_nmbflows = 1024 + maxusers * 64 * mp_ncpus;
1601 V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
1602 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1603 V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
1604 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1605 uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
1606 uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
1607 V_flowtable_ready = 1;
1608}
1609VNET_SYSINIT(flowtable_init_vnet, SI_SUB_SMP, SI_ORDER_ANY,
1610 flowtable_init_vnet, NULL);
1611
1612static void
1613flowtable_init(const void *unused __unused)
1614{
1615
1616 cv_init(&flowclean_cv, "flowcleanwait");
1617 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1618 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1619 EVENTHANDLER_PRI_ANY);
1620 flowclean_freq = 20*hz;
1621}
1622SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST,
1623 flowtable_init, NULL);
1624
1625
1626#ifdef VIMAGE
1627static void
1628flowtable_uninit(const void *unused __unused)
1629{
1630
1631 V_flowtable_ready = 0;
1632 uma_zdestroy(V_flow_ipv4_zone);
1633 uma_zdestroy(V_flow_ipv6_zone);
1634}
1635
1636VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1637 flowtable_uninit, NULL);
1638#endif
1639
1640#ifdef DDB
1641static uint32_t *
1642flowtable_get_hashkey(struct flentry *fle)
1643{
1644 uint32_t *hashkey;
1645
1646 if (fle->f_flags & FL_IPV6)
1647 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1648 else
1649 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1650
1651 return (hashkey);
1652}
1653
1654static bitstr_t *
1655flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1656{
1657 bitstr_t *mask;
1658
1659 if (ft->ft_flags & FL_PCPU)
1660 mask = ft->ft_masks[cpuid];
1661 else
1662 mask = ft->ft_masks[0];
1663
1664 return (mask);
1665}
1666
1667static struct flentry **
1668flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1669{
1670 struct flentry **fle;
1671 int index = (hash % ft->ft_size);
1672
1673 if (ft->ft_flags & FL_PCPU) {
1674 fle = &ft->ft_table.pcpu[cpuid][index];
1675 } else {
1676 fle = &ft->ft_table.global[index];
1677 }
1678
1679 return (fle);
1680}
1681
1682static void
1683flow_show(struct flowtable *ft, struct flentry *fle)
1684{
1685 int idle_time;
1686 int rt_valid, ifp_valid;
1687 uint16_t sport, dport;
1688 uint32_t *hashkey;
1689 char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
1690 volatile struct rtentry *rt;
1691 struct ifnet *ifp = NULL;
1692
1693 idle_time = (int)(time_uptime - fle->f_uptime);
1694 rt = fle->f_rt;
1695 rt_valid = rt != NULL;
1696 if (rt_valid)
1697 ifp = rt->rt_ifp;
1698 ifp_valid = ifp != NULL;
1699 hashkey = flowtable_get_hashkey(fle);
1700 if (fle->f_flags & FL_IPV6)
1701 goto skipaddr;
1702
1703 inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr);
1704 if (ft->ft_flags & FL_HASH_ALL) {
1705 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
1706 sport = ntohs(((uint16_t *)hashkey)[0]);
1707 dport = ntohs(((uint16_t *)hashkey)[1]);
1708 db_printf("%s:%d->%s:%d",
1709 saddr, sport, daddr,
1710 dport);
1711 } else
1712 db_printf("%s ", daddr);
1713
1714skipaddr:
1715 if (fle->f_flags & FL_STALE)
1716 db_printf(" FL_STALE ");
1717 if (fle->f_flags & FL_TCP)
1718 db_printf(" FL_TCP ");
1719 if (fle->f_flags & FL_UDP)
1720 db_printf(" FL_UDP ");
1721 if (rt_valid) {
1722 if (rt->rt_flags & RTF_UP)
1723 db_printf(" RTF_UP ");
1724 }
1725 if (ifp_valid) {
1726 if (ifp->if_flags & IFF_LOOPBACK)
1727 db_printf(" IFF_LOOPBACK ");
1728 if (ifp->if_flags & IFF_UP)
1729 db_printf(" IFF_UP ");
1730 if (ifp->if_flags & IFF_POINTOPOINT)
1731 db_printf(" IFF_POINTOPOINT ");
1732 }
1733 if (fle->f_flags & FL_IPV6)
1734 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1735 hashkey[0], hashkey[1], hashkey[2],
1736 hashkey[3], hashkey[4], hashkey[5],
1737 hashkey[6], hashkey[7], hashkey[8]);
1738 else
1739 db_printf("\n\tkey=%08x:%08x:%08x ",
1740 hashkey[0], hashkey[1], hashkey[2]);
1741 db_printf("hash=%08x idle_time=%03d"
1742 "\n\tfibnum=%02d rt=%p",
1743 fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt);
1744 db_printf("\n");
1745}
1746
1747static void
1748flowtable_show(struct flowtable *ft, int cpuid)
1749{
1750 int curbit = 0;
1751 struct flentry *fle, **flehead;
1752 bitstr_t *mask, *tmpmask;
1753
1754 if (cpuid != -1)
1755 db_printf("cpu: %d\n", cpuid);
1756 mask = flowtable_mask_pcpu(ft, cpuid);
1757 tmpmask = ft->ft_tmpmask;
1758 memcpy(tmpmask, mask, ft->ft_size/8);
1759 /*
1760 * XXX Note to self, bit_ffs operates at the byte level
1761 * and thus adds gratuitous overhead
1762 */
1763 bit_ffs(tmpmask, ft->ft_size, &curbit);
1764 while (curbit != -1) {
1765 if (curbit >= ft->ft_size || curbit < -1) {
1766 db_printf("warning: bad curbit value %d \n",
1767 curbit);
1768 break;
1769 }
1770
1771 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1772 fle = *flehead;
1773
1774 while (fle != NULL) {
1775 flow_show(ft, fle);
1776 fle = fle->f_next;
1777 continue;
1778 }
1779 bit_clear(tmpmask, curbit);
1780 bit_ffs(tmpmask, ft->ft_size, &curbit);
1781 }
1782}
1783
1784static void
1785flowtable_show_vnet(void)
1786{
1787 struct flowtable *ft;
1788 int i;
1789
1790 ft = V_flow_list_head;
1791 while (ft != NULL) {
1792 printf("name: %s\n", ft->ft_name);
1793 if (ft->ft_flags & FL_PCPU) {
1794 CPU_FOREACH(i) {
1795 flowtable_show(ft, i);
1796 }
1797 } else {
1798 flowtable_show(ft, -1);
1799 }
1800 ft = ft->ft_next;
1801 }
1802}
1803
1804DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1805{
1806 VNET_ITERATOR_DECL(vnet_iter);
1807
1808 VNET_FOREACH(vnet_iter) {
1809 CURVNET_SET(vnet_iter);
1810 flowtable_show_vnet();
1811 CURVNET_RESTORE();
1812 }
1813}
1814#endif