Deleted Added
full compact
tcp_hostcache.c (215317) tcp_hostcache.c (215701)
1/*-
2 * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. The name of the author may not be used to endorse or promote
14 * products derived from this software without specific prior written
15 * permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30/*
31 * The tcp_hostcache moves the tcp-specific cached metrics from the routing
32 * table to a dedicated structure indexed by the remote IP address. It keeps
33 * information on the measured TCP parameters of past TCP sessions to allow
34 * better initial start values to be used with later connections to/from the
35 * same source. Depending on the network parameters (delay, bandwidth, max
36 * MTU, congestion window) between local and remote sites, this can lead to
37 * significant speed-ups for new TCP connections after the first one.
38 *
39 * Due to the tcp_hostcache, all TCP-specific metrics information in the
40 * routing table have been removed. The inpcb no longer keeps a pointer to
41 * the routing entry, and protocol-initiated route cloning has been removed
42 * as well. With these changes, the routing table has gone back to being
43 * more lightwight and only carries information related to packet forwarding.
44 *
45 * tcp_hostcache is designed for multiple concurrent access in SMP
46 * environments and high contention. All bucket rows have their own lock and
47 * thus multiple lookups and modifies can be done at the same time as long as
48 * they are in different bucket rows. If a request for insertion of a new
49 * record can't be satisfied, it simply returns an empty structure. Nobody
50 * and nothing outside of tcp_hostcache.c will ever point directly to any
51 * entry in the tcp_hostcache. All communication is done in an
52 * object-oriented way and only functions of tcp_hostcache will manipulate
53 * hostcache entries. Otherwise, we are unable to achieve good behaviour in
54 * concurrent access situations. Since tcp_hostcache is only caching
55 * information, there are no fatal consequences if we either can't satisfy
56 * any particular request or have to drop/overwrite an existing entry because
57 * of bucket limit memory constrains.
58 */
59
60/*
61 * Many thanks to jlemon for basic structure of tcp_syncache which is being
62 * followed here.
63 */
64
65#include <sys/cdefs.h>
1/*-
2 * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. The name of the author may not be used to endorse or promote
14 * products derived from this software without specific prior written
15 * permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30/*
31 * The tcp_hostcache moves the tcp-specific cached metrics from the routing
32 * table to a dedicated structure indexed by the remote IP address. It keeps
33 * information on the measured TCP parameters of past TCP sessions to allow
34 * better initial start values to be used with later connections to/from the
35 * same source. Depending on the network parameters (delay, bandwidth, max
36 * MTU, congestion window) between local and remote sites, this can lead to
37 * significant speed-ups for new TCP connections after the first one.
38 *
39 * Due to the tcp_hostcache, all TCP-specific metrics information in the
40 * routing table have been removed. The inpcb no longer keeps a pointer to
41 * the routing entry, and protocol-initiated route cloning has been removed
42 * as well. With these changes, the routing table has gone back to being
43 * more lightwight and only carries information related to packet forwarding.
44 *
45 * tcp_hostcache is designed for multiple concurrent access in SMP
46 * environments and high contention. All bucket rows have their own lock and
47 * thus multiple lookups and modifies can be done at the same time as long as
48 * they are in different bucket rows. If a request for insertion of a new
49 * record can't be satisfied, it simply returns an empty structure. Nobody
50 * and nothing outside of tcp_hostcache.c will ever point directly to any
51 * entry in the tcp_hostcache. All communication is done in an
52 * object-oriented way and only functions of tcp_hostcache will manipulate
53 * hostcache entries. Otherwise, we are unable to achieve good behaviour in
54 * concurrent access situations. Since tcp_hostcache is only caching
55 * information, there are no fatal consequences if we either can't satisfy
56 * any particular request or have to drop/overwrite an existing entry because
57 * of bucket limit memory constrains.
58 */
59
60/*
61 * Many thanks to jlemon for basic structure of tcp_syncache which is being
62 * followed here.
63 */
64
65#include <sys/cdefs.h>
66__FBSDID("$FreeBSD: head/sys/netinet/tcp_hostcache.c 215317 2010-11-14 20:38:11Z dim $");
66__FBSDID("$FreeBSD: head/sys/netinet/tcp_hostcache.c 215701 2010-11-22 19:32:54Z dim $");
67
68#include "opt_inet6.h"
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/kernel.h>
73#include <sys/lock.h>
74#include <sys/mutex.h>
75#include <sys/malloc.h>
76#include <sys/socket.h>
77#include <sys/socketvar.h>
78#include <sys/sysctl.h>
79
80#include <net/if.h>
81#include <net/route.h>
82#include <net/vnet.h>
83
84#include <netinet/in.h>
85#include <netinet/in_systm.h>
86#include <netinet/ip.h>
87#include <netinet/in_var.h>
88#include <netinet/in_pcb.h>
89#include <netinet/ip_var.h>
90#ifdef INET6
91#include <netinet/ip6.h>
92#include <netinet6/ip6_var.h>
93#endif
94#include <netinet/tcp.h>
95#include <netinet/tcp_var.h>
96#include <netinet/tcp_hostcache.h>
97#ifdef INET6
98#include <netinet6/tcp6_var.h>
99#endif
100
101#include <vm/uma.h>
102
103/* Arbitrary values */
104#define TCP_HOSTCACHE_HASHSIZE 512
105#define TCP_HOSTCACHE_BUCKETLIMIT 30
106#define TCP_HOSTCACHE_EXPIRE 60*60 /* one hour */
107#define TCP_HOSTCACHE_PRUNE 5*60 /* every 5 minutes */
108
67
68#include "opt_inet6.h"
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/kernel.h>
73#include <sys/lock.h>
74#include <sys/mutex.h>
75#include <sys/malloc.h>
76#include <sys/socket.h>
77#include <sys/socketvar.h>
78#include <sys/sysctl.h>
79
80#include <net/if.h>
81#include <net/route.h>
82#include <net/vnet.h>
83
84#include <netinet/in.h>
85#include <netinet/in_systm.h>
86#include <netinet/ip.h>
87#include <netinet/in_var.h>
88#include <netinet/in_pcb.h>
89#include <netinet/ip_var.h>
90#ifdef INET6
91#include <netinet/ip6.h>
92#include <netinet6/ip6_var.h>
93#endif
94#include <netinet/tcp.h>
95#include <netinet/tcp_var.h>
96#include <netinet/tcp_hostcache.h>
97#ifdef INET6
98#include <netinet6/tcp6_var.h>
99#endif
100
101#include <vm/uma.h>
102
103/* Arbitrary values */
104#define TCP_HOSTCACHE_HASHSIZE 512
105#define TCP_HOSTCACHE_BUCKETLIMIT 30
106#define TCP_HOSTCACHE_EXPIRE 60*60 /* one hour */
107#define TCP_HOSTCACHE_PRUNE 5*60 /* every 5 minutes */
108
109STATIC_VNET_DEFINE(struct tcp_hostcache, tcp_hostcache);
109static VNET_DEFINE(struct tcp_hostcache, tcp_hostcache);
110#define V_tcp_hostcache VNET(tcp_hostcache)
111
110#define V_tcp_hostcache VNET(tcp_hostcache)
111
112STATIC_VNET_DEFINE(struct callout, tcp_hc_callout);
112static VNET_DEFINE(struct callout, tcp_hc_callout);
113#define V_tcp_hc_callout VNET(tcp_hc_callout)
114
115static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *);
116static struct hc_metrics *tcp_hc_insert(struct in_conninfo *);
117static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS);
118static void tcp_hc_purge_internal(int);
119static void tcp_hc_purge(void *);
120
121SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0,
122 "TCP Host cache");
123
124SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
125 &VNET_NAME(tcp_hostcache.cache_limit), 0,
126 "Overall entry limit for hostcache");
127
128SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
129 &VNET_NAME(tcp_hostcache.hashsize), 0,
130 "Size of TCP hostcache hashtable");
131
132SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit,
133 CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.bucket_limit), 0,
134 "Per-bucket hash limit for hostcache");
135
136SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_RD,
137 &VNET_NAME(tcp_hostcache.cache_count), 0,
138 "Current number of entries in hostcache");
139
140SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_RW,
141 &VNET_NAME(tcp_hostcache.expire), 0,
142 "Expire time of TCP hostcache entries");
143
144SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_RW,
145 &VNET_NAME(tcp_hostcache.prune), 0,
146 "Time between purge runs");
147
148SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_RW,
149 &VNET_NAME(tcp_hostcache.purgeall), 0,
150 "Expire all entires on next purge run");
151
152SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list,
153 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP, 0, 0,
154 sysctl_tcp_hc_list, "A", "List of all hostcache entries");
155
156
157static MALLOC_DEFINE(M_HOSTCACHE, "hostcache", "TCP hostcache");
158
159#define HOSTCACHE_HASH(ip) \
160 (((ip)->s_addr ^ ((ip)->s_addr >> 7) ^ ((ip)->s_addr >> 17)) & \
161 V_tcp_hostcache.hashmask)
162
163/* XXX: What is the recommended hash to get good entropy for IPv6 addresses? */
164#define HOSTCACHE_HASH6(ip6) \
165 (((ip6)->s6_addr32[0] ^ \
166 (ip6)->s6_addr32[1] ^ \
167 (ip6)->s6_addr32[2] ^ \
168 (ip6)->s6_addr32[3]) & \
169 V_tcp_hostcache.hashmask)
170
171#define THC_LOCK(lp) mtx_lock(lp)
172#define THC_UNLOCK(lp) mtx_unlock(lp)
173
174void
175tcp_hc_init(void)
176{
177 int i;
178
179 /*
180 * Initialize hostcache structures.
181 */
182 V_tcp_hostcache.cache_count = 0;
183 V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE;
184 V_tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT;
185 V_tcp_hostcache.cache_limit =
186 V_tcp_hostcache.hashsize * V_tcp_hostcache.bucket_limit;
187 V_tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE;
188 V_tcp_hostcache.prune = TCP_HOSTCACHE_PRUNE;
189
190 TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize",
191 &V_tcp_hostcache.hashsize);
192 TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit",
193 &V_tcp_hostcache.cache_limit);
194 TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit",
195 &V_tcp_hostcache.bucket_limit);
196 if (!powerof2(V_tcp_hostcache.hashsize)) {
197 printf("WARNING: hostcache hash size is not a power of 2.\n");
198 V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; /* default */
199 }
200 V_tcp_hostcache.hashmask = V_tcp_hostcache.hashsize - 1;
201
202 /*
203 * Allocate the hash table.
204 */
205 V_tcp_hostcache.hashbase = (struct hc_head *)
206 malloc(V_tcp_hostcache.hashsize * sizeof(struct hc_head),
207 M_HOSTCACHE, M_WAITOK | M_ZERO);
208
209 /*
210 * Initialize the hash buckets.
211 */
212 for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
213 TAILQ_INIT(&V_tcp_hostcache.hashbase[i].hch_bucket);
214 V_tcp_hostcache.hashbase[i].hch_length = 0;
215 mtx_init(&V_tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry",
216 NULL, MTX_DEF);
217 }
218
219 /*
220 * Allocate the hostcache entries.
221 */
222 V_tcp_hostcache.zone =
223 uma_zcreate("hostcache", sizeof(struct hc_metrics),
224 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
225 uma_zone_set_max(V_tcp_hostcache.zone, V_tcp_hostcache.cache_limit);
226
227 /*
228 * Set up periodic cache cleanup.
229 */
230 callout_init(&V_tcp_hc_callout, CALLOUT_MPSAFE);
231 callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
232 tcp_hc_purge, curvnet);
233}
234
235#ifdef VIMAGE
236void
237tcp_hc_destroy(void)
238{
239 int i;
240
241 callout_drain(&V_tcp_hc_callout);
242
243 /* Purge all hc entries. */
244 tcp_hc_purge_internal(1);
245
246 /* Free the uma zone and the allocated hash table. */
247 uma_zdestroy(V_tcp_hostcache.zone);
248
249 for (i = 0; i < V_tcp_hostcache.hashsize; i++)
250 mtx_destroy(&V_tcp_hostcache.hashbase[i].hch_mtx);
251 free(V_tcp_hostcache.hashbase, M_HOSTCACHE);
252}
253#endif
254
255/*
256 * Internal function: look up an entry in the hostcache or return NULL.
257 *
258 * If an entry has been returned, the caller becomes responsible for
259 * unlocking the bucket row after he is done reading/modifying the entry.
260 */
261static struct hc_metrics *
262tcp_hc_lookup(struct in_conninfo *inc)
263{
264 int hash;
265 struct hc_head *hc_head;
266 struct hc_metrics *hc_entry;
267
268 KASSERT(inc != NULL, ("tcp_hc_lookup with NULL in_conninfo pointer"));
269
270 /*
271 * Hash the foreign ip address.
272 */
273 if (inc->inc_flags & INC_ISIPV6)
274 hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
275 else
276 hash = HOSTCACHE_HASH(&inc->inc_faddr);
277
278 hc_head = &V_tcp_hostcache.hashbase[hash];
279
280 /*
281 * Acquire lock for this bucket row; we release the lock if we don't
282 * find an entry, otherwise the caller has to unlock after he is
283 * done.
284 */
285 THC_LOCK(&hc_head->hch_mtx);
286
287 /*
288 * Iterate through entries in bucket row looking for a match.
289 */
290 TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) {
291 if (inc->inc_flags & INC_ISIPV6) {
292 if (memcmp(&inc->inc6_faddr, &hc_entry->ip6,
293 sizeof(inc->inc6_faddr)) == 0)
294 return hc_entry;
295 } else {
296 if (memcmp(&inc->inc_faddr, &hc_entry->ip4,
297 sizeof(inc->inc_faddr)) == 0)
298 return hc_entry;
299 }
300 }
301
302 /*
303 * We were unsuccessful and didn't find anything.
304 */
305 THC_UNLOCK(&hc_head->hch_mtx);
306 return NULL;
307}
308
309/*
310 * Internal function: insert an entry into the hostcache or return NULL if
311 * unable to allocate a new one.
312 *
313 * If an entry has been returned, the caller becomes responsible for
314 * unlocking the bucket row after he is done reading/modifying the entry.
315 */
316static struct hc_metrics *
317tcp_hc_insert(struct in_conninfo *inc)
318{
319 int hash;
320 struct hc_head *hc_head;
321 struct hc_metrics *hc_entry;
322
323 KASSERT(inc != NULL, ("tcp_hc_insert with NULL in_conninfo pointer"));
324
325 /*
326 * Hash the foreign ip address.
327 */
328 if (inc->inc_flags & INC_ISIPV6)
329 hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
330 else
331 hash = HOSTCACHE_HASH(&inc->inc_faddr);
332
333 hc_head = &V_tcp_hostcache.hashbase[hash];
334
335 /*
336 * Acquire lock for this bucket row; we release the lock if we don't
337 * find an entry, otherwise the caller has to unlock after he is
338 * done.
339 */
340 THC_LOCK(&hc_head->hch_mtx);
341
342 /*
343 * If the bucket limit is reached, reuse the least-used element.
344 */
345 if (hc_head->hch_length >= V_tcp_hostcache.bucket_limit ||
346 V_tcp_hostcache.cache_count >= V_tcp_hostcache.cache_limit) {
347 hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead);
348 /*
349 * At first we were dropping the last element, just to
350 * reacquire it in the next two lines again, which isn't very
351 * efficient. Instead just reuse the least used element.
352 * We may drop something that is still "in-use" but we can be
353 * "lossy".
354 * Just give up if this bucket row is empty and we don't have
355 * anything to replace.
356 */
357 if (hc_entry == NULL) {
358 THC_UNLOCK(&hc_head->hch_mtx);
359 return NULL;
360 }
361 TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q);
362 V_tcp_hostcache.hashbase[hash].hch_length--;
363 V_tcp_hostcache.cache_count--;
364 TCPSTAT_INC(tcps_hc_bucketoverflow);
365#if 0
366 uma_zfree(V_tcp_hostcache.zone, hc_entry);
367#endif
368 } else {
369 /*
370 * Allocate a new entry, or balk if not possible.
371 */
372 hc_entry = uma_zalloc(V_tcp_hostcache.zone, M_NOWAIT);
373 if (hc_entry == NULL) {
374 THC_UNLOCK(&hc_head->hch_mtx);
375 return NULL;
376 }
377 }
378
379 /*
380 * Initialize basic information of hostcache entry.
381 */
382 bzero(hc_entry, sizeof(*hc_entry));
383 if (inc->inc_flags & INC_ISIPV6)
384 bcopy(&inc->inc6_faddr, &hc_entry->ip6, sizeof(hc_entry->ip6));
385 else
386 hc_entry->ip4 = inc->inc_faddr;
387 hc_entry->rmx_head = hc_head;
388 hc_entry->rmx_expire = V_tcp_hostcache.expire;
389
390 /*
391 * Put it upfront.
392 */
393 TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q);
394 V_tcp_hostcache.hashbase[hash].hch_length++;
395 V_tcp_hostcache.cache_count++;
396 TCPSTAT_INC(tcps_hc_added);
397
398 return hc_entry;
399}
400
401/*
402 * External function: look up an entry in the hostcache and fill out the
403 * supplied TCP metrics structure. Fills in NULL when no entry was found or
404 * a value is not set.
405 */
406void
407tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite)
408{
409 struct hc_metrics *hc_entry;
410
411 /*
412 * Find the right bucket.
413 */
414 hc_entry = tcp_hc_lookup(inc);
415
416 /*
417 * If we don't have an existing object.
418 */
419 if (hc_entry == NULL) {
420 bzero(hc_metrics_lite, sizeof(*hc_metrics_lite));
421 return;
422 }
423 hc_entry->rmx_hits++;
424 hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
425
426 hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu;
427 hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh;
428 hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt;
429 hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar;
430 hc_metrics_lite->rmx_bandwidth = hc_entry->rmx_bandwidth;
431 hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd;
432 hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe;
433 hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe;
434
435 /*
436 * Unlock bucket row.
437 */
438 THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
439}
440
441/*
442 * External function: look up an entry in the hostcache and return the
443 * discovered path MTU. Returns NULL if no entry is found or value is not
444 * set.
445 */
446u_long
447tcp_hc_getmtu(struct in_conninfo *inc)
448{
449 struct hc_metrics *hc_entry;
450 u_long mtu;
451
452 hc_entry = tcp_hc_lookup(inc);
453 if (hc_entry == NULL) {
454 return 0;
455 }
456 hc_entry->rmx_hits++;
457 hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
458
459 mtu = hc_entry->rmx_mtu;
460 THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
461 return mtu;
462}
463
464/*
465 * External function: update the MTU value of an entry in the hostcache.
466 * Creates a new entry if none was found.
467 */
468void
469tcp_hc_updatemtu(struct in_conninfo *inc, u_long mtu)
470{
471 struct hc_metrics *hc_entry;
472
473 /*
474 * Find the right bucket.
475 */
476 hc_entry = tcp_hc_lookup(inc);
477
478 /*
479 * If we don't have an existing object, try to insert a new one.
480 */
481 if (hc_entry == NULL) {
482 hc_entry = tcp_hc_insert(inc);
483 if (hc_entry == NULL)
484 return;
485 }
486 hc_entry->rmx_updates++;
487 hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
488
489 hc_entry->rmx_mtu = mtu;
490
491 /*
492 * Put it upfront so we find it faster next time.
493 */
494 TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
495 TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
496
497 /*
498 * Unlock bucket row.
499 */
500 THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
501}
502
503/*
504 * External function: update the TCP metrics of an entry in the hostcache.
505 * Creates a new entry if none was found.
506 */
507void
508tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml)
509{
510 struct hc_metrics *hc_entry;
511
512 hc_entry = tcp_hc_lookup(inc);
513 if (hc_entry == NULL) {
514 hc_entry = tcp_hc_insert(inc);
515 if (hc_entry == NULL)
516 return;
517 }
518 hc_entry->rmx_updates++;
519 hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
520
521 if (hcml->rmx_rtt != 0) {
522 if (hc_entry->rmx_rtt == 0)
523 hc_entry->rmx_rtt = hcml->rmx_rtt;
524 else
525 hc_entry->rmx_rtt =
526 (hc_entry->rmx_rtt + hcml->rmx_rtt) / 2;
527 TCPSTAT_INC(tcps_cachedrtt);
528 }
529 if (hcml->rmx_rttvar != 0) {
530 if (hc_entry->rmx_rttvar == 0)
531 hc_entry->rmx_rttvar = hcml->rmx_rttvar;
532 else
533 hc_entry->rmx_rttvar =
534 (hc_entry->rmx_rttvar + hcml->rmx_rttvar) / 2;
535 TCPSTAT_INC(tcps_cachedrttvar);
536 }
537 if (hcml->rmx_ssthresh != 0) {
538 if (hc_entry->rmx_ssthresh == 0)
539 hc_entry->rmx_ssthresh = hcml->rmx_ssthresh;
540 else
541 hc_entry->rmx_ssthresh =
542 (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2;
543 TCPSTAT_INC(tcps_cachedssthresh);
544 }
545 if (hcml->rmx_bandwidth != 0) {
546 if (hc_entry->rmx_bandwidth == 0)
547 hc_entry->rmx_bandwidth = hcml->rmx_bandwidth;
548 else
549 hc_entry->rmx_bandwidth =
550 (hc_entry->rmx_bandwidth + hcml->rmx_bandwidth) / 2;
551 /* TCPSTAT_INC(tcps_cachedbandwidth); */
552 }
553 if (hcml->rmx_cwnd != 0) {
554 if (hc_entry->rmx_cwnd == 0)
555 hc_entry->rmx_cwnd = hcml->rmx_cwnd;
556 else
557 hc_entry->rmx_cwnd =
558 (hc_entry->rmx_cwnd + hcml->rmx_cwnd) / 2;
559 /* TCPSTAT_INC(tcps_cachedcwnd); */
560 }
561 if (hcml->rmx_sendpipe != 0) {
562 if (hc_entry->rmx_sendpipe == 0)
563 hc_entry->rmx_sendpipe = hcml->rmx_sendpipe;
564 else
565 hc_entry->rmx_sendpipe =
566 (hc_entry->rmx_sendpipe + hcml->rmx_sendpipe) /2;
567 /* TCPSTAT_INC(tcps_cachedsendpipe); */
568 }
569 if (hcml->rmx_recvpipe != 0) {
570 if (hc_entry->rmx_recvpipe == 0)
571 hc_entry->rmx_recvpipe = hcml->rmx_recvpipe;
572 else
573 hc_entry->rmx_recvpipe =
574 (hc_entry->rmx_recvpipe + hcml->rmx_recvpipe) /2;
575 /* TCPSTAT_INC(tcps_cachedrecvpipe); */
576 }
577
578 TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
579 TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
580 THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
581}
582
583/*
584 * Sysctl function: prints the list and values of all hostcache entries in
585 * unsorted order.
586 */
587static int
588sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS)
589{
590 int bufsize;
591 int linesize = 128;
592 char *p, *buf;
593 int len, i, error;
594 struct hc_metrics *hc_entry;
595#ifdef INET6
596 char ip6buf[INET6_ADDRSTRLEN];
597#endif
598
599 bufsize = linesize * (V_tcp_hostcache.cache_count + 1);
600
601 p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO);
602
603 len = snprintf(p, linesize,
604 "\nIP address MTU SSTRESH RTT RTTVAR BANDWIDTH "
605 " CWND SENDPIPE RECVPIPE HITS UPD EXP\n");
606 p += len;
607
608#define msec(u) (((u) + 500) / 1000)
609 for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
610 THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
611 TAILQ_FOREACH(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket,
612 rmx_q) {
613 len = snprintf(p, linesize,
614 "%-15s %5lu %8lu %6lums %6lums %9lu %8lu %8lu %8lu "
615 "%4lu %4lu %4i\n",
616 hc_entry->ip4.s_addr ? inet_ntoa(hc_entry->ip4) :
617#ifdef INET6
618 ip6_sprintf(ip6buf, &hc_entry->ip6),
619#else
620 "IPv6?",
621#endif
622 hc_entry->rmx_mtu,
623 hc_entry->rmx_ssthresh,
624 msec(hc_entry->rmx_rtt *
625 (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
626 msec(hc_entry->rmx_rttvar *
627 (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
628 hc_entry->rmx_bandwidth * 8,
629 hc_entry->rmx_cwnd,
630 hc_entry->rmx_sendpipe,
631 hc_entry->rmx_recvpipe,
632 hc_entry->rmx_hits,
633 hc_entry->rmx_updates,
634 hc_entry->rmx_expire);
635 p += len;
636 }
637 THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
638 }
639#undef msec
640 error = SYSCTL_OUT(req, buf, p - buf);
641 free(buf, M_TEMP);
642 return(error);
643}
644
645/*
646 * Caller has to make sure the curvnet is set properly.
647 */
648static void
649tcp_hc_purge_internal(int all)
650{
651 struct hc_metrics *hc_entry, *hc_next;
652 int i;
653
654 for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
655 THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
656 TAILQ_FOREACH_SAFE(hc_entry,
657 &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q, hc_next) {
658 if (all || hc_entry->rmx_expire <= 0) {
659 TAILQ_REMOVE(&V_tcp_hostcache.hashbase[i].hch_bucket,
660 hc_entry, rmx_q);
661 uma_zfree(V_tcp_hostcache.zone, hc_entry);
662 V_tcp_hostcache.hashbase[i].hch_length--;
663 V_tcp_hostcache.cache_count--;
664 } else
665 hc_entry->rmx_expire -= V_tcp_hostcache.prune;
666 }
667 THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
668 }
669}
670
671/*
672 * Expire and purge (old|all) entries in the tcp_hostcache. Runs
673 * periodically from the callout.
674 */
675static void
676tcp_hc_purge(void *arg)
677{
678 CURVNET_SET((struct vnet *) arg);
679 int all = 0;
680
681 if (V_tcp_hostcache.purgeall) {
682 all = 1;
683 V_tcp_hostcache.purgeall = 0;
684 }
685
686 tcp_hc_purge_internal(all);
687
688 callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
689 tcp_hc_purge, arg);
690 CURVNET_RESTORE();
691}
113#define V_tcp_hc_callout VNET(tcp_hc_callout)
114
115static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *);
116static struct hc_metrics *tcp_hc_insert(struct in_conninfo *);
117static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS);
118static void tcp_hc_purge_internal(int);
119static void tcp_hc_purge(void *);
120
121SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0,
122 "TCP Host cache");
123
124SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
125 &VNET_NAME(tcp_hostcache.cache_limit), 0,
126 "Overall entry limit for hostcache");
127
128SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
129 &VNET_NAME(tcp_hostcache.hashsize), 0,
130 "Size of TCP hostcache hashtable");
131
132SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit,
133 CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.bucket_limit), 0,
134 "Per-bucket hash limit for hostcache");
135
136SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_RD,
137 &VNET_NAME(tcp_hostcache.cache_count), 0,
138 "Current number of entries in hostcache");
139
140SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_RW,
141 &VNET_NAME(tcp_hostcache.expire), 0,
142 "Expire time of TCP hostcache entries");
143
144SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_RW,
145 &VNET_NAME(tcp_hostcache.prune), 0,
146 "Time between purge runs");
147
148SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_RW,
149 &VNET_NAME(tcp_hostcache.purgeall), 0,
150 "Expire all entires on next purge run");
151
152SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list,
153 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP, 0, 0,
154 sysctl_tcp_hc_list, "A", "List of all hostcache entries");
155
156
157static MALLOC_DEFINE(M_HOSTCACHE, "hostcache", "TCP hostcache");
158
159#define HOSTCACHE_HASH(ip) \
160 (((ip)->s_addr ^ ((ip)->s_addr >> 7) ^ ((ip)->s_addr >> 17)) & \
161 V_tcp_hostcache.hashmask)
162
163/* XXX: What is the recommended hash to get good entropy for IPv6 addresses? */
164#define HOSTCACHE_HASH6(ip6) \
165 (((ip6)->s6_addr32[0] ^ \
166 (ip6)->s6_addr32[1] ^ \
167 (ip6)->s6_addr32[2] ^ \
168 (ip6)->s6_addr32[3]) & \
169 V_tcp_hostcache.hashmask)
170
171#define THC_LOCK(lp) mtx_lock(lp)
172#define THC_UNLOCK(lp) mtx_unlock(lp)
173
174void
175tcp_hc_init(void)
176{
177 int i;
178
179 /*
180 * Initialize hostcache structures.
181 */
182 V_tcp_hostcache.cache_count = 0;
183 V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE;
184 V_tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT;
185 V_tcp_hostcache.cache_limit =
186 V_tcp_hostcache.hashsize * V_tcp_hostcache.bucket_limit;
187 V_tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE;
188 V_tcp_hostcache.prune = TCP_HOSTCACHE_PRUNE;
189
190 TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize",
191 &V_tcp_hostcache.hashsize);
192 TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit",
193 &V_tcp_hostcache.cache_limit);
194 TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit",
195 &V_tcp_hostcache.bucket_limit);
196 if (!powerof2(V_tcp_hostcache.hashsize)) {
197 printf("WARNING: hostcache hash size is not a power of 2.\n");
198 V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; /* default */
199 }
200 V_tcp_hostcache.hashmask = V_tcp_hostcache.hashsize - 1;
201
202 /*
203 * Allocate the hash table.
204 */
205 V_tcp_hostcache.hashbase = (struct hc_head *)
206 malloc(V_tcp_hostcache.hashsize * sizeof(struct hc_head),
207 M_HOSTCACHE, M_WAITOK | M_ZERO);
208
209 /*
210 * Initialize the hash buckets.
211 */
212 for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
213 TAILQ_INIT(&V_tcp_hostcache.hashbase[i].hch_bucket);
214 V_tcp_hostcache.hashbase[i].hch_length = 0;
215 mtx_init(&V_tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry",
216 NULL, MTX_DEF);
217 }
218
219 /*
220 * Allocate the hostcache entries.
221 */
222 V_tcp_hostcache.zone =
223 uma_zcreate("hostcache", sizeof(struct hc_metrics),
224 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
225 uma_zone_set_max(V_tcp_hostcache.zone, V_tcp_hostcache.cache_limit);
226
227 /*
228 * Set up periodic cache cleanup.
229 */
230 callout_init(&V_tcp_hc_callout, CALLOUT_MPSAFE);
231 callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
232 tcp_hc_purge, curvnet);
233}
234
235#ifdef VIMAGE
236void
237tcp_hc_destroy(void)
238{
239 int i;
240
241 callout_drain(&V_tcp_hc_callout);
242
243 /* Purge all hc entries. */
244 tcp_hc_purge_internal(1);
245
246 /* Free the uma zone and the allocated hash table. */
247 uma_zdestroy(V_tcp_hostcache.zone);
248
249 for (i = 0; i < V_tcp_hostcache.hashsize; i++)
250 mtx_destroy(&V_tcp_hostcache.hashbase[i].hch_mtx);
251 free(V_tcp_hostcache.hashbase, M_HOSTCACHE);
252}
253#endif
254
255/*
256 * Internal function: look up an entry in the hostcache or return NULL.
257 *
258 * If an entry has been returned, the caller becomes responsible for
259 * unlocking the bucket row after he is done reading/modifying the entry.
260 */
261static struct hc_metrics *
262tcp_hc_lookup(struct in_conninfo *inc)
263{
264 int hash;
265 struct hc_head *hc_head;
266 struct hc_metrics *hc_entry;
267
268 KASSERT(inc != NULL, ("tcp_hc_lookup with NULL in_conninfo pointer"));
269
270 /*
271 * Hash the foreign ip address.
272 */
273 if (inc->inc_flags & INC_ISIPV6)
274 hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
275 else
276 hash = HOSTCACHE_HASH(&inc->inc_faddr);
277
278 hc_head = &V_tcp_hostcache.hashbase[hash];
279
280 /*
281 * Acquire lock for this bucket row; we release the lock if we don't
282 * find an entry, otherwise the caller has to unlock after he is
283 * done.
284 */
285 THC_LOCK(&hc_head->hch_mtx);
286
287 /*
288 * Iterate through entries in bucket row looking for a match.
289 */
290 TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) {
291 if (inc->inc_flags & INC_ISIPV6) {
292 if (memcmp(&inc->inc6_faddr, &hc_entry->ip6,
293 sizeof(inc->inc6_faddr)) == 0)
294 return hc_entry;
295 } else {
296 if (memcmp(&inc->inc_faddr, &hc_entry->ip4,
297 sizeof(inc->inc_faddr)) == 0)
298 return hc_entry;
299 }
300 }
301
302 /*
303 * We were unsuccessful and didn't find anything.
304 */
305 THC_UNLOCK(&hc_head->hch_mtx);
306 return NULL;
307}
308
309/*
310 * Internal function: insert an entry into the hostcache or return NULL if
311 * unable to allocate a new one.
312 *
313 * If an entry has been returned, the caller becomes responsible for
314 * unlocking the bucket row after he is done reading/modifying the entry.
315 */
316static struct hc_metrics *
317tcp_hc_insert(struct in_conninfo *inc)
318{
319 int hash;
320 struct hc_head *hc_head;
321 struct hc_metrics *hc_entry;
322
323 KASSERT(inc != NULL, ("tcp_hc_insert with NULL in_conninfo pointer"));
324
325 /*
326 * Hash the foreign ip address.
327 */
328 if (inc->inc_flags & INC_ISIPV6)
329 hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
330 else
331 hash = HOSTCACHE_HASH(&inc->inc_faddr);
332
333 hc_head = &V_tcp_hostcache.hashbase[hash];
334
335 /*
336 * Acquire lock for this bucket row; we release the lock if we don't
337 * find an entry, otherwise the caller has to unlock after he is
338 * done.
339 */
340 THC_LOCK(&hc_head->hch_mtx);
341
342 /*
343 * If the bucket limit is reached, reuse the least-used element.
344 */
345 if (hc_head->hch_length >= V_tcp_hostcache.bucket_limit ||
346 V_tcp_hostcache.cache_count >= V_tcp_hostcache.cache_limit) {
347 hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead);
348 /*
349 * At first we were dropping the last element, just to
350 * reacquire it in the next two lines again, which isn't very
351 * efficient. Instead just reuse the least used element.
352 * We may drop something that is still "in-use" but we can be
353 * "lossy".
354 * Just give up if this bucket row is empty and we don't have
355 * anything to replace.
356 */
357 if (hc_entry == NULL) {
358 THC_UNLOCK(&hc_head->hch_mtx);
359 return NULL;
360 }
361 TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q);
362 V_tcp_hostcache.hashbase[hash].hch_length--;
363 V_tcp_hostcache.cache_count--;
364 TCPSTAT_INC(tcps_hc_bucketoverflow);
365#if 0
366 uma_zfree(V_tcp_hostcache.zone, hc_entry);
367#endif
368 } else {
369 /*
370 * Allocate a new entry, or balk if not possible.
371 */
372 hc_entry = uma_zalloc(V_tcp_hostcache.zone, M_NOWAIT);
373 if (hc_entry == NULL) {
374 THC_UNLOCK(&hc_head->hch_mtx);
375 return NULL;
376 }
377 }
378
379 /*
380 * Initialize basic information of hostcache entry.
381 */
382 bzero(hc_entry, sizeof(*hc_entry));
383 if (inc->inc_flags & INC_ISIPV6)
384 bcopy(&inc->inc6_faddr, &hc_entry->ip6, sizeof(hc_entry->ip6));
385 else
386 hc_entry->ip4 = inc->inc_faddr;
387 hc_entry->rmx_head = hc_head;
388 hc_entry->rmx_expire = V_tcp_hostcache.expire;
389
390 /*
391 * Put it upfront.
392 */
393 TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q);
394 V_tcp_hostcache.hashbase[hash].hch_length++;
395 V_tcp_hostcache.cache_count++;
396 TCPSTAT_INC(tcps_hc_added);
397
398 return hc_entry;
399}
400
401/*
402 * External function: look up an entry in the hostcache and fill out the
403 * supplied TCP metrics structure. Fills in NULL when no entry was found or
404 * a value is not set.
405 */
406void
407tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite)
408{
409 struct hc_metrics *hc_entry;
410
411 /*
412 * Find the right bucket.
413 */
414 hc_entry = tcp_hc_lookup(inc);
415
416 /*
417 * If we don't have an existing object.
418 */
419 if (hc_entry == NULL) {
420 bzero(hc_metrics_lite, sizeof(*hc_metrics_lite));
421 return;
422 }
423 hc_entry->rmx_hits++;
424 hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
425
426 hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu;
427 hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh;
428 hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt;
429 hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar;
430 hc_metrics_lite->rmx_bandwidth = hc_entry->rmx_bandwidth;
431 hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd;
432 hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe;
433 hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe;
434
435 /*
436 * Unlock bucket row.
437 */
438 THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
439}
440
441/*
442 * External function: look up an entry in the hostcache and return the
443 * discovered path MTU. Returns NULL if no entry is found or value is not
444 * set.
445 */
446u_long
447tcp_hc_getmtu(struct in_conninfo *inc)
448{
449 struct hc_metrics *hc_entry;
450 u_long mtu;
451
452 hc_entry = tcp_hc_lookup(inc);
453 if (hc_entry == NULL) {
454 return 0;
455 }
456 hc_entry->rmx_hits++;
457 hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
458
459 mtu = hc_entry->rmx_mtu;
460 THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
461 return mtu;
462}
463
464/*
465 * External function: update the MTU value of an entry in the hostcache.
466 * Creates a new entry if none was found.
467 */
468void
469tcp_hc_updatemtu(struct in_conninfo *inc, u_long mtu)
470{
471 struct hc_metrics *hc_entry;
472
473 /*
474 * Find the right bucket.
475 */
476 hc_entry = tcp_hc_lookup(inc);
477
478 /*
479 * If we don't have an existing object, try to insert a new one.
480 */
481 if (hc_entry == NULL) {
482 hc_entry = tcp_hc_insert(inc);
483 if (hc_entry == NULL)
484 return;
485 }
486 hc_entry->rmx_updates++;
487 hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
488
489 hc_entry->rmx_mtu = mtu;
490
491 /*
492 * Put it upfront so we find it faster next time.
493 */
494 TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
495 TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
496
497 /*
498 * Unlock bucket row.
499 */
500 THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
501}
502
503/*
504 * External function: update the TCP metrics of an entry in the hostcache.
505 * Creates a new entry if none was found.
506 */
507void
508tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml)
509{
510 struct hc_metrics *hc_entry;
511
512 hc_entry = tcp_hc_lookup(inc);
513 if (hc_entry == NULL) {
514 hc_entry = tcp_hc_insert(inc);
515 if (hc_entry == NULL)
516 return;
517 }
518 hc_entry->rmx_updates++;
519 hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */
520
521 if (hcml->rmx_rtt != 0) {
522 if (hc_entry->rmx_rtt == 0)
523 hc_entry->rmx_rtt = hcml->rmx_rtt;
524 else
525 hc_entry->rmx_rtt =
526 (hc_entry->rmx_rtt + hcml->rmx_rtt) / 2;
527 TCPSTAT_INC(tcps_cachedrtt);
528 }
529 if (hcml->rmx_rttvar != 0) {
530 if (hc_entry->rmx_rttvar == 0)
531 hc_entry->rmx_rttvar = hcml->rmx_rttvar;
532 else
533 hc_entry->rmx_rttvar =
534 (hc_entry->rmx_rttvar + hcml->rmx_rttvar) / 2;
535 TCPSTAT_INC(tcps_cachedrttvar);
536 }
537 if (hcml->rmx_ssthresh != 0) {
538 if (hc_entry->rmx_ssthresh == 0)
539 hc_entry->rmx_ssthresh = hcml->rmx_ssthresh;
540 else
541 hc_entry->rmx_ssthresh =
542 (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2;
543 TCPSTAT_INC(tcps_cachedssthresh);
544 }
545 if (hcml->rmx_bandwidth != 0) {
546 if (hc_entry->rmx_bandwidth == 0)
547 hc_entry->rmx_bandwidth = hcml->rmx_bandwidth;
548 else
549 hc_entry->rmx_bandwidth =
550 (hc_entry->rmx_bandwidth + hcml->rmx_bandwidth) / 2;
551 /* TCPSTAT_INC(tcps_cachedbandwidth); */
552 }
553 if (hcml->rmx_cwnd != 0) {
554 if (hc_entry->rmx_cwnd == 0)
555 hc_entry->rmx_cwnd = hcml->rmx_cwnd;
556 else
557 hc_entry->rmx_cwnd =
558 (hc_entry->rmx_cwnd + hcml->rmx_cwnd) / 2;
559 /* TCPSTAT_INC(tcps_cachedcwnd); */
560 }
561 if (hcml->rmx_sendpipe != 0) {
562 if (hc_entry->rmx_sendpipe == 0)
563 hc_entry->rmx_sendpipe = hcml->rmx_sendpipe;
564 else
565 hc_entry->rmx_sendpipe =
566 (hc_entry->rmx_sendpipe + hcml->rmx_sendpipe) /2;
567 /* TCPSTAT_INC(tcps_cachedsendpipe); */
568 }
569 if (hcml->rmx_recvpipe != 0) {
570 if (hc_entry->rmx_recvpipe == 0)
571 hc_entry->rmx_recvpipe = hcml->rmx_recvpipe;
572 else
573 hc_entry->rmx_recvpipe =
574 (hc_entry->rmx_recvpipe + hcml->rmx_recvpipe) /2;
575 /* TCPSTAT_INC(tcps_cachedrecvpipe); */
576 }
577
578 TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
579 TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
580 THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
581}
582
583/*
584 * Sysctl function: prints the list and values of all hostcache entries in
585 * unsorted order.
586 */
587static int
588sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS)
589{
590 int bufsize;
591 int linesize = 128;
592 char *p, *buf;
593 int len, i, error;
594 struct hc_metrics *hc_entry;
595#ifdef INET6
596 char ip6buf[INET6_ADDRSTRLEN];
597#endif
598
599 bufsize = linesize * (V_tcp_hostcache.cache_count + 1);
600
601 p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO);
602
603 len = snprintf(p, linesize,
604 "\nIP address MTU SSTRESH RTT RTTVAR BANDWIDTH "
605 " CWND SENDPIPE RECVPIPE HITS UPD EXP\n");
606 p += len;
607
608#define msec(u) (((u) + 500) / 1000)
609 for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
610 THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
611 TAILQ_FOREACH(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket,
612 rmx_q) {
613 len = snprintf(p, linesize,
614 "%-15s %5lu %8lu %6lums %6lums %9lu %8lu %8lu %8lu "
615 "%4lu %4lu %4i\n",
616 hc_entry->ip4.s_addr ? inet_ntoa(hc_entry->ip4) :
617#ifdef INET6
618 ip6_sprintf(ip6buf, &hc_entry->ip6),
619#else
620 "IPv6?",
621#endif
622 hc_entry->rmx_mtu,
623 hc_entry->rmx_ssthresh,
624 msec(hc_entry->rmx_rtt *
625 (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
626 msec(hc_entry->rmx_rttvar *
627 (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
628 hc_entry->rmx_bandwidth * 8,
629 hc_entry->rmx_cwnd,
630 hc_entry->rmx_sendpipe,
631 hc_entry->rmx_recvpipe,
632 hc_entry->rmx_hits,
633 hc_entry->rmx_updates,
634 hc_entry->rmx_expire);
635 p += len;
636 }
637 THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
638 }
639#undef msec
640 error = SYSCTL_OUT(req, buf, p - buf);
641 free(buf, M_TEMP);
642 return(error);
643}
644
645/*
646 * Caller has to make sure the curvnet is set properly.
647 */
648static void
649tcp_hc_purge_internal(int all)
650{
651 struct hc_metrics *hc_entry, *hc_next;
652 int i;
653
654 for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
655 THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
656 TAILQ_FOREACH_SAFE(hc_entry,
657 &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q, hc_next) {
658 if (all || hc_entry->rmx_expire <= 0) {
659 TAILQ_REMOVE(&V_tcp_hostcache.hashbase[i].hch_bucket,
660 hc_entry, rmx_q);
661 uma_zfree(V_tcp_hostcache.zone, hc_entry);
662 V_tcp_hostcache.hashbase[i].hch_length--;
663 V_tcp_hostcache.cache_count--;
664 } else
665 hc_entry->rmx_expire -= V_tcp_hostcache.prune;
666 }
667 THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
668 }
669}
670
671/*
672 * Expire and purge (old|all) entries in the tcp_hostcache. Runs
673 * periodically from the callout.
674 */
675static void
676tcp_hc_purge(void *arg)
677{
678 CURVNET_SET((struct vnet *) arg);
679 int all = 0;
680
681 if (V_tcp_hostcache.purgeall) {
682 all = 1;
683 V_tcp_hostcache.purgeall = 0;
684 }
685
686 tcp_hc_purge_internal(all);
687
688 callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
689 tcp_hc_purge, arg);
690 CURVNET_RESTORE();
691}