1/*- 2 * Copyright (c) 2019 Mindaugas Rasiukevicius <rmind at noxt eu> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27/* 28 * NPF port map mechanism. 29 * 30 * The port map is a bitmap used to track TCP/UDP ports used for 31 * translation. Port maps are per IP addresses, therefore multiple 32 * NAT policies operating on the same IP address will share the 33 * same port map. 34 */ 35 36#ifdef _KERNEL 37#include <sys/cdefs.h> 38__KERNEL_RCSID(0, "$NetBSD: npf_portmap.c,v 1.7 2020/08/28 06:35:50 riastradh Exp $"); 39 40#include <sys/param.h> 41#include <sys/types.h> 42 43#include <sys/atomic.h> 44#include <sys/bitops.h> 45#include <sys/kmem.h> 46#include <sys/mutex.h> 47#include <sys/cprng.h> 48#include <sys/thmap.h> 49#endif 50 51#include "npf_impl.h" 52 53/* 54 * Port map uses two-level bitmaps with compression to efficiently 55 * represent the maximum of 65536 (2^16) values. 56 * 57 * Level 0: 64 chunks each representing 1048 bits in two modes: 58 * 59 * a) If PORTMAP_L1_TAG, then up to 5 values are packed in the 60 * 64-bit integer using 12 bits for each value, starting from the 61 * most significant bits. The four 4 least significant bits are 62 * unused or reserved for pointer tagging. 63 * 64 * b) If there are more than 5 values, then PORTMAP_L1_TAG is set 65 * and the value serves as a pointer to the second level bitmap. 66 * 67 * Level 1: 16 chunks each representing 64 bits in plain uint64_t. 68 */ 69 70#define PORTMAP_MAX_BITS (65536U) 71#define PORTMAP_MASK (PORTMAP_MAX_BITS - 1) 72 73#define PORTMAP_L0_SHIFT (10) // or 11 74#define PORTMAP_L0_MASK ((1U << PORTMAP_L0_SHIFT) - 1) 75#define PORTMAP_L0_WORDS (PORTMAP_MAX_BITS >> PORTMAP_L0_SHIFT) 76 77#define PORTMAP_L1_SHIFT (6) 78#define PORTMAP_L1_MASK ((1U << PORTMAP_L1_SHIFT) - 1) 79#define PORTMAP_L1_WORDS \ 80 ((PORTMAP_MAX_BITS / PORTMAP_L0_WORDS) >> PORTMAP_L1_SHIFT) 81 82#define PORTMAP_L1_TAG (UINT64_C(1)) // use level 1 83#define PORTMAP_L1_GET(p) ((void *)((uintptr_t)(p) & ~(uintptr_t)3)) 84 85CTASSERT(sizeof(uint64_t) >= sizeof(uintptr_t)); 86 87typedef struct { 88 volatile uint64_t bits1[PORTMAP_L1_WORDS]; 89} bitmap_l1_t; 90 91typedef struct bitmap { 92 npf_addr_t addr; 93 volatile uint64_t bits0[PORTMAP_L0_WORDS]; 94 LIST_ENTRY(bitmap) entry; 95 unsigned addr_len; 96} bitmap_t; 97 98#define NPF_PORTMAP_MINPORT 1024 99#define NPF_PORTMAP_MAXPORT 65535 100 101struct npf_portmap { 102 thmap_t * addr_map; 103 LIST_HEAD(, bitmap) bitmap_list; 104 kmutex_t list_lock; 105 int min_port; 106 int max_port; 107}; 108 109static kmutex_t portmap_lock; 110 111void 112npf_portmap_sysinit(void) 113{ 114 115 mutex_init(&portmap_lock, MUTEX_DEFAULT, IPL_SOFTNET); 116} 117 118void 119npf_portmap_sysfini(void) 120{ 121 122 mutex_destroy(&portmap_lock); 123} 124 125void 126npf_portmap_init(npf_t *npf) 127{ 128 npf_portmap_t *pm = npf_portmap_create( 129 NPF_PORTMAP_MINPORT, NPF_PORTMAP_MAXPORT); 130 npf_param_t param_map[] = { 131 { 132 "portmap.min_port", 133 &pm->min_port, 134 .default_val = NPF_PORTMAP_MINPORT, 135 .min = 1024, .max = 65535 136 }, 137 { 138 "portmap.max_port", 139 &pm->max_port, 140 .default_val = 49151, // RFC 6335 141 .min = 1024, .max = 65535 142 } 143 }; 144 145 npf_param_register(npf, param_map, __arraycount(param_map)); 146 npf->portmap = pm; 147} 148 149void 150npf_portmap_fini(npf_t *npf) 151{ 152 153 npf_portmap_destroy(npf->portmap); 154 npf->portmap = NULL; // diagnostic 155} 156 157npf_portmap_t * 158npf_portmap_create(int min_port, int max_port) 159{ 160 npf_portmap_t *pm; 161 162 pm = kmem_zalloc(sizeof(npf_portmap_t), KM_SLEEP); 163 mutex_init(&pm->list_lock, MUTEX_DEFAULT, IPL_SOFTNET); 164 pm->addr_map = thmap_create(0, NULL, THMAP_NOCOPY); 165 pm->min_port = min_port; 166 pm->max_port = max_port; 167 return pm; 168} 169 170void 171npf_portmap_destroy(npf_portmap_t *pm) 172{ 173 npf_portmap_flush(pm); 174 KASSERT(LIST_EMPTY(&pm->bitmap_list)); 175 176 thmap_destroy(pm->addr_map); 177 mutex_destroy(&pm->list_lock); 178 kmem_free(pm, sizeof(npf_portmap_t)); 179} 180 181///////////////////////////////////////////////////////////////////////// 182 183#if defined(_LP64) 184#define __npf_atomic_cas_64 atomic_cas_64 185#else 186static uint64_t 187__npf_atomic_cas_64(volatile uint64_t *ptr, uint64_t old, uint64_t new) 188{ 189 uint64_t prev; 190 191 mutex_enter(&portmap_lock); 192 prev = *ptr; 193 if (prev == old) { 194 *ptr = new; 195 } 196 mutex_exit(&portmap_lock); 197 198 return prev; 199} 200#endif 201 202/* 203 * bitmap_word_isset: test whether the bit value is in the packed array. 204 * 205 * => Return true if any value equals the bit number value. 206 * 207 * Packed array: 60 MSB bits, 5 values, 12 bits each. 208 * 209 * Reference: "Bit Twiddling Hacks" by S.E. Anderson, Stanford. 210 * Based on the hasvalue() and haszero() ideas. Since values are 211 * represented by upper 60 bits, we shift right by 4. 212 */ 213static bool 214bitmap_word_isset(uint64_t x, unsigned bit) 215{ 216 uint64_t m, r; 217 218 bit++; 219 KASSERT((x & PORTMAP_L1_TAG) == 0); 220 KASSERT(bit <= (PORTMAP_L0_MASK + 1)); 221 222 m = (x >> 4) ^ (UINT64_C(0x1001001001001) * bit); 223 r = (m - UINT64_C(0x1001001001001)) & (~m & UINT64_C(0x800800800800800)); 224 return r != 0; 225} 226 227/* 228 * bitmap_word_cax: compare-and-xor on packed array elements. 229 */ 230static uint64_t 231bitmap_word_cax(uint64_t x, int exp, int bit) 232{ 233 unsigned e = exp + 1; 234 235 /* 236 * We need to distinguish "no value" from zero. Just add one, 237 * since we use 12 bits to represent 11 bit values. 238 */ 239 bit++; 240 KASSERT((unsigned)bit <= (PORTMAP_L0_MASK + 1)); 241 KASSERT((x & PORTMAP_L1_TAG) == 0); 242 243 if (((x >> 52) & 0xfff) == e) 244 return x ^ ((uint64_t)bit << 52); 245 if (((x >> 40) & 0xfff) == e) 246 return x ^ ((uint64_t)bit << 40); 247 if (((x >> 28) & 0xfff) == e) 248 return x ^ ((uint64_t)bit << 28); 249 if (((x >> 16) & 0xfff) == e) 250 return x ^ ((uint64_t)bit << 16); 251 if (((x >> 4) & 0xfff) == e) 252 return x ^ ((uint64_t)bit << 4); 253 return 0; 254} 255 256static unsigned 257bitmap_word_unpack(uint64_t x, unsigned bitvals[static 5]) 258{ 259 unsigned n = 0; 260 uint64_t v; 261 262 KASSERT((x & PORTMAP_L1_TAG) == 0); 263 264 if ((v = ((x >> 52)) & 0xfff) != 0) 265 bitvals[n++] = v - 1; 266 if ((v = ((x >> 40)) & 0xfff) != 0) 267 bitvals[n++] = v - 1; 268 if ((v = ((x >> 28)) & 0xfff) != 0) 269 bitvals[n++] = v - 1; 270 if ((v = ((x >> 16)) & 0xfff) != 0) 271 bitvals[n++] = v - 1; 272 if ((v = ((x >> 4)) & 0xfff) != 0) 273 bitvals[n++] = v - 1; 274 return n; 275} 276 277#if 0 278static bool 279bitmap_isset(const bitmap_t *bm, unsigned bit) 280{ 281 unsigned i, chunk_bit; 282 uint64_t bval, b; 283 bitmap_l1_t *bm1; 284 285 KASSERT(bit < PORTMAP_MAX_BITS); 286 i = bit >> PORTMAP_L0_SHIFT; 287 bval = atomic_load_relaxed(&bm->bits0[i]); 288 289 /* 290 * Empty check. Note: we can test the whole word against zero, 291 * since zero bit values in the packed array result in bits set. 292 */ 293 if (bval == 0) 294 return false; 295 296 /* Level 0 check. */ 297 chunk_bit = bit & PORTMAP_L0_MASK; 298 if ((bval & PORTMAP_L1_TAG) == 0) 299 return bitmap_word_isset(bval, chunk_bit); 300 301 /* Level 1 check. */ 302 bm1 = PORTMAP_L1_GET(bval); 303 KASSERT(bm1 != NULL); 304 i = chunk_bit >> PORTMAP_L1_SHIFT; 305 b = UINT64_C(1) << (chunk_bit & PORTMAP_L1_MASK); 306 return (bm1->bits1[i] & b) != 0; 307} 308#endif 309 310static bool 311bitmap_set(bitmap_t *bm, unsigned bit) 312{ 313 unsigned i, chunk_bit; 314 uint64_t bval, b, oval, nval; 315 bitmap_l1_t *bm1; 316again: 317 KASSERT(bit < PORTMAP_MAX_BITS); 318 i = bit >> PORTMAP_L0_SHIFT; 319 chunk_bit = bit & PORTMAP_L0_MASK; 320 bval = bm->bits0[i]; 321 322 if ((bval & PORTMAP_L1_TAG) == 0) { 323 unsigned n = 0, bitvals[5]; 324 uint64_t bm1p; 325 326 if (bitmap_word_isset(bval, chunk_bit)) { 327 return false; 328 } 329 330 /* 331 * Look for a zero-slot and put a value there. 332 */ 333 if ((nval = bitmap_word_cax(bval, -1, chunk_bit)) != 0) { 334 KASSERT((nval & PORTMAP_L1_TAG) == 0); 335 if (__npf_atomic_cas_64(&bm->bits0[i], bval, nval) != bval) { 336 goto again; 337 } 338 return true; 339 } 340 341 /* 342 * Full: allocate L1 block and copy over the current 343 * values into the level. 344 */ 345 bm1 = kmem_intr_zalloc(sizeof(bitmap_l1_t), KM_NOSLEEP); 346 if (bm1 == NULL) { 347 return false; // error 348 } 349 n = bitmap_word_unpack(bval, bitvals); 350 while (n--) { 351 const unsigned v = bitvals[n]; 352 const unsigned off = v >> PORTMAP_L1_SHIFT; 353 354 KASSERT(v <= PORTMAP_L0_MASK); 355 KASSERT(off < (sizeof(uint64_t) * CHAR_BIT)); 356 bm1->bits1[off] |= UINT64_C(1) << (v & PORTMAP_L1_MASK); 357 } 358 359 /* 360 * Attempt to set the L1 structure. Note: there is no 361 * ABA problem since the we compare the actual values. 362 * Note: CAS serves as a memory barrier. 363 */ 364 bm1p = (uintptr_t)bm1; 365 KASSERT((bm1p & PORTMAP_L1_TAG) == 0); 366 bm1p |= PORTMAP_L1_TAG; 367 if (__npf_atomic_cas_64(&bm->bits0[i], bval, bm1p) != bval) { 368 kmem_intr_free(bm1, sizeof(bitmap_l1_t)); 369 goto again; 370 } 371 bval = bm1p; 372 } 373 374 bm1 = PORTMAP_L1_GET(bval); 375 KASSERT(bm1 != NULL); 376 i = chunk_bit >> PORTMAP_L1_SHIFT; 377 b = UINT64_C(1) << (chunk_bit & PORTMAP_L1_MASK); 378 379 oval = bm1->bits1[i]; 380 if (oval & b) { 381 return false; 382 } 383 nval = oval | b; 384 if (__npf_atomic_cas_64(&bm1->bits1[i], oval, nval) != oval) { 385 goto again; 386 } 387 return true; 388} 389 390static bool 391bitmap_clr(bitmap_t *bm, unsigned bit) 392{ 393 unsigned i, chunk_bit; 394 uint64_t bval, b, oval, nval; 395 bitmap_l1_t *bm1; 396again: 397 KASSERT(bit < PORTMAP_MAX_BITS); 398 i = bit >> PORTMAP_L0_SHIFT; 399 chunk_bit = bit & PORTMAP_L0_MASK; 400 bval = bm->bits0[i]; 401 402 if ((bval & PORTMAP_L1_TAG) == 0) { 403 if (!bitmap_word_isset(bval, chunk_bit)) { 404 return false; 405 } 406 nval = bitmap_word_cax(bval, chunk_bit, chunk_bit); 407 KASSERT((nval & PORTMAP_L1_TAG) == 0); 408 if (__npf_atomic_cas_64(&bm->bits0[i], bval, nval) != bval) { 409 goto again; 410 } 411 return true; 412 } 413 414 bm1 = PORTMAP_L1_GET(bval); 415 KASSERT(bm1 != NULL); 416 i = chunk_bit >> PORTMAP_L1_SHIFT; 417 b = UINT64_C(1) << (chunk_bit & PORTMAP_L1_MASK); 418 419 oval = bm1->bits1[i]; 420 if ((oval & b) == 0) { 421 return false; 422 } 423 nval = oval & ~b; 424 if (__npf_atomic_cas_64(&bm1->bits1[i], oval, nval) != oval) { 425 goto again; 426 } 427 return true; 428} 429 430///////////////////////////////////////////////////////////////////////// 431 432static bitmap_t * 433npf_portmap_autoget(npf_portmap_t *pm, unsigned alen, const npf_addr_t *addr) 434{ 435 bitmap_t *bm; 436 437 KASSERT(pm && pm->addr_map); 438 KASSERT(alen && alen <= sizeof(npf_addr_t)); 439 440 /* Lookup the port map for this address. */ 441 bm = thmap_get(pm->addr_map, addr, alen); 442 if (bm == NULL) { 443 void *ret; 444 445 /* 446 * Allocate a new port map for this address and 447 * attempt to insert it. 448 */ 449 bm = kmem_intr_zalloc(sizeof(bitmap_t), KM_NOSLEEP); 450 if (bm == NULL) { 451 return NULL; 452 } 453 memcpy(&bm->addr, addr, alen); 454 bm->addr_len = alen; 455 456 int s = splsoftnet(); 457 ret = thmap_put(pm->addr_map, &bm->addr, alen, bm); 458 splx(s); 459 460 if (ret == bm) { 461 /* Success: insert the bitmap into the list. */ 462 mutex_enter(&pm->list_lock); 463 LIST_INSERT_HEAD(&pm->bitmap_list, bm, entry); 464 mutex_exit(&pm->list_lock); 465 } else { 466 /* Race: use an existing bitmap. */ 467 kmem_free(bm, sizeof(bitmap_t)); 468 bm = ret; 469 } 470 } 471 return bm; 472} 473 474/* 475 * npf_portmap_flush: free all bitmaps and remove all addresses. 476 * 477 * => Concurrent calls to this routine are not allowed; therefore no 478 * need to acquire locks. 479 */ 480void 481npf_portmap_flush(npf_portmap_t *pm) 482{ 483 bitmap_t *bm; 484 485 while ((bm = LIST_FIRST(&pm->bitmap_list)) != NULL) { 486 for (unsigned i = 0; i < PORTMAP_L0_WORDS; i++) { 487 uintptr_t bm1 = bm->bits0[i]; 488 489 if (bm1 & PORTMAP_L1_TAG) { 490 bitmap_l1_t *bm1p = PORTMAP_L1_GET(bm1); 491 kmem_intr_free(bm1p, sizeof(bitmap_l1_t)); 492 } 493 bm->bits0[i] = UINT64_C(0); 494 } 495 LIST_REMOVE(bm, entry); 496 thmap_del(pm->addr_map, &bm->addr, bm->addr_len); 497 kmem_intr_free(bm, sizeof(bitmap_t)); 498 } 499 /* Note: the caller ensures there are no active references. */ 500 thmap_gc(pm->addr_map, thmap_stage_gc(pm->addr_map)); 501} 502 503/* 504 * npf_portmap_get: allocate and return a port from the given portmap. 505 * 506 * => Returns the port value in network byte-order. 507 * => Zero indicates a failure. 508 */ 509in_port_t 510npf_portmap_get(npf_portmap_t *pm, int alen, const npf_addr_t *addr) 511{ 512 const unsigned min_port = atomic_load_relaxed(&pm->min_port); 513 const unsigned max_port = atomic_load_relaxed(&pm->max_port); 514 const unsigned port_delta = max_port - min_port + 1; 515 unsigned bit, target; 516 bitmap_t *bm; 517 518 /* Sanity check: the user might set incorrect parameters. */ 519 if (__predict_false(min_port > max_port)) { 520 return 0; 521 } 522 523 bm = npf_portmap_autoget(pm, alen, addr); 524 if (__predict_false(bm == NULL)) { 525 /* No memory. */ 526 return 0; 527 } 528 529 /* Randomly select a port. */ 530 target = min_port + (cprng_fast32() % port_delta); 531 bit = target; 532next: 533 if (bitmap_set(bm, bit)) { 534 /* Success. */ 535 return htons(bit); 536 } 537 bit = min_port + ((bit + 1) % port_delta); 538 if (target != bit) { 539 /* Next.. */ 540 goto next; 541 } 542 /* No space. */ 543 return 0; 544} 545 546/* 547 * npf_portmap_take: allocate a specific port in the portmap. 548 */ 549bool 550npf_portmap_take(npf_portmap_t *pm, int alen, 551 const npf_addr_t *addr, in_port_t port) 552{ 553 bitmap_t *bm = npf_portmap_autoget(pm, alen, addr); 554 555 port = ntohs(port); 556 if (!bm || port < pm->min_port || port > pm->max_port) { 557 /* Out of memory / invalid port. */ 558 return false; 559 } 560 return bitmap_set(bm, port); 561} 562 563/* 564 * npf_portmap_put: release the port, making it available in the portmap. 565 * 566 * => The port value should be in network byte-order. 567 */ 568void 569npf_portmap_put(npf_portmap_t *pm, int alen, 570 const npf_addr_t *addr, in_port_t port) 571{ 572 bitmap_t *bm; 573 574 bm = npf_portmap_autoget(pm, alen, addr); 575 if (bm) { 576 port = ntohs(port); 577 bitmap_clr(bm, port); 578 } 579} 580