1// SPDX-License-Identifier: GPL-2.0-only 2/* Connection state tracking for netfilter. This is separated from, 3 but required by, the NAT layer; it can also be used by an iptables 4 extension. */ 5 6/* (C) 1999-2001 Paul `Rusty' Russell 7 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 8 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> 9 * (C) 2005-2012 Patrick McHardy <kaber@trash.net> 10 */ 11 12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14#include <linux/types.h> 15#include <linux/netfilter.h> 16#include <linux/module.h> 17#include <linux/sched.h> 18#include <linux/skbuff.h> 19#include <linux/proc_fs.h> 20#include <linux/vmalloc.h> 21#include <linux/stddef.h> 22#include <linux/slab.h> 23#include <linux/random.h> 24#include <linux/siphash.h> 25#include <linux/err.h> 26#include <linux/percpu.h> 27#include <linux/moduleparam.h> 28#include <linux/notifier.h> 29#include <linux/kernel.h> 30#include <linux/netdevice.h> 31#include <linux/socket.h> 32#include <linux/mm.h> 33#include <linux/nsproxy.h> 34#include <linux/rculist_nulls.h> 35 36#include <net/netfilter/nf_conntrack.h> 37#include <net/netfilter/nf_conntrack_bpf.h> 38#include <net/netfilter/nf_conntrack_l4proto.h> 39#include <net/netfilter/nf_conntrack_expect.h> 40#include <net/netfilter/nf_conntrack_helper.h> 41#include <net/netfilter/nf_conntrack_core.h> 42#include <net/netfilter/nf_conntrack_extend.h> 43#include <net/netfilter/nf_conntrack_acct.h> 44#include <net/netfilter/nf_conntrack_ecache.h> 45#include <net/netfilter/nf_conntrack_zones.h> 46#include <net/netfilter/nf_conntrack_timestamp.h> 47#include <net/netfilter/nf_conntrack_timeout.h> 48#include <net/netfilter/nf_conntrack_labels.h> 49#include <net/netfilter/nf_conntrack_synproxy.h> 50#include <net/netfilter/nf_nat.h> 51#include <net/netfilter/nf_nat_helper.h> 52#include <net/netns/hash.h> 53#include <net/ip.h> 54 55#include "nf_internals.h" 56 57__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; 58EXPORT_SYMBOL_GPL(nf_conntrack_locks); 59 60__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); 61EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); 62 63struct hlist_nulls_head *nf_conntrack_hash __read_mostly; 64EXPORT_SYMBOL_GPL(nf_conntrack_hash); 65 66struct conntrack_gc_work { 67 struct delayed_work dwork; 68 u32 next_bucket; 69 u32 avg_timeout; 70 u32 count; 71 u32 start_time; 72 bool exiting; 73 bool early_drop; 74}; 75 76static __read_mostly struct kmem_cache *nf_conntrack_cachep; 77static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); 78static __read_mostly bool nf_conntrack_locks_all; 79 80/* serialize hash resizes and nf_ct_iterate_cleanup */ 81static DEFINE_MUTEX(nf_conntrack_mutex); 82 83#define GC_SCAN_INTERVAL_MAX (60ul * HZ) 84#define GC_SCAN_INTERVAL_MIN (1ul * HZ) 85 86/* clamp timeouts to this value (TCP unacked) */ 87#define GC_SCAN_INTERVAL_CLAMP (300ul * HZ) 88 89/* Initial bias pretending we have 100 entries at the upper bound so we don't 90 * wakeup often just because we have three entries with a 1s timeout while still 91 * allowing non-idle machines to wakeup more often when needed. 92 */ 93#define GC_SCAN_INITIAL_COUNT 100 94#define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX 95 96#define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) 97#define GC_SCAN_EXPIRED_MAX (64000u / HZ) 98 99#define MIN_CHAINLEN 50u 100#define MAX_CHAINLEN (80u - MIN_CHAINLEN) 101 102static struct conntrack_gc_work conntrack_gc_work; 103 104void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) 105{ 106 /* 1) Acquire the lock */ 107 spin_lock(lock); 108 109 /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics 110 * It pairs with the smp_store_release() in nf_conntrack_all_unlock() 111 */ 112 if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) 113 return; 114 115 /* fast path failed, unlock */ 116 spin_unlock(lock); 117 118 /* Slow path 1) get global lock */ 119 spin_lock(&nf_conntrack_locks_all_lock); 120 121 /* Slow path 2) get the lock we want */ 122 spin_lock(lock); 123 124 /* Slow path 3) release the global lock */ 125 spin_unlock(&nf_conntrack_locks_all_lock); 126} 127EXPORT_SYMBOL_GPL(nf_conntrack_lock); 128 129static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) 130{ 131 h1 %= CONNTRACK_LOCKS; 132 h2 %= CONNTRACK_LOCKS; 133 spin_unlock(&nf_conntrack_locks[h1]); 134 if (h1 != h2) 135 spin_unlock(&nf_conntrack_locks[h2]); 136} 137 138/* return true if we need to recompute hashes (in case hash table was resized) */ 139static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, 140 unsigned int h2, unsigned int sequence) 141{ 142 h1 %= CONNTRACK_LOCKS; 143 h2 %= CONNTRACK_LOCKS; 144 if (h1 <= h2) { 145 nf_conntrack_lock(&nf_conntrack_locks[h1]); 146 if (h1 != h2) 147 spin_lock_nested(&nf_conntrack_locks[h2], 148 SINGLE_DEPTH_NESTING); 149 } else { 150 nf_conntrack_lock(&nf_conntrack_locks[h2]); 151 spin_lock_nested(&nf_conntrack_locks[h1], 152 SINGLE_DEPTH_NESTING); 153 } 154 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { 155 nf_conntrack_double_unlock(h1, h2); 156 return true; 157 } 158 return false; 159} 160 161static void nf_conntrack_all_lock(void) 162 __acquires(&nf_conntrack_locks_all_lock) 163{ 164 int i; 165 166 spin_lock(&nf_conntrack_locks_all_lock); 167 168 /* For nf_contrack_locks_all, only the latest time when another 169 * CPU will see an update is controlled, by the "release" of the 170 * spin_lock below. 171 * The earliest time is not controlled, an thus KCSAN could detect 172 * a race when nf_conntract_lock() reads the variable. 173 * WRITE_ONCE() is used to ensure the compiler will not 174 * optimize the write. 175 */ 176 WRITE_ONCE(nf_conntrack_locks_all, true); 177 178 for (i = 0; i < CONNTRACK_LOCKS; i++) { 179 spin_lock(&nf_conntrack_locks[i]); 180 181 /* This spin_unlock provides the "release" to ensure that 182 * nf_conntrack_locks_all==true is visible to everyone that 183 * acquired spin_lock(&nf_conntrack_locks[]). 184 */ 185 spin_unlock(&nf_conntrack_locks[i]); 186 } 187} 188 189static void nf_conntrack_all_unlock(void) 190 __releases(&nf_conntrack_locks_all_lock) 191{ 192 /* All prior stores must be complete before we clear 193 * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() 194 * might observe the false value but not the entire 195 * critical section. 196 * It pairs with the smp_load_acquire() in nf_conntrack_lock() 197 */ 198 smp_store_release(&nf_conntrack_locks_all, false); 199 spin_unlock(&nf_conntrack_locks_all_lock); 200} 201 202unsigned int nf_conntrack_htable_size __read_mostly; 203EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 204 205unsigned int nf_conntrack_max __read_mostly; 206EXPORT_SYMBOL_GPL(nf_conntrack_max); 207seqcount_spinlock_t nf_conntrack_generation __read_mostly; 208static siphash_aligned_key_t nf_conntrack_hash_rnd; 209 210static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, 211 unsigned int zoneid, 212 const struct net *net) 213{ 214 siphash_key_t key; 215 216 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); 217 218 key = nf_conntrack_hash_rnd; 219 220 key.key[0] ^= zoneid; 221 key.key[1] ^= net_hash_mix(net); 222 223 return siphash((void *)tuple, 224 offsetofend(struct nf_conntrack_tuple, dst.__nfct_hash_offsetend), 225 &key); 226} 227 228static u32 scale_hash(u32 hash) 229{ 230 return reciprocal_scale(hash, nf_conntrack_htable_size); 231} 232 233static u32 __hash_conntrack(const struct net *net, 234 const struct nf_conntrack_tuple *tuple, 235 unsigned int zoneid, 236 unsigned int size) 237{ 238 return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); 239} 240 241static u32 hash_conntrack(const struct net *net, 242 const struct nf_conntrack_tuple *tuple, 243 unsigned int zoneid) 244{ 245 return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); 246} 247 248static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, 249 unsigned int dataoff, 250 struct nf_conntrack_tuple *tuple) 251{ struct { 252 __be16 sport; 253 __be16 dport; 254 } _inet_hdr, *inet_hdr; 255 256 /* Actually only need first 4 bytes to get ports. */ 257 inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); 258 if (!inet_hdr) 259 return false; 260 261 tuple->src.u.udp.port = inet_hdr->sport; 262 tuple->dst.u.udp.port = inet_hdr->dport; 263 return true; 264} 265 266static bool 267nf_ct_get_tuple(const struct sk_buff *skb, 268 unsigned int nhoff, 269 unsigned int dataoff, 270 u_int16_t l3num, 271 u_int8_t protonum, 272 struct net *net, 273 struct nf_conntrack_tuple *tuple) 274{ 275 unsigned int size; 276 const __be32 *ap; 277 __be32 _addrs[8]; 278 279 memset(tuple, 0, sizeof(*tuple)); 280 281 tuple->src.l3num = l3num; 282 switch (l3num) { 283 case NFPROTO_IPV4: 284 nhoff += offsetof(struct iphdr, saddr); 285 size = 2 * sizeof(__be32); 286 break; 287 case NFPROTO_IPV6: 288 nhoff += offsetof(struct ipv6hdr, saddr); 289 size = sizeof(_addrs); 290 break; 291 default: 292 return true; 293 } 294 295 ap = skb_header_pointer(skb, nhoff, size, _addrs); 296 if (!ap) 297 return false; 298 299 switch (l3num) { 300 case NFPROTO_IPV4: 301 tuple->src.u3.ip = ap[0]; 302 tuple->dst.u3.ip = ap[1]; 303 break; 304 case NFPROTO_IPV6: 305 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); 306 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); 307 break; 308 } 309 310 tuple->dst.protonum = protonum; 311 tuple->dst.dir = IP_CT_DIR_ORIGINAL; 312 313 switch (protonum) { 314#if IS_ENABLED(CONFIG_IPV6) 315 case IPPROTO_ICMPV6: 316 return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple); 317#endif 318 case IPPROTO_ICMP: 319 return icmp_pkt_to_tuple(skb, dataoff, net, tuple); 320#ifdef CONFIG_NF_CT_PROTO_GRE 321 case IPPROTO_GRE: 322 return gre_pkt_to_tuple(skb, dataoff, net, tuple); 323#endif 324 case IPPROTO_TCP: 325 case IPPROTO_UDP: 326#ifdef CONFIG_NF_CT_PROTO_UDPLITE 327 case IPPROTO_UDPLITE: 328#endif 329#ifdef CONFIG_NF_CT_PROTO_SCTP 330 case IPPROTO_SCTP: 331#endif 332#ifdef CONFIG_NF_CT_PROTO_DCCP 333 case IPPROTO_DCCP: 334#endif 335 /* fallthrough */ 336 return nf_ct_get_tuple_ports(skb, dataoff, tuple); 337 default: 338 break; 339 } 340 341 return true; 342} 343 344static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 345 u_int8_t *protonum) 346{ 347 int dataoff = -1; 348 const struct iphdr *iph; 349 struct iphdr _iph; 350 351 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); 352 if (!iph) 353 return -1; 354 355 /* Conntrack defragments packets, we might still see fragments 356 * inside ICMP packets though. 357 */ 358 if (iph->frag_off & htons(IP_OFFSET)) 359 return -1; 360 361 dataoff = nhoff + (iph->ihl << 2); 362 *protonum = iph->protocol; 363 364 /* Check bogus IP headers */ 365 if (dataoff > skb->len) { 366 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", 367 nhoff, iph->ihl << 2, skb->len); 368 return -1; 369 } 370 return dataoff; 371} 372 373#if IS_ENABLED(CONFIG_IPV6) 374static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 375 u8 *protonum) 376{ 377 int protoff = -1; 378 unsigned int extoff = nhoff + sizeof(struct ipv6hdr); 379 __be16 frag_off; 380 u8 nexthdr; 381 382 if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), 383 &nexthdr, sizeof(nexthdr)) != 0) { 384 pr_debug("can't get nexthdr\n"); 385 return -1; 386 } 387 protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); 388 /* 389 * (protoff == skb->len) means the packet has not data, just 390 * IPv6 and possibly extensions headers, but it is tracked anyway 391 */ 392 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { 393 pr_debug("can't find proto in pkt\n"); 394 return -1; 395 } 396 397 *protonum = nexthdr; 398 return protoff; 399} 400#endif 401 402static int get_l4proto(const struct sk_buff *skb, 403 unsigned int nhoff, u8 pf, u8 *l4num) 404{ 405 switch (pf) { 406 case NFPROTO_IPV4: 407 return ipv4_get_l4proto(skb, nhoff, l4num); 408#if IS_ENABLED(CONFIG_IPV6) 409 case NFPROTO_IPV6: 410 return ipv6_get_l4proto(skb, nhoff, l4num); 411#endif 412 default: 413 *l4num = 0; 414 break; 415 } 416 return -1; 417} 418 419bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, 420 u_int16_t l3num, 421 struct net *net, struct nf_conntrack_tuple *tuple) 422{ 423 u8 protonum; 424 int protoff; 425 426 protoff = get_l4proto(skb, nhoff, l3num, &protonum); 427 if (protoff <= 0) 428 return false; 429 430 return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple); 431} 432EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); 433 434bool 435nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, 436 const struct nf_conntrack_tuple *orig) 437{ 438 memset(inverse, 0, sizeof(*inverse)); 439 440 inverse->src.l3num = orig->src.l3num; 441 442 switch (orig->src.l3num) { 443 case NFPROTO_IPV4: 444 inverse->src.u3.ip = orig->dst.u3.ip; 445 inverse->dst.u3.ip = orig->src.u3.ip; 446 break; 447 case NFPROTO_IPV6: 448 inverse->src.u3.in6 = orig->dst.u3.in6; 449 inverse->dst.u3.in6 = orig->src.u3.in6; 450 break; 451 default: 452 break; 453 } 454 455 inverse->dst.dir = !orig->dst.dir; 456 457 inverse->dst.protonum = orig->dst.protonum; 458 459 switch (orig->dst.protonum) { 460 case IPPROTO_ICMP: 461 return nf_conntrack_invert_icmp_tuple(inverse, orig); 462#if IS_ENABLED(CONFIG_IPV6) 463 case IPPROTO_ICMPV6: 464 return nf_conntrack_invert_icmpv6_tuple(inverse, orig); 465#endif 466 } 467 468 inverse->src.u.all = orig->dst.u.all; 469 inverse->dst.u.all = orig->src.u.all; 470 return true; 471} 472EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); 473 474/* Generate a almost-unique pseudo-id for a given conntrack. 475 * 476 * intentionally doesn't re-use any of the seeds used for hash 477 * table location, we assume id gets exposed to userspace. 478 * 479 * Following nf_conn items do not change throughout lifetime 480 * of the nf_conn: 481 * 482 * 1. nf_conn address 483 * 2. nf_conn->master address (normally NULL) 484 * 3. the associated net namespace 485 * 4. the original direction tuple 486 */ 487u32 nf_ct_get_id(const struct nf_conn *ct) 488{ 489 static siphash_aligned_key_t ct_id_seed; 490 unsigned long a, b, c, d; 491 492 net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); 493 494 a = (unsigned long)ct; 495 b = (unsigned long)ct->master; 496 c = (unsigned long)nf_ct_net(ct); 497 d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 498 sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple), 499 &ct_id_seed); 500#ifdef CONFIG_64BIT 501 return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); 502#else 503 return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); 504#endif 505} 506EXPORT_SYMBOL_GPL(nf_ct_get_id); 507 508static void 509clean_from_lists(struct nf_conn *ct) 510{ 511 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 512 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); 513 514 /* Destroy all pending expectations */ 515 nf_ct_remove_expectations(ct); 516} 517 518#define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) 519 520/* Released via nf_ct_destroy() */ 521struct nf_conn *nf_ct_tmpl_alloc(struct net *net, 522 const struct nf_conntrack_zone *zone, 523 gfp_t flags) 524{ 525 struct nf_conn *tmpl, *p; 526 527 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) { 528 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags); 529 if (!tmpl) 530 return NULL; 531 532 p = tmpl; 533 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 534 if (tmpl != p) { 535 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 536 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; 537 } 538 } else { 539 tmpl = kzalloc(sizeof(*tmpl), flags); 540 if (!tmpl) 541 return NULL; 542 } 543 544 tmpl->status = IPS_TEMPLATE; 545 write_pnet(&tmpl->ct_net, net); 546 nf_ct_zone_add(tmpl, zone); 547 refcount_set(&tmpl->ct_general.use, 1); 548 549 return tmpl; 550} 551EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); 552 553void nf_ct_tmpl_free(struct nf_conn *tmpl) 554{ 555 kfree(tmpl->ext); 556 557 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) 558 kfree((char *)tmpl - tmpl->proto.tmpl_padto); 559 else 560 kfree(tmpl); 561} 562EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); 563 564static void destroy_gre_conntrack(struct nf_conn *ct) 565{ 566#ifdef CONFIG_NF_CT_PROTO_GRE 567 struct nf_conn *master = ct->master; 568 569 if (master) 570 nf_ct_gre_keymap_destroy(master); 571#endif 572} 573 574void nf_ct_destroy(struct nf_conntrack *nfct) 575{ 576 struct nf_conn *ct = (struct nf_conn *)nfct; 577 578 WARN_ON(refcount_read(&nfct->use) != 0); 579 580 if (unlikely(nf_ct_is_template(ct))) { 581 nf_ct_tmpl_free(ct); 582 return; 583 } 584 585 if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE)) 586 destroy_gre_conntrack(ct); 587 588 /* Expectations will have been removed in clean_from_lists, 589 * except TFTP can create an expectation on the first packet, 590 * before connection is in the list, so we need to clean here, 591 * too. 592 */ 593 nf_ct_remove_expectations(ct); 594 595 if (ct->master) 596 nf_ct_put(ct->master); 597 598 nf_conntrack_free(ct); 599} 600EXPORT_SYMBOL(nf_ct_destroy); 601 602static void __nf_ct_delete_from_lists(struct nf_conn *ct) 603{ 604 struct net *net = nf_ct_net(ct); 605 unsigned int hash, reply_hash; 606 unsigned int sequence; 607 608 do { 609 sequence = read_seqcount_begin(&nf_conntrack_generation); 610 hash = hash_conntrack(net, 611 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 612 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 613 reply_hash = hash_conntrack(net, 614 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 615 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 616 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 617 618 clean_from_lists(ct); 619 nf_conntrack_double_unlock(hash, reply_hash); 620} 621 622static void nf_ct_delete_from_lists(struct nf_conn *ct) 623{ 624 nf_ct_helper_destroy(ct); 625 local_bh_disable(); 626 627 __nf_ct_delete_from_lists(ct); 628 629 local_bh_enable(); 630} 631 632static void nf_ct_add_to_ecache_list(struct nf_conn *ct) 633{ 634#ifdef CONFIG_NF_CONNTRACK_EVENTS 635 struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct)); 636 637 spin_lock(&cnet->ecache.dying_lock); 638 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 639 &cnet->ecache.dying_list); 640 spin_unlock(&cnet->ecache.dying_lock); 641#endif 642} 643 644bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) 645{ 646 struct nf_conn_tstamp *tstamp; 647 struct net *net; 648 649 if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) 650 return false; 651 652 tstamp = nf_conn_tstamp_find(ct); 653 if (tstamp) { 654 s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; 655 656 tstamp->stop = ktime_get_real_ns(); 657 if (timeout < 0) 658 tstamp->stop -= jiffies_to_nsecs(-timeout); 659 } 660 661 if (nf_conntrack_event_report(IPCT_DESTROY, ct, 662 portid, report) < 0) { 663 /* destroy event was not delivered. nf_ct_put will 664 * be done by event cache worker on redelivery. 665 */ 666 nf_ct_helper_destroy(ct); 667 local_bh_disable(); 668 __nf_ct_delete_from_lists(ct); 669 nf_ct_add_to_ecache_list(ct); 670 local_bh_enable(); 671 672 nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); 673 return false; 674 } 675 676 net = nf_ct_net(ct); 677 if (nf_conntrack_ecache_dwork_pending(net)) 678 nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT); 679 nf_ct_delete_from_lists(ct); 680 nf_ct_put(ct); 681 return true; 682} 683EXPORT_SYMBOL_GPL(nf_ct_delete); 684 685static inline bool 686nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, 687 const struct nf_conntrack_tuple *tuple, 688 const struct nf_conntrack_zone *zone, 689 const struct net *net) 690{ 691 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 692 693 /* A conntrack can be recreated with the equal tuple, 694 * so we need to check that the conntrack is confirmed 695 */ 696 return nf_ct_tuple_equal(tuple, &h->tuple) && 697 nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && 698 nf_ct_is_confirmed(ct) && 699 net_eq(net, nf_ct_net(ct)); 700} 701 702static inline bool 703nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) 704{ 705 return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 706 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 707 nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, 708 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) && 709 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) && 710 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) && 711 net_eq(nf_ct_net(ct1), nf_ct_net(ct2)); 712} 713 714/* caller must hold rcu readlock and none of the nf_conntrack_locks */ 715static void nf_ct_gc_expired(struct nf_conn *ct) 716{ 717 if (!refcount_inc_not_zero(&ct->ct_general.use)) 718 return; 719 720 /* load ->status after refcount increase */ 721 smp_acquire__after_ctrl_dep(); 722 723 if (nf_ct_should_gc(ct)) 724 nf_ct_kill(ct); 725 726 nf_ct_put(ct); 727} 728 729/* 730 * Warning : 731 * - Caller must take a reference on returned object 732 * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 733 */ 734static struct nf_conntrack_tuple_hash * 735____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, 736 const struct nf_conntrack_tuple *tuple, u32 hash) 737{ 738 struct nf_conntrack_tuple_hash *h; 739 struct hlist_nulls_head *ct_hash; 740 struct hlist_nulls_node *n; 741 unsigned int bucket, hsize; 742 743begin: 744 nf_conntrack_get_ht(&ct_hash, &hsize); 745 bucket = reciprocal_scale(hash, hsize); 746 747 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { 748 struct nf_conn *ct; 749 750 ct = nf_ct_tuplehash_to_ctrack(h); 751 if (nf_ct_is_expired(ct)) { 752 nf_ct_gc_expired(ct); 753 continue; 754 } 755 756 if (nf_ct_key_equal(h, tuple, zone, net)) 757 return h; 758 } 759 /* 760 * if the nulls value we got at the end of this lookup is 761 * not the expected one, we must restart lookup. 762 * We probably met an item that was moved to another chain. 763 */ 764 if (get_nulls_value(n) != bucket) { 765 NF_CT_STAT_INC_ATOMIC(net, search_restart); 766 goto begin; 767 } 768 769 return NULL; 770} 771 772/* Find a connection corresponding to a tuple. */ 773static struct nf_conntrack_tuple_hash * 774__nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 775 const struct nf_conntrack_tuple *tuple, u32 hash) 776{ 777 struct nf_conntrack_tuple_hash *h; 778 struct nf_conn *ct; 779 780 h = ____nf_conntrack_find(net, zone, tuple, hash); 781 if (h) { 782 /* We have a candidate that matches the tuple we're interested 783 * in, try to obtain a reference and re-check tuple 784 */ 785 ct = nf_ct_tuplehash_to_ctrack(h); 786 if (likely(refcount_inc_not_zero(&ct->ct_general.use))) { 787 /* re-check key after refcount */ 788 smp_acquire__after_ctrl_dep(); 789 790 if (likely(nf_ct_key_equal(h, tuple, zone, net))) 791 return h; 792 793 /* TYPESAFE_BY_RCU recycled the candidate */ 794 nf_ct_put(ct); 795 } 796 797 h = NULL; 798 } 799 800 return h; 801} 802 803struct nf_conntrack_tuple_hash * 804nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 805 const struct nf_conntrack_tuple *tuple) 806{ 807 unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 808 struct nf_conntrack_tuple_hash *thash; 809 810 rcu_read_lock(); 811 812 thash = __nf_conntrack_find_get(net, zone, tuple, 813 hash_conntrack_raw(tuple, zone_id, net)); 814 815 if (thash) 816 goto out_unlock; 817 818 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 819 if (rid != zone_id) 820 thash = __nf_conntrack_find_get(net, zone, tuple, 821 hash_conntrack_raw(tuple, rid, net)); 822 823out_unlock: 824 rcu_read_unlock(); 825 return thash; 826} 827EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 828 829static void __nf_conntrack_hash_insert(struct nf_conn *ct, 830 unsigned int hash, 831 unsigned int reply_hash) 832{ 833 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 834 &nf_conntrack_hash[hash]); 835 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 836 &nf_conntrack_hash[reply_hash]); 837} 838 839static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext) 840{ 841 /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions 842 * may contain stale pointers to e.g. helper that has been removed. 843 * 844 * The helper can't clear this because the nf_conn object isn't in 845 * any hash and synchronize_rcu() isn't enough because associated skb 846 * might sit in a queue. 847 */ 848 return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid); 849} 850 851static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext) 852{ 853 if (!ext) 854 return true; 855 856 if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid)) 857 return false; 858 859 /* inserted into conntrack table, nf_ct_iterate_cleanup() 860 * will find it. Disable nf_ct_ext_find() id check. 861 */ 862 WRITE_ONCE(ext->gen_id, 0); 863 return true; 864} 865 866int 867nf_conntrack_hash_check_insert(struct nf_conn *ct) 868{ 869 const struct nf_conntrack_zone *zone; 870 struct net *net = nf_ct_net(ct); 871 unsigned int hash, reply_hash; 872 struct nf_conntrack_tuple_hash *h; 873 struct hlist_nulls_node *n; 874 unsigned int max_chainlen; 875 unsigned int chainlen = 0; 876 unsigned int sequence; 877 int err = -EEXIST; 878 879 zone = nf_ct_zone(ct); 880 881 if (!nf_ct_ext_valid_pre(ct->ext)) 882 return -EAGAIN; 883 884 local_bh_disable(); 885 do { 886 sequence = read_seqcount_begin(&nf_conntrack_generation); 887 hash = hash_conntrack(net, 888 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 889 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 890 reply_hash = hash_conntrack(net, 891 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 892 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 893 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 894 895 max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); 896 897 /* See if there's one in the list already, including reverse */ 898 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 899 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 900 zone, net)) 901 goto out; 902 903 if (chainlen++ > max_chainlen) 904 goto chaintoolong; 905 } 906 907 chainlen = 0; 908 909 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 910 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 911 zone, net)) 912 goto out; 913 if (chainlen++ > max_chainlen) 914 goto chaintoolong; 915 } 916 917 /* If genid has changed, we can't insert anymore because ct 918 * extensions could have stale pointers and nf_ct_iterate_destroy 919 * might have completed its table scan already. 920 * 921 * Increment of the ext genid right after this check is fine: 922 * nf_ct_iterate_destroy blocks until locks are released. 923 */ 924 if (!nf_ct_ext_valid_post(ct->ext)) { 925 err = -EAGAIN; 926 goto out; 927 } 928 929 smp_wmb(); 930 /* The caller holds a reference to this object */ 931 refcount_set(&ct->ct_general.use, 2); 932 __nf_conntrack_hash_insert(ct, hash, reply_hash); 933 nf_conntrack_double_unlock(hash, reply_hash); 934 NF_CT_STAT_INC(net, insert); 935 local_bh_enable(); 936 937 return 0; 938chaintoolong: 939 NF_CT_STAT_INC(net, chaintoolong); 940 err = -ENOSPC; 941out: 942 nf_conntrack_double_unlock(hash, reply_hash); 943 local_bh_enable(); 944 return err; 945} 946EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); 947 948void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets, 949 unsigned int bytes) 950{ 951 struct nf_conn_acct *acct; 952 953 acct = nf_conn_acct_find(ct); 954 if (acct) { 955 struct nf_conn_counter *counter = acct->counter; 956 957 atomic64_add(packets, &counter[dir].packets); 958 atomic64_add(bytes, &counter[dir].bytes); 959 } 960} 961EXPORT_SYMBOL_GPL(nf_ct_acct_add); 962 963static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, 964 const struct nf_conn *loser_ct) 965{ 966 struct nf_conn_acct *acct; 967 968 acct = nf_conn_acct_find(loser_ct); 969 if (acct) { 970 struct nf_conn_counter *counter = acct->counter; 971 unsigned int bytes; 972 973 /* u32 should be fine since we must have seen one packet. */ 974 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); 975 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); 976 } 977} 978 979static void __nf_conntrack_insert_prepare(struct nf_conn *ct) 980{ 981 struct nf_conn_tstamp *tstamp; 982 983 refcount_inc(&ct->ct_general.use); 984 985 /* set conntrack timestamp, if enabled. */ 986 tstamp = nf_conn_tstamp_find(ct); 987 if (tstamp) 988 tstamp->start = ktime_get_real_ns(); 989} 990 991/* caller must hold locks to prevent concurrent changes */ 992static int __nf_ct_resolve_clash(struct sk_buff *skb, 993 struct nf_conntrack_tuple_hash *h) 994{ 995 /* This is the conntrack entry already in hashes that won race. */ 996 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 997 enum ip_conntrack_info ctinfo; 998 struct nf_conn *loser_ct; 999 1000 loser_ct = nf_ct_get(skb, &ctinfo); 1001 1002 if (nf_ct_is_dying(ct)) 1003 return NF_DROP; 1004 1005 if (((ct->status & IPS_NAT_DONE_MASK) == 0) || 1006 nf_ct_match(ct, loser_ct)) { 1007 struct net *net = nf_ct_net(ct); 1008 1009 nf_conntrack_get(&ct->ct_general); 1010 1011 nf_ct_acct_merge(ct, ctinfo, loser_ct); 1012 nf_ct_put(loser_ct); 1013 nf_ct_set(skb, ct, ctinfo); 1014 1015 NF_CT_STAT_INC(net, clash_resolve); 1016 return NF_ACCEPT; 1017 } 1018 1019 return NF_DROP; 1020} 1021 1022/** 1023 * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry 1024 * 1025 * @skb: skb that causes the collision 1026 * @repl_idx: hash slot for reply direction 1027 * 1028 * Called when origin or reply direction had a clash. 1029 * The skb can be handled without packet drop provided the reply direction 1030 * is unique or there the existing entry has the identical tuple in both 1031 * directions. 1032 * 1033 * Caller must hold conntrack table locks to prevent concurrent updates. 1034 * 1035 * Returns NF_DROP if the clash could not be handled. 1036 */ 1037static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) 1038{ 1039 struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb); 1040 const struct nf_conntrack_zone *zone; 1041 struct nf_conntrack_tuple_hash *h; 1042 struct hlist_nulls_node *n; 1043 struct net *net; 1044 1045 zone = nf_ct_zone(loser_ct); 1046 net = nf_ct_net(loser_ct); 1047 1048 /* Reply direction must never result in a clash, unless both origin 1049 * and reply tuples are identical. 1050 */ 1051 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) { 1052 if (nf_ct_key_equal(h, 1053 &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1054 zone, net)) 1055 return __nf_ct_resolve_clash(skb, h); 1056 } 1057 1058 /* We want the clashing entry to go away real soon: 1 second timeout. */ 1059 WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ); 1060 1061 /* IPS_NAT_CLASH removes the entry automatically on the first 1062 * reply. Also prevents UDP tracker from moving the entry to 1063 * ASSURED state, i.e. the entry can always be evicted under 1064 * pressure. 1065 */ 1066 loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH; 1067 1068 __nf_conntrack_insert_prepare(loser_ct); 1069 1070 /* fake add for ORIGINAL dir: we want lookups to only find the entry 1071 * already in the table. This also hides the clashing entry from 1072 * ctnetlink iteration, i.e. conntrack -L won't show them. 1073 */ 1074 hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 1075 1076 hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 1077 &nf_conntrack_hash[repl_idx]); 1078 1079 NF_CT_STAT_INC(net, clash_resolve); 1080 return NF_ACCEPT; 1081} 1082 1083/** 1084 * nf_ct_resolve_clash - attempt to handle clash without packet drop 1085 * 1086 * @skb: skb that causes the clash 1087 * @h: tuplehash of the clashing entry already in table 1088 * @reply_hash: hash slot for reply direction 1089 * 1090 * A conntrack entry can be inserted to the connection tracking table 1091 * if there is no existing entry with an identical tuple. 1092 * 1093 * If there is one, @skb (and the assocated, unconfirmed conntrack) has 1094 * to be dropped. In case @skb is retransmitted, next conntrack lookup 1095 * will find the already-existing entry. 1096 * 1097 * The major problem with such packet drop is the extra delay added by 1098 * the packet loss -- it will take some time for a retransmit to occur 1099 * (or the sender to time out when waiting for a reply). 1100 * 1101 * This function attempts to handle the situation without packet drop. 1102 * 1103 * If @skb has no NAT transformation or if the colliding entries are 1104 * exactly the same, only the to-be-confirmed conntrack entry is discarded 1105 * and @skb is associated with the conntrack entry already in the table. 1106 * 1107 * Failing that, the new, unconfirmed conntrack is still added to the table 1108 * provided that the collision only occurs in the ORIGINAL direction. 1109 * The new entry will be added only in the non-clashing REPLY direction, 1110 * so packets in the ORIGINAL direction will continue to match the existing 1111 * entry. The new entry will also have a fixed timeout so it expires -- 1112 * due to the collision, it will only see reply traffic. 1113 * 1114 * Returns NF_DROP if the clash could not be resolved. 1115 */ 1116static __cold noinline int 1117nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, 1118 u32 reply_hash) 1119{ 1120 /* This is the conntrack entry already in hashes that won race. */ 1121 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 1122 const struct nf_conntrack_l4proto *l4proto; 1123 enum ip_conntrack_info ctinfo; 1124 struct nf_conn *loser_ct; 1125 struct net *net; 1126 int ret; 1127 1128 loser_ct = nf_ct_get(skb, &ctinfo); 1129 net = nf_ct_net(loser_ct); 1130 1131 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); 1132 if (!l4proto->allow_clash) 1133 goto drop; 1134 1135 ret = __nf_ct_resolve_clash(skb, h); 1136 if (ret == NF_ACCEPT) 1137 return ret; 1138 1139 ret = nf_ct_resolve_clash_harder(skb, reply_hash); 1140 if (ret == NF_ACCEPT) 1141 return ret; 1142 1143drop: 1144 NF_CT_STAT_INC(net, drop); 1145 NF_CT_STAT_INC(net, insert_failed); 1146 return NF_DROP; 1147} 1148 1149/* Confirm a connection given skb; places it in hash table */ 1150int 1151__nf_conntrack_confirm(struct sk_buff *skb) 1152{ 1153 unsigned int chainlen = 0, sequence, max_chainlen; 1154 const struct nf_conntrack_zone *zone; 1155 unsigned int hash, reply_hash; 1156 struct nf_conntrack_tuple_hash *h; 1157 struct nf_conn *ct; 1158 struct nf_conn_help *help; 1159 struct hlist_nulls_node *n; 1160 enum ip_conntrack_info ctinfo; 1161 struct net *net; 1162 int ret = NF_DROP; 1163 1164 ct = nf_ct_get(skb, &ctinfo); 1165 net = nf_ct_net(ct); 1166 1167 /* ipt_REJECT uses nf_conntrack_attach to attach related 1168 ICMP/TCP RST packets in other direction. Actual packet 1169 which created connection will be IP_CT_NEW or for an 1170 expected connection, IP_CT_RELATED. */ 1171 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) 1172 return NF_ACCEPT; 1173 1174 zone = nf_ct_zone(ct); 1175 local_bh_disable(); 1176 1177 do { 1178 sequence = read_seqcount_begin(&nf_conntrack_generation); 1179 /* reuse the hash saved before */ 1180 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; 1181 hash = scale_hash(hash); 1182 reply_hash = hash_conntrack(net, 1183 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1184 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 1185 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 1186 1187 /* We're not in hash table, and we refuse to set up related 1188 * connections for unconfirmed conns. But packet copies and 1189 * REJECT will give spurious warnings here. 1190 */ 1191 1192 /* Another skb with the same unconfirmed conntrack may 1193 * win the race. This may happen for bridge(br_flood) 1194 * or broadcast/multicast packets do skb_clone with 1195 * unconfirmed conntrack. 1196 */ 1197 if (unlikely(nf_ct_is_confirmed(ct))) { 1198 WARN_ON_ONCE(1); 1199 nf_conntrack_double_unlock(hash, reply_hash); 1200 local_bh_enable(); 1201 return NF_DROP; 1202 } 1203 1204 if (!nf_ct_ext_valid_pre(ct->ext)) { 1205 NF_CT_STAT_INC(net, insert_failed); 1206 goto dying; 1207 } 1208 1209 /* We have to check the DYING flag after unlink to prevent 1210 * a race against nf_ct_get_next_corpse() possibly called from 1211 * user context, else we insert an already 'dead' hash, blocking 1212 * further use of that particular connection -JM. 1213 */ 1214 ct->status |= IPS_CONFIRMED; 1215 1216 if (unlikely(nf_ct_is_dying(ct))) { 1217 NF_CT_STAT_INC(net, insert_failed); 1218 goto dying; 1219 } 1220 1221 max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); 1222 /* See if there's one in the list already, including reverse: 1223 NAT could have grabbed it without realizing, since we're 1224 not in the hash. If there is, we lost race. */ 1225 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 1226 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1227 zone, net)) 1228 goto out; 1229 if (chainlen++ > max_chainlen) 1230 goto chaintoolong; 1231 } 1232 1233 chainlen = 0; 1234 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 1235 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1236 zone, net)) 1237 goto out; 1238 if (chainlen++ > max_chainlen) { 1239chaintoolong: 1240 NF_CT_STAT_INC(net, chaintoolong); 1241 NF_CT_STAT_INC(net, insert_failed); 1242 ret = NF_DROP; 1243 goto dying; 1244 } 1245 } 1246 1247 /* Timer relative to confirmation time, not original 1248 setting time, otherwise we'd get timer wrap in 1249 weird delay cases. */ 1250 ct->timeout += nfct_time_stamp; 1251 1252 __nf_conntrack_insert_prepare(ct); 1253 1254 /* Since the lookup is lockless, hash insertion must be done after 1255 * starting the timer and setting the CONFIRMED bit. The RCU barriers 1256 * guarantee that no other CPU can find the conntrack before the above 1257 * stores are visible. 1258 */ 1259 __nf_conntrack_hash_insert(ct, hash, reply_hash); 1260 nf_conntrack_double_unlock(hash, reply_hash); 1261 local_bh_enable(); 1262 1263 /* ext area is still valid (rcu read lock is held, 1264 * but will go out of scope soon, we need to remove 1265 * this conntrack again. 1266 */ 1267 if (!nf_ct_ext_valid_post(ct->ext)) { 1268 nf_ct_kill(ct); 1269 NF_CT_STAT_INC_ATOMIC(net, drop); 1270 return NF_DROP; 1271 } 1272 1273 help = nfct_help(ct); 1274 if (help && help->helper) 1275 nf_conntrack_event_cache(IPCT_HELPER, ct); 1276 1277 nf_conntrack_event_cache(master_ct(ct) ? 1278 IPCT_RELATED : IPCT_NEW, ct); 1279 return NF_ACCEPT; 1280 1281out: 1282 ret = nf_ct_resolve_clash(skb, h, reply_hash); 1283dying: 1284 nf_conntrack_double_unlock(hash, reply_hash); 1285 local_bh_enable(); 1286 return ret; 1287} 1288EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 1289 1290/* Returns true if a connection corresponds to the tuple (required 1291 for NAT). */ 1292int 1293nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, 1294 const struct nf_conn *ignored_conntrack) 1295{ 1296 struct net *net = nf_ct_net(ignored_conntrack); 1297 const struct nf_conntrack_zone *zone; 1298 struct nf_conntrack_tuple_hash *h; 1299 struct hlist_nulls_head *ct_hash; 1300 unsigned int hash, hsize; 1301 struct hlist_nulls_node *n; 1302 struct nf_conn *ct; 1303 1304 zone = nf_ct_zone(ignored_conntrack); 1305 1306 rcu_read_lock(); 1307 begin: 1308 nf_conntrack_get_ht(&ct_hash, &hsize); 1309 hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); 1310 1311 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { 1312 ct = nf_ct_tuplehash_to_ctrack(h); 1313 1314 if (ct == ignored_conntrack) 1315 continue; 1316 1317 if (nf_ct_is_expired(ct)) { 1318 nf_ct_gc_expired(ct); 1319 continue; 1320 } 1321 1322 if (nf_ct_key_equal(h, tuple, zone, net)) { 1323 /* Tuple is taken already, so caller will need to find 1324 * a new source port to use. 1325 * 1326 * Only exception: 1327 * If the *original tuples* are identical, then both 1328 * conntracks refer to the same flow. 1329 * This is a rare situation, it can occur e.g. when 1330 * more than one UDP packet is sent from same socket 1331 * in different threads. 1332 * 1333 * Let nf_ct_resolve_clash() deal with this later. 1334 */ 1335 if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1336 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 1337 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) 1338 continue; 1339 1340 NF_CT_STAT_INC_ATOMIC(net, found); 1341 rcu_read_unlock(); 1342 return 1; 1343 } 1344 } 1345 1346 if (get_nulls_value(n) != hash) { 1347 NF_CT_STAT_INC_ATOMIC(net, search_restart); 1348 goto begin; 1349 } 1350 1351 rcu_read_unlock(); 1352 1353 return 0; 1354} 1355EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); 1356 1357#define NF_CT_EVICTION_RANGE 8 1358 1359/* There's a small race here where we may free a just-assured 1360 connection. Too bad: we're in trouble anyway. */ 1361static unsigned int early_drop_list(struct net *net, 1362 struct hlist_nulls_head *head) 1363{ 1364 struct nf_conntrack_tuple_hash *h; 1365 struct hlist_nulls_node *n; 1366 unsigned int drops = 0; 1367 struct nf_conn *tmp; 1368 1369 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { 1370 tmp = nf_ct_tuplehash_to_ctrack(h); 1371 1372 if (nf_ct_is_expired(tmp)) { 1373 nf_ct_gc_expired(tmp); 1374 continue; 1375 } 1376 1377 if (test_bit(IPS_ASSURED_BIT, &tmp->status) || 1378 !net_eq(nf_ct_net(tmp), net) || 1379 nf_ct_is_dying(tmp)) 1380 continue; 1381 1382 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1383 continue; 1384 1385 /* load ->ct_net and ->status after refcount increase */ 1386 smp_acquire__after_ctrl_dep(); 1387 1388 /* kill only if still in same netns -- might have moved due to 1389 * SLAB_TYPESAFE_BY_RCU rules. 1390 * 1391 * We steal the timer reference. If that fails timer has 1392 * already fired or someone else deleted it. Just drop ref 1393 * and move to next entry. 1394 */ 1395 if (net_eq(nf_ct_net(tmp), net) && 1396 nf_ct_is_confirmed(tmp) && 1397 nf_ct_delete(tmp, 0, 0)) 1398 drops++; 1399 1400 nf_ct_put(tmp); 1401 } 1402 1403 return drops; 1404} 1405 1406static noinline int early_drop(struct net *net, unsigned int hash) 1407{ 1408 unsigned int i, bucket; 1409 1410 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { 1411 struct hlist_nulls_head *ct_hash; 1412 unsigned int hsize, drops; 1413 1414 rcu_read_lock(); 1415 nf_conntrack_get_ht(&ct_hash, &hsize); 1416 if (!i) 1417 bucket = reciprocal_scale(hash, hsize); 1418 else 1419 bucket = (bucket + 1) % hsize; 1420 1421 drops = early_drop_list(net, &ct_hash[bucket]); 1422 rcu_read_unlock(); 1423 1424 if (drops) { 1425 NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); 1426 return true; 1427 } 1428 } 1429 1430 return false; 1431} 1432 1433static bool gc_worker_skip_ct(const struct nf_conn *ct) 1434{ 1435 return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct); 1436} 1437 1438static bool gc_worker_can_early_drop(const struct nf_conn *ct) 1439{ 1440 const struct nf_conntrack_l4proto *l4proto; 1441 u8 protonum = nf_ct_protonum(ct); 1442 1443 if (test_bit(IPS_OFFLOAD_BIT, &ct->status) && protonum != IPPROTO_UDP) 1444 return false; 1445 if (!test_bit(IPS_ASSURED_BIT, &ct->status)) 1446 return true; 1447 1448 l4proto = nf_ct_l4proto_find(protonum); 1449 if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) 1450 return true; 1451 1452 return false; 1453} 1454 1455static void gc_worker(struct work_struct *work) 1456{ 1457 unsigned int i, hashsz, nf_conntrack_max95 = 0; 1458 u32 end_time, start_time = nfct_time_stamp; 1459 struct conntrack_gc_work *gc_work; 1460 unsigned int expired_count = 0; 1461 unsigned long next_run; 1462 s32 delta_time; 1463 long count; 1464 1465 gc_work = container_of(work, struct conntrack_gc_work, dwork.work); 1466 1467 i = gc_work->next_bucket; 1468 if (gc_work->early_drop) 1469 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; 1470 1471 if (i == 0) { 1472 gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT; 1473 gc_work->count = GC_SCAN_INITIAL_COUNT; 1474 gc_work->start_time = start_time; 1475 } 1476 1477 next_run = gc_work->avg_timeout; 1478 count = gc_work->count; 1479 1480 end_time = start_time + GC_SCAN_MAX_DURATION; 1481 1482 do { 1483 struct nf_conntrack_tuple_hash *h; 1484 struct hlist_nulls_head *ct_hash; 1485 struct hlist_nulls_node *n; 1486 struct nf_conn *tmp; 1487 1488 rcu_read_lock(); 1489 1490 nf_conntrack_get_ht(&ct_hash, &hashsz); 1491 if (i >= hashsz) { 1492 rcu_read_unlock(); 1493 break; 1494 } 1495 1496 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { 1497 struct nf_conntrack_net *cnet; 1498 struct net *net; 1499 long expires; 1500 1501 tmp = nf_ct_tuplehash_to_ctrack(h); 1502 1503 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { 1504 nf_ct_offload_timeout(tmp); 1505 if (!nf_conntrack_max95) 1506 continue; 1507 } 1508 1509 if (expired_count > GC_SCAN_EXPIRED_MAX) { 1510 rcu_read_unlock(); 1511 1512 gc_work->next_bucket = i; 1513 gc_work->avg_timeout = next_run; 1514 gc_work->count = count; 1515 1516 delta_time = nfct_time_stamp - gc_work->start_time; 1517 1518 /* re-sched immediately if total cycle time is exceeded */ 1519 next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX; 1520 goto early_exit; 1521 } 1522 1523 if (nf_ct_is_expired(tmp)) { 1524 nf_ct_gc_expired(tmp); 1525 expired_count++; 1526 continue; 1527 } 1528 1529 expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP); 1530 expires = (expires - (long)next_run) / ++count; 1531 next_run += expires; 1532 1533 if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) 1534 continue; 1535 1536 net = nf_ct_net(tmp); 1537 cnet = nf_ct_pernet(net); 1538 if (atomic_read(&cnet->count) < nf_conntrack_max95) 1539 continue; 1540 1541 /* need to take reference to avoid possible races */ 1542 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1543 continue; 1544 1545 /* load ->status after refcount increase */ 1546 smp_acquire__after_ctrl_dep(); 1547 1548 if (gc_worker_skip_ct(tmp)) { 1549 nf_ct_put(tmp); 1550 continue; 1551 } 1552 1553 if (gc_worker_can_early_drop(tmp)) { 1554 nf_ct_kill(tmp); 1555 expired_count++; 1556 } 1557 1558 nf_ct_put(tmp); 1559 } 1560 1561 /* could check get_nulls_value() here and restart if ct 1562 * was moved to another chain. But given gc is best-effort 1563 * we will just continue with next hash slot. 1564 */ 1565 rcu_read_unlock(); 1566 cond_resched(); 1567 i++; 1568 1569 delta_time = nfct_time_stamp - end_time; 1570 if (delta_time > 0 && i < hashsz) { 1571 gc_work->avg_timeout = next_run; 1572 gc_work->count = count; 1573 gc_work->next_bucket = i; 1574 next_run = 0; 1575 goto early_exit; 1576 } 1577 } while (i < hashsz); 1578 1579 gc_work->next_bucket = 0; 1580 1581 next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX); 1582 1583 delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1); 1584 if (next_run > (unsigned long)delta_time) 1585 next_run -= delta_time; 1586 else 1587 next_run = 1; 1588 1589early_exit: 1590 if (gc_work->exiting) 1591 return; 1592 1593 if (next_run) 1594 gc_work->early_drop = false; 1595 1596 queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); 1597} 1598 1599static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) 1600{ 1601 INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); 1602 gc_work->exiting = false; 1603} 1604 1605static struct nf_conn * 1606__nf_conntrack_alloc(struct net *net, 1607 const struct nf_conntrack_zone *zone, 1608 const struct nf_conntrack_tuple *orig, 1609 const struct nf_conntrack_tuple *repl, 1610 gfp_t gfp, u32 hash) 1611{ 1612 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 1613 unsigned int ct_count; 1614 struct nf_conn *ct; 1615 1616 /* We don't want any race condition at early drop stage */ 1617 ct_count = atomic_inc_return(&cnet->count); 1618 1619 if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { 1620 if (!early_drop(net, hash)) { 1621 if (!conntrack_gc_work.early_drop) 1622 conntrack_gc_work.early_drop = true; 1623 atomic_dec(&cnet->count); 1624 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); 1625 return ERR_PTR(-ENOMEM); 1626 } 1627 } 1628 1629 /* 1630 * Do not use kmem_cache_zalloc(), as this cache uses 1631 * SLAB_TYPESAFE_BY_RCU. 1632 */ 1633 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); 1634 if (ct == NULL) 1635 goto out; 1636 1637 spin_lock_init(&ct->lock); 1638 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 1639 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; 1640 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; 1641 /* save hash for reusing when confirming */ 1642 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; 1643 ct->status = 0; 1644 WRITE_ONCE(ct->timeout, 0); 1645 write_pnet(&ct->ct_net, net); 1646 memset_after(ct, 0, __nfct_init_offset); 1647 1648 nf_ct_zone_add(ct, zone); 1649 1650 /* Because we use RCU lookups, we set ct_general.use to zero before 1651 * this is inserted in any list. 1652 */ 1653 refcount_set(&ct->ct_general.use, 0); 1654 return ct; 1655out: 1656 atomic_dec(&cnet->count); 1657 return ERR_PTR(-ENOMEM); 1658} 1659 1660struct nf_conn *nf_conntrack_alloc(struct net *net, 1661 const struct nf_conntrack_zone *zone, 1662 const struct nf_conntrack_tuple *orig, 1663 const struct nf_conntrack_tuple *repl, 1664 gfp_t gfp) 1665{ 1666 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); 1667} 1668EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 1669 1670void nf_conntrack_free(struct nf_conn *ct) 1671{ 1672 struct net *net = nf_ct_net(ct); 1673 struct nf_conntrack_net *cnet; 1674 1675 /* A freed object has refcnt == 0, that's 1676 * the golden rule for SLAB_TYPESAFE_BY_RCU 1677 */ 1678 WARN_ON(refcount_read(&ct->ct_general.use) != 0); 1679 1680 if (ct->status & IPS_SRC_NAT_DONE) { 1681 const struct nf_nat_hook *nat_hook; 1682 1683 rcu_read_lock(); 1684 nat_hook = rcu_dereference(nf_nat_hook); 1685 if (nat_hook) 1686 nat_hook->remove_nat_bysrc(ct); 1687 rcu_read_unlock(); 1688 } 1689 1690 kfree(ct->ext); 1691 kmem_cache_free(nf_conntrack_cachep, ct); 1692 cnet = nf_ct_pernet(net); 1693 1694 smp_mb__before_atomic(); 1695 atomic_dec(&cnet->count); 1696} 1697EXPORT_SYMBOL_GPL(nf_conntrack_free); 1698 1699 1700/* Allocate a new conntrack: we return -ENOMEM if classification 1701 failed due to stress. Otherwise it really is unclassifiable. */ 1702static noinline struct nf_conntrack_tuple_hash * 1703init_conntrack(struct net *net, struct nf_conn *tmpl, 1704 const struct nf_conntrack_tuple *tuple, 1705 struct sk_buff *skb, 1706 unsigned int dataoff, u32 hash) 1707{ 1708 struct nf_conn *ct; 1709 struct nf_conn_help *help; 1710 struct nf_conntrack_tuple repl_tuple; 1711#ifdef CONFIG_NF_CONNTRACK_EVENTS 1712 struct nf_conntrack_ecache *ecache; 1713#endif 1714 struct nf_conntrack_expect *exp = NULL; 1715 const struct nf_conntrack_zone *zone; 1716 struct nf_conn_timeout *timeout_ext; 1717 struct nf_conntrack_zone tmp; 1718 struct nf_conntrack_net *cnet; 1719 1720 if (!nf_ct_invert_tuple(&repl_tuple, tuple)) 1721 return NULL; 1722 1723 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1724 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, 1725 hash); 1726 if (IS_ERR(ct)) 1727 return (struct nf_conntrack_tuple_hash *)ct; 1728 1729 if (!nf_ct_add_synproxy(ct, tmpl)) { 1730 nf_conntrack_free(ct); 1731 return ERR_PTR(-ENOMEM); 1732 } 1733 1734 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; 1735 1736 if (timeout_ext) 1737 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), 1738 GFP_ATOMIC); 1739 1740 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 1741 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); 1742 nf_ct_labels_ext_add(ct); 1743 1744#ifdef CONFIG_NF_CONNTRACK_EVENTS 1745 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; 1746 1747 if ((ecache || net->ct.sysctl_events) && 1748 !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, 1749 ecache ? ecache->expmask : 0, 1750 GFP_ATOMIC)) { 1751 nf_conntrack_free(ct); 1752 return ERR_PTR(-ENOMEM); 1753 } 1754#endif 1755 1756 cnet = nf_ct_pernet(net); 1757 if (cnet->expect_count) { 1758 spin_lock_bh(&nf_conntrack_expect_lock); 1759 exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl)); 1760 if (exp) { 1761 /* Welcome, Mr. Bond. We've been expecting you... */ 1762 __set_bit(IPS_EXPECTED_BIT, &ct->status); 1763 /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ 1764 ct->master = exp->master; 1765 if (exp->helper) { 1766 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); 1767 if (help) 1768 rcu_assign_pointer(help->helper, exp->helper); 1769 } 1770 1771#ifdef CONFIG_NF_CONNTRACK_MARK 1772 ct->mark = READ_ONCE(exp->master->mark); 1773#endif 1774#ifdef CONFIG_NF_CONNTRACK_SECMARK 1775 ct->secmark = exp->master->secmark; 1776#endif 1777 NF_CT_STAT_INC(net, expect_new); 1778 } 1779 spin_unlock_bh(&nf_conntrack_expect_lock); 1780 } 1781 if (!exp && tmpl) 1782 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); 1783 1784 /* Other CPU might have obtained a pointer to this object before it was 1785 * released. Because refcount is 0, refcount_inc_not_zero() will fail. 1786 * 1787 * After refcount_set(1) it will succeed; ensure that zeroing of 1788 * ct->status and the correct ct->net pointer are visible; else other 1789 * core might observe CONFIRMED bit which means the entry is valid and 1790 * in the hash table, but its not (anymore). 1791 */ 1792 smp_wmb(); 1793 1794 /* Now it is going to be associated with an sk_buff, set refcount to 1. */ 1795 refcount_set(&ct->ct_general.use, 1); 1796 1797 if (exp) { 1798 if (exp->expectfn) 1799 exp->expectfn(ct, exp); 1800 nf_ct_expect_put(exp); 1801 } 1802 1803 return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; 1804} 1805 1806/* On success, returns 0, sets skb->_nfct | ctinfo */ 1807static int 1808resolve_normal_ct(struct nf_conn *tmpl, 1809 struct sk_buff *skb, 1810 unsigned int dataoff, 1811 u_int8_t protonum, 1812 const struct nf_hook_state *state) 1813{ 1814 const struct nf_conntrack_zone *zone; 1815 struct nf_conntrack_tuple tuple; 1816 struct nf_conntrack_tuple_hash *h; 1817 enum ip_conntrack_info ctinfo; 1818 struct nf_conntrack_zone tmp; 1819 u32 hash, zone_id, rid; 1820 struct nf_conn *ct; 1821 1822 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 1823 dataoff, state->pf, protonum, state->net, 1824 &tuple)) 1825 return 0; 1826 1827 /* look for tuple match */ 1828 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1829 1830 zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 1831 hash = hash_conntrack_raw(&tuple, zone_id, state->net); 1832 h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); 1833 1834 if (!h) { 1835 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 1836 if (zone_id != rid) { 1837 u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); 1838 1839 h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); 1840 } 1841 } 1842 1843 if (!h) { 1844 h = init_conntrack(state->net, tmpl, &tuple, 1845 skb, dataoff, hash); 1846 if (!h) 1847 return 0; 1848 if (IS_ERR(h)) 1849 return PTR_ERR(h); 1850 } 1851 ct = nf_ct_tuplehash_to_ctrack(h); 1852 1853 /* It exists; we have (non-exclusive) reference. */ 1854 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { 1855 ctinfo = IP_CT_ESTABLISHED_REPLY; 1856 } else { 1857 unsigned long status = READ_ONCE(ct->status); 1858 1859 /* Once we've had two way comms, always ESTABLISHED. */ 1860 if (likely(status & IPS_SEEN_REPLY)) 1861 ctinfo = IP_CT_ESTABLISHED; 1862 else if (status & IPS_EXPECTED) 1863 ctinfo = IP_CT_RELATED; 1864 else 1865 ctinfo = IP_CT_NEW; 1866 } 1867 nf_ct_set(skb, ct, ctinfo); 1868 return 0; 1869} 1870 1871/* 1872 * icmp packets need special treatment to handle error messages that are 1873 * related to a connection. 1874 * 1875 * Callers need to check if skb has a conntrack assigned when this 1876 * helper returns; in such case skb belongs to an already known connection. 1877 */ 1878static unsigned int __cold 1879nf_conntrack_handle_icmp(struct nf_conn *tmpl, 1880 struct sk_buff *skb, 1881 unsigned int dataoff, 1882 u8 protonum, 1883 const struct nf_hook_state *state) 1884{ 1885 int ret; 1886 1887 if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP) 1888 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state); 1889#if IS_ENABLED(CONFIG_IPV6) 1890 else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6) 1891 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state); 1892#endif 1893 else 1894 return NF_ACCEPT; 1895 1896 if (ret <= 0) 1897 NF_CT_STAT_INC_ATOMIC(state->net, error); 1898 1899 return ret; 1900} 1901 1902static int generic_packet(struct nf_conn *ct, struct sk_buff *skb, 1903 enum ip_conntrack_info ctinfo) 1904{ 1905 const unsigned int *timeout = nf_ct_timeout_lookup(ct); 1906 1907 if (!timeout) 1908 timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; 1909 1910 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); 1911 return NF_ACCEPT; 1912} 1913 1914/* Returns verdict for packet, or -1 for invalid. */ 1915static int nf_conntrack_handle_packet(struct nf_conn *ct, 1916 struct sk_buff *skb, 1917 unsigned int dataoff, 1918 enum ip_conntrack_info ctinfo, 1919 const struct nf_hook_state *state) 1920{ 1921 switch (nf_ct_protonum(ct)) { 1922 case IPPROTO_TCP: 1923 return nf_conntrack_tcp_packet(ct, skb, dataoff, 1924 ctinfo, state); 1925 case IPPROTO_UDP: 1926 return nf_conntrack_udp_packet(ct, skb, dataoff, 1927 ctinfo, state); 1928 case IPPROTO_ICMP: 1929 return nf_conntrack_icmp_packet(ct, skb, ctinfo, state); 1930#if IS_ENABLED(CONFIG_IPV6) 1931 case IPPROTO_ICMPV6: 1932 return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state); 1933#endif 1934#ifdef CONFIG_NF_CT_PROTO_UDPLITE 1935 case IPPROTO_UDPLITE: 1936 return nf_conntrack_udplite_packet(ct, skb, dataoff, 1937 ctinfo, state); 1938#endif 1939#ifdef CONFIG_NF_CT_PROTO_SCTP 1940 case IPPROTO_SCTP: 1941 return nf_conntrack_sctp_packet(ct, skb, dataoff, 1942 ctinfo, state); 1943#endif 1944#ifdef CONFIG_NF_CT_PROTO_DCCP 1945 case IPPROTO_DCCP: 1946 return nf_conntrack_dccp_packet(ct, skb, dataoff, 1947 ctinfo, state); 1948#endif 1949#ifdef CONFIG_NF_CT_PROTO_GRE 1950 case IPPROTO_GRE: 1951 return nf_conntrack_gre_packet(ct, skb, dataoff, 1952 ctinfo, state); 1953#endif 1954 } 1955 1956 return generic_packet(ct, skb, ctinfo); 1957} 1958 1959unsigned int 1960nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) 1961{ 1962 enum ip_conntrack_info ctinfo; 1963 struct nf_conn *ct, *tmpl; 1964 u_int8_t protonum; 1965 int dataoff, ret; 1966 1967 tmpl = nf_ct_get(skb, &ctinfo); 1968 if (tmpl || ctinfo == IP_CT_UNTRACKED) { 1969 /* Previously seen (loopback or untracked)? Ignore. */ 1970 if ((tmpl && !nf_ct_is_template(tmpl)) || 1971 ctinfo == IP_CT_UNTRACKED) 1972 return NF_ACCEPT; 1973 skb->_nfct = 0; 1974 } 1975 1976 /* rcu_read_lock()ed by nf_hook_thresh */ 1977 dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); 1978 if (dataoff <= 0) { 1979 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 1980 ret = NF_ACCEPT; 1981 goto out; 1982 } 1983 1984 if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { 1985 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, 1986 protonum, state); 1987 if (ret <= 0) { 1988 ret = -ret; 1989 goto out; 1990 } 1991 /* ICMP[v6] protocol trackers may assign one conntrack. */ 1992 if (skb->_nfct) 1993 goto out; 1994 } 1995repeat: 1996 ret = resolve_normal_ct(tmpl, skb, dataoff, 1997 protonum, state); 1998 if (ret < 0) { 1999 /* Too stressed to deal. */ 2000 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2001 ret = NF_DROP; 2002 goto out; 2003 } 2004 2005 ct = nf_ct_get(skb, &ctinfo); 2006 if (!ct) { 2007 /* Not valid part of a connection */ 2008 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2009 ret = NF_ACCEPT; 2010 goto out; 2011 } 2012 2013 ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state); 2014 if (ret <= 0) { 2015 /* Invalid: inverse of the return code tells 2016 * the netfilter core what to do */ 2017 nf_ct_put(ct); 2018 skb->_nfct = 0; 2019 /* Special case: TCP tracker reports an attempt to reopen a 2020 * closed/aborted connection. We have to go back and create a 2021 * fresh conntrack. 2022 */ 2023 if (ret == -NF_REPEAT) 2024 goto repeat; 2025 2026 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2027 if (ret == -NF_DROP) 2028 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2029 2030 ret = -ret; 2031 goto out; 2032 } 2033 2034 if (ctinfo == IP_CT_ESTABLISHED_REPLY && 2035 !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 2036 nf_conntrack_event_cache(IPCT_REPLY, ct); 2037out: 2038 if (tmpl) 2039 nf_ct_put(tmpl); 2040 2041 return ret; 2042} 2043EXPORT_SYMBOL_GPL(nf_conntrack_in); 2044 2045/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ 2046void __nf_ct_refresh_acct(struct nf_conn *ct, 2047 enum ip_conntrack_info ctinfo, 2048 const struct sk_buff *skb, 2049 u32 extra_jiffies, 2050 bool do_acct) 2051{ 2052 /* Only update if this is not a fixed timeout */ 2053 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2054 goto acct; 2055 2056 /* If not in hash table, timer will not be active yet */ 2057 if (nf_ct_is_confirmed(ct)) 2058 extra_jiffies += nfct_time_stamp; 2059 2060 if (READ_ONCE(ct->timeout) != extra_jiffies) 2061 WRITE_ONCE(ct->timeout, extra_jiffies); 2062acct: 2063 if (do_acct) 2064 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); 2065} 2066EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); 2067 2068bool nf_ct_kill_acct(struct nf_conn *ct, 2069 enum ip_conntrack_info ctinfo, 2070 const struct sk_buff *skb) 2071{ 2072 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); 2073 2074 return nf_ct_delete(ct, 0, 0); 2075} 2076EXPORT_SYMBOL_GPL(nf_ct_kill_acct); 2077 2078#if IS_ENABLED(CONFIG_NF_CT_NETLINK) 2079 2080#include <linux/netfilter/nfnetlink.h> 2081#include <linux/netfilter/nfnetlink_conntrack.h> 2082#include <linux/mutex.h> 2083 2084/* Generic function for tcp/udp/sctp/dccp and alike. */ 2085int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, 2086 const struct nf_conntrack_tuple *tuple) 2087{ 2088 if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || 2089 nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) 2090 goto nla_put_failure; 2091 return 0; 2092 2093nla_put_failure: 2094 return -1; 2095} 2096EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); 2097 2098const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { 2099 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, 2100 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, 2101}; 2102EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); 2103 2104int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], 2105 struct nf_conntrack_tuple *t, 2106 u_int32_t flags) 2107{ 2108 if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) { 2109 if (!tb[CTA_PROTO_SRC_PORT]) 2110 return -EINVAL; 2111 2112 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); 2113 } 2114 2115 if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) { 2116 if (!tb[CTA_PROTO_DST_PORT]) 2117 return -EINVAL; 2118 2119 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); 2120 } 2121 2122 return 0; 2123} 2124EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); 2125 2126unsigned int nf_ct_port_nlattr_tuple_size(void) 2127{ 2128 static unsigned int size __read_mostly; 2129 2130 if (!size) 2131 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); 2132 2133 return size; 2134} 2135EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); 2136#endif 2137 2138/* Used by ipt_REJECT and ip6t_REJECT. */ 2139static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) 2140{ 2141 struct nf_conn *ct; 2142 enum ip_conntrack_info ctinfo; 2143 2144 /* This ICMP is in reverse direction to the packet which caused it */ 2145 ct = nf_ct_get(skb, &ctinfo); 2146 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) 2147 ctinfo = IP_CT_RELATED_REPLY; 2148 else 2149 ctinfo = IP_CT_RELATED; 2150 2151 /* Attach to new skbuff, and increment count */ 2152 nf_ct_set(nskb, ct, ctinfo); 2153 nf_conntrack_get(skb_nfct(nskb)); 2154} 2155 2156static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, 2157 struct nf_conn *ct, 2158 enum ip_conntrack_info ctinfo) 2159{ 2160 const struct nf_nat_hook *nat_hook; 2161 struct nf_conntrack_tuple_hash *h; 2162 struct nf_conntrack_tuple tuple; 2163 unsigned int status; 2164 int dataoff; 2165 u16 l3num; 2166 u8 l4num; 2167 2168 l3num = nf_ct_l3num(ct); 2169 2170 dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); 2171 if (dataoff <= 0) 2172 return NF_DROP; 2173 2174 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, 2175 l4num, net, &tuple)) 2176 return NF_DROP; 2177 2178 if (ct->status & IPS_SRC_NAT) { 2179 memcpy(tuple.src.u3.all, 2180 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all, 2181 sizeof(tuple.src.u3.all)); 2182 tuple.src.u.all = 2183 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all; 2184 } 2185 2186 if (ct->status & IPS_DST_NAT) { 2187 memcpy(tuple.dst.u3.all, 2188 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all, 2189 sizeof(tuple.dst.u3.all)); 2190 tuple.dst.u.all = 2191 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all; 2192 } 2193 2194 h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); 2195 if (!h) 2196 return NF_ACCEPT; 2197 2198 /* Store status bits of the conntrack that is clashing to re-do NAT 2199 * mangling according to what it has been done already to this packet. 2200 */ 2201 status = ct->status; 2202 2203 nf_ct_put(ct); 2204 ct = nf_ct_tuplehash_to_ctrack(h); 2205 nf_ct_set(skb, ct, ctinfo); 2206 2207 nat_hook = rcu_dereference(nf_nat_hook); 2208 if (!nat_hook) 2209 return NF_ACCEPT; 2210 2211 if (status & IPS_SRC_NAT) { 2212 unsigned int verdict = nat_hook->manip_pkt(skb, ct, 2213 NF_NAT_MANIP_SRC, 2214 IP_CT_DIR_ORIGINAL); 2215 if (verdict != NF_ACCEPT) 2216 return verdict; 2217 } 2218 2219 if (status & IPS_DST_NAT) { 2220 unsigned int verdict = nat_hook->manip_pkt(skb, ct, 2221 NF_NAT_MANIP_DST, 2222 IP_CT_DIR_ORIGINAL); 2223 if (verdict != NF_ACCEPT) 2224 return verdict; 2225 } 2226 2227 return NF_ACCEPT; 2228} 2229 2230/* This packet is coming from userspace via nf_queue, complete the packet 2231 * processing after the helper invocation in nf_confirm(). 2232 */ 2233static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, 2234 enum ip_conntrack_info ctinfo) 2235{ 2236 const struct nf_conntrack_helper *helper; 2237 const struct nf_conn_help *help; 2238 int protoff; 2239 2240 help = nfct_help(ct); 2241 if (!help) 2242 return NF_ACCEPT; 2243 2244 helper = rcu_dereference(help->helper); 2245 if (!helper) 2246 return NF_ACCEPT; 2247 2248 if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) 2249 return NF_ACCEPT; 2250 2251 switch (nf_ct_l3num(ct)) { 2252 case NFPROTO_IPV4: 2253 protoff = skb_network_offset(skb) + ip_hdrlen(skb); 2254 break; 2255#if IS_ENABLED(CONFIG_IPV6) 2256 case NFPROTO_IPV6: { 2257 __be16 frag_off; 2258 u8 pnum; 2259 2260 pnum = ipv6_hdr(skb)->nexthdr; 2261 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, 2262 &frag_off); 2263 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) 2264 return NF_ACCEPT; 2265 break; 2266 } 2267#endif 2268 default: 2269 return NF_ACCEPT; 2270 } 2271 2272 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && 2273 !nf_is_loopback_packet(skb)) { 2274 if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { 2275 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); 2276 return NF_DROP; 2277 } 2278 } 2279 2280 /* We've seen it coming out the other side: confirm it */ 2281 return nf_conntrack_confirm(skb); 2282} 2283 2284static int nf_conntrack_update(struct net *net, struct sk_buff *skb) 2285{ 2286 enum ip_conntrack_info ctinfo; 2287 struct nf_conn *ct; 2288 2289 ct = nf_ct_get(skb, &ctinfo); 2290 if (!ct) 2291 return NF_ACCEPT; 2292 2293 if (!nf_ct_is_confirmed(ct)) { 2294 int ret = __nf_conntrack_update(net, skb, ct, ctinfo); 2295 2296 if (ret != NF_ACCEPT) 2297 return ret; 2298 2299 ct = nf_ct_get(skb, &ctinfo); 2300 if (!ct) 2301 return NF_ACCEPT; 2302 } 2303 2304 return nf_confirm_cthelper(skb, ct, ctinfo); 2305} 2306 2307static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, 2308 const struct sk_buff *skb) 2309{ 2310 const struct nf_conntrack_tuple *src_tuple; 2311 const struct nf_conntrack_tuple_hash *hash; 2312 struct nf_conntrack_tuple srctuple; 2313 enum ip_conntrack_info ctinfo; 2314 struct nf_conn *ct; 2315 2316 ct = nf_ct_get(skb, &ctinfo); 2317 if (ct) { 2318 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); 2319 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2320 return true; 2321 } 2322 2323 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), 2324 NFPROTO_IPV4, dev_net(skb->dev), 2325 &srctuple)) 2326 return false; 2327 2328 hash = nf_conntrack_find_get(dev_net(skb->dev), 2329 &nf_ct_zone_dflt, 2330 &srctuple); 2331 if (!hash) 2332 return false; 2333 2334 ct = nf_ct_tuplehash_to_ctrack(hash); 2335 src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); 2336 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2337 nf_ct_put(ct); 2338 2339 return true; 2340} 2341 2342/* Bring out ya dead! */ 2343static struct nf_conn * 2344get_next_corpse(int (*iter)(struct nf_conn *i, void *data), 2345 const struct nf_ct_iter_data *iter_data, unsigned int *bucket) 2346{ 2347 struct nf_conntrack_tuple_hash *h; 2348 struct nf_conn *ct; 2349 struct hlist_nulls_node *n; 2350 spinlock_t *lockp; 2351 2352 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { 2353 struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; 2354 2355 if (hlist_nulls_empty(hslot)) 2356 continue; 2357 2358 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; 2359 local_bh_disable(); 2360 nf_conntrack_lock(lockp); 2361 hlist_nulls_for_each_entry(h, n, hslot, hnnode) { 2362 if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) 2363 continue; 2364 /* All nf_conn objects are added to hash table twice, one 2365 * for original direction tuple, once for the reply tuple. 2366 * 2367 * Exception: In the IPS_NAT_CLASH case, only the reply 2368 * tuple is added (the original tuple already existed for 2369 * a different object). 2370 * 2371 * We only need to call the iterator once for each 2372 * conntrack, so we just use the 'reply' direction 2373 * tuple while iterating. 2374 */ 2375 ct = nf_ct_tuplehash_to_ctrack(h); 2376 2377 if (iter_data->net && 2378 !net_eq(iter_data->net, nf_ct_net(ct))) 2379 continue; 2380 2381 if (iter(ct, iter_data->data)) 2382 goto found; 2383 } 2384 spin_unlock(lockp); 2385 local_bh_enable(); 2386 cond_resched(); 2387 } 2388 2389 return NULL; 2390found: 2391 refcount_inc(&ct->ct_general.use); 2392 spin_unlock(lockp); 2393 local_bh_enable(); 2394 return ct; 2395} 2396 2397static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), 2398 const struct nf_ct_iter_data *iter_data) 2399{ 2400 unsigned int bucket = 0; 2401 struct nf_conn *ct; 2402 2403 might_sleep(); 2404 2405 mutex_lock(&nf_conntrack_mutex); 2406 while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) { 2407 /* Time to push up daises... */ 2408 2409 nf_ct_delete(ct, iter_data->portid, iter_data->report); 2410 nf_ct_put(ct); 2411 cond_resched(); 2412 } 2413 mutex_unlock(&nf_conntrack_mutex); 2414} 2415 2416void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), 2417 const struct nf_ct_iter_data *iter_data) 2418{ 2419 struct net *net = iter_data->net; 2420 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2421 2422 might_sleep(); 2423 2424 if (atomic_read(&cnet->count) == 0) 2425 return; 2426 2427 nf_ct_iterate_cleanup(iter, iter_data); 2428} 2429EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); 2430 2431/** 2432 * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table 2433 * @iter: callback to invoke for each conntrack 2434 * @data: data to pass to @iter 2435 * 2436 * Like nf_ct_iterate_cleanup, but first marks conntracks on the 2437 * unconfirmed list as dying (so they will not be inserted into 2438 * main table). 2439 * 2440 * Can only be called in module exit path. 2441 */ 2442void 2443nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) 2444{ 2445 struct nf_ct_iter_data iter_data = {}; 2446 struct net *net; 2447 2448 down_read(&net_rwsem); 2449 for_each_net(net) { 2450 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2451 2452 if (atomic_read(&cnet->count) == 0) 2453 continue; 2454 nf_queue_nf_hook_drop(net); 2455 } 2456 up_read(&net_rwsem); 2457 2458 /* Need to wait for netns cleanup worker to finish, if its 2459 * running -- it might have deleted a net namespace from 2460 * the global list, so hook drop above might not have 2461 * affected all namespaces. 2462 */ 2463 net_ns_barrier(); 2464 2465 /* a skb w. unconfirmed conntrack could have been reinjected just 2466 * before we called nf_queue_nf_hook_drop(). 2467 * 2468 * This makes sure its inserted into conntrack table. 2469 */ 2470 synchronize_net(); 2471 2472 nf_ct_ext_bump_genid(); 2473 iter_data.data = data; 2474 nf_ct_iterate_cleanup(iter, &iter_data); 2475 2476 /* Another cpu might be in a rcu read section with 2477 * rcu protected pointer cleared in iter callback 2478 * or hidden via nf_ct_ext_bump_genid() above. 2479 * 2480 * Wait until those are done. 2481 */ 2482 synchronize_rcu(); 2483} 2484EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); 2485 2486static int kill_all(struct nf_conn *i, void *data) 2487{ 2488 return 1; 2489} 2490 2491void nf_conntrack_cleanup_start(void) 2492{ 2493 cleanup_nf_conntrack_bpf(); 2494 conntrack_gc_work.exiting = true; 2495} 2496 2497void nf_conntrack_cleanup_end(void) 2498{ 2499 RCU_INIT_POINTER(nf_ct_hook, NULL); 2500 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2501 kvfree(nf_conntrack_hash); 2502 2503 nf_conntrack_proto_fini(); 2504 nf_conntrack_helper_fini(); 2505 nf_conntrack_expect_fini(); 2506 2507 kmem_cache_destroy(nf_conntrack_cachep); 2508} 2509 2510/* 2511 * Mishearing the voices in his head, our hero wonders how he's 2512 * supposed to kill the mall. 2513 */ 2514void nf_conntrack_cleanup_net(struct net *net) 2515{ 2516 LIST_HEAD(single); 2517 2518 list_add(&net->exit_list, &single); 2519 nf_conntrack_cleanup_net_list(&single); 2520} 2521 2522void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) 2523{ 2524 struct nf_ct_iter_data iter_data = {}; 2525 struct net *net; 2526 int busy; 2527 2528 /* 2529 * This makes sure all current packets have passed through 2530 * netfilter framework. Roll on, two-stage module 2531 * delete... 2532 */ 2533 synchronize_rcu_expedited(); 2534i_see_dead_people: 2535 busy = 0; 2536 list_for_each_entry(net, net_exit_list, exit_list) { 2537 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2538 2539 iter_data.net = net; 2540 nf_ct_iterate_cleanup_net(kill_all, &iter_data); 2541 if (atomic_read(&cnet->count) != 0) 2542 busy = 1; 2543 } 2544 if (busy) { 2545 schedule(); 2546 goto i_see_dead_people; 2547 } 2548 2549 list_for_each_entry(net, net_exit_list, exit_list) { 2550 nf_conntrack_ecache_pernet_fini(net); 2551 nf_conntrack_expect_pernet_fini(net); 2552 free_percpu(net->ct.stat); 2553 } 2554} 2555 2556void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) 2557{ 2558 struct hlist_nulls_head *hash; 2559 unsigned int nr_slots, i; 2560 2561 if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) 2562 return NULL; 2563 2564 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 2565 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 2566 2567 hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); 2568 2569 if (hash && nulls) 2570 for (i = 0; i < nr_slots; i++) 2571 INIT_HLIST_NULLS_HEAD(&hash[i], i); 2572 2573 return hash; 2574} 2575EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); 2576 2577int nf_conntrack_hash_resize(unsigned int hashsize) 2578{ 2579 int i, bucket; 2580 unsigned int old_size; 2581 struct hlist_nulls_head *hash, *old_hash; 2582 struct nf_conntrack_tuple_hash *h; 2583 struct nf_conn *ct; 2584 2585 if (!hashsize) 2586 return -EINVAL; 2587 2588 hash = nf_ct_alloc_hashtable(&hashsize, 1); 2589 if (!hash) 2590 return -ENOMEM; 2591 2592 mutex_lock(&nf_conntrack_mutex); 2593 old_size = nf_conntrack_htable_size; 2594 if (old_size == hashsize) { 2595 mutex_unlock(&nf_conntrack_mutex); 2596 kvfree(hash); 2597 return 0; 2598 } 2599 2600 local_bh_disable(); 2601 nf_conntrack_all_lock(); 2602 write_seqcount_begin(&nf_conntrack_generation); 2603 2604 /* Lookups in the old hash might happen in parallel, which means we 2605 * might get false negatives during connection lookup. New connections 2606 * created because of a false negative won't make it into the hash 2607 * though since that required taking the locks. 2608 */ 2609 2610 for (i = 0; i < nf_conntrack_htable_size; i++) { 2611 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { 2612 unsigned int zone_id; 2613 2614 h = hlist_nulls_entry(nf_conntrack_hash[i].first, 2615 struct nf_conntrack_tuple_hash, hnnode); 2616 ct = nf_ct_tuplehash_to_ctrack(h); 2617 hlist_nulls_del_rcu(&h->hnnode); 2618 2619 zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); 2620 bucket = __hash_conntrack(nf_ct_net(ct), 2621 &h->tuple, zone_id, hashsize); 2622 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 2623 } 2624 } 2625 old_hash = nf_conntrack_hash; 2626 2627 nf_conntrack_hash = hash; 2628 nf_conntrack_htable_size = hashsize; 2629 2630 write_seqcount_end(&nf_conntrack_generation); 2631 nf_conntrack_all_unlock(); 2632 local_bh_enable(); 2633 2634 mutex_unlock(&nf_conntrack_mutex); 2635 2636 synchronize_net(); 2637 kvfree(old_hash); 2638 return 0; 2639} 2640 2641int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) 2642{ 2643 unsigned int hashsize; 2644 int rc; 2645 2646 if (current->nsproxy->net_ns != &init_net) 2647 return -EOPNOTSUPP; 2648 2649 /* On boot, we can set this without any fancy locking. */ 2650 if (!nf_conntrack_hash) 2651 return param_set_uint(val, kp); 2652 2653 rc = kstrtouint(val, 0, &hashsize); 2654 if (rc) 2655 return rc; 2656 2657 return nf_conntrack_hash_resize(hashsize); 2658} 2659 2660int nf_conntrack_init_start(void) 2661{ 2662 unsigned long nr_pages = totalram_pages(); 2663 int max_factor = 8; 2664 int ret = -ENOMEM; 2665 int i; 2666 2667 seqcount_spinlock_init(&nf_conntrack_generation, 2668 &nf_conntrack_locks_all_lock); 2669 2670 for (i = 0; i < CONNTRACK_LOCKS; i++) 2671 spin_lock_init(&nf_conntrack_locks[i]); 2672 2673 if (!nf_conntrack_htable_size) { 2674 nf_conntrack_htable_size 2675 = (((nr_pages << PAGE_SHIFT) / 16384) 2676 / sizeof(struct hlist_head)); 2677 if (BITS_PER_LONG >= 64 && 2678 nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) 2679 nf_conntrack_htable_size = 262144; 2680 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) 2681 nf_conntrack_htable_size = 65536; 2682 2683 if (nf_conntrack_htable_size < 1024) 2684 nf_conntrack_htable_size = 1024; 2685 /* Use a max. factor of one by default to keep the average 2686 * hash chain length at 2 entries. Each entry has to be added 2687 * twice (once for original direction, once for reply). 2688 * When a table size is given we use the old value of 8 to 2689 * avoid implicit reduction of the max entries setting. 2690 */ 2691 max_factor = 1; 2692 } 2693 2694 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); 2695 if (!nf_conntrack_hash) 2696 return -ENOMEM; 2697 2698 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 2699 2700 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 2701 sizeof(struct nf_conn), 2702 NFCT_INFOMASK + 1, 2703 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); 2704 if (!nf_conntrack_cachep) 2705 goto err_cachep; 2706 2707 ret = nf_conntrack_expect_init(); 2708 if (ret < 0) 2709 goto err_expect; 2710 2711 ret = nf_conntrack_helper_init(); 2712 if (ret < 0) 2713 goto err_helper; 2714 2715 ret = nf_conntrack_proto_init(); 2716 if (ret < 0) 2717 goto err_proto; 2718 2719 conntrack_gc_work_init(&conntrack_gc_work); 2720 queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); 2721 2722 ret = register_nf_conntrack_bpf(); 2723 if (ret < 0) 2724 goto err_kfunc; 2725 2726 return 0; 2727 2728err_kfunc: 2729 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2730 nf_conntrack_proto_fini(); 2731err_proto: 2732 nf_conntrack_helper_fini(); 2733err_helper: 2734 nf_conntrack_expect_fini(); 2735err_expect: 2736 kmem_cache_destroy(nf_conntrack_cachep); 2737err_cachep: 2738 kvfree(nf_conntrack_hash); 2739 return ret; 2740} 2741 2742static void nf_conntrack_set_closing(struct nf_conntrack *nfct) 2743{ 2744 struct nf_conn *ct = nf_ct_to_nf_conn(nfct); 2745 2746 switch (nf_ct_protonum(ct)) { 2747 case IPPROTO_TCP: 2748 nf_conntrack_tcp_set_closing(ct); 2749 break; 2750 } 2751} 2752 2753static const struct nf_ct_hook nf_conntrack_hook = { 2754 .update = nf_conntrack_update, 2755 .destroy = nf_ct_destroy, 2756 .get_tuple_skb = nf_conntrack_get_tuple_skb, 2757 .attach = nf_conntrack_attach, 2758 .set_closing = nf_conntrack_set_closing, 2759 .confirm = __nf_conntrack_confirm, 2760}; 2761 2762void nf_conntrack_init_end(void) 2763{ 2764 RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); 2765} 2766 2767/* 2768 * We need to use special "null" values, not used in hash table 2769 */ 2770#define UNCONFIRMED_NULLS_VAL ((1<<30)+0) 2771 2772int nf_conntrack_init_net(struct net *net) 2773{ 2774 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2775 int ret = -ENOMEM; 2776 2777 BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); 2778 BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); 2779 atomic_set(&cnet->count, 0); 2780 2781 net->ct.stat = alloc_percpu(struct ip_conntrack_stat); 2782 if (!net->ct.stat) 2783 return ret; 2784 2785 ret = nf_conntrack_expect_pernet_init(net); 2786 if (ret < 0) 2787 goto err_expect; 2788 2789 nf_conntrack_acct_pernet_init(net); 2790 nf_conntrack_tstamp_pernet_init(net); 2791 nf_conntrack_ecache_pernet_init(net); 2792 nf_conntrack_proto_pernet_init(net); 2793 2794 return 0; 2795 2796err_expect: 2797 free_percpu(net->ct.stat); 2798 return ret; 2799} 2800 2801/* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ 2802 2803int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout) 2804{ 2805 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2806 return -EPERM; 2807 2808 __nf_ct_set_timeout(ct, timeout); 2809 2810 if (test_bit(IPS_DYING_BIT, &ct->status)) 2811 return -ETIME; 2812 2813 return 0; 2814} 2815EXPORT_SYMBOL_GPL(__nf_ct_change_timeout); 2816 2817void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off) 2818{ 2819 unsigned int bit; 2820 2821 /* Ignore these unchangable bits */ 2822 on &= ~IPS_UNCHANGEABLE_MASK; 2823 off &= ~IPS_UNCHANGEABLE_MASK; 2824 2825 for (bit = 0; bit < __IPS_MAX_BIT; bit++) { 2826 if (on & (1 << bit)) 2827 set_bit(bit, &ct->status); 2828 else if (off & (1 << bit)) 2829 clear_bit(bit, &ct->status); 2830 } 2831} 2832EXPORT_SYMBOL_GPL(__nf_ct_change_status); 2833 2834int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status) 2835{ 2836 unsigned long d; 2837 2838 d = ct->status ^ status; 2839 2840 if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) 2841 /* unchangeable */ 2842 return -EBUSY; 2843 2844 if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) 2845 /* SEEN_REPLY bit can only be set */ 2846 return -EBUSY; 2847 2848 if (d & IPS_ASSURED && !(status & IPS_ASSURED)) 2849 /* ASSURED bit can only be set */ 2850 return -EBUSY; 2851 2852 __nf_ct_change_status(ct, status, 0); 2853 return 0; 2854} 2855EXPORT_SYMBOL_GPL(nf_ct_change_status_common); 2856