1/* 2 * IPVS An implementation of the IP virtual server support for the 3 * LINUX operating system. IPVS is now implemented as a module 4 * over the Netfilter framework. IPVS can be used to build a 5 * high-performance and highly available server based on a 6 * cluster of servers. 7 * 8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 9 * Peter Kese <peter.kese@ijs.si> 10 * Julian Anastasov <ja@ssi.bg> 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License 14 * as published by the Free Software Foundation; either version 15 * 2 of the License, or (at your option) any later version. 16 * 17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, 18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms 19 * and others. Many code here is taken from IP MASQ code of kernel 2.2. 20 * 21 * Changes: 22 * 23 */ 24 25#define KMSG_COMPONENT "IPVS" 26#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 27 28#include <linux/interrupt.h> 29#include <linux/in.h> 30#include <linux/net.h> 31#include <linux/kernel.h> 32#include <linux/module.h> 33#include <linux/vmalloc.h> 34#include <linux/proc_fs.h> /* for proc_net_* */ 35#include <linux/slab.h> 36#include <linux/seq_file.h> 37#include <linux/jhash.h> 38#include <linux/random.h> 39 40#include <net/net_namespace.h> 41#include <net/ip_vs.h> 42 43 44#ifndef CONFIG_IP_VS_TAB_BITS 45#define CONFIG_IP_VS_TAB_BITS 12 46#endif 47 48/* 49 * Connection hash size. Default is what was selected at compile time. 50*/ 51int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; 52module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); 53MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); 54 55/* size and mask values */ 56int ip_vs_conn_tab_size; 57int ip_vs_conn_tab_mask; 58 59/* 60 * Connection hash table: for input and output packets lookups of IPVS 61 */ 62static struct list_head *ip_vs_conn_tab; 63 64/* SLAB cache for IPVS connections */ 65static struct kmem_cache *ip_vs_conn_cachep __read_mostly; 66 67/* counter for current IPVS connections */ 68static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); 69 70/* counter for no client port connections */ 71static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); 72 73/* random value for IPVS connection hash */ 74static unsigned int ip_vs_conn_rnd; 75 76/* 77 * Fine locking granularity for big connection hash table 78 */ 79#define CT_LOCKARRAY_BITS 4 80#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) 81#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) 82 83struct ip_vs_aligned_lock 84{ 85 rwlock_t l; 86} __attribute__((__aligned__(SMP_CACHE_BYTES))); 87 88/* lock array for conn table */ 89static struct ip_vs_aligned_lock 90__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; 91 92static inline void ct_read_lock(unsigned key) 93{ 94 read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 95} 96 97static inline void ct_read_unlock(unsigned key) 98{ 99 read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 100} 101 102static inline void ct_write_lock(unsigned key) 103{ 104 write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 105} 106 107static inline void ct_write_unlock(unsigned key) 108{ 109 write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 110} 111 112static inline void ct_read_lock_bh(unsigned key) 113{ 114 read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 115} 116 117static inline void ct_read_unlock_bh(unsigned key) 118{ 119 read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 120} 121 122static inline void ct_write_lock_bh(unsigned key) 123{ 124 write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 125} 126 127static inline void ct_write_unlock_bh(unsigned key) 128{ 129 write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 130} 131 132 133/* 134 * Returns hash value for IPVS connection entry 135 */ 136static unsigned int ip_vs_conn_hashkey(int af, unsigned proto, 137 const union nf_inet_addr *addr, 138 __be16 port) 139{ 140#ifdef CONFIG_IP_VS_IPV6 141 if (af == AF_INET6) 142 return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), 143 (__force u32)port, proto, ip_vs_conn_rnd) 144 & ip_vs_conn_tab_mask; 145#endif 146 return jhash_3words((__force u32)addr->ip, (__force u32)port, proto, 147 ip_vs_conn_rnd) 148 & ip_vs_conn_tab_mask; 149} 150 151 152/* 153 * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. 154 * returns bool success. 155 */ 156static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) 157{ 158 unsigned hash; 159 int ret; 160 161 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 162 return 0; 163 164 /* Hash by protocol, client address and port */ 165 hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); 166 167 ct_write_lock(hash); 168 spin_lock(&cp->lock); 169 170 if (!(cp->flags & IP_VS_CONN_F_HASHED)) { 171 list_add(&cp->c_list, &ip_vs_conn_tab[hash]); 172 cp->flags |= IP_VS_CONN_F_HASHED; 173 atomic_inc(&cp->refcnt); 174 ret = 1; 175 } else { 176 pr_err("%s(): request for already hashed, called from %pF\n", 177 __func__, __builtin_return_address(0)); 178 ret = 0; 179 } 180 181 spin_unlock(&cp->lock); 182 ct_write_unlock(hash); 183 184 return ret; 185} 186 187 188/* 189 * UNhashes ip_vs_conn from ip_vs_conn_tab. 190 * returns bool success. 191 */ 192static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) 193{ 194 unsigned hash; 195 int ret; 196 197 /* unhash it and decrease its reference counter */ 198 hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); 199 200 ct_write_lock(hash); 201 spin_lock(&cp->lock); 202 203 if (cp->flags & IP_VS_CONN_F_HASHED) { 204 list_del(&cp->c_list); 205 cp->flags &= ~IP_VS_CONN_F_HASHED; 206 atomic_dec(&cp->refcnt); 207 ret = 1; 208 } else 209 ret = 0; 210 211 spin_unlock(&cp->lock); 212 ct_write_unlock(hash); 213 214 return ret; 215} 216 217 218/* 219 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 220 * Called for pkts coming from OUTside-to-INside. 221 * s_addr, s_port: pkt source address (foreign host) 222 * d_addr, d_port: pkt dest address (load balancer) 223 */ 224static inline struct ip_vs_conn *__ip_vs_conn_in_get 225(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, 226 const union nf_inet_addr *d_addr, __be16 d_port) 227{ 228 unsigned hash; 229 struct ip_vs_conn *cp; 230 231 hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); 232 233 ct_read_lock(hash); 234 235 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 236 if (cp->af == af && 237 ip_vs_addr_equal(af, s_addr, &cp->caddr) && 238 ip_vs_addr_equal(af, d_addr, &cp->vaddr) && 239 s_port == cp->cport && d_port == cp->vport && 240 ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && 241 protocol == cp->protocol) { 242 /* HIT */ 243 atomic_inc(&cp->refcnt); 244 ct_read_unlock(hash); 245 return cp; 246 } 247 } 248 249 ct_read_unlock(hash); 250 251 return NULL; 252} 253 254struct ip_vs_conn *ip_vs_conn_in_get 255(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, 256 const union nf_inet_addr *d_addr, __be16 d_port) 257{ 258 struct ip_vs_conn *cp; 259 260 cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port); 261 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) 262 cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr, 263 d_port); 264 265 IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", 266 ip_vs_proto_name(protocol), 267 IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), 268 IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), 269 cp ? "hit" : "not hit"); 270 271 return cp; 272} 273 274struct ip_vs_conn * 275ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, 276 struct ip_vs_protocol *pp, 277 const struct ip_vs_iphdr *iph, 278 unsigned int proto_off, int inverse) 279{ 280 __be16 _ports[2], *pptr; 281 282 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); 283 if (pptr == NULL) 284 return NULL; 285 286 if (likely(!inverse)) 287 return ip_vs_conn_in_get(af, iph->protocol, 288 &iph->saddr, pptr[0], 289 &iph->daddr, pptr[1]); 290 else 291 return ip_vs_conn_in_get(af, iph->protocol, 292 &iph->daddr, pptr[1], 293 &iph->saddr, pptr[0]); 294} 295EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); 296 297/* Get reference to connection template */ 298struct ip_vs_conn *ip_vs_ct_in_get 299(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, 300 const union nf_inet_addr *d_addr, __be16 d_port) 301{ 302 unsigned hash; 303 struct ip_vs_conn *cp; 304 305 hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); 306 307 ct_read_lock(hash); 308 309 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 310 if (cp->af == af && 311 ip_vs_addr_equal(af, s_addr, &cp->caddr) && 312 /* protocol should only be IPPROTO_IP if 313 * d_addr is a fwmark */ 314 ip_vs_addr_equal(protocol == IPPROTO_IP ? AF_UNSPEC : af, 315 d_addr, &cp->vaddr) && 316 s_port == cp->cport && d_port == cp->vport && 317 cp->flags & IP_VS_CONN_F_TEMPLATE && 318 protocol == cp->protocol) { 319 /* HIT */ 320 atomic_inc(&cp->refcnt); 321 goto out; 322 } 323 } 324 cp = NULL; 325 326 out: 327 ct_read_unlock(hash); 328 329 IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", 330 ip_vs_proto_name(protocol), 331 IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), 332 IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), 333 cp ? "hit" : "not hit"); 334 335 return cp; 336} 337 338/* 339 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 340 * Called for pkts coming from inside-to-OUTside. 341 * s_addr, s_port: pkt source address (inside host) 342 * d_addr, d_port: pkt dest address (foreign host) 343 */ 344struct ip_vs_conn *ip_vs_conn_out_get 345(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, 346 const union nf_inet_addr *d_addr, __be16 d_port) 347{ 348 unsigned hash; 349 struct ip_vs_conn *cp, *ret=NULL; 350 351 /* 352 * Check for "full" addressed entries 353 */ 354 hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port); 355 356 ct_read_lock(hash); 357 358 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 359 if (cp->af == af && 360 ip_vs_addr_equal(af, d_addr, &cp->caddr) && 361 ip_vs_addr_equal(af, s_addr, &cp->daddr) && 362 d_port == cp->cport && s_port == cp->dport && 363 protocol == cp->protocol) { 364 /* HIT */ 365 atomic_inc(&cp->refcnt); 366 ret = cp; 367 break; 368 } 369 } 370 371 ct_read_unlock(hash); 372 373 IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", 374 ip_vs_proto_name(protocol), 375 IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), 376 IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), 377 ret ? "hit" : "not hit"); 378 379 return ret; 380} 381 382struct ip_vs_conn * 383ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, 384 struct ip_vs_protocol *pp, 385 const struct ip_vs_iphdr *iph, 386 unsigned int proto_off, int inverse) 387{ 388 __be16 _ports[2], *pptr; 389 390 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); 391 if (pptr == NULL) 392 return NULL; 393 394 if (likely(!inverse)) 395 return ip_vs_conn_out_get(af, iph->protocol, 396 &iph->saddr, pptr[0], 397 &iph->daddr, pptr[1]); 398 else 399 return ip_vs_conn_out_get(af, iph->protocol, 400 &iph->daddr, pptr[1], 401 &iph->saddr, pptr[0]); 402} 403EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); 404 405/* 406 * Put back the conn and restart its timer with its timeout 407 */ 408void ip_vs_conn_put(struct ip_vs_conn *cp) 409{ 410 unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ? 411 0 : cp->timeout; 412 mod_timer(&cp->timer, jiffies+t); 413 414 __ip_vs_conn_put(cp); 415} 416 417 418/* 419 * Fill a no_client_port connection with a client port number 420 */ 421void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) 422{ 423 if (ip_vs_conn_unhash(cp)) { 424 spin_lock(&cp->lock); 425 if (cp->flags & IP_VS_CONN_F_NO_CPORT) { 426 atomic_dec(&ip_vs_conn_no_cport_cnt); 427 cp->flags &= ~IP_VS_CONN_F_NO_CPORT; 428 cp->cport = cport; 429 } 430 spin_unlock(&cp->lock); 431 432 /* hash on new dport */ 433 ip_vs_conn_hash(cp); 434 } 435} 436 437 438/* 439 * Bind a connection entry with the corresponding packet_xmit. 440 * Called by ip_vs_conn_new. 441 */ 442static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) 443{ 444 switch (IP_VS_FWD_METHOD(cp)) { 445 case IP_VS_CONN_F_MASQ: 446 cp->packet_xmit = ip_vs_nat_xmit; 447 break; 448 449 case IP_VS_CONN_F_TUNNEL: 450 cp->packet_xmit = ip_vs_tunnel_xmit; 451 break; 452 453 case IP_VS_CONN_F_DROUTE: 454 cp->packet_xmit = ip_vs_dr_xmit; 455 break; 456 457 case IP_VS_CONN_F_LOCALNODE: 458 cp->packet_xmit = ip_vs_null_xmit; 459 break; 460 461 case IP_VS_CONN_F_BYPASS: 462 cp->packet_xmit = ip_vs_bypass_xmit; 463 break; 464 } 465} 466 467#ifdef CONFIG_IP_VS_IPV6 468static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) 469{ 470 switch (IP_VS_FWD_METHOD(cp)) { 471 case IP_VS_CONN_F_MASQ: 472 cp->packet_xmit = ip_vs_nat_xmit_v6; 473 break; 474 475 case IP_VS_CONN_F_TUNNEL: 476 cp->packet_xmit = ip_vs_tunnel_xmit_v6; 477 break; 478 479 case IP_VS_CONN_F_DROUTE: 480 cp->packet_xmit = ip_vs_dr_xmit_v6; 481 break; 482 483 case IP_VS_CONN_F_LOCALNODE: 484 cp->packet_xmit = ip_vs_null_xmit; 485 break; 486 487 case IP_VS_CONN_F_BYPASS: 488 cp->packet_xmit = ip_vs_bypass_xmit_v6; 489 break; 490 } 491} 492#endif 493 494 495static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) 496{ 497 return atomic_read(&dest->activeconns) 498 + atomic_read(&dest->inactconns); 499} 500 501/* 502 * Bind a connection entry with a virtual service destination 503 * Called just after a new connection entry is created. 504 */ 505static inline void 506ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) 507{ 508 /* if dest is NULL, then return directly */ 509 if (!dest) 510 return; 511 512 /* Increase the refcnt counter of the dest */ 513 atomic_inc(&dest->refcnt); 514 515 /* Bind with the destination and its corresponding transmitter */ 516 if ((cp->flags & IP_VS_CONN_F_SYNC) && 517 (!(cp->flags & IP_VS_CONN_F_TEMPLATE))) 518 /* if the connection is not template and is created 519 * by sync, preserve the activity flag. 520 */ 521 cp->flags |= atomic_read(&dest->conn_flags) & 522 (~IP_VS_CONN_F_INACTIVE); 523 else 524 cp->flags |= atomic_read(&dest->conn_flags); 525 cp->dest = dest; 526 527 IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " 528 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " 529 "dest->refcnt:%d\n", 530 ip_vs_proto_name(cp->protocol), 531 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 532 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 533 IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), 534 ip_vs_fwd_tag(cp), cp->state, 535 cp->flags, atomic_read(&cp->refcnt), 536 atomic_read(&dest->refcnt)); 537 538 /* Update the connection counters */ 539 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { 540 /* It is a normal connection, so increase the inactive 541 connection counter because it is in TCP SYNRECV 542 state (inactive) or other protocol inacive state */ 543 if ((cp->flags & IP_VS_CONN_F_SYNC) && 544 (!(cp->flags & IP_VS_CONN_F_INACTIVE))) 545 atomic_inc(&dest->activeconns); 546 else 547 atomic_inc(&dest->inactconns); 548 } else { 549 /* It is a persistent connection/template, so increase 550 the peristent connection counter */ 551 atomic_inc(&dest->persistconns); 552 } 553 554 if (dest->u_threshold != 0 && 555 ip_vs_dest_totalconns(dest) >= dest->u_threshold) 556 dest->flags |= IP_VS_DEST_F_OVERLOAD; 557} 558 559 560/* 561 * Check if there is a destination for the connection, if so 562 * bind the connection to the destination. 563 */ 564struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) 565{ 566 struct ip_vs_dest *dest; 567 568 if ((cp) && (!cp->dest)) { 569 dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport, 570 &cp->vaddr, cp->vport, 571 cp->protocol); 572 ip_vs_bind_dest(cp, dest); 573 return dest; 574 } else 575 return NULL; 576} 577 578 579/* 580 * Unbind a connection entry with its VS destination 581 * Called by the ip_vs_conn_expire function. 582 */ 583static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) 584{ 585 struct ip_vs_dest *dest = cp->dest; 586 587 if (!dest) 588 return; 589 590 IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d " 591 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " 592 "dest->refcnt:%d\n", 593 ip_vs_proto_name(cp->protocol), 594 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 595 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 596 IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), 597 ip_vs_fwd_tag(cp), cp->state, 598 cp->flags, atomic_read(&cp->refcnt), 599 atomic_read(&dest->refcnt)); 600 601 /* Update the connection counters */ 602 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { 603 /* It is a normal connection, so decrease the inactconns 604 or activeconns counter */ 605 if (cp->flags & IP_VS_CONN_F_INACTIVE) { 606 atomic_dec(&dest->inactconns); 607 } else { 608 atomic_dec(&dest->activeconns); 609 } 610 } else { 611 /* It is a persistent connection/template, so decrease 612 the peristent connection counter */ 613 atomic_dec(&dest->persistconns); 614 } 615 616 if (dest->l_threshold != 0) { 617 if (ip_vs_dest_totalconns(dest) < dest->l_threshold) 618 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 619 } else if (dest->u_threshold != 0) { 620 if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) 621 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 622 } else { 623 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 624 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 625 } 626 627 /* 628 * Simply decrease the refcnt of the dest, because the 629 * dest will be either in service's destination list 630 * or in the trash. 631 */ 632 atomic_dec(&dest->refcnt); 633} 634 635 636/* 637 * Checking if the destination of a connection template is available. 638 * If available, return 1, otherwise invalidate this connection 639 * template and return 0. 640 */ 641int ip_vs_check_template(struct ip_vs_conn *ct) 642{ 643 struct ip_vs_dest *dest = ct->dest; 644 645 /* 646 * Checking the dest server status. 647 */ 648 if ((dest == NULL) || 649 !(dest->flags & IP_VS_DEST_F_AVAILABLE) || 650 (sysctl_ip_vs_expire_quiescent_template && 651 (atomic_read(&dest->weight) == 0))) { 652 IP_VS_DBG_BUF(9, "check_template: dest not available for " 653 "protocol %s s:%s:%d v:%s:%d " 654 "-> d:%s:%d\n", 655 ip_vs_proto_name(ct->protocol), 656 IP_VS_DBG_ADDR(ct->af, &ct->caddr), 657 ntohs(ct->cport), 658 IP_VS_DBG_ADDR(ct->af, &ct->vaddr), 659 ntohs(ct->vport), 660 IP_VS_DBG_ADDR(ct->af, &ct->daddr), 661 ntohs(ct->dport)); 662 663 /* 664 * Invalidate the connection template 665 */ 666 if (ct->vport != htons(0xffff)) { 667 if (ip_vs_conn_unhash(ct)) { 668 ct->dport = htons(0xffff); 669 ct->vport = htons(0xffff); 670 ct->cport = 0; 671 ip_vs_conn_hash(ct); 672 } 673 } 674 675 /* 676 * Simply decrease the refcnt of the template, 677 * don't restart its timer. 678 */ 679 atomic_dec(&ct->refcnt); 680 return 0; 681 } 682 return 1; 683} 684 685static void ip_vs_conn_expire(unsigned long data) 686{ 687 struct ip_vs_conn *cp = (struct ip_vs_conn *)data; 688 689 cp->timeout = 60*HZ; 690 691 /* 692 * hey, I'm using it 693 */ 694 atomic_inc(&cp->refcnt); 695 696 /* 697 * do I control anybody? 698 */ 699 if (atomic_read(&cp->n_control)) 700 goto expire_later; 701 702 /* 703 * unhash it if it is hashed in the conn table 704 */ 705 if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) 706 goto expire_later; 707 708 /* 709 * refcnt==1 implies I'm the only one referrer 710 */ 711 if (likely(atomic_read(&cp->refcnt) == 1)) { 712 /* delete the timer if it is activated by other users */ 713 if (timer_pending(&cp->timer)) 714 del_timer(&cp->timer); 715 716 /* does anybody control me? */ 717 if (cp->control) 718 ip_vs_control_del(cp); 719 720 if (unlikely(cp->app != NULL)) 721 ip_vs_unbind_app(cp); 722 ip_vs_unbind_dest(cp); 723 if (cp->flags & IP_VS_CONN_F_NO_CPORT) 724 atomic_dec(&ip_vs_conn_no_cport_cnt); 725 atomic_dec(&ip_vs_conn_count); 726 727 kmem_cache_free(ip_vs_conn_cachep, cp); 728 return; 729 } 730 731 /* hash it back to the table */ 732 ip_vs_conn_hash(cp); 733 734 expire_later: 735 IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", 736 atomic_read(&cp->refcnt)-1, 737 atomic_read(&cp->n_control)); 738 739 ip_vs_conn_put(cp); 740} 741 742 743void ip_vs_conn_expire_now(struct ip_vs_conn *cp) 744{ 745 if (del_timer(&cp->timer)) 746 mod_timer(&cp->timer, jiffies); 747} 748 749 750/* 751 * Create a new connection entry and hash it into the ip_vs_conn_tab 752 */ 753struct ip_vs_conn * 754ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, 755 const union nf_inet_addr *vaddr, __be16 vport, 756 const union nf_inet_addr *daddr, __be16 dport, unsigned flags, 757 struct ip_vs_dest *dest) 758{ 759 struct ip_vs_conn *cp; 760 struct ip_vs_protocol *pp = ip_vs_proto_get(proto); 761 762 cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); 763 if (cp == NULL) { 764 IP_VS_ERR_RL("%s(): no memory\n", __func__); 765 return NULL; 766 } 767 768 INIT_LIST_HEAD(&cp->c_list); 769 setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); 770 cp->af = af; 771 cp->protocol = proto; 772 ip_vs_addr_copy(af, &cp->caddr, caddr); 773 cp->cport = cport; 774 ip_vs_addr_copy(af, &cp->vaddr, vaddr); 775 cp->vport = vport; 776 /* proto should only be IPPROTO_IP if d_addr is a fwmark */ 777 ip_vs_addr_copy(proto == IPPROTO_IP ? AF_UNSPEC : af, 778 &cp->daddr, daddr); 779 cp->dport = dport; 780 cp->flags = flags; 781 spin_lock_init(&cp->lock); 782 783 /* 784 * Set the entry is referenced by the current thread before hashing 785 * it in the table, so that other thread run ip_vs_random_dropentry 786 * but cannot drop this entry. 787 */ 788 atomic_set(&cp->refcnt, 1); 789 790 atomic_set(&cp->n_control, 0); 791 atomic_set(&cp->in_pkts, 0); 792 793 atomic_inc(&ip_vs_conn_count); 794 if (flags & IP_VS_CONN_F_NO_CPORT) 795 atomic_inc(&ip_vs_conn_no_cport_cnt); 796 797 /* Bind the connection with a destination server */ 798 ip_vs_bind_dest(cp, dest); 799 800 /* Set its state and timeout */ 801 cp->state = 0; 802 cp->timeout = 3*HZ; 803 804 /* Bind its packet transmitter */ 805#ifdef CONFIG_IP_VS_IPV6 806 if (af == AF_INET6) 807 ip_vs_bind_xmit_v6(cp); 808 else 809#endif 810 ip_vs_bind_xmit(cp); 811 812 if (unlikely(pp && atomic_read(&pp->appcnt))) 813 ip_vs_bind_app(cp, pp); 814 815 /* Hash it in the ip_vs_conn_tab finally */ 816 ip_vs_conn_hash(cp); 817 818 return cp; 819} 820 821 822/* 823 * /proc/net/ip_vs_conn entries 824 */ 825#ifdef CONFIG_PROC_FS 826 827static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) 828{ 829 int idx; 830 struct ip_vs_conn *cp; 831 832 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { 833 ct_read_lock_bh(idx); 834 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 835 if (pos-- == 0) { 836 seq->private = &ip_vs_conn_tab[idx]; 837 return cp; 838 } 839 } 840 ct_read_unlock_bh(idx); 841 } 842 843 return NULL; 844} 845 846static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) 847{ 848 seq->private = NULL; 849 return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; 850} 851 852static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) 853{ 854 struct ip_vs_conn *cp = v; 855 struct list_head *e, *l = seq->private; 856 int idx; 857 858 ++*pos; 859 if (v == SEQ_START_TOKEN) 860 return ip_vs_conn_array(seq, 0); 861 862 /* more on same hash chain? */ 863 if ((e = cp->c_list.next) != l) 864 return list_entry(e, struct ip_vs_conn, c_list); 865 866 idx = l - ip_vs_conn_tab; 867 ct_read_unlock_bh(idx); 868 869 while (++idx < ip_vs_conn_tab_size) { 870 ct_read_lock_bh(idx); 871 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 872 seq->private = &ip_vs_conn_tab[idx]; 873 return cp; 874 } 875 ct_read_unlock_bh(idx); 876 } 877 seq->private = NULL; 878 return NULL; 879} 880 881static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) 882{ 883 struct list_head *l = seq->private; 884 885 if (l) 886 ct_read_unlock_bh(l - ip_vs_conn_tab); 887} 888 889static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) 890{ 891 892 if (v == SEQ_START_TOKEN) 893 seq_puts(seq, 894 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); 895 else { 896 const struct ip_vs_conn *cp = v; 897 898#ifdef CONFIG_IP_VS_IPV6 899 if (cp->af == AF_INET6) 900 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %7lu\n", 901 ip_vs_proto_name(cp->protocol), 902 &cp->caddr.in6, ntohs(cp->cport), 903 &cp->vaddr.in6, ntohs(cp->vport), 904 &cp->daddr.in6, ntohs(cp->dport), 905 ip_vs_state_name(cp->protocol, cp->state), 906 (cp->timer.expires-jiffies)/HZ); 907 else 908#endif 909 seq_printf(seq, 910 "%-3s %08X %04X %08X %04X" 911 " %08X %04X %-11s %7lu\n", 912 ip_vs_proto_name(cp->protocol), 913 ntohl(cp->caddr.ip), ntohs(cp->cport), 914 ntohl(cp->vaddr.ip), ntohs(cp->vport), 915 ntohl(cp->daddr.ip), ntohs(cp->dport), 916 ip_vs_state_name(cp->protocol, cp->state), 917 (cp->timer.expires-jiffies)/HZ); 918 } 919 return 0; 920} 921 922static const struct seq_operations ip_vs_conn_seq_ops = { 923 .start = ip_vs_conn_seq_start, 924 .next = ip_vs_conn_seq_next, 925 .stop = ip_vs_conn_seq_stop, 926 .show = ip_vs_conn_seq_show, 927}; 928 929static int ip_vs_conn_open(struct inode *inode, struct file *file) 930{ 931 return seq_open(file, &ip_vs_conn_seq_ops); 932} 933 934static const struct file_operations ip_vs_conn_fops = { 935 .owner = THIS_MODULE, 936 .open = ip_vs_conn_open, 937 .read = seq_read, 938 .llseek = seq_lseek, 939 .release = seq_release, 940}; 941 942static const char *ip_vs_origin_name(unsigned flags) 943{ 944 if (flags & IP_VS_CONN_F_SYNC) 945 return "SYNC"; 946 else 947 return "LOCAL"; 948} 949 950static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) 951{ 952 953 if (v == SEQ_START_TOKEN) 954 seq_puts(seq, 955 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); 956 else { 957 const struct ip_vs_conn *cp = v; 958 959#ifdef CONFIG_IP_VS_IPV6 960 if (cp->af == AF_INET6) 961 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %-6s %7lu\n", 962 ip_vs_proto_name(cp->protocol), 963 &cp->caddr.in6, ntohs(cp->cport), 964 &cp->vaddr.in6, ntohs(cp->vport), 965 &cp->daddr.in6, ntohs(cp->dport), 966 ip_vs_state_name(cp->protocol, cp->state), 967 ip_vs_origin_name(cp->flags), 968 (cp->timer.expires-jiffies)/HZ); 969 else 970#endif 971 seq_printf(seq, 972 "%-3s %08X %04X %08X %04X " 973 "%08X %04X %-11s %-6s %7lu\n", 974 ip_vs_proto_name(cp->protocol), 975 ntohl(cp->caddr.ip), ntohs(cp->cport), 976 ntohl(cp->vaddr.ip), ntohs(cp->vport), 977 ntohl(cp->daddr.ip), ntohs(cp->dport), 978 ip_vs_state_name(cp->protocol, cp->state), 979 ip_vs_origin_name(cp->flags), 980 (cp->timer.expires-jiffies)/HZ); 981 } 982 return 0; 983} 984 985static const struct seq_operations ip_vs_conn_sync_seq_ops = { 986 .start = ip_vs_conn_seq_start, 987 .next = ip_vs_conn_seq_next, 988 .stop = ip_vs_conn_seq_stop, 989 .show = ip_vs_conn_sync_seq_show, 990}; 991 992static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) 993{ 994 return seq_open(file, &ip_vs_conn_sync_seq_ops); 995} 996 997static const struct file_operations ip_vs_conn_sync_fops = { 998 .owner = THIS_MODULE, 999 .open = ip_vs_conn_sync_open, 1000 .read = seq_read, 1001 .llseek = seq_lseek, 1002 .release = seq_release, 1003}; 1004 1005#endif 1006 1007 1008/* 1009 * Randomly drop connection entries before running out of memory 1010 */ 1011static inline int todrop_entry(struct ip_vs_conn *cp) 1012{ 1013 /* 1014 * The drop rate array needs tuning for real environments. 1015 * Called from timer bh only => no locking 1016 */ 1017 static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; 1018 static char todrop_counter[9] = {0}; 1019 int i; 1020 1021 /* if the conn entry hasn't lasted for 60 seconds, don't drop it. 1022 This will leave enough time for normal connection to get 1023 through. */ 1024 if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) 1025 return 0; 1026 1027 /* Don't drop the entry if its number of incoming packets is not 1028 located in [0, 8] */ 1029 i = atomic_read(&cp->in_pkts); 1030 if (i > 8 || i < 0) return 0; 1031 1032 if (!todrop_rate[i]) return 0; 1033 if (--todrop_counter[i] > 0) return 0; 1034 1035 todrop_counter[i] = todrop_rate[i]; 1036 return 1; 1037} 1038 1039/* Called from keventd and must protect itself from softirqs */ 1040void ip_vs_random_dropentry(void) 1041{ 1042 int idx; 1043 struct ip_vs_conn *cp; 1044 1045 /* 1046 * Randomly scan 1/32 of the whole table every second 1047 */ 1048 for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { 1049 unsigned hash = net_random() & ip_vs_conn_tab_mask; 1050 1051 /* 1052 * Lock is actually needed in this loop. 1053 */ 1054 ct_write_lock_bh(hash); 1055 1056 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 1057 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 1058 /* connection template */ 1059 continue; 1060 1061 if (cp->protocol == IPPROTO_TCP) { 1062 switch(cp->state) { 1063 case IP_VS_TCP_S_SYN_RECV: 1064 case IP_VS_TCP_S_SYNACK: 1065 break; 1066 1067 case IP_VS_TCP_S_ESTABLISHED: 1068 if (todrop_entry(cp)) 1069 break; 1070 continue; 1071 1072 default: 1073 continue; 1074 } 1075 } else { 1076 if (!todrop_entry(cp)) 1077 continue; 1078 } 1079 1080 IP_VS_DBG(4, "del connection\n"); 1081 ip_vs_conn_expire_now(cp); 1082 if (cp->control) { 1083 IP_VS_DBG(4, "del conn template\n"); 1084 ip_vs_conn_expire_now(cp->control); 1085 } 1086 } 1087 ct_write_unlock_bh(hash); 1088 } 1089} 1090 1091 1092/* 1093 * Flush all the connection entries in the ip_vs_conn_tab 1094 */ 1095static void ip_vs_conn_flush(void) 1096{ 1097 int idx; 1098 struct ip_vs_conn *cp; 1099 1100 flush_again: 1101 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { 1102 /* 1103 * Lock is actually needed in this loop. 1104 */ 1105 ct_write_lock_bh(idx); 1106 1107 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 1108 1109 IP_VS_DBG(4, "del connection\n"); 1110 ip_vs_conn_expire_now(cp); 1111 if (cp->control) { 1112 IP_VS_DBG(4, "del conn template\n"); 1113 ip_vs_conn_expire_now(cp->control); 1114 } 1115 } 1116 ct_write_unlock_bh(idx); 1117 } 1118 1119 /* the counter may be not NULL, because maybe some conn entries 1120 are run by slow timer handler or unhashed but still referred */ 1121 if (atomic_read(&ip_vs_conn_count) != 0) { 1122 schedule(); 1123 goto flush_again; 1124 } 1125} 1126 1127 1128int __init ip_vs_conn_init(void) 1129{ 1130 int idx; 1131 1132 /* Compute size and mask */ 1133 ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; 1134 ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; 1135 1136 /* 1137 * Allocate the connection hash table and initialize its list heads 1138 */ 1139 ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * 1140 sizeof(struct list_head)); 1141 if (!ip_vs_conn_tab) 1142 return -ENOMEM; 1143 1144 /* Allocate ip_vs_conn slab cache */ 1145 ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", 1146 sizeof(struct ip_vs_conn), 0, 1147 SLAB_HWCACHE_ALIGN, NULL); 1148 if (!ip_vs_conn_cachep) { 1149 vfree(ip_vs_conn_tab); 1150 return -ENOMEM; 1151 } 1152 1153 pr_info("Connection hash table configured " 1154 "(size=%d, memory=%ldKbytes)\n", 1155 ip_vs_conn_tab_size, 1156 (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024); 1157 IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", 1158 sizeof(struct ip_vs_conn)); 1159 1160 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { 1161 INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); 1162 } 1163 1164 for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { 1165 rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); 1166 } 1167 1168 proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); 1169 proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); 1170 1171 /* calculate the random value for connection hash */ 1172 get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); 1173 1174 return 0; 1175} 1176 1177 1178void ip_vs_conn_cleanup(void) 1179{ 1180 /* flush all the connection entries first */ 1181 ip_vs_conn_flush(); 1182 1183 /* Release the empty cache */ 1184 kmem_cache_destroy(ip_vs_conn_cachep); 1185 proc_net_remove(&init_net, "ip_vs_conn"); 1186 proc_net_remove(&init_net, "ip_vs_conn_sync"); 1187 vfree(ip_vs_conn_tab); 1188} 1189