1/* 2 * IPVS An implementation of the IP virtual server support for the 3 * LINUX operating system. IPVS is now implemented as a module 4 * over the Netfilter framework. IPVS can be used to build a 5 * high-performance and highly available server based on a 6 * cluster of servers. 7 * 8 * Version: $Id: ip_vs_conn.c,v 1.1.1.1 2007/08/03 18:53:51 Exp $ 9 * 10 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 11 * Peter Kese <peter.kese@ijs.si> 12 * Julian Anastasov <ja@ssi.bg> 13 * 14 * This program is free software; you can redistribute it and/or 15 * modify it under the terms of the GNU General Public License 16 * as published by the Free Software Foundation; either version 17 * 2 of the License, or (at your option) any later version. 18 * 19 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, 20 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms 21 * and others. Many code here is taken from IP MASQ code of kernel 2.2. 22 * 23 * Changes: 24 * 25 */ 26 27#include <linux/interrupt.h> 28#include <linux/in.h> 29#include <linux/net.h> 30#include <linux/kernel.h> 31#include <linux/module.h> 32#include <linux/vmalloc.h> 33#include <linux/proc_fs.h> /* for proc_net_* */ 34#include <linux/seq_file.h> 35#include <linux/jhash.h> 36#include <linux/random.h> 37 38#include <net/ip_vs.h> 39 40 41/* 42 * Connection hash table: for input and output packets lookups of IPVS 43 */ 44static struct list_head *ip_vs_conn_tab; 45 46/* SLAB cache for IPVS connections */ 47static struct kmem_cache *ip_vs_conn_cachep __read_mostly; 48 49/* counter for current IPVS connections */ 50static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); 51 52/* counter for no client port connections */ 53static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); 54 55/* random value for IPVS connection hash */ 56static unsigned int ip_vs_conn_rnd; 57 58/* 59 * Fine locking granularity for big connection hash table 60 */ 61#define CT_LOCKARRAY_BITS 4 62#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) 63#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) 64 65struct ip_vs_aligned_lock 66{ 67 rwlock_t l; 68} __attribute__((__aligned__(SMP_CACHE_BYTES))); 69 70/* lock array for conn table */ 71static struct ip_vs_aligned_lock 72__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; 73 74static inline void ct_read_lock(unsigned key) 75{ 76 read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 77} 78 79static inline void ct_read_unlock(unsigned key) 80{ 81 read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 82} 83 84static inline void ct_write_lock(unsigned key) 85{ 86 write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 87} 88 89static inline void ct_write_unlock(unsigned key) 90{ 91 write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 92} 93 94static inline void ct_read_lock_bh(unsigned key) 95{ 96 read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 97} 98 99static inline void ct_read_unlock_bh(unsigned key) 100{ 101 read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 102} 103 104static inline void ct_write_lock_bh(unsigned key) 105{ 106 write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 107} 108 109static inline void ct_write_unlock_bh(unsigned key) 110{ 111 write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); 112} 113 114 115/* 116 * Returns hash value for IPVS connection entry 117 */ 118static unsigned int ip_vs_conn_hashkey(unsigned proto, __be32 addr, __be16 port) 119{ 120 return jhash_3words((__force u32)addr, (__force u32)port, proto, ip_vs_conn_rnd) 121 & IP_VS_CONN_TAB_MASK; 122} 123 124 125/* 126 * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. 127 * returns bool success. 128 */ 129static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) 130{ 131 unsigned hash; 132 int ret; 133 134 /* Hash by protocol, client address and port */ 135 hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); 136 137 ct_write_lock(hash); 138 139 if (!(cp->flags & IP_VS_CONN_F_HASHED)) { 140 list_add(&cp->c_list, &ip_vs_conn_tab[hash]); 141 cp->flags |= IP_VS_CONN_F_HASHED; 142 atomic_inc(&cp->refcnt); 143 ret = 1; 144 } else { 145 IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " 146 "called from %p\n", __builtin_return_address(0)); 147 ret = 0; 148 } 149 150 ct_write_unlock(hash); 151 152 return ret; 153} 154 155 156/* 157 * UNhashes ip_vs_conn from ip_vs_conn_tab. 158 * returns bool success. 159 */ 160static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) 161{ 162 unsigned hash; 163 int ret; 164 165 /* unhash it and decrease its reference counter */ 166 hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); 167 168 ct_write_lock(hash); 169 170 if (cp->flags & IP_VS_CONN_F_HASHED) { 171 list_del(&cp->c_list); 172 cp->flags &= ~IP_VS_CONN_F_HASHED; 173 atomic_dec(&cp->refcnt); 174 ret = 1; 175 } else 176 ret = 0; 177 178 ct_write_unlock(hash); 179 180 return ret; 181} 182 183 184/* 185 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 186 * Called for pkts coming from OUTside-to-INside. 187 * s_addr, s_port: pkt source address (foreign host) 188 * d_addr, d_port: pkt dest address (load balancer) 189 */ 190static inline struct ip_vs_conn *__ip_vs_conn_in_get 191(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) 192{ 193 unsigned hash; 194 struct ip_vs_conn *cp; 195 196 hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); 197 198 ct_read_lock(hash); 199 200 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 201 if (s_addr==cp->caddr && s_port==cp->cport && 202 d_port==cp->vport && d_addr==cp->vaddr && 203 ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && 204 protocol==cp->protocol) { 205 /* HIT */ 206 atomic_inc(&cp->refcnt); 207 ct_read_unlock(hash); 208 return cp; 209 } 210 } 211 212 ct_read_unlock(hash); 213 214 return NULL; 215} 216 217struct ip_vs_conn *ip_vs_conn_in_get 218(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) 219{ 220 struct ip_vs_conn *cp; 221 222 cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port); 223 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) 224 cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); 225 226 IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 227 ip_vs_proto_name(protocol), 228 NIPQUAD(s_addr), ntohs(s_port), 229 NIPQUAD(d_addr), ntohs(d_port), 230 cp?"hit":"not hit"); 231 232 return cp; 233} 234 235/* Get reference to connection template */ 236struct ip_vs_conn *ip_vs_ct_in_get 237(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) 238{ 239 unsigned hash; 240 struct ip_vs_conn *cp; 241 242 hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); 243 244 ct_read_lock(hash); 245 246 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 247 if (s_addr==cp->caddr && s_port==cp->cport && 248 d_port==cp->vport && d_addr==cp->vaddr && 249 cp->flags & IP_VS_CONN_F_TEMPLATE && 250 protocol==cp->protocol) { 251 /* HIT */ 252 atomic_inc(&cp->refcnt); 253 goto out; 254 } 255 } 256 cp = NULL; 257 258 out: 259 ct_read_unlock(hash); 260 261 IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 262 ip_vs_proto_name(protocol), 263 NIPQUAD(s_addr), ntohs(s_port), 264 NIPQUAD(d_addr), ntohs(d_port), 265 cp?"hit":"not hit"); 266 267 return cp; 268} 269 270/* 271 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. 272 * Called for pkts coming from inside-to-OUTside. 273 * s_addr, s_port: pkt source address (inside host) 274 * d_addr, d_port: pkt dest address (foreign host) 275 */ 276struct ip_vs_conn *ip_vs_conn_out_get 277(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) 278{ 279 unsigned hash; 280 struct ip_vs_conn *cp, *ret=NULL; 281 282 /* 283 * Check for "full" addressed entries 284 */ 285 hash = ip_vs_conn_hashkey(protocol, d_addr, d_port); 286 287 ct_read_lock(hash); 288 289 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 290 if (d_addr == cp->caddr && d_port == cp->cport && 291 s_port == cp->dport && s_addr == cp->daddr && 292 protocol == cp->protocol) { 293 /* HIT */ 294 atomic_inc(&cp->refcnt); 295 ret = cp; 296 break; 297 } 298 } 299 300 ct_read_unlock(hash); 301 302 IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 303 ip_vs_proto_name(protocol), 304 NIPQUAD(s_addr), ntohs(s_port), 305 NIPQUAD(d_addr), ntohs(d_port), 306 ret?"hit":"not hit"); 307 308 return ret; 309} 310 311 312/* 313 * Put back the conn and restart its timer with its timeout 314 */ 315void ip_vs_conn_put(struct ip_vs_conn *cp) 316{ 317 /* reset it expire in its timeout */ 318 mod_timer(&cp->timer, jiffies+cp->timeout); 319 320 __ip_vs_conn_put(cp); 321} 322 323 324/* 325 * Fill a no_client_port connection with a client port number 326 */ 327void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) 328{ 329 if (ip_vs_conn_unhash(cp)) { 330 spin_lock(&cp->lock); 331 if (cp->flags & IP_VS_CONN_F_NO_CPORT) { 332 atomic_dec(&ip_vs_conn_no_cport_cnt); 333 cp->flags &= ~IP_VS_CONN_F_NO_CPORT; 334 cp->cport = cport; 335 } 336 spin_unlock(&cp->lock); 337 338 /* hash on new dport */ 339 ip_vs_conn_hash(cp); 340 } 341} 342 343 344/* 345 * Bind a connection entry with the corresponding packet_xmit. 346 * Called by ip_vs_conn_new. 347 */ 348static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) 349{ 350 switch (IP_VS_FWD_METHOD(cp)) { 351 case IP_VS_CONN_F_MASQ: 352 cp->packet_xmit = ip_vs_nat_xmit; 353 break; 354 355 case IP_VS_CONN_F_TUNNEL: 356 cp->packet_xmit = ip_vs_tunnel_xmit; 357 break; 358 359 case IP_VS_CONN_F_DROUTE: 360 cp->packet_xmit = ip_vs_dr_xmit; 361 break; 362 363 case IP_VS_CONN_F_LOCALNODE: 364 cp->packet_xmit = ip_vs_null_xmit; 365 break; 366 367 case IP_VS_CONN_F_BYPASS: 368 cp->packet_xmit = ip_vs_bypass_xmit; 369 break; 370 } 371} 372 373 374static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) 375{ 376 return atomic_read(&dest->activeconns) 377 + atomic_read(&dest->inactconns); 378} 379 380/* 381 * Bind a connection entry with a virtual service destination 382 * Called just after a new connection entry is created. 383 */ 384static inline void 385ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) 386{ 387 /* if dest is NULL, then return directly */ 388 if (!dest) 389 return; 390 391 /* Increase the refcnt counter of the dest */ 392 atomic_inc(&dest->refcnt); 393 394 /* Bind with the destination and its corresponding transmitter */ 395 cp->flags |= atomic_read(&dest->conn_flags); 396 cp->dest = dest; 397 398 IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 399 "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " 400 "dest->refcnt:%d\n", 401 ip_vs_proto_name(cp->protocol), 402 NIPQUAD(cp->caddr), ntohs(cp->cport), 403 NIPQUAD(cp->vaddr), ntohs(cp->vport), 404 NIPQUAD(cp->daddr), ntohs(cp->dport), 405 ip_vs_fwd_tag(cp), cp->state, 406 cp->flags, atomic_read(&cp->refcnt), 407 atomic_read(&dest->refcnt)); 408 409 /* Update the connection counters */ 410 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { 411 /* It is a normal connection, so increase the inactive 412 connection counter because it is in TCP SYNRECV 413 state (inactive) or other protocol inacive state */ 414 atomic_inc(&dest->inactconns); 415 } else { 416 /* It is a persistent connection/template, so increase 417 the peristent connection counter */ 418 atomic_inc(&dest->persistconns); 419 } 420 421 if (dest->u_threshold != 0 && 422 ip_vs_dest_totalconns(dest) >= dest->u_threshold) 423 dest->flags |= IP_VS_DEST_F_OVERLOAD; 424} 425 426 427/* 428 * Unbind a connection entry with its VS destination 429 * Called by the ip_vs_conn_expire function. 430 */ 431static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) 432{ 433 struct ip_vs_dest *dest = cp->dest; 434 435 if (!dest) 436 return; 437 438 IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 439 "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " 440 "dest->refcnt:%d\n", 441 ip_vs_proto_name(cp->protocol), 442 NIPQUAD(cp->caddr), ntohs(cp->cport), 443 NIPQUAD(cp->vaddr), ntohs(cp->vport), 444 NIPQUAD(cp->daddr), ntohs(cp->dport), 445 ip_vs_fwd_tag(cp), cp->state, 446 cp->flags, atomic_read(&cp->refcnt), 447 atomic_read(&dest->refcnt)); 448 449 /* Update the connection counters */ 450 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { 451 /* It is a normal connection, so decrease the inactconns 452 or activeconns counter */ 453 if (cp->flags & IP_VS_CONN_F_INACTIVE) { 454 atomic_dec(&dest->inactconns); 455 } else { 456 atomic_dec(&dest->activeconns); 457 } 458 } else { 459 /* It is a persistent connection/template, so decrease 460 the peristent connection counter */ 461 atomic_dec(&dest->persistconns); 462 } 463 464 if (dest->l_threshold != 0) { 465 if (ip_vs_dest_totalconns(dest) < dest->l_threshold) 466 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 467 } else if (dest->u_threshold != 0) { 468 if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) 469 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 470 } else { 471 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 472 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 473 } 474 475 /* 476 * Simply decrease the refcnt of the dest, because the 477 * dest will be either in service's destination list 478 * or in the trash. 479 */ 480 atomic_dec(&dest->refcnt); 481} 482 483 484/* 485 * Checking if the destination of a connection template is available. 486 * If available, return 1, otherwise invalidate this connection 487 * template and return 0. 488 */ 489int ip_vs_check_template(struct ip_vs_conn *ct) 490{ 491 struct ip_vs_dest *dest = ct->dest; 492 493 /* 494 * Checking the dest server status. 495 */ 496 if ((dest == NULL) || 497 !(dest->flags & IP_VS_DEST_F_AVAILABLE) || 498 (sysctl_ip_vs_expire_quiescent_template && 499 (atomic_read(&dest->weight) == 0))) { 500 IP_VS_DBG(9, "check_template: dest not available for " 501 "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 502 "-> d:%u.%u.%u.%u:%d\n", 503 ip_vs_proto_name(ct->protocol), 504 NIPQUAD(ct->caddr), ntohs(ct->cport), 505 NIPQUAD(ct->vaddr), ntohs(ct->vport), 506 NIPQUAD(ct->daddr), ntohs(ct->dport)); 507 508 /* 509 * Invalidate the connection template 510 */ 511 if (ct->vport != htons(0xffff)) { 512 if (ip_vs_conn_unhash(ct)) { 513 ct->dport = htons(0xffff); 514 ct->vport = htons(0xffff); 515 ct->cport = 0; 516 ip_vs_conn_hash(ct); 517 } 518 } 519 520 /* 521 * Simply decrease the refcnt of the template, 522 * don't restart its timer. 523 */ 524 atomic_dec(&ct->refcnt); 525 return 0; 526 } 527 return 1; 528} 529 530static void ip_vs_conn_expire(unsigned long data) 531{ 532 struct ip_vs_conn *cp = (struct ip_vs_conn *)data; 533 534 cp->timeout = 60*HZ; 535 536 /* 537 * hey, I'm using it 538 */ 539 atomic_inc(&cp->refcnt); 540 541 /* 542 * do I control anybody? 543 */ 544 if (atomic_read(&cp->n_control)) 545 goto expire_later; 546 547 /* 548 * unhash it if it is hashed in the conn table 549 */ 550 if (!ip_vs_conn_unhash(cp)) 551 goto expire_later; 552 553 /* 554 * refcnt==1 implies I'm the only one referrer 555 */ 556 if (likely(atomic_read(&cp->refcnt) == 1)) { 557 /* delete the timer if it is activated by other users */ 558 if (timer_pending(&cp->timer)) 559 del_timer(&cp->timer); 560 561 /* does anybody control me? */ 562 if (cp->control) 563 ip_vs_control_del(cp); 564 565 if (unlikely(cp->app != NULL)) 566 ip_vs_unbind_app(cp); 567 ip_vs_unbind_dest(cp); 568 if (cp->flags & IP_VS_CONN_F_NO_CPORT) 569 atomic_dec(&ip_vs_conn_no_cport_cnt); 570 atomic_dec(&ip_vs_conn_count); 571 572 kmem_cache_free(ip_vs_conn_cachep, cp); 573 return; 574 } 575 576 /* hash it back to the table */ 577 ip_vs_conn_hash(cp); 578 579 expire_later: 580 IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", 581 atomic_read(&cp->refcnt)-1, 582 atomic_read(&cp->n_control)); 583 584 ip_vs_conn_put(cp); 585} 586 587 588void ip_vs_conn_expire_now(struct ip_vs_conn *cp) 589{ 590 if (del_timer(&cp->timer)) 591 mod_timer(&cp->timer, jiffies); 592} 593 594 595/* 596 * Create a new connection entry and hash it into the ip_vs_conn_tab 597 */ 598struct ip_vs_conn * 599ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport, 600 __be32 daddr, __be16 dport, unsigned flags, 601 struct ip_vs_dest *dest) 602{ 603 struct ip_vs_conn *cp; 604 struct ip_vs_protocol *pp = ip_vs_proto_get(proto); 605 606 cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); 607 if (cp == NULL) { 608 IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); 609 return NULL; 610 } 611 612 INIT_LIST_HEAD(&cp->c_list); 613 init_timer(&cp->timer); 614 cp->timer.data = (unsigned long)cp; 615 cp->timer.function = ip_vs_conn_expire; 616 cp->protocol = proto; 617 cp->caddr = caddr; 618 cp->cport = cport; 619 cp->vaddr = vaddr; 620 cp->vport = vport; 621 cp->daddr = daddr; 622 cp->dport = dport; 623 cp->flags = flags; 624 spin_lock_init(&cp->lock); 625 626 /* 627 * Set the entry is referenced by the current thread before hashing 628 * it in the table, so that other thread run ip_vs_random_dropentry 629 * but cannot drop this entry. 630 */ 631 atomic_set(&cp->refcnt, 1); 632 633 atomic_set(&cp->n_control, 0); 634 atomic_set(&cp->in_pkts, 0); 635 636 atomic_inc(&ip_vs_conn_count); 637 if (flags & IP_VS_CONN_F_NO_CPORT) 638 atomic_inc(&ip_vs_conn_no_cport_cnt); 639 640 /* Bind the connection with a destination server */ 641 ip_vs_bind_dest(cp, dest); 642 643 /* Set its state and timeout */ 644 cp->state = 0; 645 cp->timeout = 3*HZ; 646 647 /* Bind its packet transmitter */ 648 ip_vs_bind_xmit(cp); 649 650 if (unlikely(pp && atomic_read(&pp->appcnt))) 651 ip_vs_bind_app(cp, pp); 652 653 /* Hash it in the ip_vs_conn_tab finally */ 654 ip_vs_conn_hash(cp); 655 656 return cp; 657} 658 659 660/* 661 * /proc/net/ip_vs_conn entries 662 */ 663#ifdef CONFIG_PROC_FS 664 665static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) 666{ 667 int idx; 668 struct ip_vs_conn *cp; 669 670 for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { 671 ct_read_lock_bh(idx); 672 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 673 if (pos-- == 0) { 674 seq->private = &ip_vs_conn_tab[idx]; 675 return cp; 676 } 677 } 678 ct_read_unlock_bh(idx); 679 } 680 681 return NULL; 682} 683 684static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) 685{ 686 seq->private = NULL; 687 return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; 688} 689 690static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) 691{ 692 struct ip_vs_conn *cp = v; 693 struct list_head *e, *l = seq->private; 694 int idx; 695 696 ++*pos; 697 if (v == SEQ_START_TOKEN) 698 return ip_vs_conn_array(seq, 0); 699 700 /* more on same hash chain? */ 701 if ((e = cp->c_list.next) != l) 702 return list_entry(e, struct ip_vs_conn, c_list); 703 704 idx = l - ip_vs_conn_tab; 705 ct_read_unlock_bh(idx); 706 707 while (++idx < IP_VS_CONN_TAB_SIZE) { 708 ct_read_lock_bh(idx); 709 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 710 seq->private = &ip_vs_conn_tab[idx]; 711 return cp; 712 } 713 ct_read_unlock_bh(idx); 714 } 715 seq->private = NULL; 716 return NULL; 717} 718 719static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) 720{ 721 struct list_head *l = seq->private; 722 723 if (l) 724 ct_read_unlock_bh(l - ip_vs_conn_tab); 725} 726 727static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) 728{ 729 730 if (v == SEQ_START_TOKEN) 731 seq_puts(seq, 732 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); 733 else { 734 const struct ip_vs_conn *cp = v; 735 736 seq_printf(seq, 737 "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n", 738 ip_vs_proto_name(cp->protocol), 739 ntohl(cp->caddr), ntohs(cp->cport), 740 ntohl(cp->vaddr), ntohs(cp->vport), 741 ntohl(cp->daddr), ntohs(cp->dport), 742 ip_vs_state_name(cp->protocol, cp->state), 743 (cp->timer.expires-jiffies)/HZ); 744 } 745 return 0; 746} 747 748static struct seq_operations ip_vs_conn_seq_ops = { 749 .start = ip_vs_conn_seq_start, 750 .next = ip_vs_conn_seq_next, 751 .stop = ip_vs_conn_seq_stop, 752 .show = ip_vs_conn_seq_show, 753}; 754 755static int ip_vs_conn_open(struct inode *inode, struct file *file) 756{ 757 return seq_open(file, &ip_vs_conn_seq_ops); 758} 759 760static const struct file_operations ip_vs_conn_fops = { 761 .owner = THIS_MODULE, 762 .open = ip_vs_conn_open, 763 .read = seq_read, 764 .llseek = seq_lseek, 765 .release = seq_release, 766}; 767#endif 768 769 770/* 771 * Randomly drop connection entries before running out of memory 772 */ 773static inline int todrop_entry(struct ip_vs_conn *cp) 774{ 775 /* 776 * The drop rate array needs tuning for real environments. 777 * Called from timer bh only => no locking 778 */ 779 static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; 780 static char todrop_counter[9] = {0}; 781 int i; 782 783 /* if the conn entry hasn't lasted for 60 seconds, don't drop it. 784 This will leave enough time for normal connection to get 785 through. */ 786 if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) 787 return 0; 788 789 /* Don't drop the entry if its number of incoming packets is not 790 located in [0, 8] */ 791 i = atomic_read(&cp->in_pkts); 792 if (i > 8 || i < 0) return 0; 793 794 if (!todrop_rate[i]) return 0; 795 if (--todrop_counter[i] > 0) return 0; 796 797 todrop_counter[i] = todrop_rate[i]; 798 return 1; 799} 800 801/* Called from keventd and must protect itself from softirqs */ 802void ip_vs_random_dropentry(void) 803{ 804 int idx; 805 struct ip_vs_conn *cp; 806 807 /* 808 * Randomly scan 1/32 of the whole table every second 809 */ 810 for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { 811 unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; 812 813 /* 814 * Lock is actually needed in this loop. 815 */ 816 ct_write_lock_bh(hash); 817 818 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 819 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 820 /* connection template */ 821 continue; 822 823 if (cp->protocol == IPPROTO_TCP) { 824 switch(cp->state) { 825 case IP_VS_TCP_S_SYN_RECV: 826 case IP_VS_TCP_S_SYNACK: 827 break; 828 829 case IP_VS_TCP_S_ESTABLISHED: 830 if (todrop_entry(cp)) 831 break; 832 continue; 833 834 default: 835 continue; 836 } 837 } else { 838 if (!todrop_entry(cp)) 839 continue; 840 } 841 842 IP_VS_DBG(4, "del connection\n"); 843 ip_vs_conn_expire_now(cp); 844 if (cp->control) { 845 IP_VS_DBG(4, "del conn template\n"); 846 ip_vs_conn_expire_now(cp->control); 847 } 848 } 849 ct_write_unlock_bh(hash); 850 } 851} 852 853 854/* 855 * Flush all the connection entries in the ip_vs_conn_tab 856 */ 857static void ip_vs_conn_flush(void) 858{ 859 int idx; 860 struct ip_vs_conn *cp; 861 862 flush_again: 863 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { 864 /* 865 * Lock is actually needed in this loop. 866 */ 867 ct_write_lock_bh(idx); 868 869 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 870 871 IP_VS_DBG(4, "del connection\n"); 872 ip_vs_conn_expire_now(cp); 873 if (cp->control) { 874 IP_VS_DBG(4, "del conn template\n"); 875 ip_vs_conn_expire_now(cp->control); 876 } 877 } 878 ct_write_unlock_bh(idx); 879 } 880 881 /* the counter may be not NULL, because maybe some conn entries 882 are run by slow timer handler or unhashed but still referred */ 883 if (atomic_read(&ip_vs_conn_count) != 0) { 884 schedule(); 885 goto flush_again; 886 } 887} 888 889 890int ip_vs_conn_init(void) 891{ 892 int idx; 893 894 /* 895 * Allocate the connection hash table and initialize its list heads 896 */ 897 ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); 898 if (!ip_vs_conn_tab) 899 return -ENOMEM; 900 901 /* Allocate ip_vs_conn slab cache */ 902 ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", 903 sizeof(struct ip_vs_conn), 0, 904 SLAB_HWCACHE_ALIGN, NULL, NULL); 905 if (!ip_vs_conn_cachep) { 906 vfree(ip_vs_conn_tab); 907 return -ENOMEM; 908 } 909 910 IP_VS_INFO("Connection hash table configured " 911 "(size=%d, memory=%ldKbytes)\n", 912 IP_VS_CONN_TAB_SIZE, 913 (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); 914 IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", 915 sizeof(struct ip_vs_conn)); 916 917 for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { 918 INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); 919 } 920 921 for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { 922 rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); 923 } 924 925 proc_net_fops_create("ip_vs_conn", 0, &ip_vs_conn_fops); 926 927 /* calculate the random value for connection hash */ 928 get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); 929 930 return 0; 931} 932 933 934void ip_vs_conn_cleanup(void) 935{ 936 /* flush all the connection entries first */ 937 ip_vs_conn_flush(); 938 939 /* Release the empty cache */ 940 kmem_cache_destroy(ip_vs_conn_cachep); 941 proc_net_remove("ip_vs_conn"); 942 vfree(ip_vs_conn_tab); 943} 944