1/* 2 * IPVS: Locality-Based Least-Connection scheduling module 3 * 4 * Version: $Id: ip_vs_lblc.c,v 1.1.1.1 2007/08/03 18:53:51 Exp $ 5 * 6 * Authors: Wensong Zhang <wensong@gnuchina.org> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 * 13 * Changes: 14 * Martin Hamilton : fixed the terrible locking bugs 15 * *lock(tbl->lock) ==> *lock(&tbl->lock) 16 * Wensong Zhang : fixed the uninitilized tbl->lock bug 17 * Wensong Zhang : added doing full expiration check to 18 * collect stale entries of 24+ hours when 19 * no partial expire check in a half hour 20 * Julian Anastasov : replaced del_timer call with del_timer_sync 21 * to avoid the possible race between timer 22 * handler and del_timer thread in SMP 23 * 24 */ 25 26/* 27 * The lblc algorithm is as follows (pseudo code): 28 * 29 * if cachenode[dest_ip] is null then 30 * n, cachenode[dest_ip] <- {weighted least-conn node}; 31 * else 32 * n <- cachenode[dest_ip]; 33 * if (n is dead) OR 34 * (n.conns>n.weight AND 35 * there is a node m with m.conns<m.weight/2) then 36 * n, cachenode[dest_ip] <- {weighted least-conn node}; 37 * 38 * return n; 39 * 40 * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing 41 * me to write this module. 42 */ 43 44#include <linux/ip.h> 45#include <linux/module.h> 46#include <linux/kernel.h> 47#include <linux/skbuff.h> 48#include <linux/jiffies.h> 49 50/* for sysctl */ 51#include <linux/fs.h> 52#include <linux/sysctl.h> 53 54#include <net/ip_vs.h> 55 56 57/* 58 * It is for garbage collection of stale IPVS lblc entries, 59 * when the table is full. 60 */ 61#define CHECK_EXPIRE_INTERVAL (60*HZ) 62#define ENTRY_TIMEOUT (6*60*HZ) 63 64/* 65 * It is for full expiration check. 66 * When there is no partial expiration check (garbage collection) 67 * in a half hour, do a full expiration check to collect stale 68 * entries that haven't been touched for a day. 69 */ 70#define COUNT_FOR_FULL_EXPIRATION 30 71static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; 72 73 74/* 75 * for IPVS lblc entry hash table 76 */ 77#ifndef CONFIG_IP_VS_LBLC_TAB_BITS 78#define CONFIG_IP_VS_LBLC_TAB_BITS 10 79#endif 80#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS 81#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) 82#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) 83 84 85/* 86 * IPVS lblc entry represents an association between destination 87 * IP address and its destination server 88 */ 89struct ip_vs_lblc_entry { 90 struct list_head list; 91 __be32 addr; /* destination IP address */ 92 struct ip_vs_dest *dest; /* real server (cache) */ 93 unsigned long lastuse; /* last used time */ 94}; 95 96 97/* 98 * IPVS lblc hash table 99 */ 100struct ip_vs_lblc_table { 101 rwlock_t lock; /* lock for this table */ 102 struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ 103 atomic_t entries; /* number of entries */ 104 int max_size; /* maximum size of entries */ 105 struct timer_list periodic_timer; /* collect stale entries */ 106 int rover; /* rover for expire check */ 107 int counter; /* counter for no expire */ 108}; 109 110 111/* 112 * IPVS LBLC sysctl table 113 */ 114 115static ctl_table vs_vars_table[] = { 116 { 117 .ctl_name = NET_IPV4_VS_LBLC_EXPIRE, 118 .procname = "lblc_expiration", 119 .data = &sysctl_ip_vs_lblc_expiration, 120 .maxlen = sizeof(int), 121 .mode = 0644, 122 .proc_handler = &proc_dointvec_jiffies, 123 }, 124 { .ctl_name = 0 } 125}; 126 127static ctl_table vs_table[] = { 128 { 129 .ctl_name = NET_IPV4_VS, 130 .procname = "vs", 131 .mode = 0555, 132 .child = vs_vars_table 133 }, 134 { .ctl_name = 0 } 135}; 136 137static ctl_table ipvs_ipv4_table[] = { 138 { 139 .ctl_name = NET_IPV4, 140 .procname = "ipv4", 141 .mode = 0555, 142 .child = vs_table 143 }, 144 { .ctl_name = 0 } 145}; 146 147static ctl_table lblc_root_table[] = { 148 { 149 .ctl_name = CTL_NET, 150 .procname = "net", 151 .mode = 0555, 152 .child = ipvs_ipv4_table 153 }, 154 { .ctl_name = 0 } 155}; 156 157static struct ctl_table_header * sysctl_header; 158 159/* 160 * new/free a ip_vs_lblc_entry, which is a mapping of a destionation 161 * IP address to a server. 162 */ 163static inline struct ip_vs_lblc_entry * 164ip_vs_lblc_new(__be32 daddr, struct ip_vs_dest *dest) 165{ 166 struct ip_vs_lblc_entry *en; 167 168 en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC); 169 if (en == NULL) { 170 IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); 171 return NULL; 172 } 173 174 INIT_LIST_HEAD(&en->list); 175 en->addr = daddr; 176 177 atomic_inc(&dest->refcnt); 178 en->dest = dest; 179 180 return en; 181} 182 183 184static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) 185{ 186 list_del(&en->list); 187 /* 188 * We don't kfree dest because it is refered either by its service 189 * or the trash dest list. 190 */ 191 atomic_dec(&en->dest->refcnt); 192 kfree(en); 193} 194 195 196/* 197 * Returns hash value for IPVS LBLC entry 198 */ 199static inline unsigned ip_vs_lblc_hashkey(__be32 addr) 200{ 201 return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; 202} 203 204 205/* 206 * Hash an entry in the ip_vs_lblc_table. 207 * returns bool success. 208 */ 209static int 210ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) 211{ 212 unsigned hash; 213 214 if (!list_empty(&en->list)) { 215 IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, " 216 "called from %p\n", __builtin_return_address(0)); 217 return 0; 218 } 219 220 /* 221 * Hash by destination IP address 222 */ 223 hash = ip_vs_lblc_hashkey(en->addr); 224 225 write_lock(&tbl->lock); 226 list_add(&en->list, &tbl->bucket[hash]); 227 atomic_inc(&tbl->entries); 228 write_unlock(&tbl->lock); 229 230 return 1; 231} 232 233 234/* 235 * Get ip_vs_lblc_entry associated with supplied parameters. 236 */ 237static inline struct ip_vs_lblc_entry * 238ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) 239{ 240 unsigned hash; 241 struct ip_vs_lblc_entry *en; 242 243 hash = ip_vs_lblc_hashkey(addr); 244 245 read_lock(&tbl->lock); 246 247 list_for_each_entry(en, &tbl->bucket[hash], list) { 248 if (en->addr == addr) { 249 /* HIT */ 250 read_unlock(&tbl->lock); 251 return en; 252 } 253 } 254 255 read_unlock(&tbl->lock); 256 257 return NULL; 258} 259 260 261/* 262 * Flush all the entries of the specified table. 263 */ 264static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) 265{ 266 int i; 267 struct ip_vs_lblc_entry *en, *nxt; 268 269 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 270 write_lock(&tbl->lock); 271 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { 272 ip_vs_lblc_free(en); 273 atomic_dec(&tbl->entries); 274 } 275 write_unlock(&tbl->lock); 276 } 277} 278 279 280static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) 281{ 282 unsigned long now = jiffies; 283 int i, j; 284 struct ip_vs_lblc_entry *en, *nxt; 285 286 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 287 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 288 289 write_lock(&tbl->lock); 290 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 291 if (time_before(now, 292 en->lastuse + sysctl_ip_vs_lblc_expiration)) 293 continue; 294 295 ip_vs_lblc_free(en); 296 atomic_dec(&tbl->entries); 297 } 298 write_unlock(&tbl->lock); 299 } 300 tbl->rover = j; 301} 302 303 304static void ip_vs_lblc_check_expire(unsigned long data) 305{ 306 struct ip_vs_lblc_table *tbl; 307 unsigned long now = jiffies; 308 int goal; 309 int i, j; 310 struct ip_vs_lblc_entry *en, *nxt; 311 312 tbl = (struct ip_vs_lblc_table *)data; 313 314 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { 315 /* do full expiration check */ 316 ip_vs_lblc_full_check(tbl); 317 tbl->counter = 1; 318 goto out; 319 } 320 321 if (atomic_read(&tbl->entries) <= tbl->max_size) { 322 tbl->counter++; 323 goto out; 324 } 325 326 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; 327 if (goal > tbl->max_size/2) 328 goal = tbl->max_size/2; 329 330 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 331 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 332 333 write_lock(&tbl->lock); 334 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 335 if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) 336 continue; 337 338 ip_vs_lblc_free(en); 339 atomic_dec(&tbl->entries); 340 goal--; 341 } 342 write_unlock(&tbl->lock); 343 if (goal <= 0) 344 break; 345 } 346 tbl->rover = j; 347 348 out: 349 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); 350} 351 352 353static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) 354{ 355 int i; 356 struct ip_vs_lblc_table *tbl; 357 358 /* 359 * Allocate the ip_vs_lblc_table for this service 360 */ 361 tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); 362 if (tbl == NULL) { 363 IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); 364 return -ENOMEM; 365 } 366 svc->sched_data = tbl; 367 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " 368 "current service\n", 369 sizeof(struct ip_vs_lblc_table)); 370 371 /* 372 * Initialize the hash buckets 373 */ 374 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 375 INIT_LIST_HEAD(&tbl->bucket[i]); 376 } 377 rwlock_init(&tbl->lock); 378 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; 379 tbl->rover = 0; 380 tbl->counter = 1; 381 382 /* 383 * Hook periodic timer for garbage collection 384 */ 385 init_timer(&tbl->periodic_timer); 386 tbl->periodic_timer.data = (unsigned long)tbl; 387 tbl->periodic_timer.function = ip_vs_lblc_check_expire; 388 tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; 389 add_timer(&tbl->periodic_timer); 390 391 return 0; 392} 393 394 395static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) 396{ 397 struct ip_vs_lblc_table *tbl = svc->sched_data; 398 399 /* remove periodic timer */ 400 del_timer_sync(&tbl->periodic_timer); 401 402 /* got to clean up table entries here */ 403 ip_vs_lblc_flush(tbl); 404 405 /* release the table itself */ 406 kfree(svc->sched_data); 407 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", 408 sizeof(struct ip_vs_lblc_table)); 409 410 return 0; 411} 412 413 414static int ip_vs_lblc_update_svc(struct ip_vs_service *svc) 415{ 416 return 0; 417} 418 419 420static inline struct ip_vs_dest * 421__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) 422{ 423 struct ip_vs_dest *dest, *least; 424 int loh, doh; 425 426 /* 427 * We think the overhead of processing active connections is fifty 428 * times higher than that of inactive connections in average. (This 429 * fifty times might not be accurate, we will change it later.) We 430 * use the following formula to estimate the overhead: 431 * dest->activeconns*50 + dest->inactconns 432 * and the load: 433 * (dest overhead) / dest->weight 434 * 435 * Remember -- no floats in kernel mode!!! 436 * The comparison of h1*w2 > h2*w1 is equivalent to that of 437 * h1/w1 > h2/w2 438 * if every weight is larger than zero. 439 * 440 * The server with weight=0 is quiesced and will not receive any 441 * new connection. 442 */ 443 list_for_each_entry(dest, &svc->destinations, n_list) { 444 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 445 continue; 446 if (atomic_read(&dest->weight) > 0) { 447 least = dest; 448 loh = atomic_read(&least->activeconns) * 50 449 + atomic_read(&least->inactconns); 450 goto nextstage; 451 } 452 } 453 return NULL; 454 455 /* 456 * Find the destination with the least load. 457 */ 458 nextstage: 459 list_for_each_entry_continue(dest, &svc->destinations, n_list) { 460 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 461 continue; 462 463 doh = atomic_read(&dest->activeconns) * 50 464 + atomic_read(&dest->inactconns); 465 if (loh * atomic_read(&dest->weight) > 466 doh * atomic_read(&least->weight)) { 467 least = dest; 468 loh = doh; 469 } 470 } 471 472 IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " 473 "activeconns %d refcnt %d weight %d overhead %d\n", 474 NIPQUAD(least->addr), ntohs(least->port), 475 atomic_read(&least->activeconns), 476 atomic_read(&least->refcnt), 477 atomic_read(&least->weight), loh); 478 479 return least; 480} 481 482 483/* 484 * If this destination server is overloaded and there is a less loaded 485 * server, then return true. 486 */ 487static inline int 488is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) 489{ 490 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { 491 struct ip_vs_dest *d; 492 493 list_for_each_entry(d, &svc->destinations, n_list) { 494 if (atomic_read(&d->activeconns)*2 495 < atomic_read(&d->weight)) { 496 return 1; 497 } 498 } 499 } 500 return 0; 501} 502 503 504/* 505 * Locality-Based (weighted) Least-Connection scheduling 506 */ 507static struct ip_vs_dest * 508ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) 509{ 510 struct ip_vs_dest *dest; 511 struct ip_vs_lblc_table *tbl; 512 struct ip_vs_lblc_entry *en; 513 struct iphdr *iph = ip_hdr(skb); 514 515 IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); 516 517 tbl = (struct ip_vs_lblc_table *)svc->sched_data; 518 en = ip_vs_lblc_get(tbl, iph->daddr); 519 if (en == NULL) { 520 dest = __ip_vs_wlc_schedule(svc, iph); 521 if (dest == NULL) { 522 IP_VS_DBG(1, "no destination available\n"); 523 return NULL; 524 } 525 en = ip_vs_lblc_new(iph->daddr, dest); 526 if (en == NULL) { 527 return NULL; 528 } 529 ip_vs_lblc_hash(tbl, en); 530 } else { 531 dest = en->dest; 532 if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) 533 || atomic_read(&dest->weight) <= 0 534 || is_overloaded(dest, svc)) { 535 dest = __ip_vs_wlc_schedule(svc, iph); 536 if (dest == NULL) { 537 IP_VS_DBG(1, "no destination available\n"); 538 return NULL; 539 } 540 atomic_dec(&en->dest->refcnt); 541 atomic_inc(&dest->refcnt); 542 en->dest = dest; 543 } 544 } 545 en->lastuse = jiffies; 546 547 IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " 548 "--> server %u.%u.%u.%u:%d\n", 549 NIPQUAD(en->addr), 550 NIPQUAD(dest->addr), 551 ntohs(dest->port)); 552 553 return dest; 554} 555 556 557/* 558 * IPVS LBLC Scheduler structure 559 */ 560static struct ip_vs_scheduler ip_vs_lblc_scheduler = 561{ 562 .name = "lblc", 563 .refcnt = ATOMIC_INIT(0), 564 .module = THIS_MODULE, 565 .init_service = ip_vs_lblc_init_svc, 566 .done_service = ip_vs_lblc_done_svc, 567 .update_service = ip_vs_lblc_update_svc, 568 .schedule = ip_vs_lblc_schedule, 569}; 570 571 572static int __init ip_vs_lblc_init(void) 573{ 574 INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list); 575 sysctl_header = register_sysctl_table(lblc_root_table); 576 return register_ip_vs_scheduler(&ip_vs_lblc_scheduler); 577} 578 579 580static void __exit ip_vs_lblc_cleanup(void) 581{ 582 unregister_sysctl_table(sysctl_header); 583 unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); 584} 585 586 587module_init(ip_vs_lblc_init); 588module_exit(ip_vs_lblc_cleanup); 589MODULE_LICENSE("GPL"); 590