1/* 2 * IPVS: Locality-Based Least-Connection scheduling module 3 * 4 * Authors: Wensong Zhang <wensong@gnuchina.org> 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 * 11 * Changes: 12 * Martin Hamilton : fixed the terrible locking bugs 13 * *lock(tbl->lock) ==> *lock(&tbl->lock) 14 * Wensong Zhang : fixed the uninitialized tbl->lock bug 15 * Wensong Zhang : added doing full expiration check to 16 * collect stale entries of 24+ hours when 17 * no partial expire check in a half hour 18 * Julian Anastasov : replaced del_timer call with del_timer_sync 19 * to avoid the possible race between timer 20 * handler and del_timer thread in SMP 21 * 22 */ 23 24/* 25 * The lblc algorithm is as follows (pseudo code): 26 * 27 * if cachenode[dest_ip] is null then 28 * n, cachenode[dest_ip] <- {weighted least-conn node}; 29 * else 30 * n <- cachenode[dest_ip]; 31 * if (n is dead) OR 32 * (n.conns>n.weight AND 33 * there is a node m with m.conns<m.weight/2) then 34 * n, cachenode[dest_ip] <- {weighted least-conn node}; 35 * 36 * return n; 37 * 38 * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing 39 * me to write this module. 40 */ 41 42#define KMSG_COMPONENT "IPVS" 43#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 44 45#include <linux/ip.h> 46#include <linux/slab.h> 47#include <linux/module.h> 48#include <linux/kernel.h> 49#include <linux/skbuff.h> 50#include <linux/jiffies.h> 51 52/* for sysctl */ 53#include <linux/fs.h> 54#include <linux/sysctl.h> 55 56#include <net/ip_vs.h> 57 58 59/* 60 * It is for garbage collection of stale IPVS lblc entries, 61 * when the table is full. 62 */ 63#define CHECK_EXPIRE_INTERVAL (60*HZ) 64#define ENTRY_TIMEOUT (6*60*HZ) 65 66/* 67 * It is for full expiration check. 68 * When there is no partial expiration check (garbage collection) 69 * in a half hour, do a full expiration check to collect stale 70 * entries that haven't been touched for a day. 71 */ 72#define COUNT_FOR_FULL_EXPIRATION 30 73static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; 74 75 76/* 77 * for IPVS lblc entry hash table 78 */ 79#ifndef CONFIG_IP_VS_LBLC_TAB_BITS 80#define CONFIG_IP_VS_LBLC_TAB_BITS 10 81#endif 82#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS 83#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) 84#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) 85 86 87/* 88 * IPVS lblc entry represents an association between destination 89 * IP address and its destination server 90 */ 91struct ip_vs_lblc_entry { 92 struct list_head list; 93 int af; /* address family */ 94 union nf_inet_addr addr; /* destination IP address */ 95 struct ip_vs_dest *dest; /* real server (cache) */ 96 unsigned long lastuse; /* last used time */ 97}; 98 99 100/* 101 * IPVS lblc hash table 102 */ 103struct ip_vs_lblc_table { 104 struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ 105 atomic_t entries; /* number of entries */ 106 int max_size; /* maximum size of entries */ 107 struct timer_list periodic_timer; /* collect stale entries */ 108 int rover; /* rover for expire check */ 109 int counter; /* counter for no expire */ 110}; 111 112 113/* 114 * IPVS LBLC sysctl table 115 */ 116 117static ctl_table vs_vars_table[] = { 118 { 119 .procname = "lblc_expiration", 120 .data = &sysctl_ip_vs_lblc_expiration, 121 .maxlen = sizeof(int), 122 .mode = 0644, 123 .proc_handler = proc_dointvec_jiffies, 124 }, 125 { } 126}; 127 128static struct ctl_table_header * sysctl_header; 129 130static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) 131{ 132 list_del(&en->list); 133 /* 134 * We don't kfree dest because it is refered either by its service 135 * or the trash dest list. 136 */ 137 atomic_dec(&en->dest->refcnt); 138 kfree(en); 139} 140 141 142/* 143 * Returns hash value for IPVS LBLC entry 144 */ 145static inline unsigned 146ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) 147{ 148 __be32 addr_fold = addr->ip; 149 150#ifdef CONFIG_IP_VS_IPV6 151 if (af == AF_INET6) 152 addr_fold = addr->ip6[0]^addr->ip6[1]^ 153 addr->ip6[2]^addr->ip6[3]; 154#endif 155 return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK; 156} 157 158 159/* 160 * Hash an entry in the ip_vs_lblc_table. 161 * returns bool success. 162 */ 163static void 164ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) 165{ 166 unsigned hash = ip_vs_lblc_hashkey(en->af, &en->addr); 167 168 list_add(&en->list, &tbl->bucket[hash]); 169 atomic_inc(&tbl->entries); 170} 171 172 173/* 174 * Get ip_vs_lblc_entry associated with supplied parameters. Called under read 175 * lock 176 */ 177static inline struct ip_vs_lblc_entry * 178ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, 179 const union nf_inet_addr *addr) 180{ 181 unsigned hash = ip_vs_lblc_hashkey(af, addr); 182 struct ip_vs_lblc_entry *en; 183 184 list_for_each_entry(en, &tbl->bucket[hash], list) 185 if (ip_vs_addr_equal(af, &en->addr, addr)) 186 return en; 187 188 return NULL; 189} 190 191 192/* 193 * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP 194 * address to a server. Called under write lock. 195 */ 196static inline struct ip_vs_lblc_entry * 197ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, 198 struct ip_vs_dest *dest) 199{ 200 struct ip_vs_lblc_entry *en; 201 202 en = ip_vs_lblc_get(dest->af, tbl, daddr); 203 if (!en) { 204 en = kmalloc(sizeof(*en), GFP_ATOMIC); 205 if (!en) { 206 pr_err("%s(): no memory\n", __func__); 207 return NULL; 208 } 209 210 en->af = dest->af; 211 ip_vs_addr_copy(dest->af, &en->addr, daddr); 212 en->lastuse = jiffies; 213 214 atomic_inc(&dest->refcnt); 215 en->dest = dest; 216 217 ip_vs_lblc_hash(tbl, en); 218 } else if (en->dest != dest) { 219 atomic_dec(&en->dest->refcnt); 220 atomic_inc(&dest->refcnt); 221 en->dest = dest; 222 } 223 224 return en; 225} 226 227 228/* 229 * Flush all the entries of the specified table. 230 */ 231static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) 232{ 233 struct ip_vs_lblc_entry *en, *nxt; 234 int i; 235 236 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 237 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { 238 ip_vs_lblc_free(en); 239 atomic_dec(&tbl->entries); 240 } 241 } 242} 243 244 245static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) 246{ 247 struct ip_vs_lblc_table *tbl = svc->sched_data; 248 struct ip_vs_lblc_entry *en, *nxt; 249 unsigned long now = jiffies; 250 int i, j; 251 252 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 253 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 254 255 write_lock(&svc->sched_lock); 256 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 257 if (time_before(now, 258 en->lastuse + sysctl_ip_vs_lblc_expiration)) 259 continue; 260 261 ip_vs_lblc_free(en); 262 atomic_dec(&tbl->entries); 263 } 264 write_unlock(&svc->sched_lock); 265 } 266 tbl->rover = j; 267} 268 269 270static void ip_vs_lblc_check_expire(unsigned long data) 271{ 272 struct ip_vs_service *svc = (struct ip_vs_service *) data; 273 struct ip_vs_lblc_table *tbl = svc->sched_data; 274 unsigned long now = jiffies; 275 int goal; 276 int i, j; 277 struct ip_vs_lblc_entry *en, *nxt; 278 279 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { 280 /* do full expiration check */ 281 ip_vs_lblc_full_check(svc); 282 tbl->counter = 1; 283 goto out; 284 } 285 286 if (atomic_read(&tbl->entries) <= tbl->max_size) { 287 tbl->counter++; 288 goto out; 289 } 290 291 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; 292 if (goal > tbl->max_size/2) 293 goal = tbl->max_size/2; 294 295 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 296 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 297 298 write_lock(&svc->sched_lock); 299 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 300 if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) 301 continue; 302 303 ip_vs_lblc_free(en); 304 atomic_dec(&tbl->entries); 305 goal--; 306 } 307 write_unlock(&svc->sched_lock); 308 if (goal <= 0) 309 break; 310 } 311 tbl->rover = j; 312 313 out: 314 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); 315} 316 317 318static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) 319{ 320 int i; 321 struct ip_vs_lblc_table *tbl; 322 323 /* 324 * Allocate the ip_vs_lblc_table for this service 325 */ 326 tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); 327 if (tbl == NULL) { 328 pr_err("%s(): no memory\n", __func__); 329 return -ENOMEM; 330 } 331 svc->sched_data = tbl; 332 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " 333 "current service\n", sizeof(*tbl)); 334 335 /* 336 * Initialize the hash buckets 337 */ 338 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 339 INIT_LIST_HEAD(&tbl->bucket[i]); 340 } 341 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; 342 tbl->rover = 0; 343 tbl->counter = 1; 344 345 /* 346 * Hook periodic timer for garbage collection 347 */ 348 setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, 349 (unsigned long)svc); 350 mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); 351 352 return 0; 353} 354 355 356static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) 357{ 358 struct ip_vs_lblc_table *tbl = svc->sched_data; 359 360 /* remove periodic timer */ 361 del_timer_sync(&tbl->periodic_timer); 362 363 /* got to clean up table entries here */ 364 ip_vs_lblc_flush(tbl); 365 366 /* release the table itself */ 367 kfree(tbl); 368 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", 369 sizeof(*tbl)); 370 371 return 0; 372} 373 374 375static inline struct ip_vs_dest * 376__ip_vs_lblc_schedule(struct ip_vs_service *svc) 377{ 378 struct ip_vs_dest *dest, *least; 379 int loh, doh; 380 381 /* 382 * We think the overhead of processing active connections is fifty 383 * times higher than that of inactive connections in average. (This 384 * fifty times might not be accurate, we will change it later.) We 385 * use the following formula to estimate the overhead: 386 * dest->activeconns*50 + dest->inactconns 387 * and the load: 388 * (dest overhead) / dest->weight 389 * 390 * Remember -- no floats in kernel mode!!! 391 * The comparison of h1*w2 > h2*w1 is equivalent to that of 392 * h1/w1 > h2/w2 393 * if every weight is larger than zero. 394 * 395 * The server with weight=0 is quiesced and will not receive any 396 * new connection. 397 */ 398 list_for_each_entry(dest, &svc->destinations, n_list) { 399 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 400 continue; 401 if (atomic_read(&dest->weight) > 0) { 402 least = dest; 403 loh = atomic_read(&least->activeconns) * 50 404 + atomic_read(&least->inactconns); 405 goto nextstage; 406 } 407 } 408 return NULL; 409 410 /* 411 * Find the destination with the least load. 412 */ 413 nextstage: 414 list_for_each_entry_continue(dest, &svc->destinations, n_list) { 415 if (dest->flags & IP_VS_DEST_F_OVERLOAD) 416 continue; 417 418 doh = atomic_read(&dest->activeconns) * 50 419 + atomic_read(&dest->inactconns); 420 if (loh * atomic_read(&dest->weight) > 421 doh * atomic_read(&least->weight)) { 422 least = dest; 423 loh = doh; 424 } 425 } 426 427 IP_VS_DBG_BUF(6, "LBLC: server %s:%d " 428 "activeconns %d refcnt %d weight %d overhead %d\n", 429 IP_VS_DBG_ADDR(least->af, &least->addr), 430 ntohs(least->port), 431 atomic_read(&least->activeconns), 432 atomic_read(&least->refcnt), 433 atomic_read(&least->weight), loh); 434 435 return least; 436} 437 438 439/* 440 * If this destination server is overloaded and there is a less loaded 441 * server, then return true. 442 */ 443static inline int 444is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) 445{ 446 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { 447 struct ip_vs_dest *d; 448 449 list_for_each_entry(d, &svc->destinations, n_list) { 450 if (atomic_read(&d->activeconns)*2 451 < atomic_read(&d->weight)) { 452 return 1; 453 } 454 } 455 } 456 return 0; 457} 458 459 460/* 461 * Locality-Based (weighted) Least-Connection scheduling 462 */ 463static struct ip_vs_dest * 464ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) 465{ 466 struct ip_vs_lblc_table *tbl = svc->sched_data; 467 struct ip_vs_iphdr iph; 468 struct ip_vs_dest *dest = NULL; 469 struct ip_vs_lblc_entry *en; 470 471 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); 472 473 IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); 474 475 /* First look in our cache */ 476 read_lock(&svc->sched_lock); 477 en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr); 478 if (en) { 479 /* We only hold a read lock, but this is atomic */ 480 en->lastuse = jiffies; 481 482 /* 483 * If the destination is not available, i.e. it's in the trash, 484 * we must ignore it, as it may be removed from under our feet, 485 * if someone drops our reference count. Our caller only makes 486 * sure that destinations, that are not in the trash, are not 487 * moved to the trash, while we are scheduling. But anyone can 488 * free up entries from the trash at any time. 489 */ 490 491 if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) 492 dest = en->dest; 493 } 494 read_unlock(&svc->sched_lock); 495 496 /* If the destination has a weight and is not overloaded, use it */ 497 if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) 498 goto out; 499 500 /* No cache entry or it is invalid, time to schedule */ 501 dest = __ip_vs_lblc_schedule(svc); 502 if (!dest) { 503 IP_VS_ERR_RL("LBLC: no destination available\n"); 504 return NULL; 505 } 506 507 /* If we fail to create a cache entry, we'll just use the valid dest */ 508 write_lock(&svc->sched_lock); 509 ip_vs_lblc_new(tbl, &iph.daddr, dest); 510 write_unlock(&svc->sched_lock); 511 512out: 513 IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n", 514 IP_VS_DBG_ADDR(svc->af, &iph.daddr), 515 IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); 516 517 return dest; 518} 519 520 521/* 522 * IPVS LBLC Scheduler structure 523 */ 524static struct ip_vs_scheduler ip_vs_lblc_scheduler = 525{ 526 .name = "lblc", 527 .refcnt = ATOMIC_INIT(0), 528 .module = THIS_MODULE, 529 .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), 530 .init_service = ip_vs_lblc_init_svc, 531 .done_service = ip_vs_lblc_done_svc, 532 .schedule = ip_vs_lblc_schedule, 533}; 534 535 536static int __init ip_vs_lblc_init(void) 537{ 538 int ret; 539 540 sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); 541 ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); 542 if (ret) 543 unregister_sysctl_table(sysctl_header); 544 return ret; 545} 546 547 548static void __exit ip_vs_lblc_cleanup(void) 549{ 550 unregister_sysctl_table(sysctl_header); 551 unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); 552} 553 554 555module_init(ip_vs_lblc_init); 556module_exit(ip_vs_lblc_cleanup); 557MODULE_LICENSE("GPL"); 558