1/*
2 * IPVS:        Locality-Based Least-Connection scheduling module
3 *
4 * Version:     $Id: ip_vs_lblc.c,v 1.1.1.1 2007/08/03 18:53:51 Exp $
5 *
6 * Authors:     Wensong Zhang <wensong@gnuchina.org>
7 *
8 *              This program is free software; you can redistribute it and/or
9 *              modify it under the terms of the GNU General Public License
10 *              as published by the Free Software Foundation; either version
11 *              2 of the License, or (at your option) any later version.
12 *
13 * Changes:
14 *     Martin Hamilton         :    fixed the terrible locking bugs
15 *                                   *lock(tbl->lock) ==> *lock(&tbl->lock)
16 *     Wensong Zhang           :    fixed the uninitilized tbl->lock bug
17 *     Wensong Zhang           :    added doing full expiration check to
18 *                                   collect stale entries of 24+ hours when
19 *                                   no partial expire check in a half hour
20 *     Julian Anastasov        :    replaced del_timer call with del_timer_sync
21 *                                   to avoid the possible race between timer
22 *                                   handler and del_timer thread in SMP
23 *
24 */
25
26/*
27 * The lblc algorithm is as follows (pseudo code):
28 *
29 *       if cachenode[dest_ip] is null then
30 *               n, cachenode[dest_ip] <- {weighted least-conn node};
31 *       else
32 *               n <- cachenode[dest_ip];
33 *               if (n is dead) OR
34 *                  (n.conns>n.weight AND
35 *                   there is a node m with m.conns<m.weight/2) then
36 *                 n, cachenode[dest_ip] <- {weighted least-conn node};
37 *
38 *       return n;
39 *
40 * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
41 * me to write this module.
42 */
43
44#include <linux/ip.h>
45#include <linux/module.h>
46#include <linux/kernel.h>
47#include <linux/skbuff.h>
48#include <linux/jiffies.h>
49
50/* for sysctl */
51#include <linux/fs.h>
52#include <linux/sysctl.h>
53
54#include <net/ip_vs.h>
55
56
57/*
58 *    It is for garbage collection of stale IPVS lblc entries,
59 *    when the table is full.
60 */
61#define CHECK_EXPIRE_INTERVAL   (60*HZ)
62#define ENTRY_TIMEOUT           (6*60*HZ)
63
64/*
65 *    It is for full expiration check.
66 *    When there is no partial expiration check (garbage collection)
67 *    in a half hour, do a full expiration check to collect stale
68 *    entries that haven't been touched for a day.
69 */
70#define COUNT_FOR_FULL_EXPIRATION   30
71static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
72
73
74/*
75 *     for IPVS lblc entry hash table
76 */
77#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
78#define CONFIG_IP_VS_LBLC_TAB_BITS      10
79#endif
80#define IP_VS_LBLC_TAB_BITS     CONFIG_IP_VS_LBLC_TAB_BITS
81#define IP_VS_LBLC_TAB_SIZE     (1 << IP_VS_LBLC_TAB_BITS)
82#define IP_VS_LBLC_TAB_MASK     (IP_VS_LBLC_TAB_SIZE - 1)
83
84
85/*
86 *      IPVS lblc entry represents an association between destination
87 *      IP address and its destination server
88 */
89struct ip_vs_lblc_entry {
90	struct list_head        list;
91	__be32                  addr;           /* destination IP address */
92	struct ip_vs_dest       *dest;          /* real server (cache) */
93	unsigned long           lastuse;        /* last used time */
94};
95
96
97/*
98 *      IPVS lblc hash table
99 */
100struct ip_vs_lblc_table {
101	rwlock_t	        lock;           /* lock for this table */
102	struct list_head        bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */
103	atomic_t                entries;        /* number of entries */
104	int                     max_size;       /* maximum size of entries */
105	struct timer_list       periodic_timer; /* collect stale entries */
106	int                     rover;          /* rover for expire check */
107	int                     counter;        /* counter for no expire */
108};
109
110
111/*
112 *      IPVS LBLC sysctl table
113 */
114
115static ctl_table vs_vars_table[] = {
116	{
117		.ctl_name	= NET_IPV4_VS_LBLC_EXPIRE,
118		.procname	= "lblc_expiration",
119		.data		= &sysctl_ip_vs_lblc_expiration,
120		.maxlen		= sizeof(int),
121		.mode		= 0644,
122		.proc_handler	= &proc_dointvec_jiffies,
123	},
124	{ .ctl_name = 0 }
125};
126
127static ctl_table vs_table[] = {
128	{
129		.ctl_name	= NET_IPV4_VS,
130		.procname	= "vs",
131		.mode		= 0555,
132		.child		= vs_vars_table
133	},
134	{ .ctl_name = 0 }
135};
136
137static ctl_table ipvs_ipv4_table[] = {
138	{
139		.ctl_name	= NET_IPV4,
140		.procname	= "ipv4",
141		.mode		= 0555,
142		.child		= vs_table
143	},
144	{ .ctl_name = 0 }
145};
146
147static ctl_table lblc_root_table[] = {
148	{
149		.ctl_name	= CTL_NET,
150		.procname	= "net",
151		.mode		= 0555,
152		.child		= ipvs_ipv4_table
153	},
154	{ .ctl_name = 0 }
155};
156
157static struct ctl_table_header * sysctl_header;
158
159/*
160 *      new/free a ip_vs_lblc_entry, which is a mapping of a destionation
161 *      IP address to a server.
162 */
163static inline struct ip_vs_lblc_entry *
164ip_vs_lblc_new(__be32 daddr, struct ip_vs_dest *dest)
165{
166	struct ip_vs_lblc_entry *en;
167
168	en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
169	if (en == NULL) {
170		IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
171		return NULL;
172	}
173
174	INIT_LIST_HEAD(&en->list);
175	en->addr = daddr;
176
177	atomic_inc(&dest->refcnt);
178	en->dest = dest;
179
180	return en;
181}
182
183
184static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
185{
186	list_del(&en->list);
187	/*
188	 * We don't kfree dest because it is refered either by its service
189	 * or the trash dest list.
190	 */
191	atomic_dec(&en->dest->refcnt);
192	kfree(en);
193}
194
195
196/*
197 *	Returns hash value for IPVS LBLC entry
198 */
199static inline unsigned ip_vs_lblc_hashkey(__be32 addr)
200{
201	return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
202}
203
204
205/*
206 *	Hash an entry in the ip_vs_lblc_table.
207 *	returns bool success.
208 */
209static int
210ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
211{
212	unsigned hash;
213
214	if (!list_empty(&en->list)) {
215		IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
216			  "called from %p\n", __builtin_return_address(0));
217		return 0;
218	}
219
220	/*
221	 *	Hash by destination IP address
222	 */
223	hash = ip_vs_lblc_hashkey(en->addr);
224
225	write_lock(&tbl->lock);
226	list_add(&en->list, &tbl->bucket[hash]);
227	atomic_inc(&tbl->entries);
228	write_unlock(&tbl->lock);
229
230	return 1;
231}
232
233
234/*
235 *  Get ip_vs_lblc_entry associated with supplied parameters.
236 */
237static inline struct ip_vs_lblc_entry *
238ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
239{
240	unsigned hash;
241	struct ip_vs_lblc_entry *en;
242
243	hash = ip_vs_lblc_hashkey(addr);
244
245	read_lock(&tbl->lock);
246
247	list_for_each_entry(en, &tbl->bucket[hash], list) {
248		if (en->addr == addr) {
249			/* HIT */
250			read_unlock(&tbl->lock);
251			return en;
252		}
253	}
254
255	read_unlock(&tbl->lock);
256
257	return NULL;
258}
259
260
261/*
262 *      Flush all the entries of the specified table.
263 */
264static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
265{
266	int i;
267	struct ip_vs_lblc_entry *en, *nxt;
268
269	for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
270		write_lock(&tbl->lock);
271		list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
272			ip_vs_lblc_free(en);
273			atomic_dec(&tbl->entries);
274		}
275		write_unlock(&tbl->lock);
276	}
277}
278
279
280static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
281{
282	unsigned long now = jiffies;
283	int i, j;
284	struct ip_vs_lblc_entry *en, *nxt;
285
286	for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
287		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
288
289		write_lock(&tbl->lock);
290		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
291			if (time_before(now,
292					en->lastuse + sysctl_ip_vs_lblc_expiration))
293				continue;
294
295			ip_vs_lblc_free(en);
296			atomic_dec(&tbl->entries);
297		}
298		write_unlock(&tbl->lock);
299	}
300	tbl->rover = j;
301}
302
303
304static void ip_vs_lblc_check_expire(unsigned long data)
305{
306	struct ip_vs_lblc_table *tbl;
307	unsigned long now = jiffies;
308	int goal;
309	int i, j;
310	struct ip_vs_lblc_entry *en, *nxt;
311
312	tbl = (struct ip_vs_lblc_table *)data;
313
314	if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
315		/* do full expiration check */
316		ip_vs_lblc_full_check(tbl);
317		tbl->counter = 1;
318		goto out;
319	}
320
321	if (atomic_read(&tbl->entries) <= tbl->max_size) {
322		tbl->counter++;
323		goto out;
324	}
325
326	goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
327	if (goal > tbl->max_size/2)
328		goal = tbl->max_size/2;
329
330	for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
331		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
332
333		write_lock(&tbl->lock);
334		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
335			if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
336				continue;
337
338			ip_vs_lblc_free(en);
339			atomic_dec(&tbl->entries);
340			goal--;
341		}
342		write_unlock(&tbl->lock);
343		if (goal <= 0)
344			break;
345	}
346	tbl->rover = j;
347
348  out:
349	mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
350}
351
352
353static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
354{
355	int i;
356	struct ip_vs_lblc_table *tbl;
357
358	/*
359	 *    Allocate the ip_vs_lblc_table for this service
360	 */
361	tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC);
362	if (tbl == NULL) {
363		IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
364		return -ENOMEM;
365	}
366	svc->sched_data = tbl;
367	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
368		  "current service\n",
369		  sizeof(struct ip_vs_lblc_table));
370
371	/*
372	 *    Initialize the hash buckets
373	 */
374	for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
375		INIT_LIST_HEAD(&tbl->bucket[i]);
376	}
377	rwlock_init(&tbl->lock);
378	tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
379	tbl->rover = 0;
380	tbl->counter = 1;
381
382	/*
383	 *    Hook periodic timer for garbage collection
384	 */
385	init_timer(&tbl->periodic_timer);
386	tbl->periodic_timer.data = (unsigned long)tbl;
387	tbl->periodic_timer.function = ip_vs_lblc_check_expire;
388	tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
389	add_timer(&tbl->periodic_timer);
390
391	return 0;
392}
393
394
395static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
396{
397	struct ip_vs_lblc_table *tbl = svc->sched_data;
398
399	/* remove periodic timer */
400	del_timer_sync(&tbl->periodic_timer);
401
402	/* got to clean up table entries here */
403	ip_vs_lblc_flush(tbl);
404
405	/* release the table itself */
406	kfree(svc->sched_data);
407	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
408		  sizeof(struct ip_vs_lblc_table));
409
410	return 0;
411}
412
413
414static int ip_vs_lblc_update_svc(struct ip_vs_service *svc)
415{
416	return 0;
417}
418
419
420static inline struct ip_vs_dest *
421__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
422{
423	struct ip_vs_dest *dest, *least;
424	int loh, doh;
425
426	/*
427	 * We think the overhead of processing active connections is fifty
428	 * times higher than that of inactive connections in average. (This
429	 * fifty times might not be accurate, we will change it later.) We
430	 * use the following formula to estimate the overhead:
431	 *                dest->activeconns*50 + dest->inactconns
432	 * and the load:
433	 *                (dest overhead) / dest->weight
434	 *
435	 * Remember -- no floats in kernel mode!!!
436	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
437	 *                h1/w1 > h2/w2
438	 * if every weight is larger than zero.
439	 *
440	 * The server with weight=0 is quiesced and will not receive any
441	 * new connection.
442	 */
443	list_for_each_entry(dest, &svc->destinations, n_list) {
444		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
445			continue;
446		if (atomic_read(&dest->weight) > 0) {
447			least = dest;
448			loh = atomic_read(&least->activeconns) * 50
449				+ atomic_read(&least->inactconns);
450			goto nextstage;
451		}
452	}
453	return NULL;
454
455	/*
456	 *    Find the destination with the least load.
457	 */
458  nextstage:
459	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
460		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
461			continue;
462
463		doh = atomic_read(&dest->activeconns) * 50
464			+ atomic_read(&dest->inactconns);
465		if (loh * atomic_read(&dest->weight) >
466		    doh * atomic_read(&least->weight)) {
467			least = dest;
468			loh = doh;
469		}
470	}
471
472	IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
473		  "activeconns %d refcnt %d weight %d overhead %d\n",
474		  NIPQUAD(least->addr), ntohs(least->port),
475		  atomic_read(&least->activeconns),
476		  atomic_read(&least->refcnt),
477		  atomic_read(&least->weight), loh);
478
479	return least;
480}
481
482
483/*
484 *   If this destination server is overloaded and there is a less loaded
485 *   server, then return true.
486 */
487static inline int
488is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
489{
490	if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
491		struct ip_vs_dest *d;
492
493		list_for_each_entry(d, &svc->destinations, n_list) {
494			if (atomic_read(&d->activeconns)*2
495			    < atomic_read(&d->weight)) {
496				return 1;
497			}
498		}
499	}
500	return 0;
501}
502
503
504/*
505 *    Locality-Based (weighted) Least-Connection scheduling
506 */
507static struct ip_vs_dest *
508ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
509{
510	struct ip_vs_dest *dest;
511	struct ip_vs_lblc_table *tbl;
512	struct ip_vs_lblc_entry *en;
513	struct iphdr *iph = ip_hdr(skb);
514
515	IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
516
517	tbl = (struct ip_vs_lblc_table *)svc->sched_data;
518	en = ip_vs_lblc_get(tbl, iph->daddr);
519	if (en == NULL) {
520		dest = __ip_vs_wlc_schedule(svc, iph);
521		if (dest == NULL) {
522			IP_VS_DBG(1, "no destination available\n");
523			return NULL;
524		}
525		en = ip_vs_lblc_new(iph->daddr, dest);
526		if (en == NULL) {
527			return NULL;
528		}
529		ip_vs_lblc_hash(tbl, en);
530	} else {
531		dest = en->dest;
532		if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
533		    || atomic_read(&dest->weight) <= 0
534		    || is_overloaded(dest, svc)) {
535			dest = __ip_vs_wlc_schedule(svc, iph);
536			if (dest == NULL) {
537				IP_VS_DBG(1, "no destination available\n");
538				return NULL;
539			}
540			atomic_dec(&en->dest->refcnt);
541			atomic_inc(&dest->refcnt);
542			en->dest = dest;
543		}
544	}
545	en->lastuse = jiffies;
546
547	IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
548		  "--> server %u.%u.%u.%u:%d\n",
549		  NIPQUAD(en->addr),
550		  NIPQUAD(dest->addr),
551		  ntohs(dest->port));
552
553	return dest;
554}
555
556
557/*
558 *      IPVS LBLC Scheduler structure
559 */
560static struct ip_vs_scheduler ip_vs_lblc_scheduler =
561{
562	.name =			"lblc",
563	.refcnt =		ATOMIC_INIT(0),
564	.module =		THIS_MODULE,
565	.init_service =		ip_vs_lblc_init_svc,
566	.done_service =		ip_vs_lblc_done_svc,
567	.update_service =	ip_vs_lblc_update_svc,
568	.schedule =		ip_vs_lblc_schedule,
569};
570
571
572static int __init ip_vs_lblc_init(void)
573{
574	INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list);
575	sysctl_header = register_sysctl_table(lblc_root_table);
576	return register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
577}
578
579
580static void __exit ip_vs_lblc_cleanup(void)
581{
582	unregister_sysctl_table(sysctl_header);
583	unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
584}
585
586
587module_init(ip_vs_lblc_init);
588module_exit(ip_vs_lblc_cleanup);
589MODULE_LICENSE("GPL");
590