1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * IPVS:        Locality-Based Least-Connection scheduling module
4 *
5 * Authors:     Wensong Zhang <wensong@gnuchina.org>
6 *
7 * Changes:
8 *     Martin Hamilton         :    fixed the terrible locking bugs
9 *                                   *lock(tbl->lock) ==> *lock(&tbl->lock)
10 *     Wensong Zhang           :    fixed the uninitialized tbl->lock bug
11 *     Wensong Zhang           :    added doing full expiration check to
12 *                                   collect stale entries of 24+ hours when
13 *                                   no partial expire check in a half hour
14 *     Julian Anastasov        :    replaced del_timer call with del_timer_sync
15 *                                   to avoid the possible race between timer
16 *                                   handler and del_timer thread in SMP
17 */
18
19/*
20 * The lblc algorithm is as follows (pseudo code):
21 *
22 *       if cachenode[dest_ip] is null then
23 *               n, cachenode[dest_ip] <- {weighted least-conn node};
24 *       else
25 *               n <- cachenode[dest_ip];
26 *               if (n is dead) OR
27 *                  (n.conns>n.weight AND
28 *                   there is a node m with m.conns<m.weight/2) then
29 *                 n, cachenode[dest_ip] <- {weighted least-conn node};
30 *
31 *       return n;
32 *
33 * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
34 * me to write this module.
35 */
36
37#define KMSG_COMPONENT "IPVS"
38#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
39
40#include <linux/ip.h>
41#include <linux/slab.h>
42#include <linux/module.h>
43#include <linux/kernel.h>
44#include <linux/skbuff.h>
45#include <linux/jiffies.h>
46#include <linux/hash.h>
47
48/* for sysctl */
49#include <linux/fs.h>
50#include <linux/sysctl.h>
51
52#include <net/ip_vs.h>
53
54
55/*
56 *    It is for garbage collection of stale IPVS lblc entries,
57 *    when the table is full.
58 */
59#define CHECK_EXPIRE_INTERVAL   (60*HZ)
60#define ENTRY_TIMEOUT           (6*60*HZ)
61
62#define DEFAULT_EXPIRATION	(24*60*60*HZ)
63
64/*
65 *    It is for full expiration check.
66 *    When there is no partial expiration check (garbage collection)
67 *    in a half hour, do a full expiration check to collect stale
68 *    entries that haven't been touched for a day.
69 */
70#define COUNT_FOR_FULL_EXPIRATION   30
71
72
73/*
74 *     for IPVS lblc entry hash table
75 */
76#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
77#define CONFIG_IP_VS_LBLC_TAB_BITS      10
78#endif
79#define IP_VS_LBLC_TAB_BITS     CONFIG_IP_VS_LBLC_TAB_BITS
80#define IP_VS_LBLC_TAB_SIZE     (1 << IP_VS_LBLC_TAB_BITS)
81#define IP_VS_LBLC_TAB_MASK     (IP_VS_LBLC_TAB_SIZE - 1)
82
83
84/*
85 *      IPVS lblc entry represents an association between destination
86 *      IP address and its destination server
87 */
88struct ip_vs_lblc_entry {
89	struct hlist_node	list;
90	int			af;		/* address family */
91	union nf_inet_addr      addr;           /* destination IP address */
92	struct ip_vs_dest	*dest;          /* real server (cache) */
93	unsigned long           lastuse;        /* last used time */
94	struct rcu_head		rcu_head;
95};
96
97
98/*
99 *      IPVS lblc hash table
100 */
101struct ip_vs_lblc_table {
102	struct rcu_head		rcu_head;
103	struct hlist_head	bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */
104	struct timer_list       periodic_timer; /* collect stale entries */
105	struct ip_vs_service	*svc;		/* pointer back to service */
106	atomic_t                entries;        /* number of entries */
107	int                     max_size;       /* maximum size of entries */
108	int                     rover;          /* rover for expire check */
109	int                     counter;        /* counter for no expire */
110	bool			dead;
111};
112
113
114/*
115 *      IPVS LBLC sysctl table
116 */
117#ifdef CONFIG_SYSCTL
118static struct ctl_table vs_vars_table[] = {
119	{
120		.procname	= "lblc_expiration",
121		.data		= NULL,
122		.maxlen		= sizeof(int),
123		.mode		= 0644,
124		.proc_handler	= proc_dointvec_jiffies,
125	},
126	{ }
127};
128#endif
129
130static void ip_vs_lblc_rcu_free(struct rcu_head *head)
131{
132	struct ip_vs_lblc_entry *en = container_of(head,
133						   struct ip_vs_lblc_entry,
134						   rcu_head);
135
136	ip_vs_dest_put_and_free(en->dest);
137	kfree(en);
138}
139
140static inline void ip_vs_lblc_del(struct ip_vs_lblc_entry *en)
141{
142	hlist_del_rcu(&en->list);
143	call_rcu(&en->rcu_head, ip_vs_lblc_rcu_free);
144}
145
146/*
147 *	Returns hash value for IPVS LBLC entry
148 */
149static inline unsigned int
150ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
151{
152	__be32 addr_fold = addr->ip;
153
154#ifdef CONFIG_IP_VS_IPV6
155	if (af == AF_INET6)
156		addr_fold = addr->ip6[0]^addr->ip6[1]^
157			    addr->ip6[2]^addr->ip6[3];
158#endif
159	return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS);
160}
161
162
163/*
164 *	Hash an entry in the ip_vs_lblc_table.
165 *	returns bool success.
166 */
167static void
168ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
169{
170	unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr);
171
172	hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
173	atomic_inc(&tbl->entries);
174}
175
176
177/* Get ip_vs_lblc_entry associated with supplied parameters. */
178static inline struct ip_vs_lblc_entry *
179ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
180	       const union nf_inet_addr *addr)
181{
182	unsigned int hash = ip_vs_lblc_hashkey(af, addr);
183	struct ip_vs_lblc_entry *en;
184
185	hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
186		if (ip_vs_addr_equal(af, &en->addr, addr))
187			return en;
188
189	return NULL;
190}
191
192
193/*
194 * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
195 * address to a server. Called under spin lock.
196 */
197static inline struct ip_vs_lblc_entry *
198ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
199	       u16 af, struct ip_vs_dest *dest)
200{
201	struct ip_vs_lblc_entry *en;
202
203	en = ip_vs_lblc_get(af, tbl, daddr);
204	if (en) {
205		if (en->dest == dest)
206			return en;
207		ip_vs_lblc_del(en);
208	}
209	en = kmalloc(sizeof(*en), GFP_ATOMIC);
210	if (!en)
211		return NULL;
212
213	en->af = af;
214	ip_vs_addr_copy(af, &en->addr, daddr);
215	en->lastuse = jiffies;
216
217	ip_vs_dest_hold(dest);
218	en->dest = dest;
219
220	ip_vs_lblc_hash(tbl, en);
221
222	return en;
223}
224
225
226/*
227 *      Flush all the entries of the specified table.
228 */
229static void ip_vs_lblc_flush(struct ip_vs_service *svc)
230{
231	struct ip_vs_lblc_table *tbl = svc->sched_data;
232	struct ip_vs_lblc_entry *en;
233	struct hlist_node *next;
234	int i;
235
236	spin_lock_bh(&svc->sched_lock);
237	tbl->dead = true;
238	for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
239		hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
240			ip_vs_lblc_del(en);
241			atomic_dec(&tbl->entries);
242		}
243	}
244	spin_unlock_bh(&svc->sched_lock);
245}
246
247static int sysctl_lblc_expiration(struct ip_vs_service *svc)
248{
249#ifdef CONFIG_SYSCTL
250	return svc->ipvs->sysctl_lblc_expiration;
251#else
252	return DEFAULT_EXPIRATION;
253#endif
254}
255
256static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
257{
258	struct ip_vs_lblc_table *tbl = svc->sched_data;
259	struct ip_vs_lblc_entry *en;
260	struct hlist_node *next;
261	unsigned long now = jiffies;
262	int i, j;
263
264	for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
265		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
266
267		spin_lock(&svc->sched_lock);
268		hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
269			if (time_before(now,
270					en->lastuse +
271					sysctl_lblc_expiration(svc)))
272				continue;
273
274			ip_vs_lblc_del(en);
275			atomic_dec(&tbl->entries);
276		}
277		spin_unlock(&svc->sched_lock);
278	}
279	tbl->rover = j;
280}
281
282
283/*
284 *      Periodical timer handler for IPVS lblc table
285 *      It is used to collect stale entries when the number of entries
286 *      exceeds the maximum size of the table.
287 *
288 *      Fixme: we probably need more complicated algorithm to collect
289 *             entries that have not been used for a long time even
290 *             if the number of entries doesn't exceed the maximum size
291 *             of the table.
292 *      The full expiration check is for this purpose now.
293 */
294static void ip_vs_lblc_check_expire(struct timer_list *t)
295{
296	struct ip_vs_lblc_table *tbl = from_timer(tbl, t, periodic_timer);
297	struct ip_vs_service *svc = tbl->svc;
298	unsigned long now = jiffies;
299	int goal;
300	int i, j;
301	struct ip_vs_lblc_entry *en;
302	struct hlist_node *next;
303
304	if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
305		/* do full expiration check */
306		ip_vs_lblc_full_check(svc);
307		tbl->counter = 1;
308		goto out;
309	}
310
311	if (atomic_read(&tbl->entries) <= tbl->max_size) {
312		tbl->counter++;
313		goto out;
314	}
315
316	goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
317	if (goal > tbl->max_size/2)
318		goal = tbl->max_size/2;
319
320	for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
321		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
322
323		spin_lock(&svc->sched_lock);
324		hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
325			if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
326				continue;
327
328			ip_vs_lblc_del(en);
329			atomic_dec(&tbl->entries);
330			goal--;
331		}
332		spin_unlock(&svc->sched_lock);
333		if (goal <= 0)
334			break;
335	}
336	tbl->rover = j;
337
338  out:
339	mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
340}
341
342
343static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
344{
345	int i;
346	struct ip_vs_lblc_table *tbl;
347
348	/*
349	 *    Allocate the ip_vs_lblc_table for this service
350	 */
351	tbl = kmalloc(sizeof(*tbl), GFP_KERNEL);
352	if (tbl == NULL)
353		return -ENOMEM;
354
355	svc->sched_data = tbl;
356	IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) allocated for "
357		  "current service\n", sizeof(*tbl));
358
359	/*
360	 *    Initialize the hash buckets
361	 */
362	for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
363		INIT_HLIST_HEAD(&tbl->bucket[i]);
364	}
365	tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
366	tbl->rover = 0;
367	tbl->counter = 1;
368	tbl->dead = false;
369	tbl->svc = svc;
370	atomic_set(&tbl->entries, 0);
371
372	/*
373	 *    Hook periodic timer for garbage collection
374	 */
375	timer_setup(&tbl->periodic_timer, ip_vs_lblc_check_expire, 0);
376	mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
377
378	return 0;
379}
380
381
382static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
383{
384	struct ip_vs_lblc_table *tbl = svc->sched_data;
385
386	/* remove periodic timer */
387	timer_shutdown_sync(&tbl->periodic_timer);
388
389	/* got to clean up table entries here */
390	ip_vs_lblc_flush(svc);
391
392	/* release the table itself */
393	kfree_rcu(tbl, rcu_head);
394	IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) released\n",
395		  sizeof(*tbl));
396}
397
398
399static inline struct ip_vs_dest *
400__ip_vs_lblc_schedule(struct ip_vs_service *svc)
401{
402	struct ip_vs_dest *dest, *least;
403	int loh, doh;
404
405	/*
406	 * We use the following formula to estimate the load:
407	 *                (dest overhead) / dest->weight
408	 *
409	 * Remember -- no floats in kernel mode!!!
410	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
411	 *                h1/w1 > h2/w2
412	 * if every weight is larger than zero.
413	 *
414	 * The server with weight=0 is quiesced and will not receive any
415	 * new connection.
416	 */
417	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
418		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
419			continue;
420		if (atomic_read(&dest->weight) > 0) {
421			least = dest;
422			loh = ip_vs_dest_conn_overhead(least);
423			goto nextstage;
424		}
425	}
426	return NULL;
427
428	/*
429	 *    Find the destination with the least load.
430	 */
431  nextstage:
432	list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
433		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
434			continue;
435
436		doh = ip_vs_dest_conn_overhead(dest);
437		if ((__s64)loh * atomic_read(&dest->weight) >
438		    (__s64)doh * atomic_read(&least->weight)) {
439			least = dest;
440			loh = doh;
441		}
442	}
443
444	IP_VS_DBG_BUF(6, "LBLC: server %s:%d "
445		      "activeconns %d refcnt %d weight %d overhead %d\n",
446		      IP_VS_DBG_ADDR(least->af, &least->addr),
447		      ntohs(least->port),
448		      atomic_read(&least->activeconns),
449		      refcount_read(&least->refcnt),
450		      atomic_read(&least->weight), loh);
451
452	return least;
453}
454
455
456/*
457 *   If this destination server is overloaded and there is a less loaded
458 *   server, then return true.
459 */
460static inline int
461is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
462{
463	if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
464		struct ip_vs_dest *d;
465
466		list_for_each_entry_rcu(d, &svc->destinations, n_list) {
467			if (atomic_read(&d->activeconns)*2
468			    < atomic_read(&d->weight)) {
469				return 1;
470			}
471		}
472	}
473	return 0;
474}
475
476
477/*
478 *    Locality-Based (weighted) Least-Connection scheduling
479 */
480static struct ip_vs_dest *
481ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
482		    struct ip_vs_iphdr *iph)
483{
484	struct ip_vs_lblc_table *tbl = svc->sched_data;
485	struct ip_vs_dest *dest = NULL;
486	struct ip_vs_lblc_entry *en;
487
488	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
489
490	/* First look in our cache */
491	en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr);
492	if (en) {
493		/* We only hold a read lock, but this is atomic */
494		en->lastuse = jiffies;
495
496		/*
497		 * If the destination is not available, i.e. it's in the trash,
498		 * we must ignore it, as it may be removed from under our feet,
499		 * if someone drops our reference count. Our caller only makes
500		 * sure that destinations, that are not in the trash, are not
501		 * moved to the trash, while we are scheduling. But anyone can
502		 * free up entries from the trash at any time.
503		 */
504
505		dest = en->dest;
506		if ((dest->flags & IP_VS_DEST_F_AVAILABLE) &&
507		    atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
508			goto out;
509	}
510
511	/* No cache entry or it is invalid, time to schedule */
512	dest = __ip_vs_lblc_schedule(svc);
513	if (!dest) {
514		ip_vs_scheduler_err(svc, "no destination available");
515		return NULL;
516	}
517
518	/* If we fail to create a cache entry, we'll just use the valid dest */
519	spin_lock_bh(&svc->sched_lock);
520	if (!tbl->dead)
521		ip_vs_lblc_new(tbl, &iph->daddr, svc->af, dest);
522	spin_unlock_bh(&svc->sched_lock);
523
524out:
525	IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n",
526		      IP_VS_DBG_ADDR(svc->af, &iph->daddr),
527		      IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
528
529	return dest;
530}
531
532
533/*
534 *      IPVS LBLC Scheduler structure
535 */
536static struct ip_vs_scheduler ip_vs_lblc_scheduler = {
537	.name =			"lblc",
538	.refcnt =		ATOMIC_INIT(0),
539	.module =		THIS_MODULE,
540	.n_list =		LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
541	.init_service =		ip_vs_lblc_init_svc,
542	.done_service =		ip_vs_lblc_done_svc,
543	.schedule =		ip_vs_lblc_schedule,
544};
545
546/*
547 *  per netns init.
548 */
549#ifdef CONFIG_SYSCTL
550static int __net_init __ip_vs_lblc_init(struct net *net)
551{
552	struct netns_ipvs *ipvs = net_ipvs(net);
553	size_t vars_table_size = ARRAY_SIZE(vs_vars_table);
554
555	if (!ipvs)
556		return -ENOENT;
557
558	if (!net_eq(net, &init_net)) {
559		ipvs->lblc_ctl_table = kmemdup(vs_vars_table,
560						sizeof(vs_vars_table),
561						GFP_KERNEL);
562		if (ipvs->lblc_ctl_table == NULL)
563			return -ENOMEM;
564
565		/* Don't export sysctls to unprivileged users */
566		if (net->user_ns != &init_user_ns) {
567			ipvs->lblc_ctl_table[0].procname = NULL;
568			vars_table_size = 0;
569		}
570
571	} else
572		ipvs->lblc_ctl_table = vs_vars_table;
573	ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION;
574	ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration;
575
576	ipvs->lblc_ctl_header = register_net_sysctl_sz(net, "net/ipv4/vs",
577						       ipvs->lblc_ctl_table,
578						       vars_table_size);
579	if (!ipvs->lblc_ctl_header) {
580		if (!net_eq(net, &init_net))
581			kfree(ipvs->lblc_ctl_table);
582		return -ENOMEM;
583	}
584
585	return 0;
586}
587
588static void __net_exit __ip_vs_lblc_exit(struct net *net)
589{
590	struct netns_ipvs *ipvs = net_ipvs(net);
591
592	unregister_net_sysctl_table(ipvs->lblc_ctl_header);
593
594	if (!net_eq(net, &init_net))
595		kfree(ipvs->lblc_ctl_table);
596}
597
598#else
599
600static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; }
601static void __net_exit __ip_vs_lblc_exit(struct net *net) { }
602
603#endif
604
605static struct pernet_operations ip_vs_lblc_ops = {
606	.init = __ip_vs_lblc_init,
607	.exit = __ip_vs_lblc_exit,
608};
609
610static int __init ip_vs_lblc_init(void)
611{
612	int ret;
613
614	ret = register_pernet_subsys(&ip_vs_lblc_ops);
615	if (ret)
616		return ret;
617
618	ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
619	if (ret)
620		unregister_pernet_subsys(&ip_vs_lblc_ops);
621	return ret;
622}
623
624static void __exit ip_vs_lblc_cleanup(void)
625{
626	unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
627	unregister_pernet_subsys(&ip_vs_lblc_ops);
628	rcu_barrier();
629}
630
631
632module_init(ip_vs_lblc_init);
633module_exit(ip_vs_lblc_cleanup);
634MODULE_LICENSE("GPL");
635MODULE_DESCRIPTION("ipvs locality-based least-connection scheduler");
636