• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /netgear-R7000-V1.0.7.12_1.2.5/components/opensource/linux/linux-2.6.36/net/netfilter/ipvs/
1/*
2 * IPVS         An implementation of the IP virtual server support for the
3 *              LINUX operating system.  IPVS is now implemented as a module
4 *              over the Netfilter framework. IPVS can be used to build a
5 *              high-performance and highly available server based on a
6 *              cluster of servers.
7 *
8 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9 *              Peter Kese <peter.kese@ijs.si>
10 *              Julian Anastasov <ja@ssi.bg>
11 *
12 *              This program is free software; you can redistribute it and/or
13 *              modify it under the terms of the GNU General Public License
14 *              as published by the Free Software Foundation; either version
15 *              2 of the License, or (at your option) any later version.
16 *
17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19 * and others. Many code here is taken from IP MASQ code of kernel 2.2.
20 *
21 * Changes:
22 *
23 */
24
25#define KMSG_COMPONENT "IPVS"
26#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
27
28#include <linux/interrupt.h>
29#include <linux/in.h>
30#include <linux/net.h>
31#include <linux/kernel.h>
32#include <linux/module.h>
33#include <linux/vmalloc.h>
34#include <linux/proc_fs.h>		/* for proc_net_* */
35#include <linux/slab.h>
36#include <linux/seq_file.h>
37#include <linux/jhash.h>
38#include <linux/random.h>
39
40#include <net/net_namespace.h>
41#include <net/ip_vs.h>
42
43
44#ifndef CONFIG_IP_VS_TAB_BITS
45#define CONFIG_IP_VS_TAB_BITS	12
46#endif
47
48/*
49 * Connection hash size. Default is what was selected at compile time.
50*/
51int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
52module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
53MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
54
55/* size and mask values */
56int ip_vs_conn_tab_size;
57int ip_vs_conn_tab_mask;
58
59/*
60 *  Connection hash table: for input and output packets lookups of IPVS
61 */
62static struct list_head *ip_vs_conn_tab;
63
64/*  SLAB cache for IPVS connections */
65static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
66
67/*  counter for current IPVS connections */
68static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
69
70/*  counter for no client port connections */
71static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
72
73/* random value for IPVS connection hash */
74static unsigned int ip_vs_conn_rnd;
75
76/*
77 *  Fine locking granularity for big connection hash table
78 */
79#define CT_LOCKARRAY_BITS  4
80#define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
81#define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
82
83struct ip_vs_aligned_lock
84{
85	rwlock_t	l;
86} __attribute__((__aligned__(SMP_CACHE_BYTES)));
87
88/* lock array for conn table */
89static struct ip_vs_aligned_lock
90__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
91
92static inline void ct_read_lock(unsigned key)
93{
94	read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
95}
96
97static inline void ct_read_unlock(unsigned key)
98{
99	read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
100}
101
102static inline void ct_write_lock(unsigned key)
103{
104	write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
105}
106
107static inline void ct_write_unlock(unsigned key)
108{
109	write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
110}
111
112static inline void ct_read_lock_bh(unsigned key)
113{
114	read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
115}
116
117static inline void ct_read_unlock_bh(unsigned key)
118{
119	read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
120}
121
122static inline void ct_write_lock_bh(unsigned key)
123{
124	write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
125}
126
127static inline void ct_write_unlock_bh(unsigned key)
128{
129	write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
130}
131
132
133/*
134 *	Returns hash value for IPVS connection entry
135 */
136static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
137				       const union nf_inet_addr *addr,
138				       __be16 port)
139{
140#ifdef CONFIG_IP_VS_IPV6
141	if (af == AF_INET6)
142		return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
143				    (__force u32)port, proto, ip_vs_conn_rnd)
144			& ip_vs_conn_tab_mask;
145#endif
146	return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
147			    ip_vs_conn_rnd)
148		& ip_vs_conn_tab_mask;
149}
150
151
152/*
153 *	Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
154 *	returns bool success.
155 */
156static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
157{
158	unsigned hash;
159	int ret;
160
161	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
162		return 0;
163
164	/* Hash by protocol, client address and port */
165	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
166
167	ct_write_lock(hash);
168	spin_lock(&cp->lock);
169
170	if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
171		list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
172		cp->flags |= IP_VS_CONN_F_HASHED;
173		atomic_inc(&cp->refcnt);
174		ret = 1;
175	} else {
176		pr_err("%s(): request for already hashed, called from %pF\n",
177		       __func__, __builtin_return_address(0));
178		ret = 0;
179	}
180
181	spin_unlock(&cp->lock);
182	ct_write_unlock(hash);
183
184	return ret;
185}
186
187
188/*
189 *	UNhashes ip_vs_conn from ip_vs_conn_tab.
190 *	returns bool success.
191 */
192static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
193{
194	unsigned hash;
195	int ret;
196
197	/* unhash it and decrease its reference counter */
198	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
199
200	ct_write_lock(hash);
201	spin_lock(&cp->lock);
202
203	if (cp->flags & IP_VS_CONN_F_HASHED) {
204		list_del(&cp->c_list);
205		cp->flags &= ~IP_VS_CONN_F_HASHED;
206		atomic_dec(&cp->refcnt);
207		ret = 1;
208	} else
209		ret = 0;
210
211	spin_unlock(&cp->lock);
212	ct_write_unlock(hash);
213
214	return ret;
215}
216
217
218/*
219 *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
220 *  Called for pkts coming from OUTside-to-INside.
221 *	s_addr, s_port: pkt source address (foreign host)
222 *	d_addr, d_port: pkt dest address (load balancer)
223 */
224static inline struct ip_vs_conn *__ip_vs_conn_in_get
225(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
226 const union nf_inet_addr *d_addr, __be16 d_port)
227{
228	unsigned hash;
229	struct ip_vs_conn *cp;
230
231	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
232
233	ct_read_lock(hash);
234
235	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
236		if (cp->af == af &&
237		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
238		    ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
239		    s_port == cp->cport && d_port == cp->vport &&
240		    ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
241		    protocol == cp->protocol) {
242			/* HIT */
243			atomic_inc(&cp->refcnt);
244			ct_read_unlock(hash);
245			return cp;
246		}
247	}
248
249	ct_read_unlock(hash);
250
251	return NULL;
252}
253
254struct ip_vs_conn *ip_vs_conn_in_get
255(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
256 const union nf_inet_addr *d_addr, __be16 d_port)
257{
258	struct ip_vs_conn *cp;
259
260	cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port);
261	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
262		cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr,
263					 d_port);
264
265	IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
266		      ip_vs_proto_name(protocol),
267		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
268		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
269		      cp ? "hit" : "not hit");
270
271	return cp;
272}
273
274struct ip_vs_conn *
275ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
276			struct ip_vs_protocol *pp,
277			const struct ip_vs_iphdr *iph,
278			unsigned int proto_off, int inverse)
279{
280	__be16 _ports[2], *pptr;
281
282	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
283	if (pptr == NULL)
284		return NULL;
285
286	if (likely(!inverse))
287		return ip_vs_conn_in_get(af, iph->protocol,
288					 &iph->saddr, pptr[0],
289					 &iph->daddr, pptr[1]);
290	else
291		return ip_vs_conn_in_get(af, iph->protocol,
292					 &iph->daddr, pptr[1],
293					 &iph->saddr, pptr[0]);
294}
295EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
296
297/* Get reference to connection template */
298struct ip_vs_conn *ip_vs_ct_in_get
299(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
300 const union nf_inet_addr *d_addr, __be16 d_port)
301{
302	unsigned hash;
303	struct ip_vs_conn *cp;
304
305	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
306
307	ct_read_lock(hash);
308
309	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
310		if (cp->af == af &&
311		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
312		    /* protocol should only be IPPROTO_IP if
313		     * d_addr is a fwmark */
314		    ip_vs_addr_equal(protocol == IPPROTO_IP ? AF_UNSPEC : af,
315		                     d_addr, &cp->vaddr) &&
316		    s_port == cp->cport && d_port == cp->vport &&
317		    cp->flags & IP_VS_CONN_F_TEMPLATE &&
318		    protocol == cp->protocol) {
319			/* HIT */
320			atomic_inc(&cp->refcnt);
321			goto out;
322		}
323	}
324	cp = NULL;
325
326  out:
327	ct_read_unlock(hash);
328
329	IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
330		      ip_vs_proto_name(protocol),
331		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
332		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
333		      cp ? "hit" : "not hit");
334
335	return cp;
336}
337
338/*
339 *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
340 *  Called for pkts coming from inside-to-OUTside.
341 *	s_addr, s_port: pkt source address (inside host)
342 *	d_addr, d_port: pkt dest address (foreign host)
343 */
344struct ip_vs_conn *ip_vs_conn_out_get
345(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
346 const union nf_inet_addr *d_addr, __be16 d_port)
347{
348	unsigned hash;
349	struct ip_vs_conn *cp, *ret=NULL;
350
351	/*
352	 *	Check for "full" addressed entries
353	 */
354	hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port);
355
356	ct_read_lock(hash);
357
358	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
359		if (cp->af == af &&
360		    ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
361		    ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
362		    d_port == cp->cport && s_port == cp->dport &&
363		    protocol == cp->protocol) {
364			/* HIT */
365			atomic_inc(&cp->refcnt);
366			ret = cp;
367			break;
368		}
369	}
370
371	ct_read_unlock(hash);
372
373	IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
374		      ip_vs_proto_name(protocol),
375		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
376		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
377		      ret ? "hit" : "not hit");
378
379	return ret;
380}
381
382struct ip_vs_conn *
383ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
384			 struct ip_vs_protocol *pp,
385			 const struct ip_vs_iphdr *iph,
386			 unsigned int proto_off, int inverse)
387{
388	__be16 _ports[2], *pptr;
389
390	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
391	if (pptr == NULL)
392		return NULL;
393
394	if (likely(!inverse))
395		return ip_vs_conn_out_get(af, iph->protocol,
396					  &iph->saddr, pptr[0],
397					  &iph->daddr, pptr[1]);
398	else
399		return ip_vs_conn_out_get(af, iph->protocol,
400					  &iph->daddr, pptr[1],
401					  &iph->saddr, pptr[0]);
402}
403EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
404
405/*
406 *      Put back the conn and restart its timer with its timeout
407 */
408void ip_vs_conn_put(struct ip_vs_conn *cp)
409{
410	unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
411		0 : cp->timeout;
412	mod_timer(&cp->timer, jiffies+t);
413
414	__ip_vs_conn_put(cp);
415}
416
417
418/*
419 *	Fill a no_client_port connection with a client port number
420 */
421void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
422{
423	if (ip_vs_conn_unhash(cp)) {
424		spin_lock(&cp->lock);
425		if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
426			atomic_dec(&ip_vs_conn_no_cport_cnt);
427			cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
428			cp->cport = cport;
429		}
430		spin_unlock(&cp->lock);
431
432		/* hash on new dport */
433		ip_vs_conn_hash(cp);
434	}
435}
436
437
438/*
439 *	Bind a connection entry with the corresponding packet_xmit.
440 *	Called by ip_vs_conn_new.
441 */
442static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
443{
444	switch (IP_VS_FWD_METHOD(cp)) {
445	case IP_VS_CONN_F_MASQ:
446		cp->packet_xmit = ip_vs_nat_xmit;
447		break;
448
449	case IP_VS_CONN_F_TUNNEL:
450		cp->packet_xmit = ip_vs_tunnel_xmit;
451		break;
452
453	case IP_VS_CONN_F_DROUTE:
454		cp->packet_xmit = ip_vs_dr_xmit;
455		break;
456
457	case IP_VS_CONN_F_LOCALNODE:
458		cp->packet_xmit = ip_vs_null_xmit;
459		break;
460
461	case IP_VS_CONN_F_BYPASS:
462		cp->packet_xmit = ip_vs_bypass_xmit;
463		break;
464	}
465}
466
467#ifdef CONFIG_IP_VS_IPV6
468static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
469{
470	switch (IP_VS_FWD_METHOD(cp)) {
471	case IP_VS_CONN_F_MASQ:
472		cp->packet_xmit = ip_vs_nat_xmit_v6;
473		break;
474
475	case IP_VS_CONN_F_TUNNEL:
476		cp->packet_xmit = ip_vs_tunnel_xmit_v6;
477		break;
478
479	case IP_VS_CONN_F_DROUTE:
480		cp->packet_xmit = ip_vs_dr_xmit_v6;
481		break;
482
483	case IP_VS_CONN_F_LOCALNODE:
484		cp->packet_xmit = ip_vs_null_xmit;
485		break;
486
487	case IP_VS_CONN_F_BYPASS:
488		cp->packet_xmit = ip_vs_bypass_xmit_v6;
489		break;
490	}
491}
492#endif
493
494
495static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
496{
497	return atomic_read(&dest->activeconns)
498		+ atomic_read(&dest->inactconns);
499}
500
501/*
502 *	Bind a connection entry with a virtual service destination
503 *	Called just after a new connection entry is created.
504 */
505static inline void
506ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
507{
508	/* if dest is NULL, then return directly */
509	if (!dest)
510		return;
511
512	/* Increase the refcnt counter of the dest */
513	atomic_inc(&dest->refcnt);
514
515	/* Bind with the destination and its corresponding transmitter */
516	if ((cp->flags & IP_VS_CONN_F_SYNC) &&
517	    (!(cp->flags & IP_VS_CONN_F_TEMPLATE)))
518		/* if the connection is not template and is created
519		 * by sync, preserve the activity flag.
520		 */
521		cp->flags |= atomic_read(&dest->conn_flags) &
522			     (~IP_VS_CONN_F_INACTIVE);
523	else
524		cp->flags |= atomic_read(&dest->conn_flags);
525	cp->dest = dest;
526
527	IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
528		      "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
529		      "dest->refcnt:%d\n",
530		      ip_vs_proto_name(cp->protocol),
531		      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
532		      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
533		      IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
534		      ip_vs_fwd_tag(cp), cp->state,
535		      cp->flags, atomic_read(&cp->refcnt),
536		      atomic_read(&dest->refcnt));
537
538	/* Update the connection counters */
539	if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
540		/* It is a normal connection, so increase the inactive
541		   connection counter because it is in TCP SYNRECV
542		   state (inactive) or other protocol inacive state */
543		if ((cp->flags & IP_VS_CONN_F_SYNC) &&
544		    (!(cp->flags & IP_VS_CONN_F_INACTIVE)))
545			atomic_inc(&dest->activeconns);
546		else
547			atomic_inc(&dest->inactconns);
548	} else {
549		/* It is a persistent connection/template, so increase
550		   the peristent connection counter */
551		atomic_inc(&dest->persistconns);
552	}
553
554	if (dest->u_threshold != 0 &&
555	    ip_vs_dest_totalconns(dest) >= dest->u_threshold)
556		dest->flags |= IP_VS_DEST_F_OVERLOAD;
557}
558
559
560/*
561 * Check if there is a destination for the connection, if so
562 * bind the connection to the destination.
563 */
564struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
565{
566	struct ip_vs_dest *dest;
567
568	if ((cp) && (!cp->dest)) {
569		dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport,
570				       &cp->vaddr, cp->vport,
571				       cp->protocol);
572		ip_vs_bind_dest(cp, dest);
573		return dest;
574	} else
575		return NULL;
576}
577
578
579/*
580 *	Unbind a connection entry with its VS destination
581 *	Called by the ip_vs_conn_expire function.
582 */
583static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
584{
585	struct ip_vs_dest *dest = cp->dest;
586
587	if (!dest)
588		return;
589
590	IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
591		      "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
592		      "dest->refcnt:%d\n",
593		      ip_vs_proto_name(cp->protocol),
594		      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
595		      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
596		      IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
597		      ip_vs_fwd_tag(cp), cp->state,
598		      cp->flags, atomic_read(&cp->refcnt),
599		      atomic_read(&dest->refcnt));
600
601	/* Update the connection counters */
602	if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
603		/* It is a normal connection, so decrease the inactconns
604		   or activeconns counter */
605		if (cp->flags & IP_VS_CONN_F_INACTIVE) {
606			atomic_dec(&dest->inactconns);
607		} else {
608			atomic_dec(&dest->activeconns);
609		}
610	} else {
611		/* It is a persistent connection/template, so decrease
612		   the peristent connection counter */
613		atomic_dec(&dest->persistconns);
614	}
615
616	if (dest->l_threshold != 0) {
617		if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
618			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
619	} else if (dest->u_threshold != 0) {
620		if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
621			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
622	} else {
623		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
624			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
625	}
626
627	/*
628	 * Simply decrease the refcnt of the dest, because the
629	 * dest will be either in service's destination list
630	 * or in the trash.
631	 */
632	atomic_dec(&dest->refcnt);
633}
634
635
636/*
637 *	Checking if the destination of a connection template is available.
638 *	If available, return 1, otherwise invalidate this connection
639 *	template and return 0.
640 */
641int ip_vs_check_template(struct ip_vs_conn *ct)
642{
643	struct ip_vs_dest *dest = ct->dest;
644
645	/*
646	 * Checking the dest server status.
647	 */
648	if ((dest == NULL) ||
649	    !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
650	    (sysctl_ip_vs_expire_quiescent_template &&
651	     (atomic_read(&dest->weight) == 0))) {
652		IP_VS_DBG_BUF(9, "check_template: dest not available for "
653			      "protocol %s s:%s:%d v:%s:%d "
654			      "-> d:%s:%d\n",
655			      ip_vs_proto_name(ct->protocol),
656			      IP_VS_DBG_ADDR(ct->af, &ct->caddr),
657			      ntohs(ct->cport),
658			      IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
659			      ntohs(ct->vport),
660			      IP_VS_DBG_ADDR(ct->af, &ct->daddr),
661			      ntohs(ct->dport));
662
663		/*
664		 * Invalidate the connection template
665		 */
666		if (ct->vport != htons(0xffff)) {
667			if (ip_vs_conn_unhash(ct)) {
668				ct->dport = htons(0xffff);
669				ct->vport = htons(0xffff);
670				ct->cport = 0;
671				ip_vs_conn_hash(ct);
672			}
673		}
674
675		/*
676		 * Simply decrease the refcnt of the template,
677		 * don't restart its timer.
678		 */
679		atomic_dec(&ct->refcnt);
680		return 0;
681	}
682	return 1;
683}
684
685static void ip_vs_conn_expire(unsigned long data)
686{
687	struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
688
689	cp->timeout = 60*HZ;
690
691	/*
692	 *	hey, I'm using it
693	 */
694	atomic_inc(&cp->refcnt);
695
696	/*
697	 *	do I control anybody?
698	 */
699	if (atomic_read(&cp->n_control))
700		goto expire_later;
701
702	/*
703	 *	unhash it if it is hashed in the conn table
704	 */
705	if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET))
706		goto expire_later;
707
708	/*
709	 *	refcnt==1 implies I'm the only one referrer
710	 */
711	if (likely(atomic_read(&cp->refcnt) == 1)) {
712		/* delete the timer if it is activated by other users */
713		if (timer_pending(&cp->timer))
714			del_timer(&cp->timer);
715
716		/* does anybody control me? */
717		if (cp->control)
718			ip_vs_control_del(cp);
719
720		if (unlikely(cp->app != NULL))
721			ip_vs_unbind_app(cp);
722		ip_vs_unbind_dest(cp);
723		if (cp->flags & IP_VS_CONN_F_NO_CPORT)
724			atomic_dec(&ip_vs_conn_no_cport_cnt);
725		atomic_dec(&ip_vs_conn_count);
726
727		kmem_cache_free(ip_vs_conn_cachep, cp);
728		return;
729	}
730
731	/* hash it back to the table */
732	ip_vs_conn_hash(cp);
733
734  expire_later:
735	IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
736		  atomic_read(&cp->refcnt)-1,
737		  atomic_read(&cp->n_control));
738
739	ip_vs_conn_put(cp);
740}
741
742
743void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
744{
745	if (del_timer(&cp->timer))
746		mod_timer(&cp->timer, jiffies);
747}
748
749
750/*
751 *	Create a new connection entry and hash it into the ip_vs_conn_tab
752 */
753struct ip_vs_conn *
754ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
755	       const union nf_inet_addr *vaddr, __be16 vport,
756	       const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
757	       struct ip_vs_dest *dest)
758{
759	struct ip_vs_conn *cp;
760	struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
761
762	cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
763	if (cp == NULL) {
764		IP_VS_ERR_RL("%s(): no memory\n", __func__);
765		return NULL;
766	}
767
768	INIT_LIST_HEAD(&cp->c_list);
769	setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
770	cp->af		   = af;
771	cp->protocol	   = proto;
772	ip_vs_addr_copy(af, &cp->caddr, caddr);
773	cp->cport	   = cport;
774	ip_vs_addr_copy(af, &cp->vaddr, vaddr);
775	cp->vport	   = vport;
776	/* proto should only be IPPROTO_IP if d_addr is a fwmark */
777	ip_vs_addr_copy(proto == IPPROTO_IP ? AF_UNSPEC : af,
778			&cp->daddr, daddr);
779	cp->dport          = dport;
780	cp->flags	   = flags;
781	spin_lock_init(&cp->lock);
782
783	/*
784	 * Set the entry is referenced by the current thread before hashing
785	 * it in the table, so that other thread run ip_vs_random_dropentry
786	 * but cannot drop this entry.
787	 */
788	atomic_set(&cp->refcnt, 1);
789
790	atomic_set(&cp->n_control, 0);
791	atomic_set(&cp->in_pkts, 0);
792
793	atomic_inc(&ip_vs_conn_count);
794	if (flags & IP_VS_CONN_F_NO_CPORT)
795		atomic_inc(&ip_vs_conn_no_cport_cnt);
796
797	/* Bind the connection with a destination server */
798	ip_vs_bind_dest(cp, dest);
799
800	/* Set its state and timeout */
801	cp->state = 0;
802	cp->timeout = 3*HZ;
803
804	/* Bind its packet transmitter */
805#ifdef CONFIG_IP_VS_IPV6
806	if (af == AF_INET6)
807		ip_vs_bind_xmit_v6(cp);
808	else
809#endif
810		ip_vs_bind_xmit(cp);
811
812	if (unlikely(pp && atomic_read(&pp->appcnt)))
813		ip_vs_bind_app(cp, pp);
814
815	/* Hash it in the ip_vs_conn_tab finally */
816	ip_vs_conn_hash(cp);
817
818	return cp;
819}
820
821
822/*
823 *	/proc/net/ip_vs_conn entries
824 */
825#ifdef CONFIG_PROC_FS
826
827static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
828{
829	int idx;
830	struct ip_vs_conn *cp;
831
832	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
833		ct_read_lock_bh(idx);
834		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
835			if (pos-- == 0) {
836				seq->private = &ip_vs_conn_tab[idx];
837				return cp;
838			}
839		}
840		ct_read_unlock_bh(idx);
841	}
842
843	return NULL;
844}
845
846static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
847{
848	seq->private = NULL;
849	return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
850}
851
852static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
853{
854	struct ip_vs_conn *cp = v;
855	struct list_head *e, *l = seq->private;
856	int idx;
857
858	++*pos;
859	if (v == SEQ_START_TOKEN)
860		return ip_vs_conn_array(seq, 0);
861
862	/* more on same hash chain? */
863	if ((e = cp->c_list.next) != l)
864		return list_entry(e, struct ip_vs_conn, c_list);
865
866	idx = l - ip_vs_conn_tab;
867	ct_read_unlock_bh(idx);
868
869	while (++idx < ip_vs_conn_tab_size) {
870		ct_read_lock_bh(idx);
871		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
872			seq->private = &ip_vs_conn_tab[idx];
873			return cp;
874		}
875		ct_read_unlock_bh(idx);
876	}
877	seq->private = NULL;
878	return NULL;
879}
880
881static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
882{
883	struct list_head *l = seq->private;
884
885	if (l)
886		ct_read_unlock_bh(l - ip_vs_conn_tab);
887}
888
889static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
890{
891
892	if (v == SEQ_START_TOKEN)
893		seq_puts(seq,
894   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires\n");
895	else {
896		const struct ip_vs_conn *cp = v;
897
898#ifdef CONFIG_IP_VS_IPV6
899		if (cp->af == AF_INET6)
900			seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %7lu\n",
901				ip_vs_proto_name(cp->protocol),
902				&cp->caddr.in6, ntohs(cp->cport),
903				&cp->vaddr.in6, ntohs(cp->vport),
904				&cp->daddr.in6, ntohs(cp->dport),
905				ip_vs_state_name(cp->protocol, cp->state),
906				(cp->timer.expires-jiffies)/HZ);
907		else
908#endif
909			seq_printf(seq,
910				"%-3s %08X %04X %08X %04X"
911				" %08X %04X %-11s %7lu\n",
912				ip_vs_proto_name(cp->protocol),
913				ntohl(cp->caddr.ip), ntohs(cp->cport),
914				ntohl(cp->vaddr.ip), ntohs(cp->vport),
915				ntohl(cp->daddr.ip), ntohs(cp->dport),
916				ip_vs_state_name(cp->protocol, cp->state),
917				(cp->timer.expires-jiffies)/HZ);
918	}
919	return 0;
920}
921
922static const struct seq_operations ip_vs_conn_seq_ops = {
923	.start = ip_vs_conn_seq_start,
924	.next  = ip_vs_conn_seq_next,
925	.stop  = ip_vs_conn_seq_stop,
926	.show  = ip_vs_conn_seq_show,
927};
928
929static int ip_vs_conn_open(struct inode *inode, struct file *file)
930{
931	return seq_open(file, &ip_vs_conn_seq_ops);
932}
933
934static const struct file_operations ip_vs_conn_fops = {
935	.owner	 = THIS_MODULE,
936	.open    = ip_vs_conn_open,
937	.read    = seq_read,
938	.llseek  = seq_lseek,
939	.release = seq_release,
940};
941
942static const char *ip_vs_origin_name(unsigned flags)
943{
944	if (flags & IP_VS_CONN_F_SYNC)
945		return "SYNC";
946	else
947		return "LOCAL";
948}
949
950static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
951{
952
953	if (v == SEQ_START_TOKEN)
954		seq_puts(seq,
955   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Origin Expires\n");
956	else {
957		const struct ip_vs_conn *cp = v;
958
959#ifdef CONFIG_IP_VS_IPV6
960		if (cp->af == AF_INET6)
961			seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %-6s %7lu\n",
962				ip_vs_proto_name(cp->protocol),
963				&cp->caddr.in6, ntohs(cp->cport),
964				&cp->vaddr.in6, ntohs(cp->vport),
965				&cp->daddr.in6, ntohs(cp->dport),
966				ip_vs_state_name(cp->protocol, cp->state),
967				ip_vs_origin_name(cp->flags),
968				(cp->timer.expires-jiffies)/HZ);
969		else
970#endif
971			seq_printf(seq,
972				"%-3s %08X %04X %08X %04X "
973				"%08X %04X %-11s %-6s %7lu\n",
974				ip_vs_proto_name(cp->protocol),
975				ntohl(cp->caddr.ip), ntohs(cp->cport),
976				ntohl(cp->vaddr.ip), ntohs(cp->vport),
977				ntohl(cp->daddr.ip), ntohs(cp->dport),
978				ip_vs_state_name(cp->protocol, cp->state),
979				ip_vs_origin_name(cp->flags),
980				(cp->timer.expires-jiffies)/HZ);
981	}
982	return 0;
983}
984
985static const struct seq_operations ip_vs_conn_sync_seq_ops = {
986	.start = ip_vs_conn_seq_start,
987	.next  = ip_vs_conn_seq_next,
988	.stop  = ip_vs_conn_seq_stop,
989	.show  = ip_vs_conn_sync_seq_show,
990};
991
992static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
993{
994	return seq_open(file, &ip_vs_conn_sync_seq_ops);
995}
996
997static const struct file_operations ip_vs_conn_sync_fops = {
998	.owner	 = THIS_MODULE,
999	.open    = ip_vs_conn_sync_open,
1000	.read    = seq_read,
1001	.llseek  = seq_lseek,
1002	.release = seq_release,
1003};
1004
1005#endif
1006
1007
1008/*
1009 *      Randomly drop connection entries before running out of memory
1010 */
1011static inline int todrop_entry(struct ip_vs_conn *cp)
1012{
1013	/*
1014	 * The drop rate array needs tuning for real environments.
1015	 * Called from timer bh only => no locking
1016	 */
1017	static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
1018	static char todrop_counter[9] = {0};
1019	int i;
1020
1021	/* if the conn entry hasn't lasted for 60 seconds, don't drop it.
1022	   This will leave enough time for normal connection to get
1023	   through. */
1024	if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
1025		return 0;
1026
1027	/* Don't drop the entry if its number of incoming packets is not
1028	   located in [0, 8] */
1029	i = atomic_read(&cp->in_pkts);
1030	if (i > 8 || i < 0) return 0;
1031
1032	if (!todrop_rate[i]) return 0;
1033	if (--todrop_counter[i] > 0) return 0;
1034
1035	todrop_counter[i] = todrop_rate[i];
1036	return 1;
1037}
1038
1039/* Called from keventd and must protect itself from softirqs */
1040void ip_vs_random_dropentry(void)
1041{
1042	int idx;
1043	struct ip_vs_conn *cp;
1044
1045	/*
1046	 * Randomly scan 1/32 of the whole table every second
1047	 */
1048	for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
1049		unsigned hash = net_random() & ip_vs_conn_tab_mask;
1050
1051		/*
1052		 *  Lock is actually needed in this loop.
1053		 */
1054		ct_write_lock_bh(hash);
1055
1056		list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
1057			if (cp->flags & IP_VS_CONN_F_TEMPLATE)
1058				/* connection template */
1059				continue;
1060
1061			if (cp->protocol == IPPROTO_TCP) {
1062				switch(cp->state) {
1063				case IP_VS_TCP_S_SYN_RECV:
1064				case IP_VS_TCP_S_SYNACK:
1065					break;
1066
1067				case IP_VS_TCP_S_ESTABLISHED:
1068					if (todrop_entry(cp))
1069						break;
1070					continue;
1071
1072				default:
1073					continue;
1074				}
1075			} else {
1076				if (!todrop_entry(cp))
1077					continue;
1078			}
1079
1080			IP_VS_DBG(4, "del connection\n");
1081			ip_vs_conn_expire_now(cp);
1082			if (cp->control) {
1083				IP_VS_DBG(4, "del conn template\n");
1084				ip_vs_conn_expire_now(cp->control);
1085			}
1086		}
1087		ct_write_unlock_bh(hash);
1088	}
1089}
1090
1091
1092/*
1093 *      Flush all the connection entries in the ip_vs_conn_tab
1094 */
1095static void ip_vs_conn_flush(void)
1096{
1097	int idx;
1098	struct ip_vs_conn *cp;
1099
1100  flush_again:
1101	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
1102		/*
1103		 *  Lock is actually needed in this loop.
1104		 */
1105		ct_write_lock_bh(idx);
1106
1107		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
1108
1109			IP_VS_DBG(4, "del connection\n");
1110			ip_vs_conn_expire_now(cp);
1111			if (cp->control) {
1112				IP_VS_DBG(4, "del conn template\n");
1113				ip_vs_conn_expire_now(cp->control);
1114			}
1115		}
1116		ct_write_unlock_bh(idx);
1117	}
1118
1119	/* the counter may be not NULL, because maybe some conn entries
1120	   are run by slow timer handler or unhashed but still referred */
1121	if (atomic_read(&ip_vs_conn_count) != 0) {
1122		schedule();
1123		goto flush_again;
1124	}
1125}
1126
1127
1128int __init ip_vs_conn_init(void)
1129{
1130	int idx;
1131
1132	/* Compute size and mask */
1133	ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
1134	ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
1135
1136	/*
1137	 * Allocate the connection hash table and initialize its list heads
1138	 */
1139	ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size *
1140				 sizeof(struct list_head));
1141	if (!ip_vs_conn_tab)
1142		return -ENOMEM;
1143
1144	/* Allocate ip_vs_conn slab cache */
1145	ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
1146					      sizeof(struct ip_vs_conn), 0,
1147					      SLAB_HWCACHE_ALIGN, NULL);
1148	if (!ip_vs_conn_cachep) {
1149		vfree(ip_vs_conn_tab);
1150		return -ENOMEM;
1151	}
1152
1153	pr_info("Connection hash table configured "
1154		"(size=%d, memory=%ldKbytes)\n",
1155		ip_vs_conn_tab_size,
1156		(long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
1157	IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
1158		  sizeof(struct ip_vs_conn));
1159
1160	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
1161		INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
1162	}
1163
1164	for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
1165		rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
1166	}
1167
1168	proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops);
1169	proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
1170
1171	/* calculate the random value for connection hash */
1172	get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
1173
1174	return 0;
1175}
1176
1177
1178void ip_vs_conn_cleanup(void)
1179{
1180	/* flush all the connection entries first */
1181	ip_vs_conn_flush();
1182
1183	/* Release the empty cache */
1184	kmem_cache_destroy(ip_vs_conn_cachep);
1185	proc_net_remove(&init_net, "ip_vs_conn");
1186	proc_net_remove(&init_net, "ip_vs_conn_sync");
1187	vfree(ip_vs_conn_tab);
1188}
1189