1/*
2 * IPVS         An implementation of the IP virtual server support for the
3 *              LINUX operating system.  IPVS is now implemented as a module
4 *              over the NetFilter framework. IPVS can be used to build a
5 *              high-performance and highly available server based on a
6 *              cluster of servers.
7 *
8 * Version:     $Id: ip_vs_ctl.c,v 1.1.1.1 2007/08/03 18:53:51 Exp $
9 *
10 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
11 *              Peter Kese <peter.kese@ijs.si>
12 *              Julian Anastasov <ja@ssi.bg>
13 *
14 *              This program is free software; you can redistribute it and/or
15 *              modify it under the terms of the GNU General Public License
16 *              as published by the Free Software Foundation; either version
17 *              2 of the License, or (at your option) any later version.
18 *
19 * Changes:
20 *
21 */
22
23#include <linux/module.h>
24#include <linux/init.h>
25#include <linux/types.h>
26#include <linux/capability.h>
27#include <linux/fs.h>
28#include <linux/sysctl.h>
29#include <linux/proc_fs.h>
30#include <linux/workqueue.h>
31#include <linux/swap.h>
32#include <linux/proc_fs.h>
33#include <linux/seq_file.h>
34
35#include <linux/netfilter.h>
36#include <linux/netfilter_ipv4.h>
37#include <linux/mutex.h>
38
39#include <net/ip.h>
40#include <net/route.h>
41#include <net/sock.h>
42
43#include <asm/uaccess.h>
44
45#include <net/ip_vs.h>
46
47/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
48static DEFINE_MUTEX(__ip_vs_mutex);
49
50/* lock for service table */
51static DEFINE_RWLOCK(__ip_vs_svc_lock);
52
53/* lock for table with the real services */
54static DEFINE_RWLOCK(__ip_vs_rs_lock);
55
56/* lock for state and timeout tables */
57static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
58
59/* lock for drop entry handling */
60static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
61
62/* lock for drop packet handling */
63static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
64
65/* 1/rate drop and drop-entry variables */
66int ip_vs_drop_rate = 0;
67int ip_vs_drop_counter = 0;
68static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
69
70/* number of virtual services */
71static int ip_vs_num_services = 0;
72
73/* sysctl variables */
74static int sysctl_ip_vs_drop_entry = 0;
75static int sysctl_ip_vs_drop_packet = 0;
76static int sysctl_ip_vs_secure_tcp = 0;
77static int sysctl_ip_vs_amemthresh = 1024;
78static int sysctl_ip_vs_am_droprate = 10;
79int sysctl_ip_vs_cache_bypass = 0;
80int sysctl_ip_vs_expire_nodest_conn = 0;
81int sysctl_ip_vs_expire_quiescent_template = 0;
82int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
83int sysctl_ip_vs_nat_icmp_send = 0;
84
85
86#ifdef CONFIG_IP_VS_DEBUG
87static int sysctl_ip_vs_debug_level = 0;
88
89int ip_vs_get_debug_level(void)
90{
91	return sysctl_ip_vs_debug_level;
92}
93#endif
94
95/*
96 *	update_defense_level is called from keventd and from sysctl,
97 *	so it needs to protect itself from softirqs
98 */
99static void update_defense_level(void)
100{
101	struct sysinfo i;
102	static int old_secure_tcp = 0;
103	int availmem;
104	int nomem;
105	int to_change = -1;
106
107	/* we only count free and buffered memory (in pages) */
108	si_meminfo(&i);
109	availmem = i.freeram + i.bufferram;
110	/* however in linux 2.5 the i.bufferram is total page cache size,
111	   we need adjust it */
112	/* si_swapinfo(&i); */
113	/* availmem = availmem - (i.totalswap - i.freeswap); */
114
115	nomem = (availmem < sysctl_ip_vs_amemthresh);
116
117	local_bh_disable();
118
119	/* drop_entry */
120	spin_lock(&__ip_vs_dropentry_lock);
121	switch (sysctl_ip_vs_drop_entry) {
122	case 0:
123		atomic_set(&ip_vs_dropentry, 0);
124		break;
125	case 1:
126		if (nomem) {
127			atomic_set(&ip_vs_dropentry, 1);
128			sysctl_ip_vs_drop_entry = 2;
129		} else {
130			atomic_set(&ip_vs_dropentry, 0);
131		}
132		break;
133	case 2:
134		if (nomem) {
135			atomic_set(&ip_vs_dropentry, 1);
136		} else {
137			atomic_set(&ip_vs_dropentry, 0);
138			sysctl_ip_vs_drop_entry = 1;
139		};
140		break;
141	case 3:
142		atomic_set(&ip_vs_dropentry, 1);
143		break;
144	}
145	spin_unlock(&__ip_vs_dropentry_lock);
146
147	/* drop_packet */
148	spin_lock(&__ip_vs_droppacket_lock);
149	switch (sysctl_ip_vs_drop_packet) {
150	case 0:
151		ip_vs_drop_rate = 0;
152		break;
153	case 1:
154		if (nomem) {
155			ip_vs_drop_rate = ip_vs_drop_counter
156				= sysctl_ip_vs_amemthresh /
157				(sysctl_ip_vs_amemthresh-availmem);
158			sysctl_ip_vs_drop_packet = 2;
159		} else {
160			ip_vs_drop_rate = 0;
161		}
162		break;
163	case 2:
164		if (nomem) {
165			ip_vs_drop_rate = ip_vs_drop_counter
166				= sysctl_ip_vs_amemthresh /
167				(sysctl_ip_vs_amemthresh-availmem);
168		} else {
169			ip_vs_drop_rate = 0;
170			sysctl_ip_vs_drop_packet = 1;
171		}
172		break;
173	case 3:
174		ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
175		break;
176	}
177	spin_unlock(&__ip_vs_droppacket_lock);
178
179	/* secure_tcp */
180	write_lock(&__ip_vs_securetcp_lock);
181	switch (sysctl_ip_vs_secure_tcp) {
182	case 0:
183		if (old_secure_tcp >= 2)
184			to_change = 0;
185		break;
186	case 1:
187		if (nomem) {
188			if (old_secure_tcp < 2)
189				to_change = 1;
190			sysctl_ip_vs_secure_tcp = 2;
191		} else {
192			if (old_secure_tcp >= 2)
193				to_change = 0;
194		}
195		break;
196	case 2:
197		if (nomem) {
198			if (old_secure_tcp < 2)
199				to_change = 1;
200		} else {
201			if (old_secure_tcp >= 2)
202				to_change = 0;
203			sysctl_ip_vs_secure_tcp = 1;
204		}
205		break;
206	case 3:
207		if (old_secure_tcp < 2)
208			to_change = 1;
209		break;
210	}
211	old_secure_tcp = sysctl_ip_vs_secure_tcp;
212	if (to_change >= 0)
213		ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
214	write_unlock(&__ip_vs_securetcp_lock);
215
216	local_bh_enable();
217}
218
219
220/*
221 *	Timer for checking the defense
222 */
223#define DEFENSE_TIMER_PERIOD	1*HZ
224static void defense_work_handler(struct work_struct *work);
225static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
226
227static void defense_work_handler(struct work_struct *work)
228{
229	update_defense_level();
230	if (atomic_read(&ip_vs_dropentry))
231		ip_vs_random_dropentry();
232
233	schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
234}
235
236int
237ip_vs_use_count_inc(void)
238{
239	return try_module_get(THIS_MODULE);
240}
241
242void
243ip_vs_use_count_dec(void)
244{
245	module_put(THIS_MODULE);
246}
247
248
249/*
250 *	Hash table: for virtual service lookups
251 */
252#define IP_VS_SVC_TAB_BITS 8
253#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
254#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
255
256/* the service table hashed by <protocol, addr, port> */
257static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
258/* the service table hashed by fwmark */
259static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
260
261/*
262 *	Hash table: for real service lookups
263 */
264#define IP_VS_RTAB_BITS 4
265#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
266#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
267
268static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
269
270/*
271 *	Trash for destinations
272 */
273static LIST_HEAD(ip_vs_dest_trash);
274
275/*
276 *	FTP & NULL virtual service counters
277 */
278static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
279static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
280
281
282/*
283 *	Returns hash value for virtual service
284 */
285static __inline__ unsigned
286ip_vs_svc_hashkey(unsigned proto, __be32 addr, __be16 port)
287{
288	register unsigned porth = ntohs(port);
289
290	return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
291		& IP_VS_SVC_TAB_MASK;
292}
293
294/*
295 *	Returns hash value of fwmark for virtual service lookup
296 */
297static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
298{
299	return fwmark & IP_VS_SVC_TAB_MASK;
300}
301
302/*
303 *	Hashes a service in the ip_vs_svc_table by <proto,addr,port>
304 *	or in the ip_vs_svc_fwm_table by fwmark.
305 *	Should be called with locked tables.
306 */
307static int ip_vs_svc_hash(struct ip_vs_service *svc)
308{
309	unsigned hash;
310
311	if (svc->flags & IP_VS_SVC_F_HASHED) {
312		IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
313			  "called from %p\n", __builtin_return_address(0));
314		return 0;
315	}
316
317	if (svc->fwmark == 0) {
318		/*
319		 *  Hash it by <protocol,addr,port> in ip_vs_svc_table
320		 */
321		hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
322		list_add(&svc->s_list, &ip_vs_svc_table[hash]);
323	} else {
324		/*
325		 *  Hash it by fwmark in ip_vs_svc_fwm_table
326		 */
327		hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
328		list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
329	}
330
331	svc->flags |= IP_VS_SVC_F_HASHED;
332	/* increase its refcnt because it is referenced by the svc table */
333	atomic_inc(&svc->refcnt);
334	return 1;
335}
336
337
338/*
339 *	Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
340 *	Should be called with locked tables.
341 */
342static int ip_vs_svc_unhash(struct ip_vs_service *svc)
343{
344	if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
345		IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
346			  "called from %p\n", __builtin_return_address(0));
347		return 0;
348	}
349
350	if (svc->fwmark == 0) {
351		/* Remove it from the ip_vs_svc_table table */
352		list_del(&svc->s_list);
353	} else {
354		/* Remove it from the ip_vs_svc_fwm_table table */
355		list_del(&svc->f_list);
356	}
357
358	svc->flags &= ~IP_VS_SVC_F_HASHED;
359	atomic_dec(&svc->refcnt);
360	return 1;
361}
362
363
364/*
365 *	Get service by {proto,addr,port} in the service table.
366 */
367static __inline__ struct ip_vs_service *
368__ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport)
369{
370	unsigned hash;
371	struct ip_vs_service *svc;
372
373	/* Check for "full" addressed entries */
374	hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
375
376	list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
377		if ((svc->addr == vaddr)
378		    && (svc->port == vport)
379		    && (svc->protocol == protocol)) {
380			/* HIT */
381			atomic_inc(&svc->usecnt);
382			return svc;
383		}
384	}
385
386	return NULL;
387}
388
389
390/*
391 *	Get service by {fwmark} in the service table.
392 */
393static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
394{
395	unsigned hash;
396	struct ip_vs_service *svc;
397
398	/* Check for fwmark addressed entries */
399	hash = ip_vs_svc_fwm_hashkey(fwmark);
400
401	list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
402		if (svc->fwmark == fwmark) {
403			/* HIT */
404			atomic_inc(&svc->usecnt);
405			return svc;
406		}
407	}
408
409	return NULL;
410}
411
412struct ip_vs_service *
413ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
414{
415	struct ip_vs_service *svc;
416
417	read_lock(&__ip_vs_svc_lock);
418
419	/*
420	 *	Check the table hashed by fwmark first
421	 */
422	if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
423		goto out;
424
425	/*
426	 *	Check the table hashed by <protocol,addr,port>
427	 *	for "full" addressed entries
428	 */
429	svc = __ip_vs_service_get(protocol, vaddr, vport);
430
431	if (svc == NULL
432	    && protocol == IPPROTO_TCP
433	    && atomic_read(&ip_vs_ftpsvc_counter)
434	    && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
435		/*
436		 * Check if ftp service entry exists, the packet
437		 * might belong to FTP data connections.
438		 */
439		svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
440	}
441
442	if (svc == NULL
443	    && atomic_read(&ip_vs_nullsvc_counter)) {
444		/*
445		 * Check if the catch-all port (port zero) exists
446		 */
447		svc = __ip_vs_service_get(protocol, vaddr, 0);
448	}
449
450  out:
451	read_unlock(&__ip_vs_svc_lock);
452
453	IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
454		  fwmark, ip_vs_proto_name(protocol),
455		  NIPQUAD(vaddr), ntohs(vport),
456		  svc?"hit":"not hit");
457
458	return svc;
459}
460
461
462static inline void
463__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
464{
465	atomic_inc(&svc->refcnt);
466	dest->svc = svc;
467}
468
469static inline void
470__ip_vs_unbind_svc(struct ip_vs_dest *dest)
471{
472	struct ip_vs_service *svc = dest->svc;
473
474	dest->svc = NULL;
475	if (atomic_dec_and_test(&svc->refcnt))
476		kfree(svc);
477}
478
479
480/*
481 *	Returns hash value for real service
482 */
483static __inline__ unsigned ip_vs_rs_hashkey(__be32 addr, __be16 port)
484{
485	register unsigned porth = ntohs(port);
486
487	return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
488		& IP_VS_RTAB_MASK;
489}
490
491/*
492 *	Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
493 *	should be called with locked tables.
494 */
495static int ip_vs_rs_hash(struct ip_vs_dest *dest)
496{
497	unsigned hash;
498
499	if (!list_empty(&dest->d_list)) {
500		return 0;
501	}
502
503	/*
504	 *	Hash by proto,addr,port,
505	 *	which are the parameters of the real service.
506	 */
507	hash = ip_vs_rs_hashkey(dest->addr, dest->port);
508	list_add(&dest->d_list, &ip_vs_rtable[hash]);
509
510	return 1;
511}
512
513/*
514 *	UNhashes ip_vs_dest from ip_vs_rtable.
515 *	should be called with locked tables.
516 */
517static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
518{
519	/*
520	 * Remove it from the ip_vs_rtable table.
521	 */
522	if (!list_empty(&dest->d_list)) {
523		list_del(&dest->d_list);
524		INIT_LIST_HEAD(&dest->d_list);
525	}
526
527	return 1;
528}
529
530/*
531 *	Lookup real service by <proto,addr,port> in the real service table.
532 */
533struct ip_vs_dest *
534ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport)
535{
536	unsigned hash;
537	struct ip_vs_dest *dest;
538
539	/*
540	 *	Check for "full" addressed entries
541	 *	Return the first found entry
542	 */
543	hash = ip_vs_rs_hashkey(daddr, dport);
544
545	read_lock(&__ip_vs_rs_lock);
546	list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
547		if ((dest->addr == daddr)
548		    && (dest->port == dport)
549		    && ((dest->protocol == protocol) ||
550			dest->vfwmark)) {
551			/* HIT */
552			read_unlock(&__ip_vs_rs_lock);
553			return dest;
554		}
555	}
556	read_unlock(&__ip_vs_rs_lock);
557
558	return NULL;
559}
560
561/*
562 *	Lookup destination by {addr,port} in the given service
563 */
564static struct ip_vs_dest *
565ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
566{
567	struct ip_vs_dest *dest;
568
569	/*
570	 * Find the destination for the given service
571	 */
572	list_for_each_entry(dest, &svc->destinations, n_list) {
573		if ((dest->addr == daddr) && (dest->port == dport)) {
574			/* HIT */
575			return dest;
576		}
577	}
578
579	return NULL;
580}
581
582
583/*
584 *  Lookup dest by {svc,addr,port} in the destination trash.
585 *  The destination trash is used to hold the destinations that are removed
586 *  from the service table but are still referenced by some conn entries.
587 *  The reason to add the destination trash is when the dest is temporary
588 *  down (either by administrator or by monitor program), the dest can be
589 *  picked back from the trash, the remaining connections to the dest can
590 *  continue, and the counting information of the dest is also useful for
591 *  scheduling.
592 */
593static struct ip_vs_dest *
594ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
595{
596	struct ip_vs_dest *dest, *nxt;
597
598	/*
599	 * Find the destination in trash
600	 */
601	list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
602		IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
603			  "dest->refcnt=%d\n",
604			  dest->vfwmark,
605			  NIPQUAD(dest->addr), ntohs(dest->port),
606			  atomic_read(&dest->refcnt));
607		if (dest->addr == daddr &&
608		    dest->port == dport &&
609		    dest->vfwmark == svc->fwmark &&
610		    dest->protocol == svc->protocol &&
611		    (svc->fwmark ||
612		     (dest->vaddr == svc->addr &&
613		      dest->vport == svc->port))) {
614			/* HIT */
615			return dest;
616		}
617
618		/*
619		 * Try to purge the destination from trash if not referenced
620		 */
621		if (atomic_read(&dest->refcnt) == 1) {
622			IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
623				  "from trash\n",
624				  dest->vfwmark,
625				  NIPQUAD(dest->addr), ntohs(dest->port));
626			list_del(&dest->n_list);
627			ip_vs_dst_reset(dest);
628			__ip_vs_unbind_svc(dest);
629			kfree(dest);
630		}
631	}
632
633	return NULL;
634}
635
636
637/*
638 *  Clean up all the destinations in the trash
639 *  Called by the ip_vs_control_cleanup()
640 *
641 *  When the ip_vs_control_clearup is activated by ipvs module exit,
642 *  the service tables must have been flushed and all the connections
643 *  are expired, and the refcnt of each destination in the trash must
644 *  be 1, so we simply release them here.
645 */
646static void ip_vs_trash_cleanup(void)
647{
648	struct ip_vs_dest *dest, *nxt;
649
650	list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
651		list_del(&dest->n_list);
652		ip_vs_dst_reset(dest);
653		__ip_vs_unbind_svc(dest);
654		kfree(dest);
655	}
656}
657
658
659static void
660ip_vs_zero_stats(struct ip_vs_stats *stats)
661{
662	spin_lock_bh(&stats->lock);
663	memset(stats, 0, (char *)&stats->lock - (char *)stats);
664	spin_unlock_bh(&stats->lock);
665	ip_vs_zero_estimator(stats);
666}
667
668/*
669 *	Update a destination in the given service
670 */
671static void
672__ip_vs_update_dest(struct ip_vs_service *svc,
673		    struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
674{
675	int conn_flags;
676
677	/* set the weight and the flags */
678	atomic_set(&dest->weight, udest->weight);
679	conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
680
681	/* check if local node and update the flags */
682	if (inet_addr_type(udest->addr) == RTN_LOCAL) {
683		conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
684			| IP_VS_CONN_F_LOCALNODE;
685	}
686
687	/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
688	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
689		conn_flags |= IP_VS_CONN_F_NOOUTPUT;
690	} else {
691		/*
692		 *    Put the real service in ip_vs_rtable if not present.
693		 *    For now only for NAT!
694		 */
695		write_lock_bh(&__ip_vs_rs_lock);
696		ip_vs_rs_hash(dest);
697		write_unlock_bh(&__ip_vs_rs_lock);
698	}
699	atomic_set(&dest->conn_flags, conn_flags);
700
701	/* bind the service */
702	if (!dest->svc) {
703		__ip_vs_bind_svc(dest, svc);
704	} else {
705		if (dest->svc != svc) {
706			__ip_vs_unbind_svc(dest);
707			ip_vs_zero_stats(&dest->stats);
708			__ip_vs_bind_svc(dest, svc);
709		}
710	}
711
712	/* set the dest status flags */
713	dest->flags |= IP_VS_DEST_F_AVAILABLE;
714
715	if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
716		dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
717	dest->u_threshold = udest->u_threshold;
718	dest->l_threshold = udest->l_threshold;
719}
720
721
722/*
723 *	Create a destination for the given service
724 */
725static int
726ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
727	       struct ip_vs_dest **dest_p)
728{
729	struct ip_vs_dest *dest;
730	unsigned atype;
731
732	EnterFunction(2);
733
734	atype = inet_addr_type(udest->addr);
735	if (atype != RTN_LOCAL && atype != RTN_UNICAST)
736		return -EINVAL;
737
738	dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
739	if (dest == NULL) {
740		IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
741		return -ENOMEM;
742	}
743
744	dest->protocol = svc->protocol;
745	dest->vaddr = svc->addr;
746	dest->vport = svc->port;
747	dest->vfwmark = svc->fwmark;
748	dest->addr = udest->addr;
749	dest->port = udest->port;
750
751	atomic_set(&dest->activeconns, 0);
752	atomic_set(&dest->inactconns, 0);
753	atomic_set(&dest->persistconns, 0);
754	atomic_set(&dest->refcnt, 0);
755
756	INIT_LIST_HEAD(&dest->d_list);
757	spin_lock_init(&dest->dst_lock);
758	spin_lock_init(&dest->stats.lock);
759	__ip_vs_update_dest(svc, dest, udest);
760	ip_vs_new_estimator(&dest->stats);
761
762	*dest_p = dest;
763
764	LeaveFunction(2);
765	return 0;
766}
767
768
769/*
770 *	Add a destination into an existing service
771 */
772static int
773ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
774{
775	struct ip_vs_dest *dest;
776	__be32 daddr = udest->addr;
777	__be16 dport = udest->port;
778	int ret;
779
780	EnterFunction(2);
781
782	if (udest->weight < 0) {
783		IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
784		return -ERANGE;
785	}
786
787	if (udest->l_threshold > udest->u_threshold) {
788		IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
789			  "upper threshold\n");
790		return -ERANGE;
791	}
792
793	/*
794	 * Check if the dest already exists in the list
795	 */
796	dest = ip_vs_lookup_dest(svc, daddr, dport);
797	if (dest != NULL) {
798		IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
799		return -EEXIST;
800	}
801
802	/*
803	 * Check if the dest already exists in the trash and
804	 * is from the same service
805	 */
806	dest = ip_vs_trash_get_dest(svc, daddr, dport);
807	if (dest != NULL) {
808		IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
809			  "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
810			  NIPQUAD(daddr), ntohs(dport),
811			  atomic_read(&dest->refcnt),
812			  dest->vfwmark,
813			  NIPQUAD(dest->vaddr),
814			  ntohs(dest->vport));
815		__ip_vs_update_dest(svc, dest, udest);
816
817		/*
818		 * Get the destination from the trash
819		 */
820		list_del(&dest->n_list);
821
822		ip_vs_new_estimator(&dest->stats);
823
824		write_lock_bh(&__ip_vs_svc_lock);
825
826		/*
827		 * Wait until all other svc users go away.
828		 */
829		IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
830
831		list_add(&dest->n_list, &svc->destinations);
832		svc->num_dests++;
833
834		/* call the update_service function of its scheduler */
835		svc->scheduler->update_service(svc);
836
837		write_unlock_bh(&__ip_vs_svc_lock);
838		return 0;
839	}
840
841	/*
842	 * Allocate and initialize the dest structure
843	 */
844	ret = ip_vs_new_dest(svc, udest, &dest);
845	if (ret) {
846		return ret;
847	}
848
849	/*
850	 * Add the dest entry into the list
851	 */
852	atomic_inc(&dest->refcnt);
853
854	write_lock_bh(&__ip_vs_svc_lock);
855
856	/*
857	 * Wait until all other svc users go away.
858	 */
859	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
860
861	list_add(&dest->n_list, &svc->destinations);
862	svc->num_dests++;
863
864	/* call the update_service function of its scheduler */
865	svc->scheduler->update_service(svc);
866
867	write_unlock_bh(&__ip_vs_svc_lock);
868
869	LeaveFunction(2);
870
871	return 0;
872}
873
874
875/*
876 *	Edit a destination in the given service
877 */
878static int
879ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
880{
881	struct ip_vs_dest *dest;
882	__be32 daddr = udest->addr;
883	__be16 dport = udest->port;
884
885	EnterFunction(2);
886
887	if (udest->weight < 0) {
888		IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
889		return -ERANGE;
890	}
891
892	if (udest->l_threshold > udest->u_threshold) {
893		IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
894			  "upper threshold\n");
895		return -ERANGE;
896	}
897
898	/*
899	 *  Lookup the destination list
900	 */
901	dest = ip_vs_lookup_dest(svc, daddr, dport);
902	if (dest == NULL) {
903		IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
904		return -ENOENT;
905	}
906
907	__ip_vs_update_dest(svc, dest, udest);
908
909	write_lock_bh(&__ip_vs_svc_lock);
910
911	/* Wait until all other svc users go away */
912	while (atomic_read(&svc->usecnt) > 1) {};
913
914	/* call the update_service, because server weight may be changed */
915	svc->scheduler->update_service(svc);
916
917	write_unlock_bh(&__ip_vs_svc_lock);
918
919	LeaveFunction(2);
920
921	return 0;
922}
923
924
925/*
926 *	Delete a destination (must be already unlinked from the service)
927 */
928static void __ip_vs_del_dest(struct ip_vs_dest *dest)
929{
930	ip_vs_kill_estimator(&dest->stats);
931
932	/*
933	 *  Remove it from the d-linked list with the real services.
934	 */
935	write_lock_bh(&__ip_vs_rs_lock);
936	ip_vs_rs_unhash(dest);
937	write_unlock_bh(&__ip_vs_rs_lock);
938
939	/*
940	 *  Decrease the refcnt of the dest, and free the dest
941	 *  if nobody refers to it (refcnt=0). Otherwise, throw
942	 *  the destination into the trash.
943	 */
944	if (atomic_dec_and_test(&dest->refcnt)) {
945		ip_vs_dst_reset(dest);
946		/* simply decrease svc->refcnt here, let the caller check
947		   and release the service if nobody refers to it.
948		   Only user context can release destination and service,
949		   and only one user context can update virtual service at a
950		   time, so the operation here is OK */
951		atomic_dec(&dest->svc->refcnt);
952		kfree(dest);
953	} else {
954		IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
955			  "dest->refcnt=%d\n",
956			  NIPQUAD(dest->addr), ntohs(dest->port),
957			  atomic_read(&dest->refcnt));
958		list_add(&dest->n_list, &ip_vs_dest_trash);
959		atomic_inc(&dest->refcnt);
960	}
961}
962
963
964/*
965 *	Unlink a destination from the given service
966 */
967static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
968				struct ip_vs_dest *dest,
969				int svcupd)
970{
971	dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
972
973	/*
974	 *  Remove it from the d-linked destination list.
975	 */
976	list_del(&dest->n_list);
977	svc->num_dests--;
978	if (svcupd) {
979		/*
980		 *  Call the update_service function of its scheduler
981		 */
982		svc->scheduler->update_service(svc);
983	}
984}
985
986
987/*
988 *	Delete a destination server in the given service
989 */
990static int
991ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
992{
993	struct ip_vs_dest *dest;
994	__be32 daddr = udest->addr;
995	__be16 dport = udest->port;
996
997	EnterFunction(2);
998
999	dest = ip_vs_lookup_dest(svc, daddr, dport);
1000	if (dest == NULL) {
1001		IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1002		return -ENOENT;
1003	}
1004
1005	write_lock_bh(&__ip_vs_svc_lock);
1006
1007	/*
1008	 *	Wait until all other svc users go away.
1009	 */
1010	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1011
1012	/*
1013	 *	Unlink dest from the service
1014	 */
1015	__ip_vs_unlink_dest(svc, dest, 1);
1016
1017	write_unlock_bh(&__ip_vs_svc_lock);
1018
1019	/*
1020	 *	Delete the destination
1021	 */
1022	__ip_vs_del_dest(dest);
1023
1024	LeaveFunction(2);
1025
1026	return 0;
1027}
1028
1029
1030/*
1031 *	Add a service into the service hash table
1032 */
1033static int
1034ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1035{
1036	int ret = 0;
1037	struct ip_vs_scheduler *sched = NULL;
1038	struct ip_vs_service *svc = NULL;
1039
1040	/* increase the module use count */
1041	ip_vs_use_count_inc();
1042
1043	/* Lookup the scheduler by 'u->sched_name' */
1044	sched = ip_vs_scheduler_get(u->sched_name);
1045	if (sched == NULL) {
1046		IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1047			   u->sched_name);
1048		ret = -ENOENT;
1049		goto out_mod_dec;
1050	}
1051
1052	svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1053	if (svc == NULL) {
1054		IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1055		ret = -ENOMEM;
1056		goto out_err;
1057	}
1058
1059	/* I'm the first user of the service */
1060	atomic_set(&svc->usecnt, 1);
1061	atomic_set(&svc->refcnt, 0);
1062
1063	svc->protocol = u->protocol;
1064	svc->addr = u->addr;
1065	svc->port = u->port;
1066	svc->fwmark = u->fwmark;
1067	svc->flags = u->flags;
1068	svc->timeout = u->timeout * HZ;
1069	svc->netmask = u->netmask;
1070
1071	INIT_LIST_HEAD(&svc->destinations);
1072	rwlock_init(&svc->sched_lock);
1073	spin_lock_init(&svc->stats.lock);
1074
1075	/* Bind the scheduler */
1076	ret = ip_vs_bind_scheduler(svc, sched);
1077	if (ret)
1078		goto out_err;
1079	sched = NULL;
1080
1081	/* Update the virtual service counters */
1082	if (svc->port == FTPPORT)
1083		atomic_inc(&ip_vs_ftpsvc_counter);
1084	else if (svc->port == 0)
1085		atomic_inc(&ip_vs_nullsvc_counter);
1086
1087	ip_vs_new_estimator(&svc->stats);
1088	ip_vs_num_services++;
1089
1090	/* Hash the service into the service table */
1091	write_lock_bh(&__ip_vs_svc_lock);
1092	ip_vs_svc_hash(svc);
1093	write_unlock_bh(&__ip_vs_svc_lock);
1094
1095	*svc_p = svc;
1096	return 0;
1097
1098  out_err:
1099	if (svc != NULL) {
1100		if (svc->scheduler)
1101			ip_vs_unbind_scheduler(svc);
1102		if (svc->inc) {
1103			local_bh_disable();
1104			ip_vs_app_inc_put(svc->inc);
1105			local_bh_enable();
1106		}
1107		kfree(svc);
1108	}
1109	ip_vs_scheduler_put(sched);
1110
1111  out_mod_dec:
1112	/* decrease the module use count */
1113	ip_vs_use_count_dec();
1114
1115	return ret;
1116}
1117
1118
1119/*
1120 *	Edit a service and bind it with a new scheduler
1121 */
1122static int
1123ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1124{
1125	struct ip_vs_scheduler *sched, *old_sched;
1126	int ret = 0;
1127
1128	/*
1129	 * Lookup the scheduler, by 'u->sched_name'
1130	 */
1131	sched = ip_vs_scheduler_get(u->sched_name);
1132	if (sched == NULL) {
1133		IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1134			   u->sched_name);
1135		return -ENOENT;
1136	}
1137	old_sched = sched;
1138
1139	write_lock_bh(&__ip_vs_svc_lock);
1140
1141	/*
1142	 * Wait until all other svc users go away.
1143	 */
1144	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1145
1146	/*
1147	 * Set the flags and timeout value
1148	 */
1149	svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1150	svc->timeout = u->timeout * HZ;
1151	svc->netmask = u->netmask;
1152
1153	old_sched = svc->scheduler;
1154	if (sched != old_sched) {
1155		/*
1156		 * Unbind the old scheduler
1157		 */
1158		if ((ret = ip_vs_unbind_scheduler(svc))) {
1159			old_sched = sched;
1160			goto out;
1161		}
1162
1163		/*
1164		 * Bind the new scheduler
1165		 */
1166		if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1167			/*
1168			 * If ip_vs_bind_scheduler fails, restore the old
1169			 * scheduler.
1170			 * The main reason of failure is out of memory.
1171			 *
1172			 * The question is if the old scheduler can be
1173			 * restored all the time. TODO: if it cannot be
1174			 * restored some time, we must delete the service,
1175			 * otherwise the system may crash.
1176			 */
1177			ip_vs_bind_scheduler(svc, old_sched);
1178			old_sched = sched;
1179			goto out;
1180		}
1181	}
1182
1183  out:
1184	write_unlock_bh(&__ip_vs_svc_lock);
1185
1186	if (old_sched)
1187		ip_vs_scheduler_put(old_sched);
1188
1189	return ret;
1190}
1191
1192
1193/*
1194 *	Delete a service from the service list
1195 *	- The service must be unlinked, unlocked and not referenced!
1196 *	- We are called under _bh lock
1197 */
1198static void __ip_vs_del_service(struct ip_vs_service *svc)
1199{
1200	struct ip_vs_dest *dest, *nxt;
1201	struct ip_vs_scheduler *old_sched;
1202
1203	ip_vs_num_services--;
1204	ip_vs_kill_estimator(&svc->stats);
1205
1206	/* Unbind scheduler */
1207	old_sched = svc->scheduler;
1208	ip_vs_unbind_scheduler(svc);
1209	if (old_sched)
1210		ip_vs_scheduler_put(old_sched);
1211
1212	/* Unbind app inc */
1213	if (svc->inc) {
1214		ip_vs_app_inc_put(svc->inc);
1215		svc->inc = NULL;
1216	}
1217
1218	/*
1219	 *    Unlink the whole destination list
1220	 */
1221	list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1222		__ip_vs_unlink_dest(svc, dest, 0);
1223		__ip_vs_del_dest(dest);
1224	}
1225
1226	/*
1227	 *    Update the virtual service counters
1228	 */
1229	if (svc->port == FTPPORT)
1230		atomic_dec(&ip_vs_ftpsvc_counter);
1231	else if (svc->port == 0)
1232		atomic_dec(&ip_vs_nullsvc_counter);
1233
1234	/*
1235	 *    Free the service if nobody refers to it
1236	 */
1237	if (atomic_read(&svc->refcnt) == 0)
1238		kfree(svc);
1239
1240	/* decrease the module use count */
1241	ip_vs_use_count_dec();
1242}
1243
1244/*
1245 *	Delete a service from the service list
1246 */
1247static int ip_vs_del_service(struct ip_vs_service *svc)
1248{
1249	if (svc == NULL)
1250		return -EEXIST;
1251
1252	/*
1253	 * Unhash it from the service table
1254	 */
1255	write_lock_bh(&__ip_vs_svc_lock);
1256
1257	ip_vs_svc_unhash(svc);
1258
1259	/*
1260	 * Wait until all the svc users go away.
1261	 */
1262	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1263
1264	__ip_vs_del_service(svc);
1265
1266	write_unlock_bh(&__ip_vs_svc_lock);
1267
1268	return 0;
1269}
1270
1271
1272/*
1273 *	Flush all the virtual services
1274 */
1275static int ip_vs_flush(void)
1276{
1277	int idx;
1278	struct ip_vs_service *svc, *nxt;
1279
1280	/*
1281	 * Flush the service table hashed by <protocol,addr,port>
1282	 */
1283	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1284		list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1285			write_lock_bh(&__ip_vs_svc_lock);
1286			ip_vs_svc_unhash(svc);
1287			/*
1288			 * Wait until all the svc users go away.
1289			 */
1290			IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1291			__ip_vs_del_service(svc);
1292			write_unlock_bh(&__ip_vs_svc_lock);
1293		}
1294	}
1295
1296	/*
1297	 * Flush the service table hashed by fwmark
1298	 */
1299	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1300		list_for_each_entry_safe(svc, nxt,
1301					 &ip_vs_svc_fwm_table[idx], f_list) {
1302			write_lock_bh(&__ip_vs_svc_lock);
1303			ip_vs_svc_unhash(svc);
1304			/*
1305			 * Wait until all the svc users go away.
1306			 */
1307			IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1308			__ip_vs_del_service(svc);
1309			write_unlock_bh(&__ip_vs_svc_lock);
1310		}
1311	}
1312
1313	return 0;
1314}
1315
1316
1317/*
1318 *	Zero counters in a service or all services
1319 */
1320static int ip_vs_zero_service(struct ip_vs_service *svc)
1321{
1322	struct ip_vs_dest *dest;
1323
1324	write_lock_bh(&__ip_vs_svc_lock);
1325	list_for_each_entry(dest, &svc->destinations, n_list) {
1326		ip_vs_zero_stats(&dest->stats);
1327	}
1328	ip_vs_zero_stats(&svc->stats);
1329	write_unlock_bh(&__ip_vs_svc_lock);
1330	return 0;
1331}
1332
1333static int ip_vs_zero_all(void)
1334{
1335	int idx;
1336	struct ip_vs_service *svc;
1337
1338	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1339		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1340			ip_vs_zero_service(svc);
1341		}
1342	}
1343
1344	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1345		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1346			ip_vs_zero_service(svc);
1347		}
1348	}
1349
1350	ip_vs_zero_stats(&ip_vs_stats);
1351	return 0;
1352}
1353
1354
1355static int
1356proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1357		     void __user *buffer, size_t *lenp, loff_t *ppos)
1358{
1359	int *valp = table->data;
1360	int val = *valp;
1361	int rc;
1362
1363	rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1364	if (write && (*valp != val)) {
1365		if ((*valp < 0) || (*valp > 3)) {
1366			/* Restore the correct value */
1367			*valp = val;
1368		} else {
1369			update_defense_level();
1370		}
1371	}
1372	return rc;
1373}
1374
1375
1376static int
1377proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1378		       void __user *buffer, size_t *lenp, loff_t *ppos)
1379{
1380	int *valp = table->data;
1381	int val[2];
1382	int rc;
1383
1384	/* backup the value first */
1385	memcpy(val, valp, sizeof(val));
1386
1387	rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1388	if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1389		/* Restore the correct value */
1390		memcpy(valp, val, sizeof(val));
1391	}
1392	return rc;
1393}
1394
1395
1396/*
1397 *	IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1398 */
1399
1400static struct ctl_table vs_vars[] = {
1401	{
1402		.ctl_name	= NET_IPV4_VS_AMEMTHRESH,
1403		.procname	= "amemthresh",
1404		.data		= &sysctl_ip_vs_amemthresh,
1405		.maxlen		= sizeof(int),
1406		.mode		= 0644,
1407		.proc_handler	= &proc_dointvec,
1408	},
1409#ifdef CONFIG_IP_VS_DEBUG
1410	{
1411		.ctl_name	= NET_IPV4_VS_DEBUG_LEVEL,
1412		.procname	= "debug_level",
1413		.data		= &sysctl_ip_vs_debug_level,
1414		.maxlen		= sizeof(int),
1415		.mode		= 0644,
1416		.proc_handler	= &proc_dointvec,
1417	},
1418#endif
1419	{
1420		.ctl_name	= NET_IPV4_VS_AMDROPRATE,
1421		.procname	= "am_droprate",
1422		.data		= &sysctl_ip_vs_am_droprate,
1423		.maxlen		= sizeof(int),
1424		.mode		= 0644,
1425		.proc_handler	= &proc_dointvec,
1426	},
1427	{
1428		.ctl_name	= NET_IPV4_VS_DROP_ENTRY,
1429		.procname	= "drop_entry",
1430		.data		= &sysctl_ip_vs_drop_entry,
1431		.maxlen		= sizeof(int),
1432		.mode		= 0644,
1433		.proc_handler	= &proc_do_defense_mode,
1434	},
1435	{
1436		.ctl_name	= NET_IPV4_VS_DROP_PACKET,
1437		.procname	= "drop_packet",
1438		.data		= &sysctl_ip_vs_drop_packet,
1439		.maxlen		= sizeof(int),
1440		.mode		= 0644,
1441		.proc_handler	= &proc_do_defense_mode,
1442	},
1443	{
1444		.ctl_name	= NET_IPV4_VS_SECURE_TCP,
1445		.procname	= "secure_tcp",
1446		.data		= &sysctl_ip_vs_secure_tcp,
1447		.maxlen		= sizeof(int),
1448		.mode		= 0644,
1449		.proc_handler	= &proc_do_defense_mode,
1450	},
1451	{
1452		.ctl_name	= NET_IPV4_VS_CACHE_BYPASS,
1453		.procname	= "cache_bypass",
1454		.data		= &sysctl_ip_vs_cache_bypass,
1455		.maxlen		= sizeof(int),
1456		.mode		= 0644,
1457		.proc_handler	= &proc_dointvec,
1458	},
1459	{
1460		.ctl_name	= NET_IPV4_VS_EXPIRE_NODEST_CONN,
1461		.procname	= "expire_nodest_conn",
1462		.data		= &sysctl_ip_vs_expire_nodest_conn,
1463		.maxlen		= sizeof(int),
1464		.mode		= 0644,
1465		.proc_handler	= &proc_dointvec,
1466	},
1467	{
1468		.ctl_name	= NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
1469		.procname	= "expire_quiescent_template",
1470		.data		= &sysctl_ip_vs_expire_quiescent_template,
1471		.maxlen		= sizeof(int),
1472		.mode		= 0644,
1473		.proc_handler	= &proc_dointvec,
1474	},
1475	{
1476		.ctl_name	= NET_IPV4_VS_SYNC_THRESHOLD,
1477		.procname	= "sync_threshold",
1478		.data		= &sysctl_ip_vs_sync_threshold,
1479		.maxlen		= sizeof(sysctl_ip_vs_sync_threshold),
1480		.mode		= 0644,
1481		.proc_handler	= &proc_do_sync_threshold,
1482	},
1483	{
1484		.ctl_name	= NET_IPV4_VS_NAT_ICMP_SEND,
1485		.procname	= "nat_icmp_send",
1486		.data		= &sysctl_ip_vs_nat_icmp_send,
1487		.maxlen		= sizeof(int),
1488		.mode		= 0644,
1489		.proc_handler	= &proc_dointvec,
1490	},
1491	{ .ctl_name = 0 }
1492};
1493
1494static ctl_table vs_table[] = {
1495	{
1496		.ctl_name	= NET_IPV4_VS,
1497		.procname	= "vs",
1498		.mode		= 0555,
1499		.child		= vs_vars
1500	},
1501	{ .ctl_name = 0 }
1502};
1503
1504static ctl_table ipvs_ipv4_table[] = {
1505	{
1506		.ctl_name	= NET_IPV4,
1507		.procname	= "ipv4",
1508		.mode		= 0555,
1509		.child		= vs_table,
1510	},
1511	{ .ctl_name = 0 }
1512};
1513
1514static ctl_table vs_root_table[] = {
1515	{
1516		.ctl_name	= CTL_NET,
1517		.procname	= "net",
1518		.mode		= 0555,
1519		.child		= ipvs_ipv4_table,
1520	},
1521	{ .ctl_name = 0 }
1522};
1523
1524static struct ctl_table_header * sysctl_header;
1525
1526#ifdef CONFIG_PROC_FS
1527
1528struct ip_vs_iter {
1529	struct list_head *table;
1530	int bucket;
1531};
1532
1533/*
1534 *	Write the contents of the VS rule table to a PROCfs file.
1535 *	(It is kept just for backward compatibility)
1536 */
1537static inline const char *ip_vs_fwd_name(unsigned flags)
1538{
1539	switch (flags & IP_VS_CONN_F_FWD_MASK) {
1540	case IP_VS_CONN_F_LOCALNODE:
1541		return "Local";
1542	case IP_VS_CONN_F_TUNNEL:
1543		return "Tunnel";
1544	case IP_VS_CONN_F_DROUTE:
1545		return "Route";
1546	default:
1547		return "Masq";
1548	}
1549}
1550
1551
1552/* Get the Nth entry in the two lists */
1553static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1554{
1555	struct ip_vs_iter *iter = seq->private;
1556	int idx;
1557	struct ip_vs_service *svc;
1558
1559	/* look in hash by protocol */
1560	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1561		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1562			if (pos-- == 0){
1563				iter->table = ip_vs_svc_table;
1564				iter->bucket = idx;
1565				return svc;
1566			}
1567		}
1568	}
1569
1570	/* keep looking in fwmark */
1571	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1572		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1573			if (pos-- == 0) {
1574				iter->table = ip_vs_svc_fwm_table;
1575				iter->bucket = idx;
1576				return svc;
1577			}
1578		}
1579	}
1580
1581	return NULL;
1582}
1583
1584static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1585{
1586
1587	read_lock_bh(&__ip_vs_svc_lock);
1588	return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1589}
1590
1591
1592static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1593{
1594	struct list_head *e;
1595	struct ip_vs_iter *iter;
1596	struct ip_vs_service *svc;
1597
1598	++*pos;
1599	if (v == SEQ_START_TOKEN)
1600		return ip_vs_info_array(seq,0);
1601
1602	svc = v;
1603	iter = seq->private;
1604
1605	if (iter->table == ip_vs_svc_table) {
1606		/* next service in table hashed by protocol */
1607		if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1608			return list_entry(e, struct ip_vs_service, s_list);
1609
1610
1611		while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1612			list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1613					    s_list) {
1614				return svc;
1615			}
1616		}
1617
1618		iter->table = ip_vs_svc_fwm_table;
1619		iter->bucket = -1;
1620		goto scan_fwmark;
1621	}
1622
1623	/* next service in hashed by fwmark */
1624	if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1625		return list_entry(e, struct ip_vs_service, f_list);
1626
1627 scan_fwmark:
1628	while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1629		list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1630				    f_list)
1631			return svc;
1632	}
1633
1634	return NULL;
1635}
1636
1637static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1638{
1639	read_unlock_bh(&__ip_vs_svc_lock);
1640}
1641
1642
1643static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1644{
1645	if (v == SEQ_START_TOKEN) {
1646		seq_printf(seq,
1647			"IP Virtual Server version %d.%d.%d (size=%d)\n",
1648			NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1649		seq_puts(seq,
1650			 "Prot LocalAddress:Port Scheduler Flags\n");
1651		seq_puts(seq,
1652			 "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1653	} else {
1654		const struct ip_vs_service *svc = v;
1655		const struct ip_vs_iter *iter = seq->private;
1656		const struct ip_vs_dest *dest;
1657
1658		if (iter->table == ip_vs_svc_table)
1659			seq_printf(seq, "%s  %08X:%04X %s ",
1660				   ip_vs_proto_name(svc->protocol),
1661				   ntohl(svc->addr),
1662				   ntohs(svc->port),
1663				   svc->scheduler->name);
1664		else
1665			seq_printf(seq, "FWM  %08X %s ",
1666				   svc->fwmark, svc->scheduler->name);
1667
1668		if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1669			seq_printf(seq, "persistent %d %08X\n",
1670				svc->timeout,
1671				ntohl(svc->netmask));
1672		else
1673			seq_putc(seq, '\n');
1674
1675		list_for_each_entry(dest, &svc->destinations, n_list) {
1676			seq_printf(seq,
1677				   "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
1678				   ntohl(dest->addr), ntohs(dest->port),
1679				   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1680				   atomic_read(&dest->weight),
1681				   atomic_read(&dest->activeconns),
1682				   atomic_read(&dest->inactconns));
1683		}
1684	}
1685	return 0;
1686}
1687
1688static struct seq_operations ip_vs_info_seq_ops = {
1689	.start = ip_vs_info_seq_start,
1690	.next  = ip_vs_info_seq_next,
1691	.stop  = ip_vs_info_seq_stop,
1692	.show  = ip_vs_info_seq_show,
1693};
1694
1695static int ip_vs_info_open(struct inode *inode, struct file *file)
1696{
1697	struct seq_file *seq;
1698	int rc = -ENOMEM;
1699	struct ip_vs_iter *s = kzalloc(sizeof(*s), GFP_KERNEL);
1700
1701	if (!s)
1702		goto out;
1703
1704	rc = seq_open(file, &ip_vs_info_seq_ops);
1705	if (rc)
1706		goto out_kfree;
1707
1708	seq	     = file->private_data;
1709	seq->private = s;
1710out:
1711	return rc;
1712out_kfree:
1713	kfree(s);
1714	goto out;
1715}
1716
1717static const struct file_operations ip_vs_info_fops = {
1718	.owner	 = THIS_MODULE,
1719	.open    = ip_vs_info_open,
1720	.read    = seq_read,
1721	.llseek  = seq_lseek,
1722	.release = seq_release_private,
1723};
1724
1725#endif
1726
1727struct ip_vs_stats ip_vs_stats;
1728
1729#ifdef CONFIG_PROC_FS
1730static int ip_vs_stats_show(struct seq_file *seq, void *v)
1731{
1732
1733/*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1734	seq_puts(seq,
1735		 "   Total Incoming Outgoing         Incoming         Outgoing\n");
1736	seq_printf(seq,
1737		   "   Conns  Packets  Packets            Bytes            Bytes\n");
1738
1739	spin_lock_bh(&ip_vs_stats.lock);
1740	seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1741		   ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1742		   (unsigned long long) ip_vs_stats.inbytes,
1743		   (unsigned long long) ip_vs_stats.outbytes);
1744
1745/*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1746	seq_puts(seq,
1747		   " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
1748	seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1749			ip_vs_stats.cps,
1750			ip_vs_stats.inpps,
1751			ip_vs_stats.outpps,
1752			ip_vs_stats.inbps,
1753			ip_vs_stats.outbps);
1754	spin_unlock_bh(&ip_vs_stats.lock);
1755
1756	return 0;
1757}
1758
1759static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1760{
1761	return single_open(file, ip_vs_stats_show, NULL);
1762}
1763
1764static const struct file_operations ip_vs_stats_fops = {
1765	.owner = THIS_MODULE,
1766	.open = ip_vs_stats_seq_open,
1767	.read = seq_read,
1768	.llseek = seq_lseek,
1769	.release = single_release,
1770};
1771
1772#endif
1773
1774/*
1775 *	Set timeout values for tcp tcpfin udp in the timeout_table.
1776 */
1777static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1778{
1779	IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1780		  u->tcp_timeout,
1781		  u->tcp_fin_timeout,
1782		  u->udp_timeout);
1783
1784#ifdef CONFIG_IP_VS_PROTO_TCP
1785	if (u->tcp_timeout) {
1786		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1787			= u->tcp_timeout * HZ;
1788	}
1789
1790	if (u->tcp_fin_timeout) {
1791		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1792			= u->tcp_fin_timeout * HZ;
1793	}
1794#endif
1795
1796#ifdef CONFIG_IP_VS_PROTO_UDP
1797	if (u->udp_timeout) {
1798		ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1799			= u->udp_timeout * HZ;
1800	}
1801#endif
1802	return 0;
1803}
1804
1805
1806#define SET_CMDID(cmd)		(cmd - IP_VS_BASE_CTL)
1807#define SERVICE_ARG_LEN		(sizeof(struct ip_vs_service_user))
1808#define SVCDEST_ARG_LEN		(sizeof(struct ip_vs_service_user) +	\
1809				 sizeof(struct ip_vs_dest_user))
1810#define TIMEOUT_ARG_LEN		(sizeof(struct ip_vs_timeout_user))
1811#define DAEMON_ARG_LEN		(sizeof(struct ip_vs_daemon_user))
1812#define MAX_ARG_LEN		SVCDEST_ARG_LEN
1813
1814static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1815	[SET_CMDID(IP_VS_SO_SET_ADD)]		= SERVICE_ARG_LEN,
1816	[SET_CMDID(IP_VS_SO_SET_EDIT)]		= SERVICE_ARG_LEN,
1817	[SET_CMDID(IP_VS_SO_SET_DEL)]		= SERVICE_ARG_LEN,
1818	[SET_CMDID(IP_VS_SO_SET_FLUSH)]		= 0,
1819	[SET_CMDID(IP_VS_SO_SET_ADDDEST)]	= SVCDEST_ARG_LEN,
1820	[SET_CMDID(IP_VS_SO_SET_DELDEST)]	= SVCDEST_ARG_LEN,
1821	[SET_CMDID(IP_VS_SO_SET_EDITDEST)]	= SVCDEST_ARG_LEN,
1822	[SET_CMDID(IP_VS_SO_SET_TIMEOUT)]	= TIMEOUT_ARG_LEN,
1823	[SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]	= DAEMON_ARG_LEN,
1824	[SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]	= DAEMON_ARG_LEN,
1825	[SET_CMDID(IP_VS_SO_SET_ZERO)]		= SERVICE_ARG_LEN,
1826};
1827
1828static int
1829do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1830{
1831	int ret;
1832	unsigned char arg[MAX_ARG_LEN];
1833	struct ip_vs_service_user *usvc;
1834	struct ip_vs_service *svc;
1835	struct ip_vs_dest_user *udest;
1836
1837	if (!capable(CAP_NET_ADMIN))
1838		return -EPERM;
1839
1840	if (len != set_arglen[SET_CMDID(cmd)]) {
1841		IP_VS_ERR("set_ctl: len %u != %u\n",
1842			  len, set_arglen[SET_CMDID(cmd)]);
1843		return -EINVAL;
1844	}
1845
1846	if (copy_from_user(arg, user, len) != 0)
1847		return -EFAULT;
1848
1849	/* increase the module use count */
1850	ip_vs_use_count_inc();
1851
1852	if (mutex_lock_interruptible(&__ip_vs_mutex)) {
1853		ret = -ERESTARTSYS;
1854		goto out_dec;
1855	}
1856
1857	if (cmd == IP_VS_SO_SET_FLUSH) {
1858		/* Flush the virtual service */
1859		ret = ip_vs_flush();
1860		goto out_unlock;
1861	} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1862		/* Set timeout values for (tcp tcpfin udp) */
1863		ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1864		goto out_unlock;
1865	} else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1866		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1867		ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1868		goto out_unlock;
1869	} else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1870		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1871		ret = stop_sync_thread(dm->state);
1872		goto out_unlock;
1873	}
1874
1875	usvc = (struct ip_vs_service_user *)arg;
1876	udest = (struct ip_vs_dest_user *)(usvc + 1);
1877
1878	if (cmd == IP_VS_SO_SET_ZERO) {
1879		/* if no service address is set, zero counters in all */
1880		if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1881			ret = ip_vs_zero_all();
1882			goto out_unlock;
1883		}
1884	}
1885
1886	/* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1887	if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1888		IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1889			  usvc->protocol, NIPQUAD(usvc->addr),
1890			  ntohs(usvc->port), usvc->sched_name);
1891		ret = -EFAULT;
1892		goto out_unlock;
1893	}
1894
1895	/* Lookup the exact service by <protocol, addr, port> or fwmark */
1896	if (usvc->fwmark == 0)
1897		svc = __ip_vs_service_get(usvc->protocol,
1898					  usvc->addr, usvc->port);
1899	else
1900		svc = __ip_vs_svc_fwm_get(usvc->fwmark);
1901
1902	if (cmd != IP_VS_SO_SET_ADD
1903	    && (svc == NULL || svc->protocol != usvc->protocol)) {
1904		ret = -ESRCH;
1905		goto out_unlock;
1906	}
1907
1908	switch (cmd) {
1909	case IP_VS_SO_SET_ADD:
1910		if (svc != NULL)
1911			ret = -EEXIST;
1912		else
1913			ret = ip_vs_add_service(usvc, &svc);
1914		break;
1915	case IP_VS_SO_SET_EDIT:
1916		ret = ip_vs_edit_service(svc, usvc);
1917		break;
1918	case IP_VS_SO_SET_DEL:
1919		ret = ip_vs_del_service(svc);
1920		if (!ret)
1921			goto out_unlock;
1922		break;
1923	case IP_VS_SO_SET_ZERO:
1924		ret = ip_vs_zero_service(svc);
1925		break;
1926	case IP_VS_SO_SET_ADDDEST:
1927		ret = ip_vs_add_dest(svc, udest);
1928		break;
1929	case IP_VS_SO_SET_EDITDEST:
1930		ret = ip_vs_edit_dest(svc, udest);
1931		break;
1932	case IP_VS_SO_SET_DELDEST:
1933		ret = ip_vs_del_dest(svc, udest);
1934		break;
1935	default:
1936		ret = -EINVAL;
1937	}
1938
1939	if (svc)
1940		ip_vs_service_put(svc);
1941
1942  out_unlock:
1943	mutex_unlock(&__ip_vs_mutex);
1944  out_dec:
1945	/* decrease the module use count */
1946	ip_vs_use_count_dec();
1947
1948	return ret;
1949}
1950
1951
1952static void
1953ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
1954{
1955	spin_lock_bh(&src->lock);
1956	memcpy(dst, src, (char*)&src->lock - (char*)src);
1957	spin_unlock_bh(&src->lock);
1958}
1959
1960static void
1961ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
1962{
1963	dst->protocol = src->protocol;
1964	dst->addr = src->addr;
1965	dst->port = src->port;
1966	dst->fwmark = src->fwmark;
1967	strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
1968	dst->flags = src->flags;
1969	dst->timeout = src->timeout / HZ;
1970	dst->netmask = src->netmask;
1971	dst->num_dests = src->num_dests;
1972	ip_vs_copy_stats(&dst->stats, &src->stats);
1973}
1974
1975static inline int
1976__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
1977			    struct ip_vs_get_services __user *uptr)
1978{
1979	int idx, count=0;
1980	struct ip_vs_service *svc;
1981	struct ip_vs_service_entry entry;
1982	int ret = 0;
1983
1984	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1985		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1986			if (count >= get->num_services)
1987				goto out;
1988			memset(&entry, 0, sizeof(entry));
1989			ip_vs_copy_service(&entry, svc);
1990			if (copy_to_user(&uptr->entrytable[count],
1991					 &entry, sizeof(entry))) {
1992				ret = -EFAULT;
1993				goto out;
1994			}
1995			count++;
1996		}
1997	}
1998
1999	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2000		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2001			if (count >= get->num_services)
2002				goto out;
2003			memset(&entry, 0, sizeof(entry));
2004			ip_vs_copy_service(&entry, svc);
2005			if (copy_to_user(&uptr->entrytable[count],
2006					 &entry, sizeof(entry))) {
2007				ret = -EFAULT;
2008				goto out;
2009			}
2010			count++;
2011		}
2012	}
2013  out:
2014	return ret;
2015}
2016
2017static inline int
2018__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2019			 struct ip_vs_get_dests __user *uptr)
2020{
2021	struct ip_vs_service *svc;
2022	int ret = 0;
2023
2024	if (get->fwmark)
2025		svc = __ip_vs_svc_fwm_get(get->fwmark);
2026	else
2027		svc = __ip_vs_service_get(get->protocol,
2028					  get->addr, get->port);
2029	if (svc) {
2030		int count = 0;
2031		struct ip_vs_dest *dest;
2032		struct ip_vs_dest_entry entry;
2033
2034		list_for_each_entry(dest, &svc->destinations, n_list) {
2035			if (count >= get->num_dests)
2036				break;
2037
2038			entry.addr = dest->addr;
2039			entry.port = dest->port;
2040			entry.conn_flags = atomic_read(&dest->conn_flags);
2041			entry.weight = atomic_read(&dest->weight);
2042			entry.u_threshold = dest->u_threshold;
2043			entry.l_threshold = dest->l_threshold;
2044			entry.activeconns = atomic_read(&dest->activeconns);
2045			entry.inactconns = atomic_read(&dest->inactconns);
2046			entry.persistconns = atomic_read(&dest->persistconns);
2047			ip_vs_copy_stats(&entry.stats, &dest->stats);
2048			if (copy_to_user(&uptr->entrytable[count],
2049					 &entry, sizeof(entry))) {
2050				ret = -EFAULT;
2051				break;
2052			}
2053			count++;
2054		}
2055		ip_vs_service_put(svc);
2056	} else
2057		ret = -ESRCH;
2058	return ret;
2059}
2060
2061static inline void
2062__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2063{
2064#ifdef CONFIG_IP_VS_PROTO_TCP
2065	u->tcp_timeout =
2066		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2067	u->tcp_fin_timeout =
2068		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2069#endif
2070#ifdef CONFIG_IP_VS_PROTO_UDP
2071	u->udp_timeout =
2072		ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2073#endif
2074}
2075
2076
2077#define GET_CMDID(cmd)		(cmd - IP_VS_BASE_CTL)
2078#define GET_INFO_ARG_LEN	(sizeof(struct ip_vs_getinfo))
2079#define GET_SERVICES_ARG_LEN	(sizeof(struct ip_vs_get_services))
2080#define GET_SERVICE_ARG_LEN	(sizeof(struct ip_vs_service_entry))
2081#define GET_DESTS_ARG_LEN	(sizeof(struct ip_vs_get_dests))
2082#define GET_TIMEOUT_ARG_LEN	(sizeof(struct ip_vs_timeout_user))
2083#define GET_DAEMON_ARG_LEN	(sizeof(struct ip_vs_daemon_user) * 2)
2084
2085static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2086	[GET_CMDID(IP_VS_SO_GET_VERSION)]	= 64,
2087	[GET_CMDID(IP_VS_SO_GET_INFO)]		= GET_INFO_ARG_LEN,
2088	[GET_CMDID(IP_VS_SO_GET_SERVICES)]	= GET_SERVICES_ARG_LEN,
2089	[GET_CMDID(IP_VS_SO_GET_SERVICE)]	= GET_SERVICE_ARG_LEN,
2090	[GET_CMDID(IP_VS_SO_GET_DESTS)]		= GET_DESTS_ARG_LEN,
2091	[GET_CMDID(IP_VS_SO_GET_TIMEOUT)]	= GET_TIMEOUT_ARG_LEN,
2092	[GET_CMDID(IP_VS_SO_GET_DAEMON)]	= GET_DAEMON_ARG_LEN,
2093};
2094
2095static int
2096do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2097{
2098	unsigned char arg[128];
2099	int ret = 0;
2100
2101	if (!capable(CAP_NET_ADMIN))
2102		return -EPERM;
2103
2104	if (*len < get_arglen[GET_CMDID(cmd)]) {
2105		IP_VS_ERR("get_ctl: len %u < %u\n",
2106			  *len, get_arglen[GET_CMDID(cmd)]);
2107		return -EINVAL;
2108	}
2109
2110	if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2111		return -EFAULT;
2112
2113	if (mutex_lock_interruptible(&__ip_vs_mutex))
2114		return -ERESTARTSYS;
2115
2116	switch (cmd) {
2117	case IP_VS_SO_GET_VERSION:
2118	{
2119		char buf[64];
2120
2121		sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2122			NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2123		if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2124			ret = -EFAULT;
2125			goto out;
2126		}
2127		*len = strlen(buf)+1;
2128	}
2129	break;
2130
2131	case IP_VS_SO_GET_INFO:
2132	{
2133		struct ip_vs_getinfo info;
2134		info.version = IP_VS_VERSION_CODE;
2135		info.size = IP_VS_CONN_TAB_SIZE;
2136		info.num_services = ip_vs_num_services;
2137		if (copy_to_user(user, &info, sizeof(info)) != 0)
2138			ret = -EFAULT;
2139	}
2140	break;
2141
2142	case IP_VS_SO_GET_SERVICES:
2143	{
2144		struct ip_vs_get_services *get;
2145		int size;
2146
2147		get = (struct ip_vs_get_services *)arg;
2148		size = sizeof(*get) +
2149			sizeof(struct ip_vs_service_entry) * get->num_services;
2150		if (*len != size) {
2151			IP_VS_ERR("length: %u != %u\n", *len, size);
2152			ret = -EINVAL;
2153			goto out;
2154		}
2155		ret = __ip_vs_get_service_entries(get, user);
2156	}
2157	break;
2158
2159	case IP_VS_SO_GET_SERVICE:
2160	{
2161		struct ip_vs_service_entry *entry;
2162		struct ip_vs_service *svc;
2163
2164		entry = (struct ip_vs_service_entry *)arg;
2165		if (entry->fwmark)
2166			svc = __ip_vs_svc_fwm_get(entry->fwmark);
2167		else
2168			svc = __ip_vs_service_get(entry->protocol,
2169						  entry->addr, entry->port);
2170		if (svc) {
2171			ip_vs_copy_service(entry, svc);
2172			if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2173				ret = -EFAULT;
2174			ip_vs_service_put(svc);
2175		} else
2176			ret = -ESRCH;
2177	}
2178	break;
2179
2180	case IP_VS_SO_GET_DESTS:
2181	{
2182		struct ip_vs_get_dests *get;
2183		int size;
2184
2185		get = (struct ip_vs_get_dests *)arg;
2186		size = sizeof(*get) +
2187			sizeof(struct ip_vs_dest_entry) * get->num_dests;
2188		if (*len != size) {
2189			IP_VS_ERR("length: %u != %u\n", *len, size);
2190			ret = -EINVAL;
2191			goto out;
2192		}
2193		ret = __ip_vs_get_dest_entries(get, user);
2194	}
2195	break;
2196
2197	case IP_VS_SO_GET_TIMEOUT:
2198	{
2199		struct ip_vs_timeout_user t;
2200
2201		__ip_vs_get_timeouts(&t);
2202		if (copy_to_user(user, &t, sizeof(t)) != 0)
2203			ret = -EFAULT;
2204	}
2205	break;
2206
2207	case IP_VS_SO_GET_DAEMON:
2208	{
2209		struct ip_vs_daemon_user d[2];
2210
2211		memset(&d, 0, sizeof(d));
2212		if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2213			d[0].state = IP_VS_STATE_MASTER;
2214			strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2215			d[0].syncid = ip_vs_master_syncid;
2216		}
2217		if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2218			d[1].state = IP_VS_STATE_BACKUP;
2219			strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2220			d[1].syncid = ip_vs_backup_syncid;
2221		}
2222		if (copy_to_user(user, &d, sizeof(d)) != 0)
2223			ret = -EFAULT;
2224	}
2225	break;
2226
2227	default:
2228		ret = -EINVAL;
2229	}
2230
2231  out:
2232	mutex_unlock(&__ip_vs_mutex);
2233	return ret;
2234}
2235
2236
2237static struct nf_sockopt_ops ip_vs_sockopts = {
2238	.pf		= PF_INET,
2239	.set_optmin	= IP_VS_BASE_CTL,
2240	.set_optmax	= IP_VS_SO_SET_MAX+1,
2241	.set		= do_ip_vs_set_ctl,
2242	.get_optmin	= IP_VS_BASE_CTL,
2243	.get_optmax	= IP_VS_SO_GET_MAX+1,
2244	.get		= do_ip_vs_get_ctl,
2245};
2246
2247
2248int ip_vs_control_init(void)
2249{
2250	int ret;
2251	int idx;
2252
2253	EnterFunction(2);
2254
2255	ret = nf_register_sockopt(&ip_vs_sockopts);
2256	if (ret) {
2257		IP_VS_ERR("cannot register sockopt.\n");
2258		return ret;
2259	}
2260
2261	proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
2262	proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
2263
2264	sysctl_header = register_sysctl_table(vs_root_table);
2265
2266	/* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2267	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
2268		INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2269		INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2270	}
2271	for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
2272		INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2273	}
2274
2275	memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
2276	spin_lock_init(&ip_vs_stats.lock);
2277	ip_vs_new_estimator(&ip_vs_stats);
2278
2279	/* Hook the defense timer */
2280	schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2281
2282	LeaveFunction(2);
2283	return 0;
2284}
2285
2286
2287void ip_vs_control_cleanup(void)
2288{
2289	EnterFunction(2);
2290	ip_vs_trash_cleanup();
2291	cancel_rearming_delayed_work(&defense_work);
2292	cancel_work_sync(&defense_work.work);
2293	ip_vs_kill_estimator(&ip_vs_stats);
2294	unregister_sysctl_table(sysctl_header);
2295	proc_net_remove("ip_vs_stats");
2296	proc_net_remove("ip_vs");
2297	nf_unregister_sockopt(&ip_vs_sockopts);
2298	LeaveFunction(2);
2299}
2300