1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * IPVS         An implementation of the IP virtual server support for the
4 *              LINUX operating system.  IPVS is now implemented as a module
5 *              over the NetFilter framework. IPVS can be used to build a
6 *              high-performance and highly available server based on a
7 *              cluster of servers.
8 *
9 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
10 *              Peter Kese <peter.kese@ijs.si>
11 *              Julian Anastasov <ja@ssi.bg>
12 *
13 * Changes:
14 */
15
16#define KMSG_COMPONENT "IPVS"
17#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
18
19#include <linux/module.h>
20#include <linux/init.h>
21#include <linux/types.h>
22#include <linux/capability.h>
23#include <linux/fs.h>
24#include <linux/sysctl.h>
25#include <linux/proc_fs.h>
26#include <linux/workqueue.h>
27#include <linux/seq_file.h>
28#include <linux/slab.h>
29
30#include <linux/netfilter.h>
31#include <linux/netfilter_ipv4.h>
32#include <linux/mutex.h>
33
34#include <net/net_namespace.h>
35#include <linux/nsproxy.h>
36#include <net/ip.h>
37#ifdef CONFIG_IP_VS_IPV6
38#include <net/ipv6.h>
39#include <net/ip6_route.h>
40#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
41#endif
42#include <net/route.h>
43#include <net/sock.h>
44#include <net/genetlink.h>
45
46#include <linux/uaccess.h>
47
48#include <net/ip_vs.h>
49
50MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME);
51
52DEFINE_MUTEX(__ip_vs_mutex); /* Serialize configuration with sockopt/netlink */
53
54/* sysctl variables */
55
56#ifdef CONFIG_IP_VS_DEBUG
57static int sysctl_ip_vs_debug_level = 0;
58
59int ip_vs_get_debug_level(void)
60{
61	return sysctl_ip_vs_debug_level;
62}
63#endif
64
65
66/*  Protos */
67static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
68
69
70#ifdef CONFIG_IP_VS_IPV6
71/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
72static bool __ip_vs_addr_is_local_v6(struct net *net,
73				     const struct in6_addr *addr)
74{
75	struct flowi6 fl6 = {
76		.daddr = *addr,
77	};
78	struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
79	bool is_local;
80
81	is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
82
83	dst_release(dst);
84	return is_local;
85}
86#endif
87
88#ifdef CONFIG_SYSCTL
89/*
90 *	update_defense_level is called from keventd and from sysctl,
91 *	so it needs to protect itself from softirqs
92 */
93static void update_defense_level(struct netns_ipvs *ipvs)
94{
95	struct sysinfo i;
96	int availmem;
97	int nomem;
98	int to_change = -1;
99
100	/* we only count free and buffered memory (in pages) */
101	si_meminfo(&i);
102	availmem = i.freeram + i.bufferram;
103	/* however in linux 2.5 the i.bufferram is total page cache size,
104	   we need adjust it */
105	/* si_swapinfo(&i); */
106	/* availmem = availmem - (i.totalswap - i.freeswap); */
107
108	nomem = (availmem < ipvs->sysctl_amemthresh);
109
110	local_bh_disable();
111
112	/* drop_entry */
113	spin_lock(&ipvs->dropentry_lock);
114	switch (ipvs->sysctl_drop_entry) {
115	case 0:
116		atomic_set(&ipvs->dropentry, 0);
117		break;
118	case 1:
119		if (nomem) {
120			atomic_set(&ipvs->dropentry, 1);
121			ipvs->sysctl_drop_entry = 2;
122		} else {
123			atomic_set(&ipvs->dropentry, 0);
124		}
125		break;
126	case 2:
127		if (nomem) {
128			atomic_set(&ipvs->dropentry, 1);
129		} else {
130			atomic_set(&ipvs->dropentry, 0);
131			ipvs->sysctl_drop_entry = 1;
132		}
133		break;
134	case 3:
135		atomic_set(&ipvs->dropentry, 1);
136		break;
137	}
138	spin_unlock(&ipvs->dropentry_lock);
139
140	/* drop_packet */
141	spin_lock(&ipvs->droppacket_lock);
142	switch (ipvs->sysctl_drop_packet) {
143	case 0:
144		ipvs->drop_rate = 0;
145		break;
146	case 1:
147		if (nomem) {
148			ipvs->drop_rate = ipvs->drop_counter
149				= ipvs->sysctl_amemthresh /
150				(ipvs->sysctl_amemthresh-availmem);
151			ipvs->sysctl_drop_packet = 2;
152		} else {
153			ipvs->drop_rate = 0;
154		}
155		break;
156	case 2:
157		if (nomem) {
158			ipvs->drop_rate = ipvs->drop_counter
159				= ipvs->sysctl_amemthresh /
160				(ipvs->sysctl_amemthresh-availmem);
161		} else {
162			ipvs->drop_rate = 0;
163			ipvs->sysctl_drop_packet = 1;
164		}
165		break;
166	case 3:
167		ipvs->drop_rate = ipvs->sysctl_am_droprate;
168		break;
169	}
170	spin_unlock(&ipvs->droppacket_lock);
171
172	/* secure_tcp */
173	spin_lock(&ipvs->securetcp_lock);
174	switch (ipvs->sysctl_secure_tcp) {
175	case 0:
176		if (ipvs->old_secure_tcp >= 2)
177			to_change = 0;
178		break;
179	case 1:
180		if (nomem) {
181			if (ipvs->old_secure_tcp < 2)
182				to_change = 1;
183			ipvs->sysctl_secure_tcp = 2;
184		} else {
185			if (ipvs->old_secure_tcp >= 2)
186				to_change = 0;
187		}
188		break;
189	case 2:
190		if (nomem) {
191			if (ipvs->old_secure_tcp < 2)
192				to_change = 1;
193		} else {
194			if (ipvs->old_secure_tcp >= 2)
195				to_change = 0;
196			ipvs->sysctl_secure_tcp = 1;
197		}
198		break;
199	case 3:
200		if (ipvs->old_secure_tcp < 2)
201			to_change = 1;
202		break;
203	}
204	ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp;
205	if (to_change >= 0)
206		ip_vs_protocol_timeout_change(ipvs,
207					      ipvs->sysctl_secure_tcp > 1);
208	spin_unlock(&ipvs->securetcp_lock);
209
210	local_bh_enable();
211}
212
213/* Handler for delayed work for expiring no
214 * destination connections
215 */
216static void expire_nodest_conn_handler(struct work_struct *work)
217{
218	struct netns_ipvs *ipvs;
219
220	ipvs = container_of(work, struct netns_ipvs,
221			    expire_nodest_conn_work.work);
222	ip_vs_expire_nodest_conn_flush(ipvs);
223}
224
225/*
226 *	Timer for checking the defense
227 */
228#define DEFENSE_TIMER_PERIOD	1*HZ
229
230static void defense_work_handler(struct work_struct *work)
231{
232	struct netns_ipvs *ipvs =
233		container_of(work, struct netns_ipvs, defense_work.work);
234
235	update_defense_level(ipvs);
236	if (atomic_read(&ipvs->dropentry))
237		ip_vs_random_dropentry(ipvs);
238	queue_delayed_work(system_long_wq, &ipvs->defense_work,
239			   DEFENSE_TIMER_PERIOD);
240}
241#endif
242
243static void est_reload_work_handler(struct work_struct *work)
244{
245	struct netns_ipvs *ipvs =
246		container_of(work, struct netns_ipvs, est_reload_work.work);
247	int genid_done = atomic_read(&ipvs->est_genid_done);
248	unsigned long delay = HZ / 10;	/* repeat startups after failure */
249	bool repeat = false;
250	int genid;
251	int id;
252
253	mutex_lock(&ipvs->est_mutex);
254	genid = atomic_read(&ipvs->est_genid);
255	for (id = 0; id < ipvs->est_kt_count; id++) {
256		struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id];
257
258		/* netns clean up started, abort delayed work */
259		if (!ipvs->enable)
260			goto unlock;
261		if (!kd)
262			continue;
263		/* New config ? Stop kthread tasks */
264		if (genid != genid_done)
265			ip_vs_est_kthread_stop(kd);
266		if (!kd->task && !ip_vs_est_stopped(ipvs)) {
267			/* Do not start kthreads above 0 in calc phase */
268			if ((!id || !ipvs->est_calc_phase) &&
269			    ip_vs_est_kthread_start(ipvs, kd) < 0)
270				repeat = true;
271		}
272	}
273
274	atomic_set(&ipvs->est_genid_done, genid);
275
276	if (repeat)
277		queue_delayed_work(system_long_wq, &ipvs->est_reload_work,
278				   delay);
279
280unlock:
281	mutex_unlock(&ipvs->est_mutex);
282}
283
284int
285ip_vs_use_count_inc(void)
286{
287	return try_module_get(THIS_MODULE);
288}
289
290void
291ip_vs_use_count_dec(void)
292{
293	module_put(THIS_MODULE);
294}
295
296
297/*
298 *	Hash table: for virtual service lookups
299 */
300#define IP_VS_SVC_TAB_BITS 8
301#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
302#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
303
304/* the service table hashed by <protocol, addr, port> */
305static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
306/* the service table hashed by fwmark */
307static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
308
309
310/*
311 *	Returns hash value for virtual service
312 */
313static inline unsigned int
314ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto,
315		  const union nf_inet_addr *addr, __be16 port)
316{
317	unsigned int porth = ntohs(port);
318	__be32 addr_fold = addr->ip;
319	__u32 ahash;
320
321#ifdef CONFIG_IP_VS_IPV6
322	if (af == AF_INET6)
323		addr_fold = addr->ip6[0]^addr->ip6[1]^
324			    addr->ip6[2]^addr->ip6[3];
325#endif
326	ahash = ntohl(addr_fold);
327	ahash ^= ((size_t) ipvs >> 8);
328
329	return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
330	       IP_VS_SVC_TAB_MASK;
331}
332
333/*
334 *	Returns hash value of fwmark for virtual service lookup
335 */
336static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark)
337{
338	return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
339}
340
341/*
342 *	Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
343 *	or in the ip_vs_svc_fwm_table by fwmark.
344 *	Should be called with locked tables.
345 */
346static int ip_vs_svc_hash(struct ip_vs_service *svc)
347{
348	unsigned int hash;
349
350	if (svc->flags & IP_VS_SVC_F_HASHED) {
351		pr_err("%s(): request for already hashed, called from %pS\n",
352		       __func__, __builtin_return_address(0));
353		return 0;
354	}
355
356	if (svc->fwmark == 0) {
357		/*
358		 *  Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
359		 */
360		hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol,
361					 &svc->addr, svc->port);
362		hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]);
363	} else {
364		/*
365		 *  Hash it by fwmark in svc_fwm_table
366		 */
367		hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark);
368		hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
369	}
370
371	svc->flags |= IP_VS_SVC_F_HASHED;
372	/* increase its refcnt because it is referenced by the svc table */
373	atomic_inc(&svc->refcnt);
374	return 1;
375}
376
377
378/*
379 *	Unhashes a service from svc_table / svc_fwm_table.
380 *	Should be called with locked tables.
381 */
382static int ip_vs_svc_unhash(struct ip_vs_service *svc)
383{
384	if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
385		pr_err("%s(): request for unhash flagged, called from %pS\n",
386		       __func__, __builtin_return_address(0));
387		return 0;
388	}
389
390	if (svc->fwmark == 0) {
391		/* Remove it from the svc_table table */
392		hlist_del_rcu(&svc->s_list);
393	} else {
394		/* Remove it from the svc_fwm_table table */
395		hlist_del_rcu(&svc->f_list);
396	}
397
398	svc->flags &= ~IP_VS_SVC_F_HASHED;
399	atomic_dec(&svc->refcnt);
400	return 1;
401}
402
403
404/*
405 *	Get service by {netns, proto,addr,port} in the service table.
406 */
407static inline struct ip_vs_service *
408__ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol,
409		     const union nf_inet_addr *vaddr, __be16 vport)
410{
411	unsigned int hash;
412	struct ip_vs_service *svc;
413
414	/* Check for "full" addressed entries */
415	hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport);
416
417	hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) {
418		if ((svc->af == af)
419		    && ip_vs_addr_equal(af, &svc->addr, vaddr)
420		    && (svc->port == vport)
421		    && (svc->protocol == protocol)
422		    && (svc->ipvs == ipvs)) {
423			/* HIT */
424			return svc;
425		}
426	}
427
428	return NULL;
429}
430
431
432/*
433 *	Get service by {fwmark} in the service table.
434 */
435static inline struct ip_vs_service *
436__ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark)
437{
438	unsigned int hash;
439	struct ip_vs_service *svc;
440
441	/* Check for fwmark addressed entries */
442	hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark);
443
444	hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) {
445		if (svc->fwmark == fwmark && svc->af == af
446		    && (svc->ipvs == ipvs)) {
447			/* HIT */
448			return svc;
449		}
450	}
451
452	return NULL;
453}
454
455/* Find service, called under RCU lock */
456struct ip_vs_service *
457ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol,
458		   const union nf_inet_addr *vaddr, __be16 vport)
459{
460	struct ip_vs_service *svc;
461
462	/*
463	 *	Check the table hashed by fwmark first
464	 */
465	if (fwmark) {
466		svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark);
467		if (svc)
468			goto out;
469	}
470
471	/*
472	 *	Check the table hashed by <protocol,addr,port>
473	 *	for "full" addressed entries
474	 */
475	svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport);
476
477	if (!svc && protocol == IPPROTO_TCP &&
478	    atomic_read(&ipvs->ftpsvc_counter) &&
479	    (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) {
480		/*
481		 * Check if ftp service entry exists, the packet
482		 * might belong to FTP data connections.
483		 */
484		svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT);
485	}
486
487	if (svc == NULL
488	    && atomic_read(&ipvs->nullsvc_counter)) {
489		/*
490		 * Check if the catch-all port (port zero) exists
491		 */
492		svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0);
493	}
494
495  out:
496	IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
497		      fwmark, ip_vs_proto_name(protocol),
498		      IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
499		      svc ? "hit" : "not hit");
500
501	return svc;
502}
503
504
505static inline void
506__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
507{
508	atomic_inc(&svc->refcnt);
509	rcu_assign_pointer(dest->svc, svc);
510}
511
512static void ip_vs_service_free(struct ip_vs_service *svc)
513{
514	ip_vs_stats_release(&svc->stats);
515	kfree(svc);
516}
517
518static void ip_vs_service_rcu_free(struct rcu_head *head)
519{
520	struct ip_vs_service *svc;
521
522	svc = container_of(head, struct ip_vs_service, rcu_head);
523	ip_vs_service_free(svc);
524}
525
526static void __ip_vs_svc_put(struct ip_vs_service *svc)
527{
528	if (atomic_dec_and_test(&svc->refcnt)) {
529		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
530			      svc->fwmark,
531			      IP_VS_DBG_ADDR(svc->af, &svc->addr),
532			      ntohs(svc->port));
533		call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
534	}
535}
536
537
538/*
539 *	Returns hash value for real service
540 */
541static inline unsigned int ip_vs_rs_hashkey(int af,
542					    const union nf_inet_addr *addr,
543					    __be16 port)
544{
545	unsigned int porth = ntohs(port);
546	__be32 addr_fold = addr->ip;
547
548#ifdef CONFIG_IP_VS_IPV6
549	if (af == AF_INET6)
550		addr_fold = addr->ip6[0]^addr->ip6[1]^
551			    addr->ip6[2]^addr->ip6[3];
552#endif
553
554	return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
555		& IP_VS_RTAB_MASK;
556}
557
558/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
559static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
560{
561	unsigned int hash;
562	__be16 port;
563
564	if (dest->in_rs_table)
565		return;
566
567	switch (IP_VS_DFWD_METHOD(dest)) {
568	case IP_VS_CONN_F_MASQ:
569		port = dest->port;
570		break;
571	case IP_VS_CONN_F_TUNNEL:
572		switch (dest->tun_type) {
573		case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
574			port = dest->tun_port;
575			break;
576		case IP_VS_CONN_F_TUNNEL_TYPE_IPIP:
577		case IP_VS_CONN_F_TUNNEL_TYPE_GRE:
578			port = 0;
579			break;
580		default:
581			return;
582		}
583		break;
584	default:
585		return;
586	}
587
588	/*
589	 *	Hash by proto,addr,port,
590	 *	which are the parameters of the real service.
591	 */
592	hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port);
593
594	hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
595	dest->in_rs_table = 1;
596}
597
598/* Unhash ip_vs_dest from rs_table. */
599static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
600{
601	/*
602	 * Remove it from the rs_table table.
603	 */
604	if (dest->in_rs_table) {
605		hlist_del_rcu(&dest->d_list);
606		dest->in_rs_table = 0;
607	}
608}
609
610/* Check if real service by <proto,addr,port> is present */
611bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
612			    const union nf_inet_addr *daddr, __be16 dport)
613{
614	unsigned int hash;
615	struct ip_vs_dest *dest;
616
617	/* Check for "full" addressed entries */
618	hash = ip_vs_rs_hashkey(af, daddr, dport);
619
620	hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
621		if (dest->port == dport &&
622		    dest->af == af &&
623		    ip_vs_addr_equal(af, &dest->addr, daddr) &&
624		    (dest->protocol == protocol || dest->vfwmark) &&
625		    IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
626			/* HIT */
627			return true;
628		}
629	}
630
631	return false;
632}
633
634/* Find real service record by <proto,addr,port>.
635 * In case of multiple records with the same <proto,addr,port>, only
636 * the first found record is returned.
637 *
638 * To be called under RCU lock.
639 */
640struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af,
641					   __u16 protocol,
642					   const union nf_inet_addr *daddr,
643					   __be16 dport)
644{
645	unsigned int hash;
646	struct ip_vs_dest *dest;
647
648	/* Check for "full" addressed entries */
649	hash = ip_vs_rs_hashkey(af, daddr, dport);
650
651	hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
652		if (dest->port == dport &&
653		    dest->af == af &&
654		    ip_vs_addr_equal(af, &dest->addr, daddr) &&
655		    (dest->protocol == protocol || dest->vfwmark) &&
656		    IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
657			/* HIT */
658			return dest;
659		}
660	}
661
662	return NULL;
663}
664
665/* Find real service record by <af,addr,tun_port>.
666 * In case of multiple records with the same <af,addr,tun_port>, only
667 * the first found record is returned.
668 *
669 * To be called under RCU lock.
670 */
671struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af,
672				     const union nf_inet_addr *daddr,
673				     __be16 tun_port)
674{
675	struct ip_vs_dest *dest;
676	unsigned int hash;
677
678	/* Check for "full" addressed entries */
679	hash = ip_vs_rs_hashkey(af, daddr, tun_port);
680
681	hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
682		if (dest->tun_port == tun_port &&
683		    dest->af == af &&
684		    ip_vs_addr_equal(af, &dest->addr, daddr) &&
685		    IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) {
686			/* HIT */
687			return dest;
688		}
689	}
690
691	return NULL;
692}
693
694/* Lookup destination by {addr,port} in the given service
695 * Called under RCU lock.
696 */
697static struct ip_vs_dest *
698ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af,
699		  const union nf_inet_addr *daddr, __be16 dport)
700{
701	struct ip_vs_dest *dest;
702
703	/*
704	 * Find the destination for the given service
705	 */
706	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
707		if ((dest->af == dest_af) &&
708		    ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
709		    (dest->port == dport)) {
710			/* HIT */
711			return dest;
712		}
713	}
714
715	return NULL;
716}
717
718/*
719 * Find destination by {daddr,dport,vaddr,protocol}
720 * Created to be used in ip_vs_process_message() in
721 * the backup synchronization daemon. It finds the
722 * destination to be bound to the received connection
723 * on the backup.
724 * Called under RCU lock, no refcnt is returned.
725 */
726struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af,
727				   const union nf_inet_addr *daddr,
728				   __be16 dport,
729				   const union nf_inet_addr *vaddr,
730				   __be16 vport, __u16 protocol, __u32 fwmark,
731				   __u32 flags)
732{
733	struct ip_vs_dest *dest;
734	struct ip_vs_service *svc;
735	__be16 port = dport;
736
737	svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport);
738	if (!svc)
739		return NULL;
740	if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
741		port = 0;
742	dest = ip_vs_lookup_dest(svc, dest_af, daddr, port);
743	if (!dest)
744		dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport);
745	return dest;
746}
747
748void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
749{
750	struct ip_vs_dest_dst *dest_dst = container_of(head,
751						       struct ip_vs_dest_dst,
752						       rcu_head);
753
754	dst_release(dest_dst->dst_cache);
755	kfree(dest_dst);
756}
757
758/* Release dest_dst and dst_cache for dest in user context */
759static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
760{
761	struct ip_vs_dest_dst *old;
762
763	old = rcu_dereference_protected(dest->dest_dst, 1);
764	if (old) {
765		RCU_INIT_POINTER(dest->dest_dst, NULL);
766		call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
767	}
768}
769
770/*
771 *  Lookup dest by {svc,addr,port} in the destination trash.
772 *  The destination trash is used to hold the destinations that are removed
773 *  from the service table but are still referenced by some conn entries.
774 *  The reason to add the destination trash is when the dest is temporary
775 *  down (either by administrator or by monitor program), the dest can be
776 *  picked back from the trash, the remaining connections to the dest can
777 *  continue, and the counting information of the dest is also useful for
778 *  scheduling.
779 */
780static struct ip_vs_dest *
781ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af,
782		     const union nf_inet_addr *daddr, __be16 dport)
783{
784	struct ip_vs_dest *dest;
785	struct netns_ipvs *ipvs = svc->ipvs;
786
787	/*
788	 * Find the destination in trash
789	 */
790	spin_lock_bh(&ipvs->dest_trash_lock);
791	list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
792		IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
793			      "dest->refcnt=%d\n",
794			      dest->vfwmark,
795			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
796			      ntohs(dest->port),
797			      refcount_read(&dest->refcnt));
798		if (dest->af == dest_af &&
799		    ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
800		    dest->port == dport &&
801		    dest->vfwmark == svc->fwmark &&
802		    dest->protocol == svc->protocol &&
803		    (svc->fwmark ||
804		     (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
805		      dest->vport == svc->port))) {
806			/* HIT */
807			list_del(&dest->t_list);
808			goto out;
809		}
810	}
811
812	dest = NULL;
813
814out:
815	spin_unlock_bh(&ipvs->dest_trash_lock);
816
817	return dest;
818}
819
820static void ip_vs_dest_rcu_free(struct rcu_head *head)
821{
822	struct ip_vs_dest *dest;
823
824	dest = container_of(head, struct ip_vs_dest, rcu_head);
825	ip_vs_stats_release(&dest->stats);
826	ip_vs_dest_put_and_free(dest);
827}
828
829static void ip_vs_dest_free(struct ip_vs_dest *dest)
830{
831	struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
832
833	__ip_vs_dst_cache_reset(dest);
834	__ip_vs_svc_put(svc);
835	call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free);
836}
837
838/*
839 *  Clean up all the destinations in the trash
840 *  Called by the ip_vs_control_cleanup()
841 *
842 *  When the ip_vs_control_clearup is activated by ipvs module exit,
843 *  the service tables must have been flushed and all the connections
844 *  are expired, and the refcnt of each destination in the trash must
845 *  be 1, so we simply release them here.
846 */
847static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
848{
849	struct ip_vs_dest *dest, *nxt;
850
851	del_timer_sync(&ipvs->dest_trash_timer);
852	/* No need to use dest_trash_lock */
853	list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
854		list_del(&dest->t_list);
855		ip_vs_dest_free(dest);
856	}
857}
858
859static void ip_vs_stats_rcu_free(struct rcu_head *head)
860{
861	struct ip_vs_stats_rcu *rs = container_of(head,
862						  struct ip_vs_stats_rcu,
863						  rcu_head);
864
865	ip_vs_stats_release(&rs->s);
866	kfree(rs);
867}
868
869static void
870ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
871{
872#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c
873
874	spin_lock(&src->lock);
875
876	IP_VS_SHOW_STATS_COUNTER(conns);
877	IP_VS_SHOW_STATS_COUNTER(inpkts);
878	IP_VS_SHOW_STATS_COUNTER(outpkts);
879	IP_VS_SHOW_STATS_COUNTER(inbytes);
880	IP_VS_SHOW_STATS_COUNTER(outbytes);
881
882	ip_vs_read_estimator(dst, src);
883
884	spin_unlock(&src->lock);
885}
886
887static void
888ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src)
889{
890	dst->conns = (u32)src->conns;
891	dst->inpkts = (u32)src->inpkts;
892	dst->outpkts = (u32)src->outpkts;
893	dst->inbytes = src->inbytes;
894	dst->outbytes = src->outbytes;
895	dst->cps = (u32)src->cps;
896	dst->inpps = (u32)src->inpps;
897	dst->outpps = (u32)src->outpps;
898	dst->inbps = (u32)src->inbps;
899	dst->outbps = (u32)src->outbps;
900}
901
902static void
903ip_vs_zero_stats(struct ip_vs_stats *stats)
904{
905	spin_lock(&stats->lock);
906
907	/* get current counters as zero point, rates are zeroed */
908
909#define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c
910
911	IP_VS_ZERO_STATS_COUNTER(conns);
912	IP_VS_ZERO_STATS_COUNTER(inpkts);
913	IP_VS_ZERO_STATS_COUNTER(outpkts);
914	IP_VS_ZERO_STATS_COUNTER(inbytes);
915	IP_VS_ZERO_STATS_COUNTER(outbytes);
916
917	ip_vs_zero_estimator(stats);
918
919	spin_unlock(&stats->lock);
920}
921
922/* Allocate fields after kzalloc */
923int ip_vs_stats_init_alloc(struct ip_vs_stats *s)
924{
925	int i;
926
927	spin_lock_init(&s->lock);
928	s->cpustats = alloc_percpu(struct ip_vs_cpu_stats);
929	if (!s->cpustats)
930		return -ENOMEM;
931
932	for_each_possible_cpu(i) {
933		struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i);
934
935		u64_stats_init(&cs->syncp);
936	}
937	return 0;
938}
939
940struct ip_vs_stats *ip_vs_stats_alloc(void)
941{
942	struct ip_vs_stats *s = kzalloc(sizeof(*s), GFP_KERNEL);
943
944	if (s && ip_vs_stats_init_alloc(s) >= 0)
945		return s;
946	kfree(s);
947	return NULL;
948}
949
950void ip_vs_stats_release(struct ip_vs_stats *stats)
951{
952	free_percpu(stats->cpustats);
953}
954
955void ip_vs_stats_free(struct ip_vs_stats *stats)
956{
957	if (stats) {
958		ip_vs_stats_release(stats);
959		kfree(stats);
960	}
961}
962
963/*
964 *	Update a destination in the given service
965 */
966static void
967__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
968		    struct ip_vs_dest_user_kern *udest, int add)
969{
970	struct netns_ipvs *ipvs = svc->ipvs;
971	struct ip_vs_service *old_svc;
972	struct ip_vs_scheduler *sched;
973	int conn_flags;
974
975	/* We cannot modify an address and change the address family */
976	BUG_ON(!add && udest->af != dest->af);
977
978	if (add && udest->af != svc->af)
979		ipvs->mixed_address_family_dests++;
980
981	/* keep the last_weight with latest non-0 weight */
982	if (add || udest->weight != 0)
983		atomic_set(&dest->last_weight, udest->weight);
984
985	/* set the weight and the flags */
986	atomic_set(&dest->weight, udest->weight);
987	conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
988	conn_flags |= IP_VS_CONN_F_INACTIVE;
989
990	/* Need to rehash? */
991	if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) !=
992	    IP_VS_DFWD_METHOD(dest) ||
993	    udest->tun_type != dest->tun_type ||
994	    udest->tun_port != dest->tun_port)
995		ip_vs_rs_unhash(dest);
996
997	/* set the tunnel info */
998	dest->tun_type = udest->tun_type;
999	dest->tun_port = udest->tun_port;
1000	dest->tun_flags = udest->tun_flags;
1001
1002	/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
1003	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
1004		conn_flags |= IP_VS_CONN_F_NOOUTPUT;
1005	} else {
1006		/* FTP-NAT requires conntrack for mangling */
1007		if (svc->port == FTPPORT)
1008			ip_vs_register_conntrack(svc);
1009	}
1010	atomic_set(&dest->conn_flags, conn_flags);
1011	/* Put the real service in rs_table if not present. */
1012	ip_vs_rs_hash(ipvs, dest);
1013
1014	/* bind the service */
1015	old_svc = rcu_dereference_protected(dest->svc, 1);
1016	if (!old_svc) {
1017		__ip_vs_bind_svc(dest, svc);
1018	} else {
1019		if (old_svc != svc) {
1020			ip_vs_zero_stats(&dest->stats);
1021			__ip_vs_bind_svc(dest, svc);
1022			__ip_vs_svc_put(old_svc);
1023		}
1024	}
1025
1026	/* set the dest status flags */
1027	dest->flags |= IP_VS_DEST_F_AVAILABLE;
1028
1029	if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
1030		dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
1031	dest->u_threshold = udest->u_threshold;
1032	dest->l_threshold = udest->l_threshold;
1033
1034	dest->af = udest->af;
1035
1036	spin_lock_bh(&dest->dst_lock);
1037	__ip_vs_dst_cache_reset(dest);
1038	spin_unlock_bh(&dest->dst_lock);
1039
1040	if (add) {
1041		list_add_rcu(&dest->n_list, &svc->destinations);
1042		svc->num_dests++;
1043		sched = rcu_dereference_protected(svc->scheduler, 1);
1044		if (sched && sched->add_dest)
1045			sched->add_dest(svc, dest);
1046	} else {
1047		sched = rcu_dereference_protected(svc->scheduler, 1);
1048		if (sched && sched->upd_dest)
1049			sched->upd_dest(svc, dest);
1050	}
1051}
1052
1053
1054/*
1055 *	Create a destination for the given service
1056 */
1057static int
1058ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1059{
1060	struct ip_vs_dest *dest;
1061	unsigned int atype;
1062	int ret;
1063
1064#ifdef CONFIG_IP_VS_IPV6
1065	if (udest->af == AF_INET6) {
1066		atype = ipv6_addr_type(&udest->addr.in6);
1067		if ((!(atype & IPV6_ADDR_UNICAST) ||
1068			atype & IPV6_ADDR_LINKLOCAL) &&
1069			!__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6))
1070			return -EINVAL;
1071
1072		ret = nf_defrag_ipv6_enable(svc->ipvs->net);
1073		if (ret)
1074			return ret;
1075	} else
1076#endif
1077	{
1078		atype = inet_addr_type(svc->ipvs->net, udest->addr.ip);
1079		if (atype != RTN_LOCAL && atype != RTN_UNICAST)
1080			return -EINVAL;
1081	}
1082
1083	dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
1084	if (dest == NULL)
1085		return -ENOMEM;
1086
1087	ret = ip_vs_stats_init_alloc(&dest->stats);
1088	if (ret < 0)
1089		goto err_alloc;
1090
1091	ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
1092	if (ret < 0)
1093		goto err_stats;
1094
1095	dest->af = udest->af;
1096	dest->protocol = svc->protocol;
1097	dest->vaddr = svc->addr;
1098	dest->vport = svc->port;
1099	dest->vfwmark = svc->fwmark;
1100	ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr);
1101	dest->port = udest->port;
1102
1103	atomic_set(&dest->activeconns, 0);
1104	atomic_set(&dest->inactconns, 0);
1105	atomic_set(&dest->persistconns, 0);
1106	refcount_set(&dest->refcnt, 1);
1107
1108	INIT_HLIST_NODE(&dest->d_list);
1109	spin_lock_init(&dest->dst_lock);
1110	__ip_vs_update_dest(svc, dest, udest, 1);
1111
1112	return 0;
1113
1114err_stats:
1115	ip_vs_stats_release(&dest->stats);
1116
1117err_alloc:
1118	kfree(dest);
1119	return ret;
1120}
1121
1122
1123/*
1124 *	Add a destination into an existing service
1125 */
1126static int
1127ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1128{
1129	struct ip_vs_dest *dest;
1130	union nf_inet_addr daddr;
1131	__be16 dport = udest->port;
1132	int ret;
1133
1134	if (udest->weight < 0) {
1135		pr_err("%s(): server weight less than zero\n", __func__);
1136		return -ERANGE;
1137	}
1138
1139	if (udest->l_threshold > udest->u_threshold) {
1140		pr_err("%s(): lower threshold is higher than upper threshold\n",
1141			__func__);
1142		return -ERANGE;
1143	}
1144
1145	if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1146		if (udest->tun_port == 0) {
1147			pr_err("%s(): tunnel port is zero\n", __func__);
1148			return -EINVAL;
1149		}
1150	}
1151
1152	ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
1153
1154	/* We use function that requires RCU lock */
1155	rcu_read_lock();
1156	dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
1157	rcu_read_unlock();
1158
1159	if (dest != NULL) {
1160		IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
1161		return -EEXIST;
1162	}
1163
1164	/*
1165	 * Check if the dest already exists in the trash and
1166	 * is from the same service
1167	 */
1168	dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport);
1169
1170	if (dest != NULL) {
1171		IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
1172			      "dest->refcnt=%d, service %u/%s:%u\n",
1173			      IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport),
1174			      refcount_read(&dest->refcnt),
1175			      dest->vfwmark,
1176			      IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
1177			      ntohs(dest->vport));
1178
1179		ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
1180		if (ret < 0)
1181			return ret;
1182		__ip_vs_update_dest(svc, dest, udest, 1);
1183	} else {
1184		/*
1185		 * Allocate and initialize the dest structure
1186		 */
1187		ret = ip_vs_new_dest(svc, udest);
1188	}
1189
1190	return ret;
1191}
1192
1193
1194/*
1195 *	Edit a destination in the given service
1196 */
1197static int
1198ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1199{
1200	struct ip_vs_dest *dest;
1201	union nf_inet_addr daddr;
1202	__be16 dport = udest->port;
1203
1204	if (udest->weight < 0) {
1205		pr_err("%s(): server weight less than zero\n", __func__);
1206		return -ERANGE;
1207	}
1208
1209	if (udest->l_threshold > udest->u_threshold) {
1210		pr_err("%s(): lower threshold is higher than upper threshold\n",
1211			__func__);
1212		return -ERANGE;
1213	}
1214
1215	if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1216		if (udest->tun_port == 0) {
1217			pr_err("%s(): tunnel port is zero\n", __func__);
1218			return -EINVAL;
1219		}
1220	}
1221
1222	ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
1223
1224	/* We use function that requires RCU lock */
1225	rcu_read_lock();
1226	dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
1227	rcu_read_unlock();
1228
1229	if (dest == NULL) {
1230		IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
1231		return -ENOENT;
1232	}
1233
1234	__ip_vs_update_dest(svc, dest, udest, 0);
1235
1236	return 0;
1237}
1238
1239/*
1240 *	Delete a destination (must be already unlinked from the service)
1241 */
1242static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
1243			     bool cleanup)
1244{
1245	ip_vs_stop_estimator(ipvs, &dest->stats);
1246
1247	/*
1248	 *  Remove it from the d-linked list with the real services.
1249	 */
1250	ip_vs_rs_unhash(dest);
1251
1252	spin_lock_bh(&ipvs->dest_trash_lock);
1253	IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
1254		      IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
1255		      refcount_read(&dest->refcnt));
1256	if (list_empty(&ipvs->dest_trash) && !cleanup)
1257		mod_timer(&ipvs->dest_trash_timer,
1258			  jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
1259	/* dest lives in trash with reference */
1260	list_add(&dest->t_list, &ipvs->dest_trash);
1261	dest->idle_start = 0;
1262	spin_unlock_bh(&ipvs->dest_trash_lock);
1263
1264	/* Queue up delayed work to expire all no destination connections.
1265	 * No-op when CONFIG_SYSCTL is disabled.
1266	 */
1267	if (!cleanup)
1268		ip_vs_enqueue_expire_nodest_conns(ipvs);
1269}
1270
1271
1272/*
1273 *	Unlink a destination from the given service
1274 */
1275static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1276				struct ip_vs_dest *dest,
1277				int svcupd)
1278{
1279	dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1280
1281	/*
1282	 *  Remove it from the d-linked destination list.
1283	 */
1284	list_del_rcu(&dest->n_list);
1285	svc->num_dests--;
1286
1287	if (dest->af != svc->af)
1288		svc->ipvs->mixed_address_family_dests--;
1289
1290	if (svcupd) {
1291		struct ip_vs_scheduler *sched;
1292
1293		sched = rcu_dereference_protected(svc->scheduler, 1);
1294		if (sched && sched->del_dest)
1295			sched->del_dest(svc, dest);
1296	}
1297}
1298
1299
1300/*
1301 *	Delete a destination server in the given service
1302 */
1303static int
1304ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1305{
1306	struct ip_vs_dest *dest;
1307	__be16 dport = udest->port;
1308
1309	/* We use function that requires RCU lock */
1310	rcu_read_lock();
1311	dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport);
1312	rcu_read_unlock();
1313
1314	if (dest == NULL) {
1315		IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
1316		return -ENOENT;
1317	}
1318
1319	/*
1320	 *	Unlink dest from the service
1321	 */
1322	__ip_vs_unlink_dest(svc, dest, 1);
1323
1324	/*
1325	 *	Delete the destination
1326	 */
1327	__ip_vs_del_dest(svc->ipvs, dest, false);
1328
1329	return 0;
1330}
1331
1332static void ip_vs_dest_trash_expire(struct timer_list *t)
1333{
1334	struct netns_ipvs *ipvs = from_timer(ipvs, t, dest_trash_timer);
1335	struct ip_vs_dest *dest, *next;
1336	unsigned long now = jiffies;
1337
1338	spin_lock(&ipvs->dest_trash_lock);
1339	list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
1340		if (refcount_read(&dest->refcnt) > 1)
1341			continue;
1342		if (dest->idle_start) {
1343			if (time_before(now, dest->idle_start +
1344					     IP_VS_DEST_TRASH_PERIOD))
1345				continue;
1346		} else {
1347			dest->idle_start = max(1UL, now);
1348			continue;
1349		}
1350		IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
1351			      dest->vfwmark,
1352			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
1353			      ntohs(dest->port));
1354		list_del(&dest->t_list);
1355		ip_vs_dest_free(dest);
1356	}
1357	if (!list_empty(&ipvs->dest_trash))
1358		mod_timer(&ipvs->dest_trash_timer,
1359			  jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
1360	spin_unlock(&ipvs->dest_trash_lock);
1361}
1362
1363/*
1364 *	Add a service into the service hash table
1365 */
1366static int
1367ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
1368		  struct ip_vs_service **svc_p)
1369{
1370	int ret = 0;
1371	struct ip_vs_scheduler *sched = NULL;
1372	struct ip_vs_pe *pe = NULL;
1373	struct ip_vs_service *svc = NULL;
1374	int ret_hooks = -1;
1375
1376	/* increase the module use count */
1377	if (!ip_vs_use_count_inc())
1378		return -ENOPROTOOPT;
1379
1380	/* Lookup the scheduler by 'u->sched_name' */
1381	if (strcmp(u->sched_name, "none")) {
1382		sched = ip_vs_scheduler_get(u->sched_name);
1383		if (!sched) {
1384			pr_info("Scheduler module ip_vs_%s not found\n",
1385				u->sched_name);
1386			ret = -ENOENT;
1387			goto out_err;
1388		}
1389	}
1390
1391	if (u->pe_name && *u->pe_name) {
1392		pe = ip_vs_pe_getbyname(u->pe_name);
1393		if (pe == NULL) {
1394			pr_info("persistence engine module ip_vs_pe_%s "
1395				"not found\n", u->pe_name);
1396			ret = -ENOENT;
1397			goto out_err;
1398		}
1399	}
1400
1401#ifdef CONFIG_IP_VS_IPV6
1402	if (u->af == AF_INET6) {
1403		__u32 plen = (__force __u32) u->netmask;
1404
1405		if (plen < 1 || plen > 128) {
1406			ret = -EINVAL;
1407			goto out_err;
1408		}
1409
1410		ret = nf_defrag_ipv6_enable(ipvs->net);
1411		if (ret)
1412			goto out_err;
1413	}
1414#endif
1415
1416	if ((u->af == AF_INET && !ipvs->num_services) ||
1417	    (u->af == AF_INET6 && !ipvs->num_services6)) {
1418		ret = ip_vs_register_hooks(ipvs, u->af);
1419		if (ret < 0)
1420			goto out_err;
1421		ret_hooks = ret;
1422	}
1423
1424	svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
1425	if (svc == NULL) {
1426		IP_VS_DBG(1, "%s(): no memory\n", __func__);
1427		ret = -ENOMEM;
1428		goto out_err;
1429	}
1430	ret = ip_vs_stats_init_alloc(&svc->stats);
1431	if (ret < 0)
1432		goto out_err;
1433
1434	/* I'm the first user of the service */
1435	atomic_set(&svc->refcnt, 0);
1436
1437	svc->af = u->af;
1438	svc->protocol = u->protocol;
1439	ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1440	svc->port = u->port;
1441	svc->fwmark = u->fwmark;
1442	svc->flags = u->flags & ~IP_VS_SVC_F_HASHED;
1443	svc->timeout = u->timeout * HZ;
1444	svc->netmask = u->netmask;
1445	svc->ipvs = ipvs;
1446
1447	INIT_LIST_HEAD(&svc->destinations);
1448	spin_lock_init(&svc->sched_lock);
1449
1450	/* Bind the scheduler */
1451	if (sched) {
1452		ret = ip_vs_bind_scheduler(svc, sched);
1453		if (ret)
1454			goto out_err;
1455		sched = NULL;
1456	}
1457
1458	ret = ip_vs_start_estimator(ipvs, &svc->stats);
1459	if (ret < 0)
1460		goto out_err;
1461
1462	/* Bind the ct retriever */
1463	RCU_INIT_POINTER(svc->pe, pe);
1464	pe = NULL;
1465
1466	/* Update the virtual service counters */
1467	if (svc->port == FTPPORT)
1468		atomic_inc(&ipvs->ftpsvc_counter);
1469	else if (svc->port == 0)
1470		atomic_inc(&ipvs->nullsvc_counter);
1471	if (svc->pe && svc->pe->conn_out)
1472		atomic_inc(&ipvs->conn_out_counter);
1473
1474	/* Count only IPv4 services for old get/setsockopt interface */
1475	if (svc->af == AF_INET)
1476		ipvs->num_services++;
1477	else if (svc->af == AF_INET6)
1478		ipvs->num_services6++;
1479
1480	/* Hash the service into the service table */
1481	ip_vs_svc_hash(svc);
1482
1483	*svc_p = svc;
1484
1485	if (!ipvs->enable) {
1486		/* Now there is a service - full throttle */
1487		ipvs->enable = 1;
1488
1489		/* Start estimation for first time */
1490		ip_vs_est_reload_start(ipvs);
1491	}
1492
1493	return 0;
1494
1495
1496 out_err:
1497	if (ret_hooks >= 0)
1498		ip_vs_unregister_hooks(ipvs, u->af);
1499	if (svc != NULL) {
1500		ip_vs_unbind_scheduler(svc, sched);
1501		ip_vs_service_free(svc);
1502	}
1503	ip_vs_scheduler_put(sched);
1504	ip_vs_pe_put(pe);
1505
1506	/* decrease the module use count */
1507	ip_vs_use_count_dec();
1508
1509	return ret;
1510}
1511
1512
1513/*
1514 *	Edit a service and bind it with a new scheduler
1515 */
1516static int
1517ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1518{
1519	struct ip_vs_scheduler *sched = NULL, *old_sched;
1520	struct ip_vs_pe *pe = NULL, *old_pe = NULL;
1521	int ret = 0;
1522	bool new_pe_conn_out, old_pe_conn_out;
1523
1524	/*
1525	 * Lookup the scheduler, by 'u->sched_name'
1526	 */
1527	if (strcmp(u->sched_name, "none")) {
1528		sched = ip_vs_scheduler_get(u->sched_name);
1529		if (!sched) {
1530			pr_info("Scheduler module ip_vs_%s not found\n",
1531				u->sched_name);
1532			return -ENOENT;
1533		}
1534	}
1535	old_sched = sched;
1536
1537	if (u->pe_name && *u->pe_name) {
1538		pe = ip_vs_pe_getbyname(u->pe_name);
1539		if (pe == NULL) {
1540			pr_info("persistence engine module ip_vs_pe_%s "
1541				"not found\n", u->pe_name);
1542			ret = -ENOENT;
1543			goto out;
1544		}
1545		old_pe = pe;
1546	}
1547
1548#ifdef CONFIG_IP_VS_IPV6
1549	if (u->af == AF_INET6) {
1550		__u32 plen = (__force __u32) u->netmask;
1551
1552		if (plen < 1 || plen > 128) {
1553			ret = -EINVAL;
1554			goto out;
1555		}
1556	}
1557#endif
1558
1559	old_sched = rcu_dereference_protected(svc->scheduler, 1);
1560	if (sched != old_sched) {
1561		if (old_sched) {
1562			ip_vs_unbind_scheduler(svc, old_sched);
1563			RCU_INIT_POINTER(svc->scheduler, NULL);
1564			/* Wait all svc->sched_data users */
1565			synchronize_rcu();
1566		}
1567		/* Bind the new scheduler */
1568		if (sched) {
1569			ret = ip_vs_bind_scheduler(svc, sched);
1570			if (ret) {
1571				ip_vs_scheduler_put(sched);
1572				goto out;
1573			}
1574		}
1575	}
1576
1577	/*
1578	 * Set the flags and timeout value
1579	 */
1580	svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1581	svc->timeout = u->timeout * HZ;
1582	svc->netmask = u->netmask;
1583
1584	old_pe = rcu_dereference_protected(svc->pe, 1);
1585	if (pe != old_pe) {
1586		rcu_assign_pointer(svc->pe, pe);
1587		/* check for optional methods in new pe */
1588		new_pe_conn_out = (pe && pe->conn_out) ? true : false;
1589		old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false;
1590		if (new_pe_conn_out && !old_pe_conn_out)
1591			atomic_inc(&svc->ipvs->conn_out_counter);
1592		if (old_pe_conn_out && !new_pe_conn_out)
1593			atomic_dec(&svc->ipvs->conn_out_counter);
1594	}
1595
1596out:
1597	ip_vs_scheduler_put(old_sched);
1598	ip_vs_pe_put(old_pe);
1599	return ret;
1600}
1601
1602/*
1603 *	Delete a service from the service list
1604 *	- The service must be unlinked, unlocked and not referenced!
1605 *	- We are called under _bh lock
1606 */
1607static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
1608{
1609	struct ip_vs_dest *dest, *nxt;
1610	struct ip_vs_scheduler *old_sched;
1611	struct ip_vs_pe *old_pe;
1612	struct netns_ipvs *ipvs = svc->ipvs;
1613
1614	if (svc->af == AF_INET) {
1615		ipvs->num_services--;
1616		if (!ipvs->num_services)
1617			ip_vs_unregister_hooks(ipvs, svc->af);
1618	} else if (svc->af == AF_INET6) {
1619		ipvs->num_services6--;
1620		if (!ipvs->num_services6)
1621			ip_vs_unregister_hooks(ipvs, svc->af);
1622	}
1623
1624	ip_vs_stop_estimator(svc->ipvs, &svc->stats);
1625
1626	/* Unbind scheduler */
1627	old_sched = rcu_dereference_protected(svc->scheduler, 1);
1628	ip_vs_unbind_scheduler(svc, old_sched);
1629	ip_vs_scheduler_put(old_sched);
1630
1631	/* Unbind persistence engine, keep svc->pe */
1632	old_pe = rcu_dereference_protected(svc->pe, 1);
1633	if (old_pe && old_pe->conn_out)
1634		atomic_dec(&ipvs->conn_out_counter);
1635	ip_vs_pe_put(old_pe);
1636
1637	/*
1638	 *    Unlink the whole destination list
1639	 */
1640	list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1641		__ip_vs_unlink_dest(svc, dest, 0);
1642		__ip_vs_del_dest(svc->ipvs, dest, cleanup);
1643	}
1644
1645	/*
1646	 *    Update the virtual service counters
1647	 */
1648	if (svc->port == FTPPORT)
1649		atomic_dec(&ipvs->ftpsvc_counter);
1650	else if (svc->port == 0)
1651		atomic_dec(&ipvs->nullsvc_counter);
1652
1653	/*
1654	 *    Free the service if nobody refers to it
1655	 */
1656	__ip_vs_svc_put(svc);
1657
1658	/* decrease the module use count */
1659	ip_vs_use_count_dec();
1660}
1661
1662/*
1663 * Unlink a service from list and try to delete it if its refcnt reached 0
1664 */
1665static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
1666{
1667	ip_vs_unregister_conntrack(svc);
1668	/* Hold svc to avoid double release from dest_trash */
1669	atomic_inc(&svc->refcnt);
1670	/*
1671	 * Unhash it from the service table
1672	 */
1673	ip_vs_svc_unhash(svc);
1674
1675	__ip_vs_del_service(svc, cleanup);
1676}
1677
1678/*
1679 *	Delete a service from the service list
1680 */
1681static int ip_vs_del_service(struct ip_vs_service *svc)
1682{
1683	if (svc == NULL)
1684		return -EEXIST;
1685	ip_vs_unlink_service(svc, false);
1686
1687	return 0;
1688}
1689
1690
1691/*
1692 *	Flush all the virtual services
1693 */
1694static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
1695{
1696	int idx;
1697	struct ip_vs_service *svc;
1698	struct hlist_node *n;
1699
1700	/*
1701	 * Flush the service table hashed by <netns,protocol,addr,port>
1702	 */
1703	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1704		hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx],
1705					  s_list) {
1706			if (svc->ipvs == ipvs)
1707				ip_vs_unlink_service(svc, cleanup);
1708		}
1709	}
1710
1711	/*
1712	 * Flush the service table hashed by fwmark
1713	 */
1714	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1715		hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx],
1716					  f_list) {
1717			if (svc->ipvs == ipvs)
1718				ip_vs_unlink_service(svc, cleanup);
1719		}
1720	}
1721
1722	return 0;
1723}
1724
1725/*
1726 *	Delete service by {netns} in the service table.
1727 *	Called by __ip_vs_batch_cleanup()
1728 */
1729void ip_vs_service_nets_cleanup(struct list_head *net_list)
1730{
1731	struct netns_ipvs *ipvs;
1732	struct net *net;
1733
1734	/* Check for "full" addressed entries */
1735	mutex_lock(&__ip_vs_mutex);
1736	list_for_each_entry(net, net_list, exit_list) {
1737		ipvs = net_ipvs(net);
1738		ip_vs_flush(ipvs, true);
1739	}
1740	mutex_unlock(&__ip_vs_mutex);
1741}
1742
1743/* Put all references for device (dst_cache) */
1744static inline void
1745ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
1746{
1747	struct ip_vs_dest_dst *dest_dst;
1748
1749	spin_lock_bh(&dest->dst_lock);
1750	dest_dst = rcu_dereference_protected(dest->dest_dst, 1);
1751	if (dest_dst && dest_dst->dst_cache->dev == dev) {
1752		IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
1753			      dev->name,
1754			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
1755			      ntohs(dest->port),
1756			      refcount_read(&dest->refcnt));
1757		__ip_vs_dst_cache_reset(dest);
1758	}
1759	spin_unlock_bh(&dest->dst_lock);
1760
1761}
1762/* Netdev event receiver
1763 * Currently only NETDEV_DOWN is handled to release refs to cached dsts
1764 */
1765static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
1766			   void *ptr)
1767{
1768	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1769	struct net *net = dev_net(dev);
1770	struct netns_ipvs *ipvs = net_ipvs(net);
1771	struct ip_vs_service *svc;
1772	struct ip_vs_dest *dest;
1773	unsigned int idx;
1774
1775	if (event != NETDEV_DOWN || !ipvs)
1776		return NOTIFY_DONE;
1777	IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
1778	mutex_lock(&__ip_vs_mutex);
1779	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1780		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1781			if (svc->ipvs == ipvs) {
1782				list_for_each_entry(dest, &svc->destinations,
1783						    n_list) {
1784					ip_vs_forget_dev(dest, dev);
1785				}
1786			}
1787		}
1788
1789		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1790			if (svc->ipvs == ipvs) {
1791				list_for_each_entry(dest, &svc->destinations,
1792						    n_list) {
1793					ip_vs_forget_dev(dest, dev);
1794				}
1795			}
1796
1797		}
1798	}
1799
1800	spin_lock_bh(&ipvs->dest_trash_lock);
1801	list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
1802		ip_vs_forget_dev(dest, dev);
1803	}
1804	spin_unlock_bh(&ipvs->dest_trash_lock);
1805	mutex_unlock(&__ip_vs_mutex);
1806	return NOTIFY_DONE;
1807}
1808
1809/*
1810 *	Zero counters in a service or all services
1811 */
1812static int ip_vs_zero_service(struct ip_vs_service *svc)
1813{
1814	struct ip_vs_dest *dest;
1815
1816	list_for_each_entry(dest, &svc->destinations, n_list) {
1817		ip_vs_zero_stats(&dest->stats);
1818	}
1819	ip_vs_zero_stats(&svc->stats);
1820	return 0;
1821}
1822
1823static int ip_vs_zero_all(struct netns_ipvs *ipvs)
1824{
1825	int idx;
1826	struct ip_vs_service *svc;
1827
1828	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1829		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1830			if (svc->ipvs == ipvs)
1831				ip_vs_zero_service(svc);
1832		}
1833	}
1834
1835	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1836		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1837			if (svc->ipvs == ipvs)
1838				ip_vs_zero_service(svc);
1839		}
1840	}
1841
1842	ip_vs_zero_stats(&ipvs->tot_stats->s);
1843	return 0;
1844}
1845
1846#ifdef CONFIG_SYSCTL
1847
1848static int
1849proc_do_defense_mode(struct ctl_table *table, int write,
1850		     void *buffer, size_t *lenp, loff_t *ppos)
1851{
1852	struct netns_ipvs *ipvs = table->extra2;
1853	int *valp = table->data;
1854	int val = *valp;
1855	int rc;
1856
1857	struct ctl_table tmp = {
1858		.data = &val,
1859		.maxlen = sizeof(int),
1860		.mode = table->mode,
1861	};
1862
1863	rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
1864	if (write && (*valp != val)) {
1865		if (val < 0 || val > 3) {
1866			rc = -EINVAL;
1867		} else {
1868			*valp = val;
1869			update_defense_level(ipvs);
1870		}
1871	}
1872	return rc;
1873}
1874
1875static int
1876proc_do_sync_threshold(struct ctl_table *table, int write,
1877		       void *buffer, size_t *lenp, loff_t *ppos)
1878{
1879	struct netns_ipvs *ipvs = table->extra2;
1880	int *valp = table->data;
1881	int val[2];
1882	int rc;
1883	struct ctl_table tmp = {
1884		.data = &val,
1885		.maxlen = table->maxlen,
1886		.mode = table->mode,
1887	};
1888
1889	mutex_lock(&ipvs->sync_mutex);
1890	memcpy(val, valp, sizeof(val));
1891	rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
1892	if (write) {
1893		if (val[0] < 0 || val[1] < 0 ||
1894		    (val[0] >= val[1] && val[1]))
1895			rc = -EINVAL;
1896		else
1897			memcpy(valp, val, sizeof(val));
1898	}
1899	mutex_unlock(&ipvs->sync_mutex);
1900	return rc;
1901}
1902
1903static int
1904proc_do_sync_ports(struct ctl_table *table, int write,
1905		   void *buffer, size_t *lenp, loff_t *ppos)
1906{
1907	int *valp = table->data;
1908	int val = *valp;
1909	int rc;
1910
1911	struct ctl_table tmp = {
1912		.data = &val,
1913		.maxlen = sizeof(int),
1914		.mode = table->mode,
1915	};
1916
1917	rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
1918	if (write && (*valp != val)) {
1919		if (val < 1 || !is_power_of_2(val))
1920			rc = -EINVAL;
1921		else
1922			*valp = val;
1923	}
1924	return rc;
1925}
1926
1927static int ipvs_proc_est_cpumask_set(struct ctl_table *table, void *buffer)
1928{
1929	struct netns_ipvs *ipvs = table->extra2;
1930	cpumask_var_t *valp = table->data;
1931	cpumask_var_t newmask;
1932	int ret;
1933
1934	if (!zalloc_cpumask_var(&newmask, GFP_KERNEL))
1935		return -ENOMEM;
1936
1937	ret = cpulist_parse(buffer, newmask);
1938	if (ret)
1939		goto out;
1940
1941	mutex_lock(&ipvs->est_mutex);
1942
1943	if (!ipvs->est_cpulist_valid) {
1944		if (!zalloc_cpumask_var(valp, GFP_KERNEL)) {
1945			ret = -ENOMEM;
1946			goto unlock;
1947		}
1948		ipvs->est_cpulist_valid = 1;
1949	}
1950	cpumask_and(newmask, newmask, &current->cpus_mask);
1951	cpumask_copy(*valp, newmask);
1952	/* est_max_threads may depend on cpulist size */
1953	ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
1954	ipvs->est_calc_phase = 1;
1955	ip_vs_est_reload_start(ipvs);
1956
1957unlock:
1958	mutex_unlock(&ipvs->est_mutex);
1959
1960out:
1961	free_cpumask_var(newmask);
1962	return ret;
1963}
1964
1965static int ipvs_proc_est_cpumask_get(struct ctl_table *table, void *buffer,
1966				     size_t size)
1967{
1968	struct netns_ipvs *ipvs = table->extra2;
1969	cpumask_var_t *valp = table->data;
1970	struct cpumask *mask;
1971	int ret;
1972
1973	mutex_lock(&ipvs->est_mutex);
1974
1975	if (ipvs->est_cpulist_valid)
1976		mask = *valp;
1977	else
1978		mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD);
1979	ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask));
1980
1981	mutex_unlock(&ipvs->est_mutex);
1982
1983	return ret;
1984}
1985
1986static int ipvs_proc_est_cpulist(struct ctl_table *table, int write,
1987				 void *buffer, size_t *lenp, loff_t *ppos)
1988{
1989	int ret;
1990
1991	/* Ignore both read and write(append) if *ppos not 0 */
1992	if (*ppos || !*lenp) {
1993		*lenp = 0;
1994		return 0;
1995	}
1996	if (write) {
1997		/* proc_sys_call_handler() appends terminator */
1998		ret = ipvs_proc_est_cpumask_set(table, buffer);
1999		if (ret >= 0)
2000			*ppos += *lenp;
2001	} else {
2002		/* proc_sys_call_handler() allocates 1 byte for terminator */
2003		ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1);
2004		if (ret >= 0) {
2005			*lenp = ret;
2006			*ppos += *lenp;
2007			ret = 0;
2008		}
2009	}
2010	return ret;
2011}
2012
2013static int ipvs_proc_est_nice(struct ctl_table *table, int write,
2014			      void *buffer, size_t *lenp, loff_t *ppos)
2015{
2016	struct netns_ipvs *ipvs = table->extra2;
2017	int *valp = table->data;
2018	int val = *valp;
2019	int ret;
2020
2021	struct ctl_table tmp_table = {
2022		.data = &val,
2023		.maxlen = sizeof(int),
2024		.mode = table->mode,
2025	};
2026
2027	ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
2028	if (write && ret >= 0) {
2029		if (val < MIN_NICE || val > MAX_NICE) {
2030			ret = -EINVAL;
2031		} else {
2032			mutex_lock(&ipvs->est_mutex);
2033			if (*valp != val) {
2034				*valp = val;
2035				ip_vs_est_reload_start(ipvs);
2036			}
2037			mutex_unlock(&ipvs->est_mutex);
2038		}
2039	}
2040	return ret;
2041}
2042
2043static int ipvs_proc_run_estimation(struct ctl_table *table, int write,
2044				    void *buffer, size_t *lenp, loff_t *ppos)
2045{
2046	struct netns_ipvs *ipvs = table->extra2;
2047	int *valp = table->data;
2048	int val = *valp;
2049	int ret;
2050
2051	struct ctl_table tmp_table = {
2052		.data = &val,
2053		.maxlen = sizeof(int),
2054		.mode = table->mode,
2055	};
2056
2057	ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
2058	if (write && ret >= 0) {
2059		mutex_lock(&ipvs->est_mutex);
2060		if (*valp != val) {
2061			*valp = val;
2062			ip_vs_est_reload_start(ipvs);
2063		}
2064		mutex_unlock(&ipvs->est_mutex);
2065	}
2066	return ret;
2067}
2068
2069/*
2070 *	IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
2071 *	Do not change order or insert new entries without
2072 *	align with netns init in ip_vs_control_net_init()
2073 */
2074
2075static struct ctl_table vs_vars[] = {
2076	{
2077		.procname	= "amemthresh",
2078		.maxlen		= sizeof(int),
2079		.mode		= 0644,
2080		.proc_handler	= proc_dointvec,
2081	},
2082	{
2083		.procname	= "am_droprate",
2084		.maxlen		= sizeof(int),
2085		.mode		= 0644,
2086		.proc_handler	= proc_dointvec,
2087	},
2088	{
2089		.procname	= "drop_entry",
2090		.maxlen		= sizeof(int),
2091		.mode		= 0644,
2092		.proc_handler	= proc_do_defense_mode,
2093	},
2094	{
2095		.procname	= "drop_packet",
2096		.maxlen		= sizeof(int),
2097		.mode		= 0644,
2098		.proc_handler	= proc_do_defense_mode,
2099	},
2100#ifdef CONFIG_IP_VS_NFCT
2101	{
2102		.procname	= "conntrack",
2103		.maxlen		= sizeof(int),
2104		.mode		= 0644,
2105		.proc_handler	= &proc_dointvec,
2106	},
2107#endif
2108	{
2109		.procname	= "secure_tcp",
2110		.maxlen		= sizeof(int),
2111		.mode		= 0644,
2112		.proc_handler	= proc_do_defense_mode,
2113	},
2114	{
2115		.procname	= "snat_reroute",
2116		.maxlen		= sizeof(int),
2117		.mode		= 0644,
2118		.proc_handler	= &proc_dointvec,
2119	},
2120	{
2121		.procname	= "sync_version",
2122		.maxlen		= sizeof(int),
2123		.mode		= 0644,
2124		.proc_handler	= proc_dointvec_minmax,
2125		.extra1		= SYSCTL_ZERO,
2126		.extra2		= SYSCTL_ONE,
2127	},
2128	{
2129		.procname	= "sync_ports",
2130		.maxlen		= sizeof(int),
2131		.mode		= 0644,
2132		.proc_handler	= proc_do_sync_ports,
2133	},
2134	{
2135		.procname	= "sync_persist_mode",
2136		.maxlen		= sizeof(int),
2137		.mode		= 0644,
2138		.proc_handler	= proc_dointvec,
2139	},
2140	{
2141		.procname	= "sync_qlen_max",
2142		.maxlen		= sizeof(unsigned long),
2143		.mode		= 0644,
2144		.proc_handler	= proc_doulongvec_minmax,
2145	},
2146	{
2147		.procname	= "sync_sock_size",
2148		.maxlen		= sizeof(int),
2149		.mode		= 0644,
2150		.proc_handler	= proc_dointvec,
2151	},
2152	{
2153		.procname	= "cache_bypass",
2154		.maxlen		= sizeof(int),
2155		.mode		= 0644,
2156		.proc_handler	= proc_dointvec,
2157	},
2158	{
2159		.procname	= "expire_nodest_conn",
2160		.maxlen		= sizeof(int),
2161		.mode		= 0644,
2162		.proc_handler	= proc_dointvec,
2163	},
2164	{
2165		.procname	= "sloppy_tcp",
2166		.maxlen		= sizeof(int),
2167		.mode		= 0644,
2168		.proc_handler	= proc_dointvec,
2169	},
2170	{
2171		.procname	= "sloppy_sctp",
2172		.maxlen		= sizeof(int),
2173		.mode		= 0644,
2174		.proc_handler	= proc_dointvec,
2175	},
2176	{
2177		.procname	= "expire_quiescent_template",
2178		.maxlen		= sizeof(int),
2179		.mode		= 0644,
2180		.proc_handler	= proc_dointvec,
2181	},
2182	{
2183		.procname	= "sync_threshold",
2184		.maxlen		=
2185			sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
2186		.mode		= 0644,
2187		.proc_handler	= proc_do_sync_threshold,
2188	},
2189	{
2190		.procname	= "sync_refresh_period",
2191		.maxlen		= sizeof(int),
2192		.mode		= 0644,
2193		.proc_handler	= proc_dointvec_jiffies,
2194	},
2195	{
2196		.procname	= "sync_retries",
2197		.maxlen		= sizeof(int),
2198		.mode		= 0644,
2199		.proc_handler	= proc_dointvec_minmax,
2200		.extra1		= SYSCTL_ZERO,
2201		.extra2		= SYSCTL_THREE,
2202	},
2203	{
2204		.procname	= "nat_icmp_send",
2205		.maxlen		= sizeof(int),
2206		.mode		= 0644,
2207		.proc_handler	= proc_dointvec,
2208	},
2209	{
2210		.procname	= "pmtu_disc",
2211		.maxlen		= sizeof(int),
2212		.mode		= 0644,
2213		.proc_handler	= proc_dointvec,
2214	},
2215	{
2216		.procname	= "backup_only",
2217		.maxlen		= sizeof(int),
2218		.mode		= 0644,
2219		.proc_handler	= proc_dointvec,
2220	},
2221	{
2222		.procname	= "conn_reuse_mode",
2223		.maxlen		= sizeof(int),
2224		.mode		= 0644,
2225		.proc_handler	= proc_dointvec,
2226	},
2227	{
2228		.procname	= "schedule_icmp",
2229		.maxlen		= sizeof(int),
2230		.mode		= 0644,
2231		.proc_handler	= proc_dointvec,
2232	},
2233	{
2234		.procname	= "ignore_tunneled",
2235		.maxlen		= sizeof(int),
2236		.mode		= 0644,
2237		.proc_handler	= proc_dointvec,
2238	},
2239	{
2240		.procname	= "run_estimation",
2241		.maxlen		= sizeof(int),
2242		.mode		= 0644,
2243		.proc_handler	= ipvs_proc_run_estimation,
2244	},
2245	{
2246		.procname	= "est_cpulist",
2247		.maxlen		= NR_CPUS,	/* unused */
2248		.mode		= 0644,
2249		.proc_handler	= ipvs_proc_est_cpulist,
2250	},
2251	{
2252		.procname	= "est_nice",
2253		.maxlen		= sizeof(int),
2254		.mode		= 0644,
2255		.proc_handler	= ipvs_proc_est_nice,
2256	},
2257#ifdef CONFIG_IP_VS_DEBUG
2258	{
2259		.procname	= "debug_level",
2260		.data		= &sysctl_ip_vs_debug_level,
2261		.maxlen		= sizeof(int),
2262		.mode		= 0644,
2263		.proc_handler	= proc_dointvec,
2264	},
2265#endif
2266	{ }
2267};
2268
2269#endif
2270
2271#ifdef CONFIG_PROC_FS
2272
2273struct ip_vs_iter {
2274	struct seq_net_private p;  /* Do not move this, netns depends upon it*/
2275	struct hlist_head *table;
2276	int bucket;
2277};
2278
2279/*
2280 *	Write the contents of the VS rule table to a PROCfs file.
2281 *	(It is kept just for backward compatibility)
2282 */
2283static inline const char *ip_vs_fwd_name(unsigned int flags)
2284{
2285	switch (flags & IP_VS_CONN_F_FWD_MASK) {
2286	case IP_VS_CONN_F_LOCALNODE:
2287		return "Local";
2288	case IP_VS_CONN_F_TUNNEL:
2289		return "Tunnel";
2290	case IP_VS_CONN_F_DROUTE:
2291		return "Route";
2292	default:
2293		return "Masq";
2294	}
2295}
2296
2297
2298/* Get the Nth entry in the two lists */
2299static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
2300{
2301	struct net *net = seq_file_net(seq);
2302	struct netns_ipvs *ipvs = net_ipvs(net);
2303	struct ip_vs_iter *iter = seq->private;
2304	int idx;
2305	struct ip_vs_service *svc;
2306
2307	/* look in hash by protocol */
2308	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2309		hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) {
2310			if ((svc->ipvs == ipvs) && pos-- == 0) {
2311				iter->table = ip_vs_svc_table;
2312				iter->bucket = idx;
2313				return svc;
2314			}
2315		}
2316	}
2317
2318	/* keep looking in fwmark */
2319	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2320		hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx],
2321					 f_list) {
2322			if ((svc->ipvs == ipvs) && pos-- == 0) {
2323				iter->table = ip_vs_svc_fwm_table;
2324				iter->bucket = idx;
2325				return svc;
2326			}
2327		}
2328	}
2329
2330	return NULL;
2331}
2332
2333static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
2334	__acquires(RCU)
2335{
2336	rcu_read_lock();
2337	return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
2338}
2339
2340
2341static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2342{
2343	struct hlist_node *e;
2344	struct ip_vs_iter *iter;
2345	struct ip_vs_service *svc;
2346
2347	++*pos;
2348	if (v == SEQ_START_TOKEN)
2349		return ip_vs_info_array(seq,0);
2350
2351	svc = v;
2352	iter = seq->private;
2353
2354	if (iter->table == ip_vs_svc_table) {
2355		/* next service in table hashed by protocol */
2356		e = rcu_dereference(hlist_next_rcu(&svc->s_list));
2357		if (e)
2358			return hlist_entry(e, struct ip_vs_service, s_list);
2359
2360		while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
2361			hlist_for_each_entry_rcu(svc,
2362						 &ip_vs_svc_table[iter->bucket],
2363						 s_list) {
2364				return svc;
2365			}
2366		}
2367
2368		iter->table = ip_vs_svc_fwm_table;
2369		iter->bucket = -1;
2370		goto scan_fwmark;
2371	}
2372
2373	/* next service in hashed by fwmark */
2374	e = rcu_dereference(hlist_next_rcu(&svc->f_list));
2375	if (e)
2376		return hlist_entry(e, struct ip_vs_service, f_list);
2377
2378 scan_fwmark:
2379	while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
2380		hlist_for_each_entry_rcu(svc,
2381					 &ip_vs_svc_fwm_table[iter->bucket],
2382					 f_list)
2383			return svc;
2384	}
2385
2386	return NULL;
2387}
2388
2389static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
2390	__releases(RCU)
2391{
2392	rcu_read_unlock();
2393}
2394
2395
2396static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
2397{
2398	if (v == SEQ_START_TOKEN) {
2399		seq_printf(seq,
2400			"IP Virtual Server version %d.%d.%d (size=%d)\n",
2401			NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2402		seq_puts(seq,
2403			 "Prot LocalAddress:Port Scheduler Flags\n");
2404		seq_puts(seq,
2405			 "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
2406	} else {
2407		struct net *net = seq_file_net(seq);
2408		struct netns_ipvs *ipvs = net_ipvs(net);
2409		const struct ip_vs_service *svc = v;
2410		const struct ip_vs_iter *iter = seq->private;
2411		const struct ip_vs_dest *dest;
2412		struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
2413		char *sched_name = sched ? sched->name : "none";
2414
2415		if (svc->ipvs != ipvs)
2416			return 0;
2417		if (iter->table == ip_vs_svc_table) {
2418#ifdef CONFIG_IP_VS_IPV6
2419			if (svc->af == AF_INET6)
2420				seq_printf(seq, "%s  [%pI6]:%04X %s ",
2421					   ip_vs_proto_name(svc->protocol),
2422					   &svc->addr.in6,
2423					   ntohs(svc->port),
2424					   sched_name);
2425			else
2426#endif
2427				seq_printf(seq, "%s  %08X:%04X %s %s ",
2428					   ip_vs_proto_name(svc->protocol),
2429					   ntohl(svc->addr.ip),
2430					   ntohs(svc->port),
2431					   sched_name,
2432					   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2433		} else {
2434			seq_printf(seq, "FWM  %08X %s %s",
2435				   svc->fwmark, sched_name,
2436				   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2437		}
2438
2439		if (svc->flags & IP_VS_SVC_F_PERSISTENT)
2440			seq_printf(seq, "persistent %d %08X\n",
2441				svc->timeout,
2442				ntohl(svc->netmask));
2443		else
2444			seq_putc(seq, '\n');
2445
2446		list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
2447#ifdef CONFIG_IP_VS_IPV6
2448			if (dest->af == AF_INET6)
2449				seq_printf(seq,
2450					   "  -> [%pI6]:%04X"
2451					   "      %-7s %-6d %-10d %-10d\n",
2452					   &dest->addr.in6,
2453					   ntohs(dest->port),
2454					   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2455					   atomic_read(&dest->weight),
2456					   atomic_read(&dest->activeconns),
2457					   atomic_read(&dest->inactconns));
2458			else
2459#endif
2460				seq_printf(seq,
2461					   "  -> %08X:%04X      "
2462					   "%-7s %-6d %-10d %-10d\n",
2463					   ntohl(dest->addr.ip),
2464					   ntohs(dest->port),
2465					   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2466					   atomic_read(&dest->weight),
2467					   atomic_read(&dest->activeconns),
2468					   atomic_read(&dest->inactconns));
2469
2470		}
2471	}
2472	return 0;
2473}
2474
2475static const struct seq_operations ip_vs_info_seq_ops = {
2476	.start = ip_vs_info_seq_start,
2477	.next  = ip_vs_info_seq_next,
2478	.stop  = ip_vs_info_seq_stop,
2479	.show  = ip_vs_info_seq_show,
2480};
2481
2482static int ip_vs_stats_show(struct seq_file *seq, void *v)
2483{
2484	struct net *net = seq_file_single_net(seq);
2485	struct ip_vs_kstats show;
2486
2487/*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2488	seq_puts(seq,
2489		 "   Total Incoming Outgoing         Incoming         Outgoing\n");
2490	seq_puts(seq,
2491		 "   Conns  Packets  Packets            Bytes            Bytes\n");
2492
2493	ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s);
2494	seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n",
2495		   (unsigned long long)show.conns,
2496		   (unsigned long long)show.inpkts,
2497		   (unsigned long long)show.outpkts,
2498		   (unsigned long long)show.inbytes,
2499		   (unsigned long long)show.outbytes);
2500
2501/*                01234567 01234567 01234567 0123456701234567 0123456701234567*/
2502	seq_puts(seq,
2503		 " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2504	seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n",
2505		   (unsigned long long)show.cps,
2506		   (unsigned long long)show.inpps,
2507		   (unsigned long long)show.outpps,
2508		   (unsigned long long)show.inbps,
2509		   (unsigned long long)show.outbps);
2510
2511	return 0;
2512}
2513
2514static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
2515{
2516	struct net *net = seq_file_single_net(seq);
2517	struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s;
2518	struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
2519	struct ip_vs_kstats kstats;
2520	int i;
2521
2522/*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2523	seq_puts(seq,
2524		 "       Total Incoming Outgoing         Incoming         Outgoing\n");
2525	seq_puts(seq,
2526		 "CPU    Conns  Packets  Packets            Bytes            Bytes\n");
2527
2528	for_each_possible_cpu(i) {
2529		struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
2530		unsigned int start;
2531		u64 conns, inpkts, outpkts, inbytes, outbytes;
2532
2533		do {
2534			start = u64_stats_fetch_begin(&u->syncp);
2535			conns = u64_stats_read(&u->cnt.conns);
2536			inpkts = u64_stats_read(&u->cnt.inpkts);
2537			outpkts = u64_stats_read(&u->cnt.outpkts);
2538			inbytes = u64_stats_read(&u->cnt.inbytes);
2539			outbytes = u64_stats_read(&u->cnt.outbytes);
2540		} while (u64_stats_fetch_retry(&u->syncp, start));
2541
2542		seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
2543			   i, (u64)conns, (u64)inpkts,
2544			   (u64)outpkts, (u64)inbytes,
2545			   (u64)outbytes);
2546	}
2547
2548	ip_vs_copy_stats(&kstats, tot_stats);
2549
2550	seq_printf(seq, "  ~ %8LX %8LX %8LX %16LX %16LX\n\n",
2551		   (unsigned long long)kstats.conns,
2552		   (unsigned long long)kstats.inpkts,
2553		   (unsigned long long)kstats.outpkts,
2554		   (unsigned long long)kstats.inbytes,
2555		   (unsigned long long)kstats.outbytes);
2556
2557/*                ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2558	seq_puts(seq,
2559		 "     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2560	seq_printf(seq, "    %8LX %8LX %8LX %16LX %16LX\n",
2561		   kstats.cps,
2562		   kstats.inpps,
2563		   kstats.outpps,
2564		   kstats.inbps,
2565		   kstats.outbps);
2566
2567	return 0;
2568}
2569#endif
2570
2571/*
2572 *	Set timeout values for tcp tcpfin udp in the timeout_table.
2573 */
2574static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u)
2575{
2576#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2577	struct ip_vs_proto_data *pd;
2578#endif
2579
2580	IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
2581		  u->tcp_timeout,
2582		  u->tcp_fin_timeout,
2583		  u->udp_timeout);
2584
2585#ifdef CONFIG_IP_VS_PROTO_TCP
2586	if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) ||
2587	    u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) {
2588		return -EINVAL;
2589	}
2590#endif
2591
2592#ifdef CONFIG_IP_VS_PROTO_UDP
2593	if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ))
2594		return -EINVAL;
2595#endif
2596
2597#ifdef CONFIG_IP_VS_PROTO_TCP
2598	if (u->tcp_timeout) {
2599		pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
2600		pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
2601			= u->tcp_timeout * HZ;
2602	}
2603
2604	if (u->tcp_fin_timeout) {
2605		pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
2606		pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
2607			= u->tcp_fin_timeout * HZ;
2608	}
2609#endif
2610
2611#ifdef CONFIG_IP_VS_PROTO_UDP
2612	if (u->udp_timeout) {
2613		pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
2614		pd->timeout_table[IP_VS_UDP_S_NORMAL]
2615			= u->udp_timeout * HZ;
2616	}
2617#endif
2618	return 0;
2619}
2620
2621#define CMDID(cmd)		(cmd - IP_VS_BASE_CTL)
2622
2623struct ip_vs_svcdest_user {
2624	struct ip_vs_service_user	s;
2625	struct ip_vs_dest_user		d;
2626};
2627
2628static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = {
2629	[CMDID(IP_VS_SO_SET_ADD)]         = sizeof(struct ip_vs_service_user),
2630	[CMDID(IP_VS_SO_SET_EDIT)]        = sizeof(struct ip_vs_service_user),
2631	[CMDID(IP_VS_SO_SET_DEL)]         = sizeof(struct ip_vs_service_user),
2632	[CMDID(IP_VS_SO_SET_ADDDEST)]     = sizeof(struct ip_vs_svcdest_user),
2633	[CMDID(IP_VS_SO_SET_DELDEST)]     = sizeof(struct ip_vs_svcdest_user),
2634	[CMDID(IP_VS_SO_SET_EDITDEST)]    = sizeof(struct ip_vs_svcdest_user),
2635	[CMDID(IP_VS_SO_SET_TIMEOUT)]     = sizeof(struct ip_vs_timeout_user),
2636	[CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user),
2637	[CMDID(IP_VS_SO_SET_STOPDAEMON)]  = sizeof(struct ip_vs_daemon_user),
2638	[CMDID(IP_VS_SO_SET_ZERO)]        = sizeof(struct ip_vs_service_user),
2639};
2640
2641union ip_vs_set_arglen {
2642	struct ip_vs_service_user	field_IP_VS_SO_SET_ADD;
2643	struct ip_vs_service_user	field_IP_VS_SO_SET_EDIT;
2644	struct ip_vs_service_user	field_IP_VS_SO_SET_DEL;
2645	struct ip_vs_svcdest_user	field_IP_VS_SO_SET_ADDDEST;
2646	struct ip_vs_svcdest_user	field_IP_VS_SO_SET_DELDEST;
2647	struct ip_vs_svcdest_user	field_IP_VS_SO_SET_EDITDEST;
2648	struct ip_vs_timeout_user	field_IP_VS_SO_SET_TIMEOUT;
2649	struct ip_vs_daemon_user	field_IP_VS_SO_SET_STARTDAEMON;
2650	struct ip_vs_daemon_user	field_IP_VS_SO_SET_STOPDAEMON;
2651	struct ip_vs_service_user	field_IP_VS_SO_SET_ZERO;
2652};
2653
2654#define MAX_SET_ARGLEN	sizeof(union ip_vs_set_arglen)
2655
2656static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
2657				  struct ip_vs_service_user *usvc_compat)
2658{
2659	memset(usvc, 0, sizeof(*usvc));
2660
2661	usvc->af		= AF_INET;
2662	usvc->protocol		= usvc_compat->protocol;
2663	usvc->addr.ip		= usvc_compat->addr;
2664	usvc->port		= usvc_compat->port;
2665	usvc->fwmark		= usvc_compat->fwmark;
2666
2667	/* Deep copy of sched_name is not needed here */
2668	usvc->sched_name	= usvc_compat->sched_name;
2669
2670	usvc->flags		= usvc_compat->flags;
2671	usvc->timeout		= usvc_compat->timeout;
2672	usvc->netmask		= usvc_compat->netmask;
2673}
2674
2675static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
2676				   struct ip_vs_dest_user *udest_compat)
2677{
2678	memset(udest, 0, sizeof(*udest));
2679
2680	udest->addr.ip		= udest_compat->addr;
2681	udest->port		= udest_compat->port;
2682	udest->conn_flags	= udest_compat->conn_flags;
2683	udest->weight		= udest_compat->weight;
2684	udest->u_threshold	= udest_compat->u_threshold;
2685	udest->l_threshold	= udest_compat->l_threshold;
2686	udest->af		= AF_INET;
2687	udest->tun_type		= IP_VS_CONN_F_TUNNEL_TYPE_IPIP;
2688}
2689
2690static int
2691do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len)
2692{
2693	struct net *net = sock_net(sk);
2694	int ret;
2695	unsigned char arg[MAX_SET_ARGLEN];
2696	struct ip_vs_service_user *usvc_compat;
2697	struct ip_vs_service_user_kern usvc;
2698	struct ip_vs_service *svc;
2699	struct ip_vs_dest_user *udest_compat;
2700	struct ip_vs_dest_user_kern udest;
2701	struct netns_ipvs *ipvs = net_ipvs(net);
2702
2703	BUILD_BUG_ON(sizeof(arg) > 255);
2704	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2705		return -EPERM;
2706
2707	if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
2708		return -EINVAL;
2709	if (len != set_arglen[CMDID(cmd)]) {
2710		IP_VS_DBG(1, "set_ctl: len %u != %u\n",
2711			  len, set_arglen[CMDID(cmd)]);
2712		return -EINVAL;
2713	}
2714
2715	if (copy_from_sockptr(arg, ptr, len) != 0)
2716		return -EFAULT;
2717
2718	/* Handle daemons since they have another lock */
2719	if (cmd == IP_VS_SO_SET_STARTDAEMON ||
2720	    cmd == IP_VS_SO_SET_STOPDAEMON) {
2721		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2722
2723		if (cmd == IP_VS_SO_SET_STARTDAEMON) {
2724			struct ipvs_sync_daemon_cfg cfg;
2725
2726			memset(&cfg, 0, sizeof(cfg));
2727			ret = -EINVAL;
2728			if (strscpy(cfg.mcast_ifn, dm->mcast_ifn,
2729				    sizeof(cfg.mcast_ifn)) <= 0)
2730				return ret;
2731			cfg.syncid = dm->syncid;
2732			ret = start_sync_thread(ipvs, &cfg, dm->state);
2733		} else {
2734			ret = stop_sync_thread(ipvs, dm->state);
2735		}
2736		return ret;
2737	}
2738
2739	mutex_lock(&__ip_vs_mutex);
2740	if (cmd == IP_VS_SO_SET_FLUSH) {
2741		/* Flush the virtual service */
2742		ret = ip_vs_flush(ipvs, false);
2743		goto out_unlock;
2744	} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
2745		/* Set timeout values for (tcp tcpfin udp) */
2746		ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg);
2747		goto out_unlock;
2748	} else if (!len) {
2749		/* No more commands with len == 0 below */
2750		ret = -EINVAL;
2751		goto out_unlock;
2752	}
2753
2754	usvc_compat = (struct ip_vs_service_user *)arg;
2755	udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
2756
2757	/* We only use the new structs internally, so copy userspace compat
2758	 * structs to extended internal versions */
2759	ip_vs_copy_usvc_compat(&usvc, usvc_compat);
2760	ip_vs_copy_udest_compat(&udest, udest_compat);
2761
2762	if (cmd == IP_VS_SO_SET_ZERO) {
2763		/* if no service address is set, zero counters in all */
2764		if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
2765			ret = ip_vs_zero_all(ipvs);
2766			goto out_unlock;
2767		}
2768	}
2769
2770	if ((cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_EDIT) &&
2771	    strnlen(usvc.sched_name, IP_VS_SCHEDNAME_MAXLEN) ==
2772	    IP_VS_SCHEDNAME_MAXLEN) {
2773		ret = -EINVAL;
2774		goto out_unlock;
2775	}
2776
2777	/* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
2778	if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
2779	    usvc.protocol != IPPROTO_SCTP) {
2780		pr_err("set_ctl: invalid protocol: %d %pI4:%d\n",
2781		       usvc.protocol, &usvc.addr.ip,
2782		       ntohs(usvc.port));
2783		ret = -EFAULT;
2784		goto out_unlock;
2785	}
2786
2787	/* Lookup the exact service by <protocol, addr, port> or fwmark */
2788	rcu_read_lock();
2789	if (usvc.fwmark == 0)
2790		svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol,
2791					   &usvc.addr, usvc.port);
2792	else
2793		svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark);
2794	rcu_read_unlock();
2795
2796	if (cmd != IP_VS_SO_SET_ADD
2797	    && (svc == NULL || svc->protocol != usvc.protocol)) {
2798		ret = -ESRCH;
2799		goto out_unlock;
2800	}
2801
2802	switch (cmd) {
2803	case IP_VS_SO_SET_ADD:
2804		if (svc != NULL)
2805			ret = -EEXIST;
2806		else
2807			ret = ip_vs_add_service(ipvs, &usvc, &svc);
2808		break;
2809	case IP_VS_SO_SET_EDIT:
2810		ret = ip_vs_edit_service(svc, &usvc);
2811		break;
2812	case IP_VS_SO_SET_DEL:
2813		ret = ip_vs_del_service(svc);
2814		if (!ret)
2815			goto out_unlock;
2816		break;
2817	case IP_VS_SO_SET_ZERO:
2818		ret = ip_vs_zero_service(svc);
2819		break;
2820	case IP_VS_SO_SET_ADDDEST:
2821		ret = ip_vs_add_dest(svc, &udest);
2822		break;
2823	case IP_VS_SO_SET_EDITDEST:
2824		ret = ip_vs_edit_dest(svc, &udest);
2825		break;
2826	case IP_VS_SO_SET_DELDEST:
2827		ret = ip_vs_del_dest(svc, &udest);
2828		break;
2829	default:
2830		WARN_ON_ONCE(1);
2831		ret = -EINVAL;
2832		break;
2833	}
2834
2835  out_unlock:
2836	mutex_unlock(&__ip_vs_mutex);
2837	return ret;
2838}
2839
2840
2841static void
2842ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2843{
2844	struct ip_vs_scheduler *sched;
2845	struct ip_vs_kstats kstats;
2846	char *sched_name;
2847
2848	sched = rcu_dereference_protected(src->scheduler, 1);
2849	sched_name = sched ? sched->name : "none";
2850	dst->protocol = src->protocol;
2851	dst->addr = src->addr.ip;
2852	dst->port = src->port;
2853	dst->fwmark = src->fwmark;
2854	strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name));
2855	dst->flags = src->flags;
2856	dst->timeout = src->timeout / HZ;
2857	dst->netmask = src->netmask;
2858	dst->num_dests = src->num_dests;
2859	ip_vs_copy_stats(&kstats, &src->stats);
2860	ip_vs_export_stats_user(&dst->stats, &kstats);
2861}
2862
2863static inline int
2864__ip_vs_get_service_entries(struct netns_ipvs *ipvs,
2865			    const struct ip_vs_get_services *get,
2866			    struct ip_vs_get_services __user *uptr)
2867{
2868	int idx, count=0;
2869	struct ip_vs_service *svc;
2870	struct ip_vs_service_entry entry;
2871	int ret = 0;
2872
2873	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2874		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2875			/* Only expose IPv4 entries to old interface */
2876			if (svc->af != AF_INET || (svc->ipvs != ipvs))
2877				continue;
2878
2879			if (count >= get->num_services)
2880				goto out;
2881			memset(&entry, 0, sizeof(entry));
2882			ip_vs_copy_service(&entry, svc);
2883			if (copy_to_user(&uptr->entrytable[count],
2884					 &entry, sizeof(entry))) {
2885				ret = -EFAULT;
2886				goto out;
2887			}
2888			count++;
2889		}
2890	}
2891
2892	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2893		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2894			/* Only expose IPv4 entries to old interface */
2895			if (svc->af != AF_INET || (svc->ipvs != ipvs))
2896				continue;
2897
2898			if (count >= get->num_services)
2899				goto out;
2900			memset(&entry, 0, sizeof(entry));
2901			ip_vs_copy_service(&entry, svc);
2902			if (copy_to_user(&uptr->entrytable[count],
2903					 &entry, sizeof(entry))) {
2904				ret = -EFAULT;
2905				goto out;
2906			}
2907			count++;
2908		}
2909	}
2910out:
2911	return ret;
2912}
2913
2914static inline int
2915__ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get,
2916			 struct ip_vs_get_dests __user *uptr)
2917{
2918	struct ip_vs_service *svc;
2919	union nf_inet_addr addr = { .ip = get->addr };
2920	int ret = 0;
2921
2922	rcu_read_lock();
2923	if (get->fwmark)
2924		svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark);
2925	else
2926		svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr,
2927					   get->port);
2928	rcu_read_unlock();
2929
2930	if (svc) {
2931		int count = 0;
2932		struct ip_vs_dest *dest;
2933		struct ip_vs_dest_entry entry;
2934		struct ip_vs_kstats kstats;
2935
2936		memset(&entry, 0, sizeof(entry));
2937		list_for_each_entry(dest, &svc->destinations, n_list) {
2938			if (count >= get->num_dests)
2939				break;
2940
2941			/* Cannot expose heterogeneous members via sockopt
2942			 * interface
2943			 */
2944			if (dest->af != svc->af)
2945				continue;
2946
2947			entry.addr = dest->addr.ip;
2948			entry.port = dest->port;
2949			entry.conn_flags = atomic_read(&dest->conn_flags);
2950			entry.weight = atomic_read(&dest->weight);
2951			entry.u_threshold = dest->u_threshold;
2952			entry.l_threshold = dest->l_threshold;
2953			entry.activeconns = atomic_read(&dest->activeconns);
2954			entry.inactconns = atomic_read(&dest->inactconns);
2955			entry.persistconns = atomic_read(&dest->persistconns);
2956			ip_vs_copy_stats(&kstats, &dest->stats);
2957			ip_vs_export_stats_user(&entry.stats, &kstats);
2958			if (copy_to_user(&uptr->entrytable[count],
2959					 &entry, sizeof(entry))) {
2960				ret = -EFAULT;
2961				break;
2962			}
2963			count++;
2964		}
2965	} else
2966		ret = -ESRCH;
2967	return ret;
2968}
2969
2970static inline void
2971__ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u)
2972{
2973#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2974	struct ip_vs_proto_data *pd;
2975#endif
2976
2977	memset(u, 0, sizeof (*u));
2978
2979#ifdef CONFIG_IP_VS_PROTO_TCP
2980	pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
2981	u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2982	u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2983#endif
2984#ifdef CONFIG_IP_VS_PROTO_UDP
2985	pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
2986	u->udp_timeout =
2987			pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2988#endif
2989}
2990
2991static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = {
2992	[CMDID(IP_VS_SO_GET_VERSION)]  = 64,
2993	[CMDID(IP_VS_SO_GET_INFO)]     = sizeof(struct ip_vs_getinfo),
2994	[CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services),
2995	[CMDID(IP_VS_SO_GET_SERVICE)]  = sizeof(struct ip_vs_service_entry),
2996	[CMDID(IP_VS_SO_GET_DESTS)]    = sizeof(struct ip_vs_get_dests),
2997	[CMDID(IP_VS_SO_GET_TIMEOUT)]  = sizeof(struct ip_vs_timeout_user),
2998	[CMDID(IP_VS_SO_GET_DAEMON)]   = 2 * sizeof(struct ip_vs_daemon_user),
2999};
3000
3001union ip_vs_get_arglen {
3002	char				field_IP_VS_SO_GET_VERSION[64];
3003	struct ip_vs_getinfo		field_IP_VS_SO_GET_INFO;
3004	struct ip_vs_get_services	field_IP_VS_SO_GET_SERVICES;
3005	struct ip_vs_service_entry	field_IP_VS_SO_GET_SERVICE;
3006	struct ip_vs_get_dests		field_IP_VS_SO_GET_DESTS;
3007	struct ip_vs_timeout_user	field_IP_VS_SO_GET_TIMEOUT;
3008	struct ip_vs_daemon_user	field_IP_VS_SO_GET_DAEMON[2];
3009};
3010
3011#define MAX_GET_ARGLEN	sizeof(union ip_vs_get_arglen)
3012
3013static int
3014do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
3015{
3016	unsigned char arg[MAX_GET_ARGLEN];
3017	int ret = 0;
3018	unsigned int copylen;
3019	struct net *net = sock_net(sk);
3020	struct netns_ipvs *ipvs = net_ipvs(net);
3021
3022	BUG_ON(!net);
3023	BUILD_BUG_ON(sizeof(arg) > 255);
3024	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3025		return -EPERM;
3026
3027	if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
3028		return -EINVAL;
3029
3030	copylen = get_arglen[CMDID(cmd)];
3031	if (*len < (int) copylen) {
3032		IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen);
3033		return -EINVAL;
3034	}
3035
3036	if (copy_from_user(arg, user, copylen) != 0)
3037		return -EFAULT;
3038	/*
3039	 * Handle daemons first since it has its own locking
3040	 */
3041	if (cmd == IP_VS_SO_GET_DAEMON) {
3042		struct ip_vs_daemon_user d[2];
3043
3044		memset(&d, 0, sizeof(d));
3045		mutex_lock(&ipvs->sync_mutex);
3046		if (ipvs->sync_state & IP_VS_STATE_MASTER) {
3047			d[0].state = IP_VS_STATE_MASTER;
3048			strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn,
3049				sizeof(d[0].mcast_ifn));
3050			d[0].syncid = ipvs->mcfg.syncid;
3051		}
3052		if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
3053			d[1].state = IP_VS_STATE_BACKUP;
3054			strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn,
3055				sizeof(d[1].mcast_ifn));
3056			d[1].syncid = ipvs->bcfg.syncid;
3057		}
3058		if (copy_to_user(user, &d, sizeof(d)) != 0)
3059			ret = -EFAULT;
3060		mutex_unlock(&ipvs->sync_mutex);
3061		return ret;
3062	}
3063
3064	mutex_lock(&__ip_vs_mutex);
3065	switch (cmd) {
3066	case IP_VS_SO_GET_VERSION:
3067	{
3068		char buf[64];
3069
3070		sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
3071			NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
3072		if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
3073			ret = -EFAULT;
3074			goto out;
3075		}
3076		*len = strlen(buf)+1;
3077	}
3078	break;
3079
3080	case IP_VS_SO_GET_INFO:
3081	{
3082		struct ip_vs_getinfo info;
3083		info.version = IP_VS_VERSION_CODE;
3084		info.size = ip_vs_conn_tab_size;
3085		info.num_services = ipvs->num_services;
3086		if (copy_to_user(user, &info, sizeof(info)) != 0)
3087			ret = -EFAULT;
3088	}
3089	break;
3090
3091	case IP_VS_SO_GET_SERVICES:
3092	{
3093		struct ip_vs_get_services *get;
3094		int size;
3095
3096		get = (struct ip_vs_get_services *)arg;
3097		size = struct_size(get, entrytable, get->num_services);
3098		if (*len != size) {
3099			pr_err("length: %u != %u\n", *len, size);
3100			ret = -EINVAL;
3101			goto out;
3102		}
3103		ret = __ip_vs_get_service_entries(ipvs, get, user);
3104	}
3105	break;
3106
3107	case IP_VS_SO_GET_SERVICE:
3108	{
3109		struct ip_vs_service_entry *entry;
3110		struct ip_vs_service *svc;
3111		union nf_inet_addr addr;
3112
3113		entry = (struct ip_vs_service_entry *)arg;
3114		addr.ip = entry->addr;
3115		rcu_read_lock();
3116		if (entry->fwmark)
3117			svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark);
3118		else
3119			svc = __ip_vs_service_find(ipvs, AF_INET,
3120						   entry->protocol, &addr,
3121						   entry->port);
3122		rcu_read_unlock();
3123		if (svc) {
3124			ip_vs_copy_service(entry, svc);
3125			if (copy_to_user(user, entry, sizeof(*entry)) != 0)
3126				ret = -EFAULT;
3127		} else
3128			ret = -ESRCH;
3129	}
3130	break;
3131
3132	case IP_VS_SO_GET_DESTS:
3133	{
3134		struct ip_vs_get_dests *get;
3135		int size;
3136
3137		get = (struct ip_vs_get_dests *)arg;
3138		size = struct_size(get, entrytable, get->num_dests);
3139		if (*len != size) {
3140			pr_err("length: %u != %u\n", *len, size);
3141			ret = -EINVAL;
3142			goto out;
3143		}
3144		ret = __ip_vs_get_dest_entries(ipvs, get, user);
3145	}
3146	break;
3147
3148	case IP_VS_SO_GET_TIMEOUT:
3149	{
3150		struct ip_vs_timeout_user t;
3151
3152		__ip_vs_get_timeouts(ipvs, &t);
3153		if (copy_to_user(user, &t, sizeof(t)) != 0)
3154			ret = -EFAULT;
3155	}
3156	break;
3157
3158	default:
3159		ret = -EINVAL;
3160	}
3161
3162out:
3163	mutex_unlock(&__ip_vs_mutex);
3164	return ret;
3165}
3166
3167
3168static struct nf_sockopt_ops ip_vs_sockopts = {
3169	.pf		= PF_INET,
3170	.set_optmin	= IP_VS_BASE_CTL,
3171	.set_optmax	= IP_VS_SO_SET_MAX+1,
3172	.set		= do_ip_vs_set_ctl,
3173	.get_optmin	= IP_VS_BASE_CTL,
3174	.get_optmax	= IP_VS_SO_GET_MAX+1,
3175	.get		= do_ip_vs_get_ctl,
3176	.owner		= THIS_MODULE,
3177};
3178
3179/*
3180 * Generic Netlink interface
3181 */
3182
3183/* IPVS genetlink family */
3184static struct genl_family ip_vs_genl_family;
3185
3186/* Policy used for first-level command attributes */
3187static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
3188	[IPVS_CMD_ATTR_SERVICE]		= { .type = NLA_NESTED },
3189	[IPVS_CMD_ATTR_DEST]		= { .type = NLA_NESTED },
3190	[IPVS_CMD_ATTR_DAEMON]		= { .type = NLA_NESTED },
3191	[IPVS_CMD_ATTR_TIMEOUT_TCP]	= { .type = NLA_U32 },
3192	[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]	= { .type = NLA_U32 },
3193	[IPVS_CMD_ATTR_TIMEOUT_UDP]	= { .type = NLA_U32 },
3194};
3195
3196/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
3197static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
3198	[IPVS_DAEMON_ATTR_STATE]	= { .type = NLA_U32 },
3199	[IPVS_DAEMON_ATTR_MCAST_IFN]	= { .type = NLA_NUL_STRING,
3200					    .len = IP_VS_IFNAME_MAXLEN - 1 },
3201	[IPVS_DAEMON_ATTR_SYNC_ID]	= { .type = NLA_U32 },
3202	[IPVS_DAEMON_ATTR_SYNC_MAXLEN]	= { .type = NLA_U16 },
3203	[IPVS_DAEMON_ATTR_MCAST_GROUP]	= { .type = NLA_U32 },
3204	[IPVS_DAEMON_ATTR_MCAST_GROUP6]	= { .len = sizeof(struct in6_addr) },
3205	[IPVS_DAEMON_ATTR_MCAST_PORT]	= { .type = NLA_U16 },
3206	[IPVS_DAEMON_ATTR_MCAST_TTL]	= { .type = NLA_U8 },
3207};
3208
3209/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
3210static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
3211	[IPVS_SVC_ATTR_AF]		= { .type = NLA_U16 },
3212	[IPVS_SVC_ATTR_PROTOCOL]	= { .type = NLA_U16 },
3213	[IPVS_SVC_ATTR_ADDR]		= { .type = NLA_BINARY,
3214					    .len = sizeof(union nf_inet_addr) },
3215	[IPVS_SVC_ATTR_PORT]		= { .type = NLA_U16 },
3216	[IPVS_SVC_ATTR_FWMARK]		= { .type = NLA_U32 },
3217	[IPVS_SVC_ATTR_SCHED_NAME]	= { .type = NLA_NUL_STRING,
3218					    .len = IP_VS_SCHEDNAME_MAXLEN - 1 },
3219	[IPVS_SVC_ATTR_PE_NAME]		= { .type = NLA_NUL_STRING,
3220					    .len = IP_VS_PENAME_MAXLEN },
3221	[IPVS_SVC_ATTR_FLAGS]		= { .type = NLA_BINARY,
3222					    .len = sizeof(struct ip_vs_flags) },
3223	[IPVS_SVC_ATTR_TIMEOUT]		= { .type = NLA_U32 },
3224	[IPVS_SVC_ATTR_NETMASK]		= { .type = NLA_U32 },
3225	[IPVS_SVC_ATTR_STATS]		= { .type = NLA_NESTED },
3226};
3227
3228/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
3229static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
3230	[IPVS_DEST_ATTR_ADDR]		= { .type = NLA_BINARY,
3231					    .len = sizeof(union nf_inet_addr) },
3232	[IPVS_DEST_ATTR_PORT]		= { .type = NLA_U16 },
3233	[IPVS_DEST_ATTR_FWD_METHOD]	= { .type = NLA_U32 },
3234	[IPVS_DEST_ATTR_WEIGHT]		= { .type = NLA_U32 },
3235	[IPVS_DEST_ATTR_U_THRESH]	= { .type = NLA_U32 },
3236	[IPVS_DEST_ATTR_L_THRESH]	= { .type = NLA_U32 },
3237	[IPVS_DEST_ATTR_ACTIVE_CONNS]	= { .type = NLA_U32 },
3238	[IPVS_DEST_ATTR_INACT_CONNS]	= { .type = NLA_U32 },
3239	[IPVS_DEST_ATTR_PERSIST_CONNS]	= { .type = NLA_U32 },
3240	[IPVS_DEST_ATTR_STATS]		= { .type = NLA_NESTED },
3241	[IPVS_DEST_ATTR_ADDR_FAMILY]	= { .type = NLA_U16 },
3242	[IPVS_DEST_ATTR_TUN_TYPE]	= { .type = NLA_U8 },
3243	[IPVS_DEST_ATTR_TUN_PORT]	= { .type = NLA_U16 },
3244	[IPVS_DEST_ATTR_TUN_FLAGS]	= { .type = NLA_U16 },
3245};
3246
3247static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
3248				 struct ip_vs_kstats *kstats)
3249{
3250	struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type);
3251
3252	if (!nl_stats)
3253		return -EMSGSIZE;
3254
3255	if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) ||
3256	    nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) ||
3257	    nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) ||
3258	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes,
3259			      IPVS_STATS_ATTR_PAD) ||
3260	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes,
3261			      IPVS_STATS_ATTR_PAD) ||
3262	    nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) ||
3263	    nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) ||
3264	    nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) ||
3265	    nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) ||
3266	    nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps))
3267		goto nla_put_failure;
3268	nla_nest_end(skb, nl_stats);
3269
3270	return 0;
3271
3272nla_put_failure:
3273	nla_nest_cancel(skb, nl_stats);
3274	return -EMSGSIZE;
3275}
3276
3277static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type,
3278				   struct ip_vs_kstats *kstats)
3279{
3280	struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type);
3281
3282	if (!nl_stats)
3283		return -EMSGSIZE;
3284
3285	if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns,
3286			      IPVS_STATS_ATTR_PAD) ||
3287	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts,
3288			      IPVS_STATS_ATTR_PAD) ||
3289	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts,
3290			      IPVS_STATS_ATTR_PAD) ||
3291	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes,
3292			      IPVS_STATS_ATTR_PAD) ||
3293	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes,
3294			      IPVS_STATS_ATTR_PAD) ||
3295	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps,
3296			      IPVS_STATS_ATTR_PAD) ||
3297	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps,
3298			      IPVS_STATS_ATTR_PAD) ||
3299	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps,
3300			      IPVS_STATS_ATTR_PAD) ||
3301	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps,
3302			      IPVS_STATS_ATTR_PAD) ||
3303	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps,
3304			      IPVS_STATS_ATTR_PAD))
3305		goto nla_put_failure;
3306	nla_nest_end(skb, nl_stats);
3307
3308	return 0;
3309
3310nla_put_failure:
3311	nla_nest_cancel(skb, nl_stats);
3312	return -EMSGSIZE;
3313}
3314
3315static int ip_vs_genl_fill_service(struct sk_buff *skb,
3316				   struct ip_vs_service *svc)
3317{
3318	struct ip_vs_scheduler *sched;
3319	struct ip_vs_pe *pe;
3320	struct nlattr *nl_service;
3321	struct ip_vs_flags flags = { .flags = svc->flags,
3322				     .mask = ~0 };
3323	struct ip_vs_kstats kstats;
3324	char *sched_name;
3325
3326	nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE);
3327	if (!nl_service)
3328		return -EMSGSIZE;
3329
3330	if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
3331		goto nla_put_failure;
3332	if (svc->fwmark) {
3333		if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
3334			goto nla_put_failure;
3335	} else {
3336		if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
3337		    nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
3338		    nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
3339			goto nla_put_failure;
3340	}
3341
3342	sched = rcu_dereference_protected(svc->scheduler, 1);
3343	sched_name = sched ? sched->name : "none";
3344	pe = rcu_dereference_protected(svc->pe, 1);
3345	if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) ||
3346	    (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
3347	    nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
3348	    nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
3349	    nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
3350		goto nla_put_failure;
3351	ip_vs_copy_stats(&kstats, &svc->stats);
3352	if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats))
3353		goto nla_put_failure;
3354	if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats))
3355		goto nla_put_failure;
3356
3357	nla_nest_end(skb, nl_service);
3358
3359	return 0;
3360
3361nla_put_failure:
3362	nla_nest_cancel(skb, nl_service);
3363	return -EMSGSIZE;
3364}
3365
3366static int ip_vs_genl_dump_service(struct sk_buff *skb,
3367				   struct ip_vs_service *svc,
3368				   struct netlink_callback *cb)
3369{
3370	void *hdr;
3371
3372	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3373			  &ip_vs_genl_family, NLM_F_MULTI,
3374			  IPVS_CMD_NEW_SERVICE);
3375	if (!hdr)
3376		return -EMSGSIZE;
3377
3378	if (ip_vs_genl_fill_service(skb, svc) < 0)
3379		goto nla_put_failure;
3380
3381	genlmsg_end(skb, hdr);
3382	return 0;
3383
3384nla_put_failure:
3385	genlmsg_cancel(skb, hdr);
3386	return -EMSGSIZE;
3387}
3388
3389static int ip_vs_genl_dump_services(struct sk_buff *skb,
3390				    struct netlink_callback *cb)
3391{
3392	int idx = 0, i;
3393	int start = cb->args[0];
3394	struct ip_vs_service *svc;
3395	struct net *net = sock_net(skb->sk);
3396	struct netns_ipvs *ipvs = net_ipvs(net);
3397
3398	mutex_lock(&__ip_vs_mutex);
3399	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
3400		hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
3401			if (++idx <= start || (svc->ipvs != ipvs))
3402				continue;
3403			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
3404				idx--;
3405				goto nla_put_failure;
3406			}
3407		}
3408	}
3409
3410	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
3411		hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
3412			if (++idx <= start || (svc->ipvs != ipvs))
3413				continue;
3414			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
3415				idx--;
3416				goto nla_put_failure;
3417			}
3418		}
3419	}
3420
3421nla_put_failure:
3422	mutex_unlock(&__ip_vs_mutex);
3423	cb->args[0] = idx;
3424
3425	return skb->len;
3426}
3427
3428static bool ip_vs_is_af_valid(int af)
3429{
3430	if (af == AF_INET)
3431		return true;
3432#ifdef CONFIG_IP_VS_IPV6
3433	if (af == AF_INET6 && ipv6_mod_enabled())
3434		return true;
3435#endif
3436	return false;
3437}
3438
3439static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs,
3440				    struct ip_vs_service_user_kern *usvc,
3441				    struct nlattr *nla, bool full_entry,
3442				    struct ip_vs_service **ret_svc)
3443{
3444	struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
3445	struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
3446	struct ip_vs_service *svc;
3447
3448	/* Parse mandatory identifying service fields first */
3449	if (nla == NULL ||
3450	    nla_parse_nested_deprecated(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy, NULL))
3451		return -EINVAL;
3452
3453	nla_af		= attrs[IPVS_SVC_ATTR_AF];
3454	nla_protocol	= attrs[IPVS_SVC_ATTR_PROTOCOL];
3455	nla_addr	= attrs[IPVS_SVC_ATTR_ADDR];
3456	nla_port	= attrs[IPVS_SVC_ATTR_PORT];
3457	nla_fwmark	= attrs[IPVS_SVC_ATTR_FWMARK];
3458
3459	if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
3460		return -EINVAL;
3461
3462	memset(usvc, 0, sizeof(*usvc));
3463
3464	usvc->af = nla_get_u16(nla_af);
3465	if (!ip_vs_is_af_valid(usvc->af))
3466		return -EAFNOSUPPORT;
3467
3468	if (nla_fwmark) {
3469		usvc->protocol = IPPROTO_TCP;
3470		usvc->fwmark = nla_get_u32(nla_fwmark);
3471	} else {
3472		usvc->protocol = nla_get_u16(nla_protocol);
3473		nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
3474		usvc->port = nla_get_be16(nla_port);
3475		usvc->fwmark = 0;
3476	}
3477
3478	rcu_read_lock();
3479	if (usvc->fwmark)
3480		svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark);
3481	else
3482		svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol,
3483					   &usvc->addr, usvc->port);
3484	rcu_read_unlock();
3485	*ret_svc = svc;
3486
3487	/* If a full entry was requested, check for the additional fields */
3488	if (full_entry) {
3489		struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
3490			      *nla_netmask;
3491		struct ip_vs_flags flags;
3492
3493		nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
3494		nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
3495		nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
3496		nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
3497		nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
3498
3499		if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
3500			return -EINVAL;
3501
3502		nla_memcpy(&flags, nla_flags, sizeof(flags));
3503
3504		/* prefill flags from service if it already exists */
3505		if (svc)
3506			usvc->flags = svc->flags;
3507
3508		/* set new flags from userland */
3509		usvc->flags = (usvc->flags & ~flags.mask) |
3510			      (flags.flags & flags.mask);
3511		usvc->sched_name = nla_data(nla_sched);
3512		usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
3513		usvc->timeout = nla_get_u32(nla_timeout);
3514		usvc->netmask = nla_get_be32(nla_netmask);
3515	}
3516
3517	return 0;
3518}
3519
3520static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs,
3521						     struct nlattr *nla)
3522{
3523	struct ip_vs_service_user_kern usvc;
3524	struct ip_vs_service *svc;
3525	int ret;
3526
3527	ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, false, &svc);
3528	return ret ? ERR_PTR(ret) : svc;
3529}
3530
3531static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
3532{
3533	struct nlattr *nl_dest;
3534	struct ip_vs_kstats kstats;
3535
3536	nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST);
3537	if (!nl_dest)
3538		return -EMSGSIZE;
3539
3540	if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
3541	    nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
3542	    nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
3543			(atomic_read(&dest->conn_flags) &
3544			 IP_VS_CONN_F_FWD_MASK)) ||
3545	    nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
3546			atomic_read(&dest->weight)) ||
3547	    nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE,
3548		       dest->tun_type) ||
3549	    nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT,
3550			 dest->tun_port) ||
3551	    nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS,
3552			dest->tun_flags) ||
3553	    nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
3554	    nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
3555	    nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
3556			atomic_read(&dest->activeconns)) ||
3557	    nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
3558			atomic_read(&dest->inactconns)) ||
3559	    nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
3560			atomic_read(&dest->persistconns)) ||
3561	    nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af))
3562		goto nla_put_failure;
3563	ip_vs_copy_stats(&kstats, &dest->stats);
3564	if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats))
3565		goto nla_put_failure;
3566	if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats))
3567		goto nla_put_failure;
3568
3569	nla_nest_end(skb, nl_dest);
3570
3571	return 0;
3572
3573nla_put_failure:
3574	nla_nest_cancel(skb, nl_dest);
3575	return -EMSGSIZE;
3576}
3577
3578static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
3579				struct netlink_callback *cb)
3580{
3581	void *hdr;
3582
3583	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3584			  &ip_vs_genl_family, NLM_F_MULTI,
3585			  IPVS_CMD_NEW_DEST);
3586	if (!hdr)
3587		return -EMSGSIZE;
3588
3589	if (ip_vs_genl_fill_dest(skb, dest) < 0)
3590		goto nla_put_failure;
3591
3592	genlmsg_end(skb, hdr);
3593	return 0;
3594
3595nla_put_failure:
3596	genlmsg_cancel(skb, hdr);
3597	return -EMSGSIZE;
3598}
3599
3600static int ip_vs_genl_dump_dests(struct sk_buff *skb,
3601				 struct netlink_callback *cb)
3602{
3603	int idx = 0;
3604	int start = cb->args[0];
3605	struct ip_vs_service *svc;
3606	struct ip_vs_dest *dest;
3607	struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
3608	struct net *net = sock_net(skb->sk);
3609	struct netns_ipvs *ipvs = net_ipvs(net);
3610
3611	mutex_lock(&__ip_vs_mutex);
3612
3613	/* Try to find the service for which to dump destinations */
3614	if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack))
3615		goto out_err;
3616
3617
3618	svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]);
3619	if (IS_ERR_OR_NULL(svc))
3620		goto out_err;
3621
3622	/* Dump the destinations */
3623	list_for_each_entry(dest, &svc->destinations, n_list) {
3624		if (++idx <= start)
3625			continue;
3626		if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
3627			idx--;
3628			goto nla_put_failure;
3629		}
3630	}
3631
3632nla_put_failure:
3633	cb->args[0] = idx;
3634
3635out_err:
3636	mutex_unlock(&__ip_vs_mutex);
3637
3638	return skb->len;
3639}
3640
3641static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
3642				 struct nlattr *nla, bool full_entry)
3643{
3644	struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
3645	struct nlattr *nla_addr, *nla_port;
3646	struct nlattr *nla_addr_family;
3647
3648	/* Parse mandatory identifying destination fields first */
3649	if (nla == NULL ||
3650	    nla_parse_nested_deprecated(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy, NULL))
3651		return -EINVAL;
3652
3653	nla_addr	= attrs[IPVS_DEST_ATTR_ADDR];
3654	nla_port	= attrs[IPVS_DEST_ATTR_PORT];
3655	nla_addr_family	= attrs[IPVS_DEST_ATTR_ADDR_FAMILY];
3656
3657	if (!(nla_addr && nla_port))
3658		return -EINVAL;
3659
3660	memset(udest, 0, sizeof(*udest));
3661
3662	nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
3663	udest->port = nla_get_be16(nla_port);
3664
3665	if (nla_addr_family)
3666		udest->af = nla_get_u16(nla_addr_family);
3667	else
3668		udest->af = 0;
3669
3670	/* If a full entry was requested, check for the additional fields */
3671	if (full_entry) {
3672		struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
3673			      *nla_l_thresh, *nla_tun_type, *nla_tun_port,
3674			      *nla_tun_flags;
3675
3676		nla_fwd		= attrs[IPVS_DEST_ATTR_FWD_METHOD];
3677		nla_weight	= attrs[IPVS_DEST_ATTR_WEIGHT];
3678		nla_u_thresh	= attrs[IPVS_DEST_ATTR_U_THRESH];
3679		nla_l_thresh	= attrs[IPVS_DEST_ATTR_L_THRESH];
3680		nla_tun_type	= attrs[IPVS_DEST_ATTR_TUN_TYPE];
3681		nla_tun_port	= attrs[IPVS_DEST_ATTR_TUN_PORT];
3682		nla_tun_flags	= attrs[IPVS_DEST_ATTR_TUN_FLAGS];
3683
3684		if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
3685			return -EINVAL;
3686
3687		udest->conn_flags = nla_get_u32(nla_fwd)
3688				    & IP_VS_CONN_F_FWD_MASK;
3689		udest->weight = nla_get_u32(nla_weight);
3690		udest->u_threshold = nla_get_u32(nla_u_thresh);
3691		udest->l_threshold = nla_get_u32(nla_l_thresh);
3692
3693		if (nla_tun_type)
3694			udest->tun_type = nla_get_u8(nla_tun_type);
3695
3696		if (nla_tun_port)
3697			udest->tun_port = nla_get_be16(nla_tun_port);
3698
3699		if (nla_tun_flags)
3700			udest->tun_flags = nla_get_u16(nla_tun_flags);
3701	}
3702
3703	return 0;
3704}
3705
3706static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
3707				  struct ipvs_sync_daemon_cfg *c)
3708{
3709	struct nlattr *nl_daemon;
3710
3711	nl_daemon = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DAEMON);
3712	if (!nl_daemon)
3713		return -EMSGSIZE;
3714
3715	if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
3716	    nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) ||
3717	    nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) ||
3718	    nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) ||
3719	    nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) ||
3720	    nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl))
3721		goto nla_put_failure;
3722#ifdef CONFIG_IP_VS_IPV6
3723	if (c->mcast_af == AF_INET6) {
3724		if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6,
3725				     &c->mcast_group.in6))
3726			goto nla_put_failure;
3727	} else
3728#endif
3729		if (c->mcast_af == AF_INET &&
3730		    nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP,
3731				    c->mcast_group.ip))
3732			goto nla_put_failure;
3733	nla_nest_end(skb, nl_daemon);
3734
3735	return 0;
3736
3737nla_put_failure:
3738	nla_nest_cancel(skb, nl_daemon);
3739	return -EMSGSIZE;
3740}
3741
3742static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
3743				  struct ipvs_sync_daemon_cfg *c,
3744				  struct netlink_callback *cb)
3745{
3746	void *hdr;
3747	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3748			  &ip_vs_genl_family, NLM_F_MULTI,
3749			  IPVS_CMD_NEW_DAEMON);
3750	if (!hdr)
3751		return -EMSGSIZE;
3752
3753	if (ip_vs_genl_fill_daemon(skb, state, c))
3754		goto nla_put_failure;
3755
3756	genlmsg_end(skb, hdr);
3757	return 0;
3758
3759nla_put_failure:
3760	genlmsg_cancel(skb, hdr);
3761	return -EMSGSIZE;
3762}
3763
3764static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
3765				   struct netlink_callback *cb)
3766{
3767	struct net *net = sock_net(skb->sk);
3768	struct netns_ipvs *ipvs = net_ipvs(net);
3769
3770	mutex_lock(&ipvs->sync_mutex);
3771	if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
3772		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
3773					   &ipvs->mcfg, cb) < 0)
3774			goto nla_put_failure;
3775
3776		cb->args[0] = 1;
3777	}
3778
3779	if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
3780		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
3781					   &ipvs->bcfg, cb) < 0)
3782			goto nla_put_failure;
3783
3784		cb->args[1] = 1;
3785	}
3786
3787nla_put_failure:
3788	mutex_unlock(&ipvs->sync_mutex);
3789
3790	return skb->len;
3791}
3792
3793static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
3794{
3795	struct ipvs_sync_daemon_cfg c;
3796	struct nlattr *a;
3797	int ret;
3798
3799	memset(&c, 0, sizeof(c));
3800	if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
3801	      attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
3802	      attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
3803		return -EINVAL;
3804	strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
3805		sizeof(c.mcast_ifn));
3806	c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]);
3807
3808	a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN];
3809	if (a)
3810		c.sync_maxlen = nla_get_u16(a);
3811
3812	a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP];
3813	if (a) {
3814		c.mcast_af = AF_INET;
3815		c.mcast_group.ip = nla_get_in_addr(a);
3816		if (!ipv4_is_multicast(c.mcast_group.ip))
3817			return -EINVAL;
3818	} else {
3819		a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6];
3820		if (a) {
3821#ifdef CONFIG_IP_VS_IPV6
3822			int addr_type;
3823
3824			c.mcast_af = AF_INET6;
3825			c.mcast_group.in6 = nla_get_in6_addr(a);
3826			addr_type = ipv6_addr_type(&c.mcast_group.in6);
3827			if (!(addr_type & IPV6_ADDR_MULTICAST))
3828				return -EINVAL;
3829#else
3830			return -EAFNOSUPPORT;
3831#endif
3832		}
3833	}
3834
3835	a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT];
3836	if (a)
3837		c.mcast_port = nla_get_u16(a);
3838
3839	a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL];
3840	if (a)
3841		c.mcast_ttl = nla_get_u8(a);
3842
3843	/* The synchronization protocol is incompatible with mixed family
3844	 * services
3845	 */
3846	if (ipvs->mixed_address_family_dests > 0)
3847		return -EINVAL;
3848
3849	ret = start_sync_thread(ipvs, &c,
3850				nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3851	return ret;
3852}
3853
3854static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
3855{
3856	int ret;
3857
3858	if (!attrs[IPVS_DAEMON_ATTR_STATE])
3859		return -EINVAL;
3860
3861	ret = stop_sync_thread(ipvs,
3862			       nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3863	return ret;
3864}
3865
3866static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs)
3867{
3868	struct ip_vs_timeout_user t;
3869
3870	__ip_vs_get_timeouts(ipvs, &t);
3871
3872	if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
3873		t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
3874
3875	if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
3876		t.tcp_fin_timeout =
3877			nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
3878
3879	if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
3880		t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
3881
3882	return ip_vs_set_timeout(ipvs, &t);
3883}
3884
3885static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
3886{
3887	int ret = -EINVAL, cmd;
3888	struct net *net = sock_net(skb->sk);
3889	struct netns_ipvs *ipvs = net_ipvs(net);
3890
3891	cmd = info->genlhdr->cmd;
3892
3893	if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
3894		struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
3895
3896		if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
3897		    nla_parse_nested_deprecated(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], ip_vs_daemon_policy, info->extack))
3898			goto out;
3899
3900		if (cmd == IPVS_CMD_NEW_DAEMON)
3901			ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs);
3902		else
3903			ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs);
3904	}
3905
3906out:
3907	return ret;
3908}
3909
3910static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3911{
3912	bool need_full_svc = false, need_full_dest = false;
3913	struct ip_vs_service *svc = NULL;
3914	struct ip_vs_service_user_kern usvc;
3915	struct ip_vs_dest_user_kern udest;
3916	int ret = 0, cmd;
3917	struct net *net = sock_net(skb->sk);
3918	struct netns_ipvs *ipvs = net_ipvs(net);
3919
3920	cmd = info->genlhdr->cmd;
3921
3922	mutex_lock(&__ip_vs_mutex);
3923
3924	if (cmd == IPVS_CMD_FLUSH) {
3925		ret = ip_vs_flush(ipvs, false);
3926		goto out;
3927	} else if (cmd == IPVS_CMD_SET_CONFIG) {
3928		ret = ip_vs_genl_set_config(ipvs, info->attrs);
3929		goto out;
3930	} else if (cmd == IPVS_CMD_ZERO &&
3931		   !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
3932		ret = ip_vs_zero_all(ipvs);
3933		goto out;
3934	}
3935
3936	/* All following commands require a service argument, so check if we
3937	 * received a valid one. We need a full service specification when
3938	 * adding / editing a service. Only identifying members otherwise. */
3939	if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
3940		need_full_svc = true;
3941
3942	ret = ip_vs_genl_parse_service(ipvs, &usvc,
3943				       info->attrs[IPVS_CMD_ATTR_SERVICE],
3944				       need_full_svc, &svc);
3945	if (ret)
3946		goto out;
3947
3948	/* Unless we're adding a new service, the service must already exist */
3949	if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
3950		ret = -ESRCH;
3951		goto out;
3952	}
3953
3954	/* Destination commands require a valid destination argument. For
3955	 * adding / editing a destination, we need a full destination
3956	 * specification. */
3957	if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
3958	    cmd == IPVS_CMD_DEL_DEST) {
3959		if (cmd != IPVS_CMD_DEL_DEST)
3960			need_full_dest = true;
3961
3962		ret = ip_vs_genl_parse_dest(&udest,
3963					    info->attrs[IPVS_CMD_ATTR_DEST],
3964					    need_full_dest);
3965		if (ret)
3966			goto out;
3967
3968		/* Old protocols did not allow the user to specify address
3969		 * family, so we set it to zero instead.  We also didn't
3970		 * allow heterogeneous pools in the old code, so it's safe
3971		 * to assume that this will have the same address family as
3972		 * the service.
3973		 */
3974		if (udest.af == 0)
3975			udest.af = svc->af;
3976
3977		if (!ip_vs_is_af_valid(udest.af)) {
3978			ret = -EAFNOSUPPORT;
3979			goto out;
3980		}
3981
3982		if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) {
3983			/* The synchronization protocol is incompatible
3984			 * with mixed family services
3985			 */
3986			if (ipvs->sync_state) {
3987				ret = -EINVAL;
3988				goto out;
3989			}
3990
3991			/* Which connection types do we support? */
3992			switch (udest.conn_flags) {
3993			case IP_VS_CONN_F_TUNNEL:
3994				/* We are able to forward this */
3995				break;
3996			default:
3997				ret = -EINVAL;
3998				goto out;
3999			}
4000		}
4001	}
4002
4003	switch (cmd) {
4004	case IPVS_CMD_NEW_SERVICE:
4005		if (svc == NULL)
4006			ret = ip_vs_add_service(ipvs, &usvc, &svc);
4007		else
4008			ret = -EEXIST;
4009		break;
4010	case IPVS_CMD_SET_SERVICE:
4011		ret = ip_vs_edit_service(svc, &usvc);
4012		break;
4013	case IPVS_CMD_DEL_SERVICE:
4014		ret = ip_vs_del_service(svc);
4015		/* do not use svc, it can be freed */
4016		break;
4017	case IPVS_CMD_NEW_DEST:
4018		ret = ip_vs_add_dest(svc, &udest);
4019		break;
4020	case IPVS_CMD_SET_DEST:
4021		ret = ip_vs_edit_dest(svc, &udest);
4022		break;
4023	case IPVS_CMD_DEL_DEST:
4024		ret = ip_vs_del_dest(svc, &udest);
4025		break;
4026	case IPVS_CMD_ZERO:
4027		ret = ip_vs_zero_service(svc);
4028		break;
4029	default:
4030		ret = -EINVAL;
4031	}
4032
4033out:
4034	mutex_unlock(&__ip_vs_mutex);
4035
4036	return ret;
4037}
4038
4039static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
4040{
4041	struct sk_buff *msg;
4042	void *reply;
4043	int ret, cmd, reply_cmd;
4044	struct net *net = sock_net(skb->sk);
4045	struct netns_ipvs *ipvs = net_ipvs(net);
4046
4047	cmd = info->genlhdr->cmd;
4048
4049	if (cmd == IPVS_CMD_GET_SERVICE)
4050		reply_cmd = IPVS_CMD_NEW_SERVICE;
4051	else if (cmd == IPVS_CMD_GET_INFO)
4052		reply_cmd = IPVS_CMD_SET_INFO;
4053	else if (cmd == IPVS_CMD_GET_CONFIG)
4054		reply_cmd = IPVS_CMD_SET_CONFIG;
4055	else {
4056		pr_err("unknown Generic Netlink command\n");
4057		return -EINVAL;
4058	}
4059
4060	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
4061	if (!msg)
4062		return -ENOMEM;
4063
4064	mutex_lock(&__ip_vs_mutex);
4065
4066	reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
4067	if (reply == NULL)
4068		goto nla_put_failure;
4069
4070	switch (cmd) {
4071	case IPVS_CMD_GET_SERVICE:
4072	{
4073		struct ip_vs_service *svc;
4074
4075		svc = ip_vs_genl_find_service(ipvs,
4076					      info->attrs[IPVS_CMD_ATTR_SERVICE]);
4077		if (IS_ERR(svc)) {
4078			ret = PTR_ERR(svc);
4079			goto out_err;
4080		} else if (svc) {
4081			ret = ip_vs_genl_fill_service(msg, svc);
4082			if (ret)
4083				goto nla_put_failure;
4084		} else {
4085			ret = -ESRCH;
4086			goto out_err;
4087		}
4088
4089		break;
4090	}
4091
4092	case IPVS_CMD_GET_CONFIG:
4093	{
4094		struct ip_vs_timeout_user t;
4095
4096		__ip_vs_get_timeouts(ipvs, &t);
4097#ifdef CONFIG_IP_VS_PROTO_TCP
4098		if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
4099				t.tcp_timeout) ||
4100		    nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
4101				t.tcp_fin_timeout))
4102			goto nla_put_failure;
4103#endif
4104#ifdef CONFIG_IP_VS_PROTO_UDP
4105		if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
4106			goto nla_put_failure;
4107#endif
4108
4109		break;
4110	}
4111
4112	case IPVS_CMD_GET_INFO:
4113		if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
4114				IP_VS_VERSION_CODE) ||
4115		    nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
4116				ip_vs_conn_tab_size))
4117			goto nla_put_failure;
4118		break;
4119	}
4120
4121	genlmsg_end(msg, reply);
4122	ret = genlmsg_reply(msg, info);
4123	goto out;
4124
4125nla_put_failure:
4126	pr_err("not enough space in Netlink message\n");
4127	ret = -EMSGSIZE;
4128
4129out_err:
4130	nlmsg_free(msg);
4131out:
4132	mutex_unlock(&__ip_vs_mutex);
4133
4134	return ret;
4135}
4136
4137
4138static const struct genl_small_ops ip_vs_genl_ops[] = {
4139	{
4140		.cmd	= IPVS_CMD_NEW_SERVICE,
4141		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4142		.flags	= GENL_ADMIN_PERM,
4143		.doit	= ip_vs_genl_set_cmd,
4144	},
4145	{
4146		.cmd	= IPVS_CMD_SET_SERVICE,
4147		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4148		.flags	= GENL_ADMIN_PERM,
4149		.doit	= ip_vs_genl_set_cmd,
4150	},
4151	{
4152		.cmd	= IPVS_CMD_DEL_SERVICE,
4153		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4154		.flags	= GENL_ADMIN_PERM,
4155		.doit	= ip_vs_genl_set_cmd,
4156	},
4157	{
4158		.cmd	= IPVS_CMD_GET_SERVICE,
4159		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4160		.flags	= GENL_ADMIN_PERM,
4161		.doit	= ip_vs_genl_get_cmd,
4162		.dumpit	= ip_vs_genl_dump_services,
4163	},
4164	{
4165		.cmd	= IPVS_CMD_NEW_DEST,
4166		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4167		.flags	= GENL_ADMIN_PERM,
4168		.doit	= ip_vs_genl_set_cmd,
4169	},
4170	{
4171		.cmd	= IPVS_CMD_SET_DEST,
4172		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4173		.flags	= GENL_ADMIN_PERM,
4174		.doit	= ip_vs_genl_set_cmd,
4175	},
4176	{
4177		.cmd	= IPVS_CMD_DEL_DEST,
4178		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4179		.flags	= GENL_ADMIN_PERM,
4180		.doit	= ip_vs_genl_set_cmd,
4181	},
4182	{
4183		.cmd	= IPVS_CMD_GET_DEST,
4184		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4185		.flags	= GENL_ADMIN_PERM,
4186		.dumpit	= ip_vs_genl_dump_dests,
4187	},
4188	{
4189		.cmd	= IPVS_CMD_NEW_DAEMON,
4190		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4191		.flags	= GENL_ADMIN_PERM,
4192		.doit	= ip_vs_genl_set_daemon,
4193	},
4194	{
4195		.cmd	= IPVS_CMD_DEL_DAEMON,
4196		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4197		.flags	= GENL_ADMIN_PERM,
4198		.doit	= ip_vs_genl_set_daemon,
4199	},
4200	{
4201		.cmd	= IPVS_CMD_GET_DAEMON,
4202		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4203		.flags	= GENL_ADMIN_PERM,
4204		.dumpit	= ip_vs_genl_dump_daemons,
4205	},
4206	{
4207		.cmd	= IPVS_CMD_SET_CONFIG,
4208		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4209		.flags	= GENL_ADMIN_PERM,
4210		.doit	= ip_vs_genl_set_cmd,
4211	},
4212	{
4213		.cmd	= IPVS_CMD_GET_CONFIG,
4214		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4215		.flags	= GENL_ADMIN_PERM,
4216		.doit	= ip_vs_genl_get_cmd,
4217	},
4218	{
4219		.cmd	= IPVS_CMD_GET_INFO,
4220		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4221		.flags	= GENL_ADMIN_PERM,
4222		.doit	= ip_vs_genl_get_cmd,
4223	},
4224	{
4225		.cmd	= IPVS_CMD_ZERO,
4226		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4227		.flags	= GENL_ADMIN_PERM,
4228		.doit	= ip_vs_genl_set_cmd,
4229	},
4230	{
4231		.cmd	= IPVS_CMD_FLUSH,
4232		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4233		.flags	= GENL_ADMIN_PERM,
4234		.doit	= ip_vs_genl_set_cmd,
4235	},
4236};
4237
4238static struct genl_family ip_vs_genl_family __ro_after_init = {
4239	.hdrsize	= 0,
4240	.name		= IPVS_GENL_NAME,
4241	.version	= IPVS_GENL_VERSION,
4242	.maxattr	= IPVS_CMD_ATTR_MAX,
4243	.policy = ip_vs_cmd_policy,
4244	.netnsok        = true,         /* Make ipvsadm to work on netns */
4245	.module		= THIS_MODULE,
4246	.small_ops	= ip_vs_genl_ops,
4247	.n_small_ops	= ARRAY_SIZE(ip_vs_genl_ops),
4248	.resv_start_op	= IPVS_CMD_FLUSH + 1,
4249};
4250
4251static int __init ip_vs_genl_register(void)
4252{
4253	return genl_register_family(&ip_vs_genl_family);
4254}
4255
4256static void ip_vs_genl_unregister(void)
4257{
4258	genl_unregister_family(&ip_vs_genl_family);
4259}
4260
4261/* End of Generic Netlink interface definitions */
4262
4263/*
4264 * per netns intit/exit func.
4265 */
4266#ifdef CONFIG_SYSCTL
4267static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
4268{
4269	struct net *net = ipvs->net;
4270	struct ctl_table *tbl;
4271	int idx, ret;
4272	size_t ctl_table_size = ARRAY_SIZE(vs_vars);
4273
4274	atomic_set(&ipvs->dropentry, 0);
4275	spin_lock_init(&ipvs->dropentry_lock);
4276	spin_lock_init(&ipvs->droppacket_lock);
4277	spin_lock_init(&ipvs->securetcp_lock);
4278	INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
4279	INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
4280			  expire_nodest_conn_handler);
4281	ipvs->est_stopped = 0;
4282
4283	if (!net_eq(net, &init_net)) {
4284		tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
4285		if (tbl == NULL)
4286			return -ENOMEM;
4287
4288		/* Don't export sysctls to unprivileged users */
4289		if (net->user_ns != &init_user_ns) {
4290			tbl[0].procname = NULL;
4291			ctl_table_size = 0;
4292		}
4293	} else
4294		tbl = vs_vars;
4295	/* Initialize sysctl defaults */
4296	for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) {
4297		if (tbl[idx].proc_handler == proc_do_defense_mode)
4298			tbl[idx].extra2 = ipvs;
4299	}
4300	idx = 0;
4301	ipvs->sysctl_amemthresh = 1024;
4302	tbl[idx++].data = &ipvs->sysctl_amemthresh;
4303	ipvs->sysctl_am_droprate = 10;
4304	tbl[idx++].data = &ipvs->sysctl_am_droprate;
4305	tbl[idx++].data = &ipvs->sysctl_drop_entry;
4306	tbl[idx++].data = &ipvs->sysctl_drop_packet;
4307#ifdef CONFIG_IP_VS_NFCT
4308	tbl[idx++].data = &ipvs->sysctl_conntrack;
4309#endif
4310	tbl[idx++].data = &ipvs->sysctl_secure_tcp;
4311	ipvs->sysctl_snat_reroute = 1;
4312	tbl[idx++].data = &ipvs->sysctl_snat_reroute;
4313	ipvs->sysctl_sync_ver = 1;
4314	tbl[idx++].data = &ipvs->sysctl_sync_ver;
4315	ipvs->sysctl_sync_ports = 1;
4316	tbl[idx++].data = &ipvs->sysctl_sync_ports;
4317	tbl[idx++].data = &ipvs->sysctl_sync_persist_mode;
4318	ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
4319	tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
4320	ipvs->sysctl_sync_sock_size = 0;
4321	tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
4322	tbl[idx++].data = &ipvs->sysctl_cache_bypass;
4323	tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
4324	tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
4325	tbl[idx++].data = &ipvs->sysctl_sloppy_sctp;
4326	tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
4327	ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
4328	ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
4329	tbl[idx].data = &ipvs->sysctl_sync_threshold;
4330	tbl[idx].extra2 = ipvs;
4331	tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
4332	ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
4333	tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
4334	ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
4335	tbl[idx++].data = &ipvs->sysctl_sync_retries;
4336	tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
4337	ipvs->sysctl_pmtu_disc = 1;
4338	tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
4339	tbl[idx++].data = &ipvs->sysctl_backup_only;
4340	ipvs->sysctl_conn_reuse_mode = 1;
4341	tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
4342	tbl[idx++].data = &ipvs->sysctl_schedule_icmp;
4343	tbl[idx++].data = &ipvs->sysctl_ignore_tunneled;
4344	ipvs->sysctl_run_estimation = 1;
4345	tbl[idx].extra2 = ipvs;
4346	tbl[idx++].data = &ipvs->sysctl_run_estimation;
4347
4348	ipvs->est_cpulist_valid = 0;
4349	tbl[idx].extra2 = ipvs;
4350	tbl[idx++].data = &ipvs->sysctl_est_cpulist;
4351
4352	ipvs->sysctl_est_nice = IPVS_EST_NICE;
4353	tbl[idx].extra2 = ipvs;
4354	tbl[idx++].data = &ipvs->sysctl_est_nice;
4355
4356#ifdef CONFIG_IP_VS_DEBUG
4357	/* Global sysctls must be ro in non-init netns */
4358	if (!net_eq(net, &init_net))
4359		tbl[idx++].mode = 0444;
4360#endif
4361
4362	ret = -ENOMEM;
4363	ipvs->sysctl_hdr = register_net_sysctl_sz(net, "net/ipv4/vs", tbl,
4364						  ctl_table_size);
4365	if (!ipvs->sysctl_hdr)
4366		goto err;
4367	ipvs->sysctl_tbl = tbl;
4368
4369	ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s);
4370	if (ret < 0)
4371		goto err;
4372
4373	/* Schedule defense work */
4374	queue_delayed_work(system_long_wq, &ipvs->defense_work,
4375			   DEFENSE_TIMER_PERIOD);
4376
4377	return 0;
4378
4379err:
4380	unregister_net_sysctl_table(ipvs->sysctl_hdr);
4381	if (!net_eq(net, &init_net))
4382		kfree(tbl);
4383	return ret;
4384}
4385
4386static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
4387{
4388	struct net *net = ipvs->net;
4389
4390	cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work);
4391	cancel_delayed_work_sync(&ipvs->defense_work);
4392	cancel_work_sync(&ipvs->defense_work.work);
4393	unregister_net_sysctl_table(ipvs->sysctl_hdr);
4394	ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
4395
4396	if (ipvs->est_cpulist_valid)
4397		free_cpumask_var(ipvs->sysctl_est_cpulist);
4398
4399	if (!net_eq(net, &init_net))
4400		kfree(ipvs->sysctl_tbl);
4401}
4402
4403#else
4404
4405static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; }
4406static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { }
4407
4408#endif
4409
4410static struct notifier_block ip_vs_dst_notifier = {
4411	.notifier_call = ip_vs_dst_event,
4412#ifdef CONFIG_IP_VS_IPV6
4413	.priority = ADDRCONF_NOTIFY_PRIORITY + 5,
4414#endif
4415};
4416
4417int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
4418{
4419	int ret = -ENOMEM;
4420	int idx;
4421
4422	/* Initialize rs_table */
4423	for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
4424		INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
4425
4426	INIT_LIST_HEAD(&ipvs->dest_trash);
4427	spin_lock_init(&ipvs->dest_trash_lock);
4428	timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0);
4429	atomic_set(&ipvs->ftpsvc_counter, 0);
4430	atomic_set(&ipvs->nullsvc_counter, 0);
4431	atomic_set(&ipvs->conn_out_counter, 0);
4432
4433	INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler);
4434
4435	/* procfs stats */
4436	ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL);
4437	if (!ipvs->tot_stats)
4438		goto out;
4439	if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0)
4440		goto err_tot_stats;
4441
4442#ifdef CONFIG_PROC_FS
4443	if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net,
4444			     &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter)))
4445		goto err_vs;
4446	if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net,
4447				    ip_vs_stats_show, NULL))
4448		goto err_stats;
4449	if (!proc_create_net_single("ip_vs_stats_percpu", 0,
4450				    ipvs->net->proc_net,
4451				    ip_vs_stats_percpu_show, NULL))
4452		goto err_percpu;
4453#endif
4454
4455	ret = ip_vs_control_net_init_sysctl(ipvs);
4456	if (ret < 0)
4457		goto err;
4458
4459	return 0;
4460
4461err:
4462#ifdef CONFIG_PROC_FS
4463	remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
4464
4465err_percpu:
4466	remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
4467
4468err_stats:
4469	remove_proc_entry("ip_vs", ipvs->net->proc_net);
4470
4471err_vs:
4472#endif
4473	ip_vs_stats_release(&ipvs->tot_stats->s);
4474
4475err_tot_stats:
4476	kfree(ipvs->tot_stats);
4477
4478out:
4479	return ret;
4480}
4481
4482void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
4483{
4484	ip_vs_trash_cleanup(ipvs);
4485	ip_vs_control_net_cleanup_sysctl(ipvs);
4486	cancel_delayed_work_sync(&ipvs->est_reload_work);
4487#ifdef CONFIG_PROC_FS
4488	remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
4489	remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
4490	remove_proc_entry("ip_vs", ipvs->net->proc_net);
4491#endif
4492	call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free);
4493}
4494
4495int __init ip_vs_register_nl_ioctl(void)
4496{
4497	int ret;
4498
4499	ret = nf_register_sockopt(&ip_vs_sockopts);
4500	if (ret) {
4501		pr_err("cannot register sockopt.\n");
4502		goto err_sock;
4503	}
4504
4505	ret = ip_vs_genl_register();
4506	if (ret) {
4507		pr_err("cannot register Generic Netlink interface.\n");
4508		goto err_genl;
4509	}
4510	return 0;
4511
4512err_genl:
4513	nf_unregister_sockopt(&ip_vs_sockopts);
4514err_sock:
4515	return ret;
4516}
4517
4518void ip_vs_unregister_nl_ioctl(void)
4519{
4520	ip_vs_genl_unregister();
4521	nf_unregister_sockopt(&ip_vs_sockopts);
4522}
4523
4524int __init ip_vs_control_init(void)
4525{
4526	int idx;
4527	int ret;
4528
4529	/* Initialize svc_table, ip_vs_svc_fwm_table */
4530	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4531		INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
4532		INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]);
4533	}
4534
4535	smp_wmb();	/* Do we really need it now ? */
4536
4537	ret = register_netdevice_notifier(&ip_vs_dst_notifier);
4538	if (ret < 0)
4539		return ret;
4540
4541	return 0;
4542}
4543
4544
4545void ip_vs_control_cleanup(void)
4546{
4547	unregister_netdevice_notifier(&ip_vs_dst_notifier);
4548	/* relying on common rcu_barrier() in ip_vs_cleanup() */
4549}
4550