1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * ip_vs_est.c: simple rate estimator for IPVS
4 *
5 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
6 *
7 * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
8 *              Network name space (netns) aware.
9 *              Global data moved to netns i.e struct netns_ipvs
10 *              Affected data: est_list and est_lock.
11 *              estimation_timer() runs with timer per netns.
12 *              get_stats()) do the per cpu summing.
13 */
14
15#define KMSG_COMPONENT "IPVS"
16#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
17
18#include <linux/kernel.h>
19#include <linux/jiffies.h>
20#include <linux/types.h>
21#include <linux/interrupt.h>
22#include <linux/sysctl.h>
23#include <linux/list.h>
24#include <linux/rcupdate_wait.h>
25
26#include <net/ip_vs.h>
27
28/*
29  This code is to estimate rate in a shorter interval (such as 8
30  seconds) for virtual services and real servers. For measure rate in a
31  long interval, it is easy to implement a user level daemon which
32  periodically reads those statistical counters and measure rate.
33
34  We measure rate during the last 8 seconds every 2 seconds:
35
36    avgrate = avgrate*(1-W) + rate*W
37
38    where W = 2^(-2)
39
40  NOTES.
41
42  * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10.
43
44  * Netlink users can see 64-bit values but sockopt users are restricted
45    to 32-bit values for conns, packets, bps, cps and pps.
46
47  * A lot of code is taken from net/core/gen_estimator.c
48
49  KEY POINTS:
50  - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled
51  - kthreads read the cpustats to update the estimators (svcs, dests, total)
52  - the states of estimators can be read (get stats) or modified (zero stats)
53    from processes
54
55  KTHREADS:
56  - estimators are added initially to est_temp_list and later kthread 0
57    distributes them to one or many kthreads for estimation
58  - kthread contexts are created and attached to array
59  - the kthread tasks are started when first service is added, before that
60    the total stats are not estimated
61  - when configuration (cpulist/nice) is changed, the tasks are restarted
62    by work (est_reload_work)
63  - kthread tasks are stopped while the cpulist is empty
64  - the kthread context holds lists with estimators (chains) which are
65    processed every 2 seconds
66  - as estimators can be added dynamically and in bursts, we try to spread
67    them to multiple chains which are estimated at different time
68  - on start, kthread 0 enters calculation phase to determine the chain limits
69    and the limit of estimators per kthread
70  - est_add_ktid: ktid where to add new ests, can point to empty slot where
71    we should add kt data
72 */
73
74static struct lock_class_key __ipvs_est_key;
75
76static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs);
77static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs);
78
79static void ip_vs_chain_estimation(struct hlist_head *chain)
80{
81	struct ip_vs_estimator *e;
82	struct ip_vs_cpu_stats *c;
83	struct ip_vs_stats *s;
84	u64 rate;
85
86	hlist_for_each_entry_rcu(e, chain, list) {
87		u64 conns, inpkts, outpkts, inbytes, outbytes;
88		u64 kconns = 0, kinpkts = 0, koutpkts = 0;
89		u64 kinbytes = 0, koutbytes = 0;
90		unsigned int start;
91		int i;
92
93		if (kthread_should_stop())
94			break;
95
96		s = container_of(e, struct ip_vs_stats, est);
97		for_each_possible_cpu(i) {
98			c = per_cpu_ptr(s->cpustats, i);
99			do {
100				start = u64_stats_fetch_begin(&c->syncp);
101				conns = u64_stats_read(&c->cnt.conns);
102				inpkts = u64_stats_read(&c->cnt.inpkts);
103				outpkts = u64_stats_read(&c->cnt.outpkts);
104				inbytes = u64_stats_read(&c->cnt.inbytes);
105				outbytes = u64_stats_read(&c->cnt.outbytes);
106			} while (u64_stats_fetch_retry(&c->syncp, start));
107			kconns += conns;
108			kinpkts += inpkts;
109			koutpkts += outpkts;
110			kinbytes += inbytes;
111			koutbytes += outbytes;
112		}
113
114		spin_lock(&s->lock);
115
116		s->kstats.conns = kconns;
117		s->kstats.inpkts = kinpkts;
118		s->kstats.outpkts = koutpkts;
119		s->kstats.inbytes = kinbytes;
120		s->kstats.outbytes = koutbytes;
121
122		/* scaled by 2^10, but divided 2 seconds */
123		rate = (s->kstats.conns - e->last_conns) << 9;
124		e->last_conns = s->kstats.conns;
125		e->cps += ((s64)rate - (s64)e->cps) >> 2;
126
127		rate = (s->kstats.inpkts - e->last_inpkts) << 9;
128		e->last_inpkts = s->kstats.inpkts;
129		e->inpps += ((s64)rate - (s64)e->inpps) >> 2;
130
131		rate = (s->kstats.outpkts - e->last_outpkts) << 9;
132		e->last_outpkts = s->kstats.outpkts;
133		e->outpps += ((s64)rate - (s64)e->outpps) >> 2;
134
135		/* scaled by 2^5, but divided 2 seconds */
136		rate = (s->kstats.inbytes - e->last_inbytes) << 4;
137		e->last_inbytes = s->kstats.inbytes;
138		e->inbps += ((s64)rate - (s64)e->inbps) >> 2;
139
140		rate = (s->kstats.outbytes - e->last_outbytes) << 4;
141		e->last_outbytes = s->kstats.outbytes;
142		e->outbps += ((s64)rate - (s64)e->outbps) >> 2;
143		spin_unlock(&s->lock);
144	}
145}
146
147static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row)
148{
149	struct ip_vs_est_tick_data *td;
150	int cid;
151
152	rcu_read_lock();
153	td = rcu_dereference(kd->ticks[row]);
154	if (!td)
155		goto out;
156	for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) {
157		if (kthread_should_stop())
158			break;
159		ip_vs_chain_estimation(&td->chains[cid]);
160		cond_resched_rcu();
161		td = rcu_dereference(kd->ticks[row]);
162		if (!td)
163			break;
164	}
165
166out:
167	rcu_read_unlock();
168}
169
170static int ip_vs_estimation_kthread(void *data)
171{
172	struct ip_vs_est_kt_data *kd = data;
173	struct netns_ipvs *ipvs = kd->ipvs;
174	int row = kd->est_row;
175	unsigned long now;
176	int id = kd->id;
177	long gap;
178
179	if (id > 0) {
180		if (!ipvs->est_chain_max)
181			return 0;
182	} else {
183		if (!ipvs->est_chain_max) {
184			ipvs->est_calc_phase = 1;
185			/* commit est_calc_phase before reading est_genid */
186			smp_mb();
187		}
188
189		/* kthread 0 will handle the calc phase */
190		if (ipvs->est_calc_phase)
191			ip_vs_est_calc_phase(ipvs);
192	}
193
194	while (1) {
195		if (!id && !hlist_empty(&ipvs->est_temp_list))
196			ip_vs_est_drain_temp_list(ipvs);
197		set_current_state(TASK_IDLE);
198		if (kthread_should_stop())
199			break;
200
201		/* before estimation, check if we should sleep */
202		now = jiffies;
203		gap = kd->est_timer - now;
204		if (gap > 0) {
205			if (gap > IPVS_EST_TICK) {
206				kd->est_timer = now - IPVS_EST_TICK;
207				gap = IPVS_EST_TICK;
208			}
209			schedule_timeout(gap);
210		} else {
211			__set_current_state(TASK_RUNNING);
212			if (gap < -8 * IPVS_EST_TICK)
213				kd->est_timer = now;
214		}
215
216		if (kd->tick_len[row])
217			ip_vs_tick_estimation(kd, row);
218
219		row++;
220		if (row >= IPVS_EST_NTICKS)
221			row = 0;
222		WRITE_ONCE(kd->est_row, row);
223		kd->est_timer += IPVS_EST_TICK;
224	}
225	__set_current_state(TASK_RUNNING);
226
227	return 0;
228}
229
230/* Schedule stop/start for kthread tasks */
231void ip_vs_est_reload_start(struct netns_ipvs *ipvs)
232{
233	/* Ignore reloads before first service is added */
234	if (!ipvs->enable)
235		return;
236	ip_vs_est_stopped_recalc(ipvs);
237	/* Bump the kthread configuration genid */
238	atomic_inc(&ipvs->est_genid);
239	queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0);
240}
241
242/* Start kthread task with current configuration */
243int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
244			    struct ip_vs_est_kt_data *kd)
245{
246	unsigned long now;
247	int ret = 0;
248	long gap;
249
250	lockdep_assert_held(&ipvs->est_mutex);
251
252	if (kd->task)
253		goto out;
254	now = jiffies;
255	gap = kd->est_timer - now;
256	/* Sync est_timer if task is starting later */
257	if (abs(gap) > 4 * IPVS_EST_TICK)
258		kd->est_timer = now;
259	kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d",
260				  ipvs->gen, kd->id);
261	if (IS_ERR(kd->task)) {
262		ret = PTR_ERR(kd->task);
263		kd->task = NULL;
264		goto out;
265	}
266
267	set_user_nice(kd->task, sysctl_est_nice(ipvs));
268	set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs));
269
270	pr_info("starting estimator thread %d...\n", kd->id);
271	wake_up_process(kd->task);
272
273out:
274	return ret;
275}
276
277void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd)
278{
279	if (kd->task) {
280		pr_info("stopping estimator thread %d...\n", kd->id);
281		kthread_stop(kd->task);
282		kd->task = NULL;
283	}
284}
285
286/* Apply parameters to kthread */
287static void ip_vs_est_set_params(struct netns_ipvs *ipvs,
288				 struct ip_vs_est_kt_data *kd)
289{
290	kd->chain_max = ipvs->est_chain_max;
291	/* We are using single chain on RCU preemption */
292	if (IPVS_EST_TICK_CHAINS == 1)
293		kd->chain_max *= IPVS_EST_CHAIN_FACTOR;
294	kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max;
295	kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max;
296}
297
298/* Create and start estimation kthread in a free or new array slot */
299static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
300{
301	struct ip_vs_est_kt_data *kd = NULL;
302	int id = ipvs->est_kt_count;
303	int ret = -ENOMEM;
304	void *arr = NULL;
305	int i;
306
307	if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
308	    ipvs->enable && ipvs->est_max_threads)
309		return -EINVAL;
310
311	mutex_lock(&ipvs->est_mutex);
312
313	for (i = 0; i < id; i++) {
314		if (!ipvs->est_kt_arr[i])
315			break;
316	}
317	if (i >= id) {
318		arr = krealloc_array(ipvs->est_kt_arr, id + 1,
319				     sizeof(struct ip_vs_est_kt_data *),
320				     GFP_KERNEL);
321		if (!arr)
322			goto out;
323		ipvs->est_kt_arr = arr;
324	} else {
325		id = i;
326	}
327
328	kd = kzalloc(sizeof(*kd), GFP_KERNEL);
329	if (!kd)
330		goto out;
331	kd->ipvs = ipvs;
332	bitmap_fill(kd->avail, IPVS_EST_NTICKS);
333	kd->est_timer = jiffies;
334	kd->id = id;
335	ip_vs_est_set_params(ipvs, kd);
336
337	/* Pre-allocate stats used in calc phase */
338	if (!id && !kd->calc_stats) {
339		kd->calc_stats = ip_vs_stats_alloc();
340		if (!kd->calc_stats)
341			goto out;
342	}
343
344	/* Start kthread tasks only when services are present */
345	if (ipvs->enable && !ip_vs_est_stopped(ipvs)) {
346		ret = ip_vs_est_kthread_start(ipvs, kd);
347		if (ret < 0)
348			goto out;
349	}
350
351	if (arr)
352		ipvs->est_kt_count++;
353	ipvs->est_kt_arr[id] = kd;
354	kd = NULL;
355	/* Use most recent kthread for new ests */
356	ipvs->est_add_ktid = id;
357	ret = 0;
358
359out:
360	mutex_unlock(&ipvs->est_mutex);
361	if (kd) {
362		ip_vs_stats_free(kd->calc_stats);
363		kfree(kd);
364	}
365
366	return ret;
367}
368
369/* Select ktid where to add new ests: available, unused or new slot */
370static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs)
371{
372	int ktid, best = ipvs->est_kt_count;
373	struct ip_vs_est_kt_data *kd;
374
375	for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) {
376		kd = ipvs->est_kt_arr[ktid];
377		if (kd) {
378			if (kd->est_count < kd->est_max_count) {
379				best = ktid;
380				break;
381			}
382		} else if (ktid < best) {
383			best = ktid;
384		}
385	}
386	ipvs->est_add_ktid = best;
387}
388
389/* Add estimator to current kthread (est_add_ktid) */
390static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs,
391				   struct ip_vs_estimator *est)
392{
393	struct ip_vs_est_kt_data *kd = NULL;
394	struct ip_vs_est_tick_data *td;
395	int ktid, row, crow, cid, ret;
396	int delay = est->ktrow;
397
398	BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127,
399			 "Too many chains for ktcid");
400
401	if (ipvs->est_add_ktid < ipvs->est_kt_count) {
402		kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
403		if (kd)
404			goto add_est;
405	}
406
407	ret = ip_vs_est_add_kthread(ipvs);
408	if (ret < 0)
409		goto out;
410	kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
411
412add_est:
413	ktid = kd->id;
414	/* For small number of estimators prefer to use few ticks,
415	 * otherwise try to add into the last estimated row.
416	 * est_row and add_row point after the row we should use
417	 */
418	if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1)
419		crow = READ_ONCE(kd->est_row);
420	else
421		crow = kd->add_row;
422	crow += delay;
423	if (crow >= IPVS_EST_NTICKS)
424		crow -= IPVS_EST_NTICKS;
425	/* Assume initial delay ? */
426	if (delay >= IPVS_EST_NTICKS - 1) {
427		/* Preserve initial delay or decrease it if no space in tick */
428		row = crow;
429		if (crow < IPVS_EST_NTICKS - 1) {
430			crow++;
431			row = find_last_bit(kd->avail, crow);
432		}
433		if (row >= crow)
434			row = find_last_bit(kd->avail, IPVS_EST_NTICKS);
435	} else {
436		/* Preserve delay or increase it if no space in tick */
437		row = IPVS_EST_NTICKS;
438		if (crow > 0)
439			row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow);
440		if (row >= IPVS_EST_NTICKS)
441			row = find_first_bit(kd->avail, IPVS_EST_NTICKS);
442	}
443
444	td = rcu_dereference_protected(kd->ticks[row], 1);
445	if (!td) {
446		td = kzalloc(sizeof(*td), GFP_KERNEL);
447		if (!td) {
448			ret = -ENOMEM;
449			goto out;
450		}
451		rcu_assign_pointer(kd->ticks[row], td);
452	}
453
454	cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS);
455
456	kd->est_count++;
457	kd->tick_len[row]++;
458	if (!td->chain_len[cid])
459		__set_bit(cid, td->present);
460	td->chain_len[cid]++;
461	est->ktid = ktid;
462	est->ktrow = row;
463	est->ktcid = cid;
464	hlist_add_head_rcu(&est->list, &td->chains[cid]);
465
466	if (td->chain_len[cid] >= kd->chain_max) {
467		__set_bit(cid, td->full);
468		if (kd->tick_len[row] >= kd->tick_max)
469			__clear_bit(row, kd->avail);
470	}
471
472	/* Update est_add_ktid to point to first available/empty kt slot */
473	if (kd->est_count == kd->est_max_count)
474		ip_vs_est_update_ktid(ipvs);
475
476	ret = 0;
477
478out:
479	return ret;
480}
481
482/* Start estimation for stats */
483int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
484{
485	struct ip_vs_estimator *est = &stats->est;
486	int ret;
487
488	if (!ipvs->est_max_threads && ipvs->enable)
489		ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
490
491	est->ktid = -1;
492	est->ktrow = IPVS_EST_NTICKS - 1;	/* Initial delay */
493
494	/* We prefer this code to be short, kthread 0 will requeue the
495	 * estimator to available chain. If tasks are disabled, we
496	 * will not allocate much memory, just for kt 0.
497	 */
498	ret = 0;
499	if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0])
500		ret = ip_vs_est_add_kthread(ipvs);
501	if (ret >= 0)
502		hlist_add_head(&est->list, &ipvs->est_temp_list);
503	else
504		INIT_HLIST_NODE(&est->list);
505	return ret;
506}
507
508static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd)
509{
510	if (kd) {
511		if (kd->task) {
512			pr_info("stop unused estimator thread %d...\n", kd->id);
513			kthread_stop(kd->task);
514		}
515		ip_vs_stats_free(kd->calc_stats);
516		kfree(kd);
517	}
518}
519
520/* Unlink estimator from chain */
521void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
522{
523	struct ip_vs_estimator *est = &stats->est;
524	struct ip_vs_est_tick_data *td;
525	struct ip_vs_est_kt_data *kd;
526	int ktid = est->ktid;
527	int row = est->ktrow;
528	int cid = est->ktcid;
529
530	/* Failed to add to chain ? */
531	if (hlist_unhashed(&est->list))
532		return;
533
534	/* On return, estimator can be freed, dequeue it now */
535
536	/* In est_temp_list ? */
537	if (ktid < 0) {
538		hlist_del(&est->list);
539		goto end_kt0;
540	}
541
542	hlist_del_rcu(&est->list);
543	kd = ipvs->est_kt_arr[ktid];
544	td = rcu_dereference_protected(kd->ticks[row], 1);
545	__clear_bit(cid, td->full);
546	td->chain_len[cid]--;
547	if (!td->chain_len[cid])
548		__clear_bit(cid, td->present);
549	kd->tick_len[row]--;
550	__set_bit(row, kd->avail);
551	if (!kd->tick_len[row]) {
552		RCU_INIT_POINTER(kd->ticks[row], NULL);
553		kfree_rcu(td, rcu_head);
554	}
555	kd->est_count--;
556	if (kd->est_count) {
557		/* This kt slot can become available just now, prefer it */
558		if (ktid < ipvs->est_add_ktid)
559			ipvs->est_add_ktid = ktid;
560		return;
561	}
562
563	if (ktid > 0) {
564		mutex_lock(&ipvs->est_mutex);
565		ip_vs_est_kthread_destroy(kd);
566		ipvs->est_kt_arr[ktid] = NULL;
567		if (ktid == ipvs->est_kt_count - 1) {
568			ipvs->est_kt_count--;
569			while (ipvs->est_kt_count > 1 &&
570			       !ipvs->est_kt_arr[ipvs->est_kt_count - 1])
571				ipvs->est_kt_count--;
572		}
573		mutex_unlock(&ipvs->est_mutex);
574
575		/* This slot is now empty, prefer another available kt slot */
576		if (ktid == ipvs->est_add_ktid)
577			ip_vs_est_update_ktid(ipvs);
578	}
579
580end_kt0:
581	/* kt 0 is freed after all other kthreads and chains are empty */
582	if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) {
583		kd = ipvs->est_kt_arr[0];
584		if (!kd || !kd->est_count) {
585			mutex_lock(&ipvs->est_mutex);
586			if (kd) {
587				ip_vs_est_kthread_destroy(kd);
588				ipvs->est_kt_arr[0] = NULL;
589			}
590			ipvs->est_kt_count--;
591			mutex_unlock(&ipvs->est_mutex);
592			ipvs->est_add_ktid = 0;
593		}
594	}
595}
596
597/* Register all ests from est_temp_list to kthreads */
598static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs)
599{
600	struct ip_vs_estimator *est;
601
602	while (1) {
603		int max = 16;
604
605		mutex_lock(&__ip_vs_mutex);
606
607		while (max-- > 0) {
608			est = hlist_entry_safe(ipvs->est_temp_list.first,
609					       struct ip_vs_estimator, list);
610			if (est) {
611				if (kthread_should_stop())
612					goto unlock;
613				hlist_del_init(&est->list);
614				if (ip_vs_enqueue_estimator(ipvs, est) >= 0)
615					continue;
616				est->ktid = -1;
617				hlist_add_head(&est->list,
618					       &ipvs->est_temp_list);
619				/* Abort, some entries will not be estimated
620				 * until next attempt
621				 */
622			}
623			goto unlock;
624		}
625		mutex_unlock(&__ip_vs_mutex);
626		cond_resched();
627	}
628
629unlock:
630	mutex_unlock(&__ip_vs_mutex);
631}
632
633/* Calculate limits for all kthreads */
634static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
635{
636	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
637	struct ip_vs_est_kt_data *kd;
638	struct hlist_head chain;
639	struct ip_vs_stats *s;
640	int cache_factor = 4;
641	int i, loops, ntest;
642	s32 min_est = 0;
643	ktime_t t1, t2;
644	int max = 8;
645	int ret = 1;
646	s64 diff;
647	u64 val;
648
649	INIT_HLIST_HEAD(&chain);
650	mutex_lock(&__ip_vs_mutex);
651	kd = ipvs->est_kt_arr[0];
652	mutex_unlock(&__ip_vs_mutex);
653	s = kd ? kd->calc_stats : NULL;
654	if (!s)
655		goto out;
656	hlist_add_head(&s->est.list, &chain);
657
658	loops = 1;
659	/* Get best result from many tests */
660	for (ntest = 0; ntest < 12; ntest++) {
661		if (!(ntest & 3)) {
662			/* Wait for cpufreq frequency transition */
663			wait_event_idle_timeout(wq, kthread_should_stop(),
664						HZ / 50);
665			if (!ipvs->enable || kthread_should_stop())
666				goto stop;
667		}
668
669		local_bh_disable();
670		rcu_read_lock();
671
672		/* Put stats in cache */
673		ip_vs_chain_estimation(&chain);
674
675		t1 = ktime_get();
676		for (i = loops * cache_factor; i > 0; i--)
677			ip_vs_chain_estimation(&chain);
678		t2 = ktime_get();
679
680		rcu_read_unlock();
681		local_bh_enable();
682
683		if (!ipvs->enable || kthread_should_stop())
684			goto stop;
685		cond_resched();
686
687		diff = ktime_to_ns(ktime_sub(t2, t1));
688		if (diff <= 1 * NSEC_PER_USEC) {
689			/* Do more loops on low time resolution */
690			loops *= 2;
691			continue;
692		}
693		if (diff >= NSEC_PER_SEC)
694			continue;
695		val = diff;
696		do_div(val, loops);
697		if (!min_est || val < min_est) {
698			min_est = val;
699			/* goal: 95usec per chain */
700			val = 95 * NSEC_PER_USEC;
701			if (val >= min_est) {
702				do_div(val, min_est);
703				max = (int)val;
704			} else {
705				max = 1;
706			}
707		}
708	}
709
710out:
711	if (s)
712		hlist_del_init(&s->est.list);
713	*chain_max = max;
714	return ret;
715
716stop:
717	ret = 0;
718	goto out;
719}
720
721/* Calculate the parameters and apply them in context of kt #0
722 * ECP: est_calc_phase
723 * ECM: est_chain_max
724 * ECP	ECM	Insert Chain	enable	Description
725 * ---------------------------------------------------------------------------
726 * 0	0	est_temp_list	0	create kt #0 context
727 * 0	0	est_temp_list	0->1	service added, start kthread #0 task
728 * 0->1	0	est_temp_list	1	kt task #0 started, enters calc phase
729 * 1	0	est_temp_list	1	kt #0: determine est_chain_max,
730 *					stop tasks, move ests to est_temp_list
731 *					and free kd for kthreads 1..last
732 * 1->0	0->N	kt chains	1	ests can go to kthreads
733 * 0	N	kt chains	1	drain est_temp_list, create new kthread
734 *					contexts, start tasks, estimate
735 */
736static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
737{
738	int genid = atomic_read(&ipvs->est_genid);
739	struct ip_vs_est_tick_data *td;
740	struct ip_vs_est_kt_data *kd;
741	struct ip_vs_estimator *est;
742	struct ip_vs_stats *stats;
743	int id, row, cid, delay;
744	bool last, last_td;
745	int chain_max;
746	int step;
747
748	if (!ip_vs_est_calc_limits(ipvs, &chain_max))
749		return;
750
751	mutex_lock(&__ip_vs_mutex);
752
753	/* Stop all other tasks, so that we can immediately move the
754	 * estimators to est_temp_list without RCU grace period
755	 */
756	mutex_lock(&ipvs->est_mutex);
757	for (id = 1; id < ipvs->est_kt_count; id++) {
758		/* netns clean up started, abort */
759		if (!ipvs->enable)
760			goto unlock2;
761		kd = ipvs->est_kt_arr[id];
762		if (!kd)
763			continue;
764		ip_vs_est_kthread_stop(kd);
765	}
766	mutex_unlock(&ipvs->est_mutex);
767
768	/* Move all estimators to est_temp_list but carefully,
769	 * all estimators and kthread data can be released while
770	 * we reschedule. Even for kthread 0.
771	 */
772	step = 0;
773
774	/* Order entries in est_temp_list in ascending delay, so now
775	 * walk delay(desc), id(desc), cid(asc)
776	 */
777	delay = IPVS_EST_NTICKS;
778
779next_delay:
780	delay--;
781	if (delay < 0)
782		goto end_dequeue;
783
784last_kt:
785	/* Destroy contexts backwards */
786	id = ipvs->est_kt_count;
787
788next_kt:
789	if (!ipvs->enable || kthread_should_stop())
790		goto unlock;
791	id--;
792	if (id < 0)
793		goto next_delay;
794	kd = ipvs->est_kt_arr[id];
795	if (!kd)
796		goto next_kt;
797	/* kt 0 can exist with empty chains */
798	if (!id && kd->est_count <= 1)
799		goto next_delay;
800
801	row = kd->est_row + delay;
802	if (row >= IPVS_EST_NTICKS)
803		row -= IPVS_EST_NTICKS;
804	td = rcu_dereference_protected(kd->ticks[row], 1);
805	if (!td)
806		goto next_kt;
807
808	cid = 0;
809
810walk_chain:
811	if (kthread_should_stop())
812		goto unlock;
813	step++;
814	if (!(step & 63)) {
815		/* Give chance estimators to be added (to est_temp_list)
816		 * and deleted (releasing kthread contexts)
817		 */
818		mutex_unlock(&__ip_vs_mutex);
819		cond_resched();
820		mutex_lock(&__ip_vs_mutex);
821
822		/* Current kt released ? */
823		if (id >= ipvs->est_kt_count)
824			goto last_kt;
825		if (kd != ipvs->est_kt_arr[id])
826			goto next_kt;
827		/* Current td released ? */
828		if (td != rcu_dereference_protected(kd->ticks[row], 1))
829			goto next_kt;
830		/* No fatal changes on the current kd and td */
831	}
832	est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator,
833			       list);
834	if (!est) {
835		cid++;
836		if (cid >= IPVS_EST_TICK_CHAINS)
837			goto next_kt;
838		goto walk_chain;
839	}
840	/* We can cheat and increase est_count to protect kt 0 context
841	 * from release but we prefer to keep the last estimator
842	 */
843	last = kd->est_count <= 1;
844	/* Do not free kt #0 data */
845	if (!id && last)
846		goto next_delay;
847	last_td = kd->tick_len[row] <= 1;
848	stats = container_of(est, struct ip_vs_stats, est);
849	ip_vs_stop_estimator(ipvs, stats);
850	/* Tasks are stopped, move without RCU grace period */
851	est->ktid = -1;
852	est->ktrow = row - kd->est_row;
853	if (est->ktrow < 0)
854		est->ktrow += IPVS_EST_NTICKS;
855	hlist_add_head(&est->list, &ipvs->est_temp_list);
856	/* kd freed ? */
857	if (last)
858		goto next_kt;
859	/* td freed ? */
860	if (last_td)
861		goto next_kt;
862	goto walk_chain;
863
864end_dequeue:
865	/* All estimators removed while calculating ? */
866	if (!ipvs->est_kt_count)
867		goto unlock;
868	kd = ipvs->est_kt_arr[0];
869	if (!kd)
870		goto unlock;
871	kd->add_row = kd->est_row;
872	ipvs->est_chain_max = chain_max;
873	ip_vs_est_set_params(ipvs, kd);
874
875	pr_info("using max %d ests per chain, %d per kthread\n",
876		kd->chain_max, kd->est_max_count);
877
878	/* Try to keep tot_stats in kt0, enqueue it early */
879	if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) &&
880	    ipvs->tot_stats->s.est.ktid == -1) {
881		hlist_del(&ipvs->tot_stats->s.est.list);
882		hlist_add_head(&ipvs->tot_stats->s.est.list,
883			       &ipvs->est_temp_list);
884	}
885
886	mutex_lock(&ipvs->est_mutex);
887
888	/* We completed the calc phase, new calc phase not requested */
889	if (genid == atomic_read(&ipvs->est_genid))
890		ipvs->est_calc_phase = 0;
891
892unlock2:
893	mutex_unlock(&ipvs->est_mutex);
894
895unlock:
896	mutex_unlock(&__ip_vs_mutex);
897}
898
899void ip_vs_zero_estimator(struct ip_vs_stats *stats)
900{
901	struct ip_vs_estimator *est = &stats->est;
902	struct ip_vs_kstats *k = &stats->kstats;
903
904	/* reset counters, caller must hold the stats->lock lock */
905	est->last_inbytes = k->inbytes;
906	est->last_outbytes = k->outbytes;
907	est->last_conns = k->conns;
908	est->last_inpkts = k->inpkts;
909	est->last_outpkts = k->outpkts;
910	est->cps = 0;
911	est->inpps = 0;
912	est->outpps = 0;
913	est->inbps = 0;
914	est->outbps = 0;
915}
916
917/* Get decoded rates */
918void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats)
919{
920	struct ip_vs_estimator *e = &stats->est;
921
922	dst->cps = (e->cps + 0x1FF) >> 10;
923	dst->inpps = (e->inpps + 0x1FF) >> 10;
924	dst->outpps = (e->outpps + 0x1FF) >> 10;
925	dst->inbps = (e->inbps + 0xF) >> 5;
926	dst->outbps = (e->outbps + 0xF) >> 5;
927}
928
929int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs)
930{
931	INIT_HLIST_HEAD(&ipvs->est_temp_list);
932	ipvs->est_kt_arr = NULL;
933	ipvs->est_max_threads = 0;
934	ipvs->est_calc_phase = 0;
935	ipvs->est_chain_max = 0;
936	ipvs->est_kt_count = 0;
937	ipvs->est_add_ktid = 0;
938	atomic_set(&ipvs->est_genid, 0);
939	atomic_set(&ipvs->est_genid_done, 0);
940	__mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key);
941	return 0;
942}
943
944void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs)
945{
946	int i;
947
948	for (i = 0; i < ipvs->est_kt_count; i++)
949		ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]);
950	kfree(ipvs->est_kt_arr);
951	mutex_destroy(&ipvs->est_mutex);
952}
953