1/*
2 * Floating proportions
3 *
4 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5 *
6 * Description:
7 *
8 * The floating proportion is a time derivative with an exponentially decaying
9 * history:
10 *
11 *   p_{j} = \Sum_{i=0} (dx_{j}/dt_{-i}) / 2^(1+i)
12 *
13 * Where j is an element from {prop_local}, x_{j} is j's number of events,
14 * and i the time period over which the differential is taken. So d/dt_{-i} is
15 * the differential over the i-th last period.
16 *
17 * The decaying history gives smooth transitions. The time differential carries
18 * the notion of speed.
19 *
20 * The denominator is 2^(1+i) because we want the series to be normalised, ie.
21 *
22 *   \Sum_{i=0} 1/2^(1+i) = 1
23 *
24 * Further more, if we measure time (t) in the same events as x; so that:
25 *
26 *   t = \Sum_{j} x_{j}
27 *
28 * we get that:
29 *
30 *   \Sum_{j} p_{j} = 1
31 *
32 * Writing this in an iterative fashion we get (dropping the 'd's):
33 *
34 *   if (++x_{j}, ++t > period)
35 *     t /= 2;
36 *     for_each (j)
37 *       x_{j} /= 2;
38 *
39 * so that:
40 *
41 *   p_{j} = x_{j} / t;
42 *
43 * We optimize away the '/= 2' for the global time delta by noting that:
44 *
45 *   if (++t > period) t /= 2:
46 *
47 * Can be approximated by:
48 *
49 *   period/2 + (++t % period/2)
50 *
51 * [ Furthermore, when we choose period to be 2^n it can be written in terms of
52 *   binary operations and wraparound artefacts disappear. ]
53 *
54 * Also note that this yields a natural counter of the elapsed periods:
55 *
56 *   c = t / (period/2)
57 *
58 * [ Its monotonic increasing property can be applied to mitigate the wrap-
59 *   around issue. ]
60 *
61 * This allows us to do away with the loop over all prop_locals on each period
62 * expiration. By remembering the period count under which it was last accessed
63 * as c_{j}, we can obtain the number of 'missed' cycles from:
64 *
65 *   c - c_{j}
66 *
67 * We can then lazily catch up to the global period count every time we are
68 * going to use x_{j}, by doing:
69 *
70 *   x_{j} /= 2^(c - c_{j}), c_{j} = c
71 */
72
73#include <linux/proportions.h>
74#include <linux/rcupdate.h>
75
76int prop_descriptor_init(struct prop_descriptor *pd, int shift)
77{
78	int err;
79
80	if (shift > PROP_MAX_SHIFT)
81		shift = PROP_MAX_SHIFT;
82
83	pd->index = 0;
84	pd->pg[0].shift = shift;
85	mutex_init(&pd->mutex);
86	err = percpu_counter_init(&pd->pg[0].events, 0);
87	if (err)
88		goto out;
89
90	err = percpu_counter_init(&pd->pg[1].events, 0);
91	if (err)
92		percpu_counter_destroy(&pd->pg[0].events);
93
94out:
95	return err;
96}
97
98/*
99 * We have two copies, and flip between them to make it seem like an atomic
100 * update. The update is not really atomic wrt the events counter, but
101 * it is internally consistent with the bit layout depending on shift.
102 *
103 * We copy the events count, move the bits around and flip the index.
104 */
105void prop_change_shift(struct prop_descriptor *pd, int shift)
106{
107	int index;
108	int offset;
109	u64 events;
110	unsigned long flags;
111
112	if (shift > PROP_MAX_SHIFT)
113		shift = PROP_MAX_SHIFT;
114
115	mutex_lock(&pd->mutex);
116
117	index = pd->index ^ 1;
118	offset = pd->pg[pd->index].shift - shift;
119	if (!offset)
120		goto out;
121
122	pd->pg[index].shift = shift;
123
124	local_irq_save(flags);
125	events = percpu_counter_sum(&pd->pg[pd->index].events);
126	if (offset < 0)
127		events <<= -offset;
128	else
129		events >>= offset;
130	percpu_counter_set(&pd->pg[index].events, events);
131
132	/*
133	 * ensure the new pg is fully written before the switch
134	 */
135	smp_wmb();
136	pd->index = index;
137	local_irq_restore(flags);
138
139	synchronize_rcu();
140
141out:
142	mutex_unlock(&pd->mutex);
143}
144
145/*
146 * wrap the access to the data in an rcu_read_lock() section;
147 * this is used to track the active references.
148 */
149static struct prop_global *prop_get_global(struct prop_descriptor *pd)
150__acquires(RCU)
151{
152	int index;
153
154	rcu_read_lock();
155	index = pd->index;
156	/*
157	 * match the wmb from vcd_flip()
158	 */
159	smp_rmb();
160	return &pd->pg[index];
161}
162
163static void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg)
164__releases(RCU)
165{
166	rcu_read_unlock();
167}
168
169static void
170prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift)
171{
172	int offset = *pl_shift - new_shift;
173
174	if (!offset)
175		return;
176
177	if (offset < 0)
178		*pl_period <<= -offset;
179	else
180		*pl_period >>= offset;
181
182	*pl_shift = new_shift;
183}
184
185/*
186 * PERCPU
187 */
188
189#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids)))
190
191int prop_local_init_percpu(struct prop_local_percpu *pl)
192{
193	spin_lock_init(&pl->lock);
194	pl->shift = 0;
195	pl->period = 0;
196	return percpu_counter_init(&pl->events, 0);
197}
198
199void prop_local_destroy_percpu(struct prop_local_percpu *pl)
200{
201	percpu_counter_destroy(&pl->events);
202}
203
204/*
205 * Catch up with missed period expirations.
206 *
207 *   until (c_{j} == c)
208 *     x_{j} -= x_{j}/2;
209 *     c_{j}++;
210 */
211static
212void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl)
213{
214	unsigned long period = 1UL << (pg->shift - 1);
215	unsigned long period_mask = ~(period - 1);
216	unsigned long global_period;
217	unsigned long flags;
218
219	global_period = percpu_counter_read(&pg->events);
220	global_period &= period_mask;
221
222	/*
223	 * Fast path - check if the local and global period count still match
224	 * outside of the lock.
225	 */
226	if (pl->period == global_period)
227		return;
228
229	spin_lock_irqsave(&pl->lock, flags);
230	prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
231
232	/*
233	 * For each missed period, we half the local counter.
234	 * basically:
235	 *   pl->events >> (global_period - pl->period);
236	 */
237	period = (global_period - pl->period) >> (pg->shift - 1);
238	if (period < BITS_PER_LONG) {
239		s64 val = percpu_counter_read(&pl->events);
240
241		if (val < (nr_cpu_ids * PROP_BATCH))
242			val = percpu_counter_sum(&pl->events);
243
244		__percpu_counter_add(&pl->events, -val + (val >> period),
245					PROP_BATCH);
246	} else
247		percpu_counter_set(&pl->events, 0);
248
249	pl->period = global_period;
250	spin_unlock_irqrestore(&pl->lock, flags);
251}
252
253/*
254 *   ++x_{j}, ++t
255 */
256void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl)
257{
258	struct prop_global *pg = prop_get_global(pd);
259
260	prop_norm_percpu(pg, pl);
261	__percpu_counter_add(&pl->events, 1, PROP_BATCH);
262	percpu_counter_add(&pg->events, 1);
263	prop_put_global(pd, pg);
264}
265
266/*
267 * identical to __prop_inc_percpu, except that it limits this pl's fraction to
268 * @frac/PROP_FRAC_BASE by ignoring events when this limit has been exceeded.
269 */
270void __prop_inc_percpu_max(struct prop_descriptor *pd,
271			   struct prop_local_percpu *pl, long frac)
272{
273	struct prop_global *pg = prop_get_global(pd);
274
275	prop_norm_percpu(pg, pl);
276
277	if (unlikely(frac != PROP_FRAC_BASE)) {
278		unsigned long period_2 = 1UL << (pg->shift - 1);
279		unsigned long counter_mask = period_2 - 1;
280		unsigned long global_count;
281		long numerator, denominator;
282
283		numerator = percpu_counter_read_positive(&pl->events);
284		global_count = percpu_counter_read(&pg->events);
285		denominator = period_2 + (global_count & counter_mask);
286
287		if (numerator > ((denominator * frac) >> PROP_FRAC_SHIFT))
288			goto out_put;
289	}
290
291	percpu_counter_add(&pl->events, 1);
292	percpu_counter_add(&pg->events, 1);
293
294out_put:
295	prop_put_global(pd, pg);
296}
297
298/*
299 * Obtain a fraction of this proportion
300 *
301 *   p_{j} = x_{j} / (period/2 + t % period/2)
302 */
303void prop_fraction_percpu(struct prop_descriptor *pd,
304		struct prop_local_percpu *pl,
305		long *numerator, long *denominator)
306{
307	struct prop_global *pg = prop_get_global(pd);
308	unsigned long period_2 = 1UL << (pg->shift - 1);
309	unsigned long counter_mask = period_2 - 1;
310	unsigned long global_count;
311
312	prop_norm_percpu(pg, pl);
313	*numerator = percpu_counter_read_positive(&pl->events);
314
315	global_count = percpu_counter_read(&pg->events);
316	*denominator = period_2 + (global_count & counter_mask);
317
318	prop_put_global(pd, pg);
319}
320
321/*
322 * SINGLE
323 */
324
325int prop_local_init_single(struct prop_local_single *pl)
326{
327	spin_lock_init(&pl->lock);
328	pl->shift = 0;
329	pl->period = 0;
330	pl->events = 0;
331	return 0;
332}
333
334void prop_local_destroy_single(struct prop_local_single *pl)
335{
336}
337
338/*
339 * Catch up with missed period expirations.
340 */
341static
342void prop_norm_single(struct prop_global *pg, struct prop_local_single *pl)
343{
344	unsigned long period = 1UL << (pg->shift - 1);
345	unsigned long period_mask = ~(period - 1);
346	unsigned long global_period;
347	unsigned long flags;
348
349	global_period = percpu_counter_read(&pg->events);
350	global_period &= period_mask;
351
352	/*
353	 * Fast path - check if the local and global period count still match
354	 * outside of the lock.
355	 */
356	if (pl->period == global_period)
357		return;
358
359	spin_lock_irqsave(&pl->lock, flags);
360	prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
361	/*
362	 * For each missed period, we half the local counter.
363	 */
364	period = (global_period - pl->period) >> (pg->shift - 1);
365	if (likely(period < BITS_PER_LONG))
366		pl->events >>= period;
367	else
368		pl->events = 0;
369	pl->period = global_period;
370	spin_unlock_irqrestore(&pl->lock, flags);
371}
372
373/*
374 *   ++x_{j}, ++t
375 */
376void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl)
377{
378	struct prop_global *pg = prop_get_global(pd);
379
380	prop_norm_single(pg, pl);
381	pl->events++;
382	percpu_counter_add(&pg->events, 1);
383	prop_put_global(pd, pg);
384}
385
386/*
387 * Obtain a fraction of this proportion
388 *
389 *   p_{j} = x_{j} / (period/2 + t % period/2)
390 */
391void prop_fraction_single(struct prop_descriptor *pd,
392	       	struct prop_local_single *pl,
393		long *numerator, long *denominator)
394{
395	struct prop_global *pg = prop_get_global(pd);
396	unsigned long period_2 = 1UL << (pg->shift - 1);
397	unsigned long counter_mask = period_2 - 1;
398	unsigned long global_count;
399
400	prop_norm_single(pg, pl);
401	*numerator = pl->events;
402
403	global_count = percpu_counter_read(&pg->events);
404	*denominator = period_2 + (global_count & counter_mask);
405
406	prop_put_global(pd, pg);
407}
408