1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2016 Thomas Gleixner.
4 * Copyright (C) 2016-2017 Christoph Hellwig.
5 */
6#include <linux/kernel.h>
7#include <linux/slab.h>
8#include <linux/cpu.h>
9#include <linux/sort.h>
10#include <linux/group_cpus.h>
11
12#ifdef CONFIG_SMP
13
14static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
15				unsigned int cpus_per_grp)
16{
17	const struct cpumask *siblmsk;
18	int cpu, sibl;
19
20	for ( ; cpus_per_grp > 0; ) {
21		cpu = cpumask_first(nmsk);
22
23		/* Should not happen, but I'm too lazy to think about it */
24		if (cpu >= nr_cpu_ids)
25			return;
26
27		cpumask_clear_cpu(cpu, nmsk);
28		cpumask_set_cpu(cpu, irqmsk);
29		cpus_per_grp--;
30
31		/* If the cpu has siblings, use them first */
32		siblmsk = topology_sibling_cpumask(cpu);
33		for (sibl = -1; cpus_per_grp > 0; ) {
34			sibl = cpumask_next(sibl, siblmsk);
35			if (sibl >= nr_cpu_ids)
36				break;
37			if (!cpumask_test_and_clear_cpu(sibl, nmsk))
38				continue;
39			cpumask_set_cpu(sibl, irqmsk);
40			cpus_per_grp--;
41		}
42	}
43}
44
45static cpumask_var_t *alloc_node_to_cpumask(void)
46{
47	cpumask_var_t *masks;
48	int node;
49
50	masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL);
51	if (!masks)
52		return NULL;
53
54	for (node = 0; node < nr_node_ids; node++) {
55		if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL))
56			goto out_unwind;
57	}
58
59	return masks;
60
61out_unwind:
62	while (--node >= 0)
63		free_cpumask_var(masks[node]);
64	kfree(masks);
65	return NULL;
66}
67
68static void free_node_to_cpumask(cpumask_var_t *masks)
69{
70	int node;
71
72	for (node = 0; node < nr_node_ids; node++)
73		free_cpumask_var(masks[node]);
74	kfree(masks);
75}
76
77static void build_node_to_cpumask(cpumask_var_t *masks)
78{
79	int cpu;
80
81	for_each_possible_cpu(cpu)
82		cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
83}
84
85static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
86				const struct cpumask *mask, nodemask_t *nodemsk)
87{
88	int n, nodes = 0;
89
90	/* Calculate the number of nodes in the supplied affinity mask */
91	for_each_node(n) {
92		if (cpumask_intersects(mask, node_to_cpumask[n])) {
93			node_set(n, *nodemsk);
94			nodes++;
95		}
96	}
97	return nodes;
98}
99
100struct node_groups {
101	unsigned id;
102
103	union {
104		unsigned ngroups;
105		unsigned ncpus;
106	};
107};
108
109static int ncpus_cmp_func(const void *l, const void *r)
110{
111	const struct node_groups *ln = l;
112	const struct node_groups *rn = r;
113
114	return ln->ncpus - rn->ncpus;
115}
116
117/*
118 * Allocate group number for each node, so that for each node:
119 *
120 * 1) the allocated number is >= 1
121 *
122 * 2) the allocated number is <= active CPU number of this node
123 *
124 * The actual allocated total groups may be less than @numgrps when
125 * active total CPU number is less than @numgrps.
126 *
127 * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
128 * for each node.
129 */
130static void alloc_nodes_groups(unsigned int numgrps,
131			       cpumask_var_t *node_to_cpumask,
132			       const struct cpumask *cpu_mask,
133			       const nodemask_t nodemsk,
134			       struct cpumask *nmsk,
135			       struct node_groups *node_groups)
136{
137	unsigned n, remaining_ncpus = 0;
138
139	for (n = 0; n < nr_node_ids; n++) {
140		node_groups[n].id = n;
141		node_groups[n].ncpus = UINT_MAX;
142	}
143
144	for_each_node_mask(n, nodemsk) {
145		unsigned ncpus;
146
147		cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
148		ncpus = cpumask_weight(nmsk);
149
150		if (!ncpus)
151			continue;
152		remaining_ncpus += ncpus;
153		node_groups[n].ncpus = ncpus;
154	}
155
156	numgrps = min_t(unsigned, remaining_ncpus, numgrps);
157
158	sort(node_groups, nr_node_ids, sizeof(node_groups[0]),
159	     ncpus_cmp_func, NULL);
160
161	/*
162	 * Allocate groups for each node according to the ratio of this
163	 * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is
164	 * bigger than number of active numa nodes. Always start the
165	 * allocation from the node with minimized nr_cpus.
166	 *
167	 * This way guarantees that each active node gets allocated at
168	 * least one group, and the theory is simple: over-allocation
169	 * is only done when this node is assigned by one group, so
170	 * other nodes will be allocated >= 1 groups, since 'numgrps' is
171	 * bigger than number of numa nodes.
172	 *
173	 * One perfect invariant is that number of allocated groups for
174	 * each node is <= CPU count of this node:
175	 *
176	 * 1) suppose there are two nodes: A and B
177	 * 	ncpu(X) is CPU count of node X
178	 * 	grps(X) is the group count allocated to node X via this
179	 * 	algorithm
180	 *
181	 * 	ncpu(A) <= ncpu(B)
182	 * 	ncpu(A) + ncpu(B) = N
183	 * 	grps(A) + grps(B) = G
184	 *
185	 * 	grps(A) = max(1, round_down(G * ncpu(A) / N))
186	 * 	grps(B) = G - grps(A)
187	 *
188	 * 	both N and G are integer, and 2 <= G <= N, suppose
189	 * 	G = N - delta, and 0 <= delta <= N - 2
190	 *
191	 * 2) obviously grps(A) <= ncpu(A) because:
192	 *
193	 * 	if grps(A) is 1, then grps(A) <= ncpu(A) given
194	 * 	ncpu(A) >= 1
195	 *
196	 * 	otherwise,
197	 * 		grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N
198	 *
199	 * 3) prove how grps(B) <= ncpu(B):
200	 *
201	 * 	if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be
202	 * 	over-allocated, so grps(B) <= ncpu(B),
203	 *
204	 * 	otherwise:
205	 *
206	 * 	grps(A) =
207	 * 		round_down(G * ncpu(A) / N) =
208	 * 		round_down((N - delta) * ncpu(A) / N) =
209	 * 		round_down((N * ncpu(A) - delta * ncpu(A)) / N)	 >=
210	 * 		round_down((N * ncpu(A) - delta * N) / N)	 =
211	 * 		cpu(A) - delta
212	 *
213	 * 	then:
214	 *
215	 * 	grps(A) - G >= ncpu(A) - delta - G
216	 * 	=>
217	 * 	G - grps(A) <= G + delta - ncpu(A)
218	 * 	=>
219	 * 	grps(B) <= N - ncpu(A)
220	 * 	=>
221	 * 	grps(B) <= cpu(B)
222	 *
223	 * For nodes >= 3, it can be thought as one node and another big
224	 * node given that is exactly what this algorithm is implemented,
225	 * and we always re-calculate 'remaining_ncpus' & 'numgrps', and
226	 * finally for each node X: grps(X) <= ncpu(X).
227	 *
228	 */
229	for (n = 0; n < nr_node_ids; n++) {
230		unsigned ngroups, ncpus;
231
232		if (node_groups[n].ncpus == UINT_MAX)
233			continue;
234
235		WARN_ON_ONCE(numgrps == 0);
236
237		ncpus = node_groups[n].ncpus;
238		ngroups = max_t(unsigned, 1,
239				 numgrps * ncpus / remaining_ncpus);
240		WARN_ON_ONCE(ngroups > ncpus);
241
242		node_groups[n].ngroups = ngroups;
243
244		remaining_ncpus -= ncpus;
245		numgrps -= ngroups;
246	}
247}
248
249static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
250			       cpumask_var_t *node_to_cpumask,
251			       const struct cpumask *cpu_mask,
252			       struct cpumask *nmsk, struct cpumask *masks)
253{
254	unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0;
255	unsigned int last_grp = numgrps;
256	unsigned int curgrp = startgrp;
257	nodemask_t nodemsk = NODE_MASK_NONE;
258	struct node_groups *node_groups;
259
260	if (cpumask_empty(cpu_mask))
261		return 0;
262
263	nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
264
265	/*
266	 * If the number of nodes in the mask is greater than or equal the
267	 * number of groups we just spread the groups across the nodes.
268	 */
269	if (numgrps <= nodes) {
270		for_each_node_mask(n, nodemsk) {
271			/* Ensure that only CPUs which are in both masks are set */
272			cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
273			cpumask_or(&masks[curgrp], &masks[curgrp], nmsk);
274			if (++curgrp == last_grp)
275				curgrp = 0;
276		}
277		return numgrps;
278	}
279
280	node_groups = kcalloc(nr_node_ids,
281			       sizeof(struct node_groups),
282			       GFP_KERNEL);
283	if (!node_groups)
284		return -ENOMEM;
285
286	/* allocate group number for each node */
287	alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask,
288			   nodemsk, nmsk, node_groups);
289	for (i = 0; i < nr_node_ids; i++) {
290		unsigned int ncpus, v;
291		struct node_groups *nv = &node_groups[i];
292
293		if (nv->ngroups == UINT_MAX)
294			continue;
295
296		/* Get the cpus on this node which are in the mask */
297		cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
298		ncpus = cpumask_weight(nmsk);
299		if (!ncpus)
300			continue;
301
302		WARN_ON_ONCE(nv->ngroups > ncpus);
303
304		/* Account for rounding errors */
305		extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups);
306
307		/* Spread allocated groups on CPUs of the current node */
308		for (v = 0; v < nv->ngroups; v++, curgrp++) {
309			cpus_per_grp = ncpus / nv->ngroups;
310
311			/* Account for extra groups to compensate rounding errors */
312			if (extra_grps) {
313				cpus_per_grp++;
314				--extra_grps;
315			}
316
317			/*
318			 * wrapping has to be considered given 'startgrp'
319			 * may start anywhere
320			 */
321			if (curgrp >= last_grp)
322				curgrp = 0;
323			grp_spread_init_one(&masks[curgrp], nmsk,
324						cpus_per_grp);
325		}
326		done += nv->ngroups;
327	}
328	kfree(node_groups);
329	return done;
330}
331
332/**
333 * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
334 * @numgrps: number of groups
335 *
336 * Return: cpumask array if successful, NULL otherwise. And each element
337 * includes CPUs assigned to this group
338 *
339 * Try to put close CPUs from viewpoint of CPU and NUMA locality into
340 * same group, and run two-stage grouping:
341 *	1) allocate present CPUs on these groups evenly first
342 *	2) allocate other possible CPUs on these groups evenly
343 *
344 * We guarantee in the resulted grouping that all CPUs are covered, and
345 * no same CPU is assigned to multiple groups
346 */
347struct cpumask *group_cpus_evenly(unsigned int numgrps)
348{
349	unsigned int curgrp = 0, nr_present = 0, nr_others = 0;
350	cpumask_var_t *node_to_cpumask;
351	cpumask_var_t nmsk, npresmsk;
352	int ret = -ENOMEM;
353	struct cpumask *masks = NULL;
354
355	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
356		return NULL;
357
358	if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
359		goto fail_nmsk;
360
361	node_to_cpumask = alloc_node_to_cpumask();
362	if (!node_to_cpumask)
363		goto fail_npresmsk;
364
365	masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
366	if (!masks)
367		goto fail_node_to_cpumask;
368
369	build_node_to_cpumask(node_to_cpumask);
370
371	/*
372	 * Make a local cache of 'cpu_present_mask', so the two stages
373	 * spread can observe consistent 'cpu_present_mask' without holding
374	 * cpu hotplug lock, then we can reduce deadlock risk with cpu
375	 * hotplug code.
376	 *
377	 * Here CPU hotplug may happen when reading `cpu_present_mask`, and
378	 * we can live with the case because it only affects that hotplug
379	 * CPU is handled in the 1st or 2nd stage, and either way is correct
380	 * from API user viewpoint since 2-stage spread is sort of
381	 * optimization.
382	 */
383	cpumask_copy(npresmsk, data_race(cpu_present_mask));
384
385	/* grouping present CPUs first */
386	ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
387				  npresmsk, nmsk, masks);
388	if (ret < 0)
389		goto fail_build_affinity;
390	nr_present = ret;
391
392	/*
393	 * Allocate non present CPUs starting from the next group to be
394	 * handled. If the grouping of present CPUs already exhausted the
395	 * group space, assign the non present CPUs to the already
396	 * allocated out groups.
397	 */
398	if (nr_present >= numgrps)
399		curgrp = 0;
400	else
401		curgrp = nr_present;
402	cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk);
403	ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
404				  npresmsk, nmsk, masks);
405	if (ret >= 0)
406		nr_others = ret;
407
408 fail_build_affinity:
409	if (ret >= 0)
410		WARN_ON(nr_present + nr_others < numgrps);
411
412 fail_node_to_cpumask:
413	free_node_to_cpumask(node_to_cpumask);
414
415 fail_npresmsk:
416	free_cpumask_var(npresmsk);
417
418 fail_nmsk:
419	free_cpumask_var(nmsk);
420	if (ret < 0) {
421		kfree(masks);
422		return NULL;
423	}
424	return masks;
425}
426#else /* CONFIG_SMP */
427struct cpumask *group_cpus_evenly(unsigned int numgrps)
428{
429	struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
430
431	if (!masks)
432		return NULL;
433
434	/* assign all CPUs(cpu 0) to the 1st group only */
435	cpumask_copy(&masks[0], cpu_possible_mask);
436	return masks;
437}
438#endif /* CONFIG_SMP */
439EXPORT_SYMBOL_GPL(group_cpus_evenly);
440