1// SPDX-License-Identifier: GPL-2.0
2#include <linux/memcontrol.h>
3#include <linux/rwsem.h>
4#include <linux/shrinker.h>
5#include <linux/rculist.h>
6#include <trace/events/vmscan.h>
7
8#include "internal.h"
9
10LIST_HEAD(shrinker_list);
11DEFINE_MUTEX(shrinker_mutex);
12
13#ifdef CONFIG_MEMCG
14static int shrinker_nr_max;
15
16static inline int shrinker_unit_size(int nr_items)
17{
18	return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *));
19}
20
21static inline void shrinker_unit_free(struct shrinker_info *info, int start)
22{
23	struct shrinker_info_unit **unit;
24	int nr, i;
25
26	if (!info)
27		return;
28
29	unit = info->unit;
30	nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS);
31
32	for (i = start; i < nr; i++) {
33		if (!unit[i])
34			break;
35
36		kfree(unit[i]);
37		unit[i] = NULL;
38	}
39}
40
41static inline int shrinker_unit_alloc(struct shrinker_info *new,
42				       struct shrinker_info *old, int nid)
43{
44	struct shrinker_info_unit *unit;
45	int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS);
46	int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0;
47	int i;
48
49	for (i = start; i < nr; i++) {
50		unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid);
51		if (!unit) {
52			shrinker_unit_free(new, start);
53			return -ENOMEM;
54		}
55
56		new->unit[i] = unit;
57	}
58
59	return 0;
60}
61
62void free_shrinker_info(struct mem_cgroup *memcg)
63{
64	struct mem_cgroup_per_node *pn;
65	struct shrinker_info *info;
66	int nid;
67
68	for_each_node(nid) {
69		pn = memcg->nodeinfo[nid];
70		info = rcu_dereference_protected(pn->shrinker_info, true);
71		shrinker_unit_free(info, 0);
72		kvfree(info);
73		rcu_assign_pointer(pn->shrinker_info, NULL);
74	}
75}
76
77int alloc_shrinker_info(struct mem_cgroup *memcg)
78{
79	struct shrinker_info *info;
80	int nid, ret = 0;
81	int array_size = 0;
82
83	mutex_lock(&shrinker_mutex);
84	array_size = shrinker_unit_size(shrinker_nr_max);
85	for_each_node(nid) {
86		info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid);
87		if (!info)
88			goto err;
89		info->map_nr_max = shrinker_nr_max;
90		if (shrinker_unit_alloc(info, NULL, nid))
91			goto err;
92		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
93	}
94	mutex_unlock(&shrinker_mutex);
95
96	return ret;
97
98err:
99	mutex_unlock(&shrinker_mutex);
100	free_shrinker_info(memcg);
101	return -ENOMEM;
102}
103
104static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
105						     int nid)
106{
107	return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
108					 lockdep_is_held(&shrinker_mutex));
109}
110
111static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size,
112				    int old_size, int new_nr_max)
113{
114	struct shrinker_info *new, *old;
115	struct mem_cgroup_per_node *pn;
116	int nid;
117
118	for_each_node(nid) {
119		pn = memcg->nodeinfo[nid];
120		old = shrinker_info_protected(memcg, nid);
121		/* Not yet online memcg */
122		if (!old)
123			return 0;
124
125		/* Already expanded this shrinker_info */
126		if (new_nr_max <= old->map_nr_max)
127			continue;
128
129		new = kvzalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid);
130		if (!new)
131			return -ENOMEM;
132
133		new->map_nr_max = new_nr_max;
134
135		memcpy(new->unit, old->unit, old_size);
136		if (shrinker_unit_alloc(new, old, nid)) {
137			kvfree(new);
138			return -ENOMEM;
139		}
140
141		rcu_assign_pointer(pn->shrinker_info, new);
142		kvfree_rcu(old, rcu);
143	}
144
145	return 0;
146}
147
148static int expand_shrinker_info(int new_id)
149{
150	int ret = 0;
151	int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS);
152	int new_size, old_size = 0;
153	struct mem_cgroup *memcg;
154
155	if (!root_mem_cgroup)
156		goto out;
157
158	lockdep_assert_held(&shrinker_mutex);
159
160	new_size = shrinker_unit_size(new_nr_max);
161	old_size = shrinker_unit_size(shrinker_nr_max);
162
163	memcg = mem_cgroup_iter(NULL, NULL, NULL);
164	do {
165		ret = expand_one_shrinker_info(memcg, new_size, old_size,
166					       new_nr_max);
167		if (ret) {
168			mem_cgroup_iter_break(NULL, memcg);
169			goto out;
170		}
171	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
172out:
173	if (!ret)
174		shrinker_nr_max = new_nr_max;
175
176	return ret;
177}
178
179static inline int shrinker_id_to_index(int shrinker_id)
180{
181	return shrinker_id / SHRINKER_UNIT_BITS;
182}
183
184static inline int shrinker_id_to_offset(int shrinker_id)
185{
186	return shrinker_id % SHRINKER_UNIT_BITS;
187}
188
189static inline int calc_shrinker_id(int index, int offset)
190{
191	return index * SHRINKER_UNIT_BITS + offset;
192}
193
194void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
195{
196	if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
197		struct shrinker_info *info;
198		struct shrinker_info_unit *unit;
199
200		rcu_read_lock();
201		info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
202		unit = info->unit[shrinker_id_to_index(shrinker_id)];
203		if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
204			/* Pairs with smp mb in shrink_slab() */
205			smp_mb__before_atomic();
206			set_bit(shrinker_id_to_offset(shrinker_id), unit->map);
207		}
208		rcu_read_unlock();
209	}
210}
211
212static DEFINE_IDR(shrinker_idr);
213
214static int shrinker_memcg_alloc(struct shrinker *shrinker)
215{
216	int id, ret = -ENOMEM;
217
218	if (mem_cgroup_disabled())
219		return -ENOSYS;
220
221	mutex_lock(&shrinker_mutex);
222	id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
223	if (id < 0)
224		goto unlock;
225
226	if (id >= shrinker_nr_max) {
227		if (expand_shrinker_info(id)) {
228			idr_remove(&shrinker_idr, id);
229			goto unlock;
230		}
231	}
232	shrinker->id = id;
233	ret = 0;
234unlock:
235	mutex_unlock(&shrinker_mutex);
236	return ret;
237}
238
239static void shrinker_memcg_remove(struct shrinker *shrinker)
240{
241	int id = shrinker->id;
242
243	BUG_ON(id < 0);
244
245	lockdep_assert_held(&shrinker_mutex);
246
247	idr_remove(&shrinker_idr, id);
248}
249
250static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
251				   struct mem_cgroup *memcg)
252{
253	struct shrinker_info *info;
254	struct shrinker_info_unit *unit;
255	long nr_deferred;
256
257	rcu_read_lock();
258	info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
259	unit = info->unit[shrinker_id_to_index(shrinker->id)];
260	nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0);
261	rcu_read_unlock();
262
263	return nr_deferred;
264}
265
266static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
267				  struct mem_cgroup *memcg)
268{
269	struct shrinker_info *info;
270	struct shrinker_info_unit *unit;
271	long nr_deferred;
272
273	rcu_read_lock();
274	info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
275	unit = info->unit[shrinker_id_to_index(shrinker->id)];
276	nr_deferred =
277		atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]);
278	rcu_read_unlock();
279
280	return nr_deferred;
281}
282
283void reparent_shrinker_deferred(struct mem_cgroup *memcg)
284{
285	int nid, index, offset;
286	long nr;
287	struct mem_cgroup *parent;
288	struct shrinker_info *child_info, *parent_info;
289	struct shrinker_info_unit *child_unit, *parent_unit;
290
291	parent = parent_mem_cgroup(memcg);
292	if (!parent)
293		parent = root_mem_cgroup;
294
295	/* Prevent from concurrent shrinker_info expand */
296	mutex_lock(&shrinker_mutex);
297	for_each_node(nid) {
298		child_info = shrinker_info_protected(memcg, nid);
299		parent_info = shrinker_info_protected(parent, nid);
300		for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) {
301			child_unit = child_info->unit[index];
302			parent_unit = parent_info->unit[index];
303			for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) {
304				nr = atomic_long_read(&child_unit->nr_deferred[offset]);
305				atomic_long_add(nr, &parent_unit->nr_deferred[offset]);
306			}
307		}
308	}
309	mutex_unlock(&shrinker_mutex);
310}
311#else
312static int shrinker_memcg_alloc(struct shrinker *shrinker)
313{
314	return -ENOSYS;
315}
316
317static void shrinker_memcg_remove(struct shrinker *shrinker)
318{
319}
320
321static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
322				   struct mem_cgroup *memcg)
323{
324	return 0;
325}
326
327static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
328				  struct mem_cgroup *memcg)
329{
330	return 0;
331}
332#endif /* CONFIG_MEMCG */
333
334static long xchg_nr_deferred(struct shrinker *shrinker,
335			     struct shrink_control *sc)
336{
337	int nid = sc->nid;
338
339	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
340		nid = 0;
341
342	if (sc->memcg &&
343	    (shrinker->flags & SHRINKER_MEMCG_AWARE))
344		return xchg_nr_deferred_memcg(nid, shrinker,
345					      sc->memcg);
346
347	return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
348}
349
350
351static long add_nr_deferred(long nr, struct shrinker *shrinker,
352			    struct shrink_control *sc)
353{
354	int nid = sc->nid;
355
356	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
357		nid = 0;
358
359	if (sc->memcg &&
360	    (shrinker->flags & SHRINKER_MEMCG_AWARE))
361		return add_nr_deferred_memcg(nr, nid, shrinker,
362					     sc->memcg);
363
364	return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
365}
366
367#define SHRINK_BATCH 128
368
369static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
370				    struct shrinker *shrinker, int priority)
371{
372	unsigned long freed = 0;
373	unsigned long long delta;
374	long total_scan;
375	long freeable;
376	long nr;
377	long new_nr;
378	long batch_size = shrinker->batch ? shrinker->batch
379					  : SHRINK_BATCH;
380	long scanned = 0, next_deferred;
381
382	freeable = shrinker->count_objects(shrinker, shrinkctl);
383	if (freeable == 0 || freeable == SHRINK_EMPTY)
384		return freeable;
385
386	/*
387	 * copy the current shrinker scan count into a local variable
388	 * and zero it so that other concurrent shrinker invocations
389	 * don't also do this scanning work.
390	 */
391	nr = xchg_nr_deferred(shrinker, shrinkctl);
392
393	if (shrinker->seeks) {
394		delta = freeable >> priority;
395		delta *= 4;
396		do_div(delta, shrinker->seeks);
397	} else {
398		/*
399		 * These objects don't require any IO to create. Trim
400		 * them aggressively under memory pressure to keep
401		 * them from causing refetches in the IO caches.
402		 */
403		delta = freeable / 2;
404	}
405
406	total_scan = nr >> priority;
407	total_scan += delta;
408	total_scan = min(total_scan, (2 * freeable));
409
410	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
411				   freeable, delta, total_scan, priority);
412
413	/*
414	 * Normally, we should not scan less than batch_size objects in one
415	 * pass to avoid too frequent shrinker calls, but if the slab has less
416	 * than batch_size objects in total and we are really tight on memory,
417	 * we will try to reclaim all available objects, otherwise we can end
418	 * up failing allocations although there are plenty of reclaimable
419	 * objects spread over several slabs with usage less than the
420	 * batch_size.
421	 *
422	 * We detect the "tight on memory" situations by looking at the total
423	 * number of objects we want to scan (total_scan). If it is greater
424	 * than the total number of objects on slab (freeable), we must be
425	 * scanning at high prio and therefore should try to reclaim as much as
426	 * possible.
427	 */
428	while (total_scan >= batch_size ||
429	       total_scan >= freeable) {
430		unsigned long ret;
431		unsigned long nr_to_scan = min(batch_size, total_scan);
432
433		shrinkctl->nr_to_scan = nr_to_scan;
434		shrinkctl->nr_scanned = nr_to_scan;
435		ret = shrinker->scan_objects(shrinker, shrinkctl);
436		if (ret == SHRINK_STOP)
437			break;
438		freed += ret;
439
440		count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
441		total_scan -= shrinkctl->nr_scanned;
442		scanned += shrinkctl->nr_scanned;
443
444		cond_resched();
445	}
446
447	/*
448	 * The deferred work is increased by any new work (delta) that wasn't
449	 * done, decreased by old deferred work that was done now.
450	 *
451	 * And it is capped to two times of the freeable items.
452	 */
453	next_deferred = max_t(long, (nr + delta - scanned), 0);
454	next_deferred = min(next_deferred, (2 * freeable));
455
456	/*
457	 * move the unused scan count back into the shrinker in a
458	 * manner that handles concurrent updates.
459	 */
460	new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
461
462	trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
463	return freed;
464}
465
466#ifdef CONFIG_MEMCG
467static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
468			struct mem_cgroup *memcg, int priority)
469{
470	struct shrinker_info *info;
471	unsigned long ret, freed = 0;
472	int offset, index = 0;
473
474	if (!mem_cgroup_online(memcg))
475		return 0;
476
477	/*
478	 * lockless algorithm of memcg shrink.
479	 *
480	 * The shrinker_info may be freed asynchronously via RCU in the
481	 * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used
482	 * to ensure the existence of the shrinker_info.
483	 *
484	 * The shrinker_info_unit is never freed unless its corresponding memcg
485	 * is destroyed. Here we already hold the refcount of memcg, so the
486	 * memcg will not be destroyed, and of course shrinker_info_unit will
487	 * not be freed.
488	 *
489	 * So in the memcg shrink:
490	 *  step 1: use rcu_read_lock() to guarantee existence of the
491	 *          shrinker_info.
492	 *  step 2: after getting shrinker_info_unit we can safely release the
493	 *          RCU lock.
494	 *  step 3: traverse the bitmap and calculate shrinker_id
495	 *  step 4: use rcu_read_lock() to guarantee existence of the shrinker.
496	 *  step 5: use shrinker_id to find the shrinker, then use
497	 *          shrinker_try_get() to guarantee existence of the shrinker,
498	 *          then we can release the RCU lock to do do_shrink_slab() that
499	 *          may sleep.
500	 *  step 6: do shrinker_put() paired with step 5 to put the refcount,
501	 *          if the refcount reaches 0, then wake up the waiter in
502	 *          shrinker_free() by calling complete().
503	 *          Note: here is different from the global shrink, we don't
504	 *                need to acquire the RCU lock to guarantee existence of
505	 *                the shrinker, because we don't need to use this
506	 *                shrinker to traverse the next shrinker in the bitmap.
507	 *  step 7: we have already exited the read-side of rcu critical section
508	 *          before calling do_shrink_slab(), the shrinker_info may be
509	 *          released in expand_one_shrinker_info(), so go back to step 1
510	 *          to reacquire the shrinker_info.
511	 */
512again:
513	rcu_read_lock();
514	info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
515	if (unlikely(!info))
516		goto unlock;
517
518	if (index < shrinker_id_to_index(info->map_nr_max)) {
519		struct shrinker_info_unit *unit;
520
521		unit = info->unit[index];
522
523		rcu_read_unlock();
524
525		for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) {
526			struct shrink_control sc = {
527				.gfp_mask = gfp_mask,
528				.nid = nid,
529				.memcg = memcg,
530			};
531			struct shrinker *shrinker;
532			int shrinker_id = calc_shrinker_id(index, offset);
533
534			rcu_read_lock();
535			shrinker = idr_find(&shrinker_idr, shrinker_id);
536			if (unlikely(!shrinker || !shrinker_try_get(shrinker))) {
537				clear_bit(offset, unit->map);
538				rcu_read_unlock();
539				continue;
540			}
541			rcu_read_unlock();
542
543			/* Call non-slab shrinkers even though kmem is disabled */
544			if (!memcg_kmem_online() &&
545			    !(shrinker->flags & SHRINKER_NONSLAB))
546				continue;
547
548			ret = do_shrink_slab(&sc, shrinker, priority);
549			if (ret == SHRINK_EMPTY) {
550				clear_bit(offset, unit->map);
551				/*
552				 * After the shrinker reported that it had no objects to
553				 * free, but before we cleared the corresponding bit in
554				 * the memcg shrinker map, a new object might have been
555				 * added. To make sure, we have the bit set in this
556				 * case, we invoke the shrinker one more time and reset
557				 * the bit if it reports that it is not empty anymore.
558				 * The memory barrier here pairs with the barrier in
559				 * set_shrinker_bit():
560				 *
561				 * list_lru_add()     shrink_slab_memcg()
562				 *   list_add_tail()    clear_bit()
563				 *   <MB>               <MB>
564				 *   set_bit()          do_shrink_slab()
565				 */
566				smp_mb__after_atomic();
567				ret = do_shrink_slab(&sc, shrinker, priority);
568				if (ret == SHRINK_EMPTY)
569					ret = 0;
570				else
571					set_shrinker_bit(memcg, nid, shrinker_id);
572			}
573			freed += ret;
574			shrinker_put(shrinker);
575		}
576
577		index++;
578		goto again;
579	}
580unlock:
581	rcu_read_unlock();
582	return freed;
583}
584#else /* !CONFIG_MEMCG */
585static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
586			struct mem_cgroup *memcg, int priority)
587{
588	return 0;
589}
590#endif /* CONFIG_MEMCG */
591
592/**
593 * shrink_slab - shrink slab caches
594 * @gfp_mask: allocation context
595 * @nid: node whose slab caches to target
596 * @memcg: memory cgroup whose slab caches to target
597 * @priority: the reclaim priority
598 *
599 * Call the shrink functions to age shrinkable caches.
600 *
601 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
602 * unaware shrinkers will receive a node id of 0 instead.
603 *
604 * @memcg specifies the memory cgroup to target. Unaware shrinkers
605 * are called only if it is the root cgroup.
606 *
607 * @priority is sc->priority, we take the number of objects and >> by priority
608 * in order to get the scan target.
609 *
610 * Returns the number of reclaimed slab objects.
611 */
612unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
613			  int priority)
614{
615	unsigned long ret, freed = 0;
616	struct shrinker *shrinker;
617
618	/*
619	 * The root memcg might be allocated even though memcg is disabled
620	 * via "cgroup_disable=memory" boot parameter.  This could make
621	 * mem_cgroup_is_root() return false, then just run memcg slab
622	 * shrink, but skip global shrink.  This may result in premature
623	 * oom.
624	 */
625	if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
626		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
627
628	/*
629	 * lockless algorithm of global shrink.
630	 *
631	 * In the unregistration setp, the shrinker will be freed asynchronously
632	 * via RCU after its refcount reaches 0. So both rcu_read_lock() and
633	 * shrinker_try_get() can be used to ensure the existence of the shrinker.
634	 *
635	 * So in the global shrink:
636	 *  step 1: use rcu_read_lock() to guarantee existence of the shrinker
637	 *          and the validity of the shrinker_list walk.
638	 *  step 2: use shrinker_try_get() to try get the refcount, if successful,
639	 *          then the existence of the shrinker can also be guaranteed,
640	 *          so we can release the RCU lock to do do_shrink_slab() that
641	 *          may sleep.
642	 *  step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(),
643	 *          which ensures that neither this shrinker nor the next shrinker
644	 *          will be freed in the next traversal operation.
645	 *  step 4: do shrinker_put() paired with step 2 to put the refcount,
646	 *          if the refcount reaches 0, then wake up the waiter in
647	 *          shrinker_free() by calling complete().
648	 */
649	rcu_read_lock();
650	list_for_each_entry_rcu(shrinker, &shrinker_list, list) {
651		struct shrink_control sc = {
652			.gfp_mask = gfp_mask,
653			.nid = nid,
654			.memcg = memcg,
655		};
656
657		if (!shrinker_try_get(shrinker))
658			continue;
659
660		rcu_read_unlock();
661
662		ret = do_shrink_slab(&sc, shrinker, priority);
663		if (ret == SHRINK_EMPTY)
664			ret = 0;
665		freed += ret;
666
667		rcu_read_lock();
668		shrinker_put(shrinker);
669	}
670
671	rcu_read_unlock();
672	cond_resched();
673	return freed;
674}
675
676struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)
677{
678	struct shrinker *shrinker;
679	unsigned int size;
680	va_list ap;
681	int err;
682
683	shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL);
684	if (!shrinker)
685		return NULL;
686
687	va_start(ap, fmt);
688	err = shrinker_debugfs_name_alloc(shrinker, fmt, ap);
689	va_end(ap);
690	if (err)
691		goto err_name;
692
693	shrinker->flags = flags | SHRINKER_ALLOCATED;
694	shrinker->seeks = DEFAULT_SEEKS;
695
696	if (flags & SHRINKER_MEMCG_AWARE) {
697		err = shrinker_memcg_alloc(shrinker);
698		if (err == -ENOSYS) {
699			/* Memcg is not supported, fallback to non-memcg-aware shrinker. */
700			shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
701			goto non_memcg;
702		}
703
704		if (err)
705			goto err_flags;
706
707		return shrinker;
708	}
709
710non_memcg:
711	/*
712	 * The nr_deferred is available on per memcg level for memcg aware
713	 * shrinkers, so only allocate nr_deferred in the following cases:
714	 *  - non-memcg-aware shrinkers
715	 *  - !CONFIG_MEMCG
716	 *  - memcg is disabled by kernel command line
717	 */
718	size = sizeof(*shrinker->nr_deferred);
719	if (flags & SHRINKER_NUMA_AWARE)
720		size *= nr_node_ids;
721
722	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
723	if (!shrinker->nr_deferred)
724		goto err_flags;
725
726	return shrinker;
727
728err_flags:
729	shrinker_debugfs_name_free(shrinker);
730err_name:
731	kfree(shrinker);
732	return NULL;
733}
734EXPORT_SYMBOL_GPL(shrinker_alloc);
735
736void shrinker_register(struct shrinker *shrinker)
737{
738	if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) {
739		pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker");
740		return;
741	}
742
743	mutex_lock(&shrinker_mutex);
744	list_add_tail_rcu(&shrinker->list, &shrinker_list);
745	shrinker->flags |= SHRINKER_REGISTERED;
746	shrinker_debugfs_add(shrinker);
747	mutex_unlock(&shrinker_mutex);
748
749	init_completion(&shrinker->done);
750	/*
751	 * Now the shrinker is fully set up, take the first reference to it to
752	 * indicate that lookup operations are now allowed to use it via
753	 * shrinker_try_get().
754	 */
755	refcount_set(&shrinker->refcount, 1);
756}
757EXPORT_SYMBOL_GPL(shrinker_register);
758
759static void shrinker_free_rcu_cb(struct rcu_head *head)
760{
761	struct shrinker *shrinker = container_of(head, struct shrinker, rcu);
762
763	kfree(shrinker->nr_deferred);
764	kfree(shrinker);
765}
766
767void shrinker_free(struct shrinker *shrinker)
768{
769	struct dentry *debugfs_entry = NULL;
770	int debugfs_id;
771
772	if (!shrinker)
773		return;
774
775	if (shrinker->flags & SHRINKER_REGISTERED) {
776		/* drop the initial refcount */
777		shrinker_put(shrinker);
778		/*
779		 * Wait for all lookups of the shrinker to complete, after that,
780		 * no shrinker is running or will run again, then we can safely
781		 * free it asynchronously via RCU and safely free the structure
782		 * where the shrinker is located, such as super_block etc.
783		 */
784		wait_for_completion(&shrinker->done);
785	}
786
787	mutex_lock(&shrinker_mutex);
788	if (shrinker->flags & SHRINKER_REGISTERED) {
789		/*
790		 * Now we can safely remove it from the shrinker_list and then
791		 * free it.
792		 */
793		list_del_rcu(&shrinker->list);
794		debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
795		shrinker->flags &= ~SHRINKER_REGISTERED;
796	}
797
798	shrinker_debugfs_name_free(shrinker);
799
800	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
801		shrinker_memcg_remove(shrinker);
802	mutex_unlock(&shrinker_mutex);
803
804	if (debugfs_entry)
805		shrinker_debugfs_remove(debugfs_entry, debugfs_id);
806
807	call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
808}
809EXPORT_SYMBOL_GPL(shrinker_free);
810