1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/memcontrol.h>
3 #include <linux/rwsem.h>
4 #include <linux/shrinker.h>
5 #include <linux/rculist.h>
6 #include <trace/events/vmscan.h>
7
8 #include "internal.h"
9
10 LIST_HEAD(shrinker_list);
11 DEFINE_MUTEX(shrinker_mutex);
12
13 #ifdef CONFIG_MEMCG
14 static int shrinker_nr_max;
15
shrinker_unit_size(int nr_items)16 static inline int shrinker_unit_size(int nr_items)
17 {
18 return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *));
19 }
20
shrinker_unit_free(struct shrinker_info * info,int start)21 static inline void shrinker_unit_free(struct shrinker_info *info, int start)
22 {
23 struct shrinker_info_unit **unit;
24 int nr, i;
25
26 if (!info)
27 return;
28
29 unit = info->unit;
30 nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS);
31
32 for (i = start; i < nr; i++) {
33 if (!unit[i])
34 break;
35
36 kfree(unit[i]);
37 unit[i] = NULL;
38 }
39 }
40
shrinker_unit_alloc(struct shrinker_info * new,struct shrinker_info * old,int nid)41 static inline int shrinker_unit_alloc(struct shrinker_info *new,
42 struct shrinker_info *old, int nid)
43 {
44 struct shrinker_info_unit *unit;
45 int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS);
46 int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0;
47 int i;
48
49 for (i = start; i < nr; i++) {
50 unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid);
51 if (!unit) {
52 shrinker_unit_free(new, start);
53 return -ENOMEM;
54 }
55
56 new->unit[i] = unit;
57 }
58
59 return 0;
60 }
61
free_shrinker_info(struct mem_cgroup * memcg)62 void free_shrinker_info(struct mem_cgroup *memcg)
63 {
64 struct mem_cgroup_per_node *pn;
65 struct shrinker_info *info;
66 int nid;
67
68 for_each_node(nid) {
69 pn = memcg->nodeinfo[nid];
70 info = rcu_dereference_protected(pn->shrinker_info, true);
71 shrinker_unit_free(info, 0);
72 kvfree(info);
73 rcu_assign_pointer(pn->shrinker_info, NULL);
74 }
75 }
76
alloc_shrinker_info(struct mem_cgroup * memcg)77 int alloc_shrinker_info(struct mem_cgroup *memcg)
78 {
79 int nid, ret = 0;
80 int array_size = 0;
81
82 mutex_lock(&shrinker_mutex);
83 array_size = shrinker_unit_size(shrinker_nr_max);
84 for_each_node(nid) {
85 struct shrinker_info *info = kvzalloc_node(sizeof(*info) + array_size,
86 GFP_KERNEL, nid);
87 if (!info)
88 goto err;
89 info->map_nr_max = shrinker_nr_max;
90 if (shrinker_unit_alloc(info, NULL, nid)) {
91 kvfree(info);
92 goto err;
93 }
94 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
95 }
96 mutex_unlock(&shrinker_mutex);
97
98 return ret;
99
100 err:
101 mutex_unlock(&shrinker_mutex);
102 free_shrinker_info(memcg);
103 return -ENOMEM;
104 }
105
shrinker_info_protected(struct mem_cgroup * memcg,int nid)106 static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
107 int nid)
108 {
109 return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
110 lockdep_is_held(&shrinker_mutex));
111 }
112
expand_one_shrinker_info(struct mem_cgroup * memcg,int new_size,int old_size,int new_nr_max)113 static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size,
114 int old_size, int new_nr_max)
115 {
116 struct shrinker_info *new, *old;
117 struct mem_cgroup_per_node *pn;
118 int nid;
119
120 for_each_node(nid) {
121 pn = memcg->nodeinfo[nid];
122 old = shrinker_info_protected(memcg, nid);
123 /* Not yet online memcg */
124 if (!old)
125 return 0;
126
127 /* Already expanded this shrinker_info */
128 if (new_nr_max <= old->map_nr_max)
129 continue;
130
131 new = kvzalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid);
132 if (!new)
133 return -ENOMEM;
134
135 new->map_nr_max = new_nr_max;
136
137 memcpy(new->unit, old->unit, old_size);
138 if (shrinker_unit_alloc(new, old, nid)) {
139 kvfree(new);
140 return -ENOMEM;
141 }
142
143 rcu_assign_pointer(pn->shrinker_info, new);
144 kvfree_rcu(old, rcu);
145 }
146
147 return 0;
148 }
149
expand_shrinker_info(int new_id)150 static int expand_shrinker_info(int new_id)
151 {
152 int ret = 0;
153 int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS);
154 int new_size, old_size = 0;
155 struct mem_cgroup *memcg;
156
157 if (!root_mem_cgroup)
158 goto out;
159
160 lockdep_assert_held(&shrinker_mutex);
161
162 new_size = shrinker_unit_size(new_nr_max);
163 old_size = shrinker_unit_size(shrinker_nr_max);
164
165 memcg = mem_cgroup_iter(NULL, NULL, NULL);
166 do {
167 ret = expand_one_shrinker_info(memcg, new_size, old_size,
168 new_nr_max);
169 if (ret) {
170 mem_cgroup_iter_break(NULL, memcg);
171 goto out;
172 }
173 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
174 out:
175 if (!ret)
176 shrinker_nr_max = new_nr_max;
177
178 return ret;
179 }
180
shrinker_id_to_index(int shrinker_id)181 static inline int shrinker_id_to_index(int shrinker_id)
182 {
183 return shrinker_id / SHRINKER_UNIT_BITS;
184 }
185
shrinker_id_to_offset(int shrinker_id)186 static inline int shrinker_id_to_offset(int shrinker_id)
187 {
188 return shrinker_id % SHRINKER_UNIT_BITS;
189 }
190
calc_shrinker_id(int index,int offset)191 static inline int calc_shrinker_id(int index, int offset)
192 {
193 return index * SHRINKER_UNIT_BITS + offset;
194 }
195
set_shrinker_bit(struct mem_cgroup * memcg,int nid,int shrinker_id)196 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
197 {
198 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
199 struct shrinker_info *info;
200 struct shrinker_info_unit *unit;
201
202 rcu_read_lock();
203 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
204 unit = info->unit[shrinker_id_to_index(shrinker_id)];
205 if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
206 /* Pairs with smp mb in shrink_slab() */
207 smp_mb__before_atomic();
208 set_bit(shrinker_id_to_offset(shrinker_id), unit->map);
209 }
210 rcu_read_unlock();
211 }
212 }
213
214 static DEFINE_IDR(shrinker_idr);
215
shrinker_memcg_alloc(struct shrinker * shrinker)216 static int shrinker_memcg_alloc(struct shrinker *shrinker)
217 {
218 int id, ret = -ENOMEM;
219
220 if (mem_cgroup_disabled())
221 return -ENOSYS;
222
223 mutex_lock(&shrinker_mutex);
224 id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
225 if (id < 0)
226 goto unlock;
227
228 if (id >= shrinker_nr_max) {
229 if (expand_shrinker_info(id)) {
230 idr_remove(&shrinker_idr, id);
231 goto unlock;
232 }
233 }
234 shrinker->id = id;
235 ret = 0;
236 unlock:
237 mutex_unlock(&shrinker_mutex);
238 return ret;
239 }
240
shrinker_memcg_remove(struct shrinker * shrinker)241 static void shrinker_memcg_remove(struct shrinker *shrinker)
242 {
243 int id = shrinker->id;
244
245 BUG_ON(id < 0);
246
247 lockdep_assert_held(&shrinker_mutex);
248
249 idr_remove(&shrinker_idr, id);
250 }
251
xchg_nr_deferred_memcg(int nid,struct shrinker * shrinker,struct mem_cgroup * memcg)252 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
253 struct mem_cgroup *memcg)
254 {
255 struct shrinker_info *info;
256 struct shrinker_info_unit *unit;
257 long nr_deferred;
258
259 rcu_read_lock();
260 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
261 unit = info->unit[shrinker_id_to_index(shrinker->id)];
262 nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0);
263 rcu_read_unlock();
264
265 return nr_deferred;
266 }
267
add_nr_deferred_memcg(long nr,int nid,struct shrinker * shrinker,struct mem_cgroup * memcg)268 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
269 struct mem_cgroup *memcg)
270 {
271 struct shrinker_info *info;
272 struct shrinker_info_unit *unit;
273 long nr_deferred;
274
275 rcu_read_lock();
276 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
277 unit = info->unit[shrinker_id_to_index(shrinker->id)];
278 nr_deferred =
279 atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]);
280 rcu_read_unlock();
281
282 return nr_deferred;
283 }
284
reparent_shrinker_deferred(struct mem_cgroup * memcg)285 void reparent_shrinker_deferred(struct mem_cgroup *memcg)
286 {
287 int nid, index, offset;
288 long nr;
289 struct mem_cgroup *parent;
290 struct shrinker_info *child_info, *parent_info;
291 struct shrinker_info_unit *child_unit, *parent_unit;
292
293 parent = parent_mem_cgroup(memcg);
294 if (!parent)
295 parent = root_mem_cgroup;
296
297 /* Prevent from concurrent shrinker_info expand */
298 mutex_lock(&shrinker_mutex);
299 for_each_node(nid) {
300 child_info = shrinker_info_protected(memcg, nid);
301 parent_info = shrinker_info_protected(parent, nid);
302 for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) {
303 child_unit = child_info->unit[index];
304 parent_unit = parent_info->unit[index];
305 for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) {
306 nr = atomic_long_read(&child_unit->nr_deferred[offset]);
307 atomic_long_add(nr, &parent_unit->nr_deferred[offset]);
308 }
309 }
310 }
311 mutex_unlock(&shrinker_mutex);
312 }
313 #else
shrinker_memcg_alloc(struct shrinker * shrinker)314 static int shrinker_memcg_alloc(struct shrinker *shrinker)
315 {
316 return -ENOSYS;
317 }
318
shrinker_memcg_remove(struct shrinker * shrinker)319 static void shrinker_memcg_remove(struct shrinker *shrinker)
320 {
321 }
322
xchg_nr_deferred_memcg(int nid,struct shrinker * shrinker,struct mem_cgroup * memcg)323 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
324 struct mem_cgroup *memcg)
325 {
326 return 0;
327 }
328
add_nr_deferred_memcg(long nr,int nid,struct shrinker * shrinker,struct mem_cgroup * memcg)329 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
330 struct mem_cgroup *memcg)
331 {
332 return 0;
333 }
334 #endif /* CONFIG_MEMCG */
335
xchg_nr_deferred(struct shrinker * shrinker,struct shrink_control * sc)336 static long xchg_nr_deferred(struct shrinker *shrinker,
337 struct shrink_control *sc)
338 {
339 int nid = sc->nid;
340
341 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
342 nid = 0;
343
344 if (sc->memcg &&
345 (shrinker->flags & SHRINKER_MEMCG_AWARE))
346 return xchg_nr_deferred_memcg(nid, shrinker,
347 sc->memcg);
348
349 return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
350 }
351
352
add_nr_deferred(long nr,struct shrinker * shrinker,struct shrink_control * sc)353 static long add_nr_deferred(long nr, struct shrinker *shrinker,
354 struct shrink_control *sc)
355 {
356 int nid = sc->nid;
357
358 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
359 nid = 0;
360
361 if (sc->memcg &&
362 (shrinker->flags & SHRINKER_MEMCG_AWARE))
363 return add_nr_deferred_memcg(nr, nid, shrinker,
364 sc->memcg);
365
366 return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
367 }
368
369 #define SHRINK_BATCH 128
370
do_shrink_slab(struct shrink_control * shrinkctl,struct shrinker * shrinker,int priority)371 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
372 struct shrinker *shrinker, int priority)
373 {
374 unsigned long freed = 0;
375 unsigned long long delta;
376 long total_scan;
377 long freeable;
378 long nr;
379 long new_nr;
380 long batch_size = shrinker->batch ? shrinker->batch
381 : SHRINK_BATCH;
382 long scanned = 0, next_deferred;
383
384 freeable = shrinker->count_objects(shrinker, shrinkctl);
385 if (freeable == 0 || freeable == SHRINK_EMPTY)
386 return freeable;
387
388 /*
389 * copy the current shrinker scan count into a local variable
390 * and zero it so that other concurrent shrinker invocations
391 * don't also do this scanning work.
392 */
393 nr = xchg_nr_deferred(shrinker, shrinkctl);
394
395 if (shrinker->seeks) {
396 delta = freeable >> priority;
397 delta *= 4;
398 do_div(delta, shrinker->seeks);
399 } else {
400 /*
401 * These objects don't require any IO to create. Trim
402 * them aggressively under memory pressure to keep
403 * them from causing refetches in the IO caches.
404 */
405 delta = freeable / 2;
406 }
407
408 total_scan = nr >> priority;
409 total_scan += delta;
410 total_scan = min(total_scan, (2 * freeable));
411
412 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
413 freeable, delta, total_scan, priority);
414
415 /*
416 * Normally, we should not scan less than batch_size objects in one
417 * pass to avoid too frequent shrinker calls, but if the slab has less
418 * than batch_size objects in total and we are really tight on memory,
419 * we will try to reclaim all available objects, otherwise we can end
420 * up failing allocations although there are plenty of reclaimable
421 * objects spread over several slabs with usage less than the
422 * batch_size.
423 *
424 * We detect the "tight on memory" situations by looking at the total
425 * number of objects we want to scan (total_scan). If it is greater
426 * than the total number of objects on slab (freeable), we must be
427 * scanning at high prio and therefore should try to reclaim as much as
428 * possible.
429 */
430 while (total_scan >= batch_size ||
431 total_scan >= freeable) {
432 unsigned long ret;
433 unsigned long nr_to_scan = min(batch_size, total_scan);
434
435 shrinkctl->nr_to_scan = nr_to_scan;
436 shrinkctl->nr_scanned = nr_to_scan;
437 ret = shrinker->scan_objects(shrinker, shrinkctl);
438 if (ret == SHRINK_STOP)
439 break;
440 freed += ret;
441
442 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
443 total_scan -= shrinkctl->nr_scanned;
444 scanned += shrinkctl->nr_scanned;
445
446 cond_resched();
447 }
448
449 /*
450 * The deferred work is increased by any new work (delta) that wasn't
451 * done, decreased by old deferred work that was done now.
452 *
453 * And it is capped to two times of the freeable items.
454 */
455 next_deferred = max_t(long, (nr + delta - scanned), 0);
456 next_deferred = min(next_deferred, (2 * freeable));
457
458 /*
459 * move the unused scan count back into the shrinker in a
460 * manner that handles concurrent updates.
461 */
462 new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
463
464 trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
465 return freed;
466 }
467
468 #ifdef CONFIG_MEMCG
shrink_slab_memcg(gfp_t gfp_mask,int nid,struct mem_cgroup * memcg,int priority)469 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
470 struct mem_cgroup *memcg, int priority)
471 {
472 struct shrinker_info *info;
473 unsigned long ret, freed = 0;
474 int offset, index = 0;
475
476 if (!mem_cgroup_online(memcg))
477 return 0;
478
479 /*
480 * lockless algorithm of memcg shrink.
481 *
482 * The shrinker_info may be freed asynchronously via RCU in the
483 * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used
484 * to ensure the existence of the shrinker_info.
485 *
486 * The shrinker_info_unit is never freed unless its corresponding memcg
487 * is destroyed. Here we already hold the refcount of memcg, so the
488 * memcg will not be destroyed, and of course shrinker_info_unit will
489 * not be freed.
490 *
491 * So in the memcg shrink:
492 * step 1: use rcu_read_lock() to guarantee existence of the
493 * shrinker_info.
494 * step 2: after getting shrinker_info_unit we can safely release the
495 * RCU lock.
496 * step 3: traverse the bitmap and calculate shrinker_id
497 * step 4: use rcu_read_lock() to guarantee existence of the shrinker.
498 * step 5: use shrinker_id to find the shrinker, then use
499 * shrinker_try_get() to guarantee existence of the shrinker,
500 * then we can release the RCU lock to do do_shrink_slab() that
501 * may sleep.
502 * step 6: do shrinker_put() paired with step 5 to put the refcount,
503 * if the refcount reaches 0, then wake up the waiter in
504 * shrinker_free() by calling complete().
505 * Note: here is different from the global shrink, we don't
506 * need to acquire the RCU lock to guarantee existence of
507 * the shrinker, because we don't need to use this
508 * shrinker to traverse the next shrinker in the bitmap.
509 * step 7: we have already exited the read-side of rcu critical section
510 * before calling do_shrink_slab(), the shrinker_info may be
511 * released in expand_one_shrinker_info(), so go back to step 1
512 * to reacquire the shrinker_info.
513 */
514 again:
515 rcu_read_lock();
516 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
517 if (unlikely(!info))
518 goto unlock;
519
520 if (index < shrinker_id_to_index(info->map_nr_max)) {
521 struct shrinker_info_unit *unit;
522
523 unit = info->unit[index];
524
525 rcu_read_unlock();
526
527 for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) {
528 struct shrink_control sc = {
529 .gfp_mask = gfp_mask,
530 .nid = nid,
531 .memcg = memcg,
532 };
533 struct shrinker *shrinker;
534 int shrinker_id = calc_shrinker_id(index, offset);
535
536 rcu_read_lock();
537 shrinker = idr_find(&shrinker_idr, shrinker_id);
538 if (unlikely(!shrinker || !shrinker_try_get(shrinker))) {
539 clear_bit(offset, unit->map);
540 rcu_read_unlock();
541 continue;
542 }
543 rcu_read_unlock();
544
545 /* Call non-slab shrinkers even though kmem is disabled */
546 if (!memcg_kmem_online() &&
547 !(shrinker->flags & SHRINKER_NONSLAB))
548 continue;
549
550 ret = do_shrink_slab(&sc, shrinker, priority);
551 if (ret == SHRINK_EMPTY) {
552 clear_bit(offset, unit->map);
553 /*
554 * After the shrinker reported that it had no objects to
555 * free, but before we cleared the corresponding bit in
556 * the memcg shrinker map, a new object might have been
557 * added. To make sure, we have the bit set in this
558 * case, we invoke the shrinker one more time and reset
559 * the bit if it reports that it is not empty anymore.
560 * The memory barrier here pairs with the barrier in
561 * set_shrinker_bit():
562 *
563 * list_lru_add() shrink_slab_memcg()
564 * list_add_tail() clear_bit()
565 * <MB> <MB>
566 * set_bit() do_shrink_slab()
567 */
568 smp_mb__after_atomic();
569 ret = do_shrink_slab(&sc, shrinker, priority);
570 if (ret == SHRINK_EMPTY)
571 ret = 0;
572 else
573 set_shrinker_bit(memcg, nid, shrinker_id);
574 }
575 freed += ret;
576 shrinker_put(shrinker);
577 }
578
579 index++;
580 goto again;
581 }
582 unlock:
583 rcu_read_unlock();
584 return freed;
585 }
586 #else /* !CONFIG_MEMCG */
shrink_slab_memcg(gfp_t gfp_mask,int nid,struct mem_cgroup * memcg,int priority)587 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
588 struct mem_cgroup *memcg, int priority)
589 {
590 return 0;
591 }
592 #endif /* CONFIG_MEMCG */
593
594 /**
595 * shrink_slab - shrink slab caches
596 * @gfp_mask: allocation context
597 * @nid: node whose slab caches to target
598 * @memcg: memory cgroup whose slab caches to target
599 * @priority: the reclaim priority
600 *
601 * Call the shrink functions to age shrinkable caches.
602 *
603 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
604 * unaware shrinkers will receive a node id of 0 instead.
605 *
606 * @memcg specifies the memory cgroup to target. Unaware shrinkers
607 * are called only if it is the root cgroup.
608 *
609 * @priority is sc->priority, we take the number of objects and >> by priority
610 * in order to get the scan target.
611 *
612 * Returns the number of reclaimed slab objects.
613 */
shrink_slab(gfp_t gfp_mask,int nid,struct mem_cgroup * memcg,int priority)614 unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
615 int priority)
616 {
617 unsigned long ret, freed = 0;
618 struct shrinker *shrinker;
619
620 /*
621 * The root memcg might be allocated even though memcg is disabled
622 * via "cgroup_disable=memory" boot parameter. This could make
623 * mem_cgroup_is_root() return false, then just run memcg slab
624 * shrink, but skip global shrink. This may result in premature
625 * oom.
626 */
627 if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
628 return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
629
630 /*
631 * lockless algorithm of global shrink.
632 *
633 * In the unregistration setp, the shrinker will be freed asynchronously
634 * via RCU after its refcount reaches 0. So both rcu_read_lock() and
635 * shrinker_try_get() can be used to ensure the existence of the shrinker.
636 *
637 * So in the global shrink:
638 * step 1: use rcu_read_lock() to guarantee existence of the shrinker
639 * and the validity of the shrinker_list walk.
640 * step 2: use shrinker_try_get() to try get the refcount, if successful,
641 * then the existence of the shrinker can also be guaranteed,
642 * so we can release the RCU lock to do do_shrink_slab() that
643 * may sleep.
644 * step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(),
645 * which ensures that neither this shrinker nor the next shrinker
646 * will be freed in the next traversal operation.
647 * step 4: do shrinker_put() paired with step 2 to put the refcount,
648 * if the refcount reaches 0, then wake up the waiter in
649 * shrinker_free() by calling complete().
650 */
651 rcu_read_lock();
652 list_for_each_entry_rcu(shrinker, &shrinker_list, list) {
653 struct shrink_control sc = {
654 .gfp_mask = gfp_mask,
655 .nid = nid,
656 .memcg = memcg,
657 };
658
659 if (!shrinker_try_get(shrinker))
660 continue;
661
662 rcu_read_unlock();
663
664 ret = do_shrink_slab(&sc, shrinker, priority);
665 if (ret == SHRINK_EMPTY)
666 ret = 0;
667 freed += ret;
668
669 rcu_read_lock();
670 shrinker_put(shrinker);
671 }
672
673 rcu_read_unlock();
674 cond_resched();
675 return freed;
676 }
677
shrinker_alloc(unsigned int flags,const char * fmt,...)678 struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)
679 {
680 struct shrinker *shrinker;
681 unsigned int size;
682 va_list ap;
683 int err;
684
685 shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL);
686 if (!shrinker)
687 return NULL;
688
689 va_start(ap, fmt);
690 err = shrinker_debugfs_name_alloc(shrinker, fmt, ap);
691 va_end(ap);
692 if (err)
693 goto err_name;
694
695 shrinker->flags = flags | SHRINKER_ALLOCATED;
696 shrinker->seeks = DEFAULT_SEEKS;
697
698 if (flags & SHRINKER_MEMCG_AWARE) {
699 err = shrinker_memcg_alloc(shrinker);
700 if (err == -ENOSYS) {
701 /* Memcg is not supported, fallback to non-memcg-aware shrinker. */
702 shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
703 goto non_memcg;
704 }
705
706 if (err)
707 goto err_flags;
708
709 return shrinker;
710 }
711
712 non_memcg:
713 /*
714 * The nr_deferred is available on per memcg level for memcg aware
715 * shrinkers, so only allocate nr_deferred in the following cases:
716 * - non-memcg-aware shrinkers
717 * - !CONFIG_MEMCG
718 * - memcg is disabled by kernel command line
719 */
720 size = sizeof(*shrinker->nr_deferred);
721 if (flags & SHRINKER_NUMA_AWARE)
722 size *= nr_node_ids;
723
724 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
725 if (!shrinker->nr_deferred)
726 goto err_flags;
727
728 return shrinker;
729
730 err_flags:
731 shrinker_debugfs_name_free(shrinker);
732 err_name:
733 kfree(shrinker);
734 return NULL;
735 }
736 EXPORT_SYMBOL_GPL(shrinker_alloc);
737
shrinker_register(struct shrinker * shrinker)738 void shrinker_register(struct shrinker *shrinker)
739 {
740 if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) {
741 pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker");
742 return;
743 }
744
745 mutex_lock(&shrinker_mutex);
746 list_add_tail_rcu(&shrinker->list, &shrinker_list);
747 shrinker->flags |= SHRINKER_REGISTERED;
748 shrinker_debugfs_add(shrinker);
749 mutex_unlock(&shrinker_mutex);
750
751 init_completion(&shrinker->done);
752 /*
753 * Now the shrinker is fully set up, take the first reference to it to
754 * indicate that lookup operations are now allowed to use it via
755 * shrinker_try_get().
756 */
757 refcount_set(&shrinker->refcount, 1);
758 }
759 EXPORT_SYMBOL_GPL(shrinker_register);
760
shrinker_free_rcu_cb(struct rcu_head * head)761 static void shrinker_free_rcu_cb(struct rcu_head *head)
762 {
763 struct shrinker *shrinker = container_of(head, struct shrinker, rcu);
764
765 kfree(shrinker->nr_deferred);
766 kfree(shrinker);
767 }
768
shrinker_free(struct shrinker * shrinker)769 void shrinker_free(struct shrinker *shrinker)
770 {
771 struct dentry *debugfs_entry = NULL;
772 int debugfs_id;
773
774 if (!shrinker)
775 return;
776
777 if (shrinker->flags & SHRINKER_REGISTERED) {
778 /* drop the initial refcount */
779 shrinker_put(shrinker);
780 /*
781 * Wait for all lookups of the shrinker to complete, after that,
782 * no shrinker is running or will run again, then we can safely
783 * free it asynchronously via RCU and safely free the structure
784 * where the shrinker is located, such as super_block etc.
785 */
786 wait_for_completion(&shrinker->done);
787 }
788
789 mutex_lock(&shrinker_mutex);
790 if (shrinker->flags & SHRINKER_REGISTERED) {
791 /*
792 * Now we can safely remove it from the shrinker_list and then
793 * free it.
794 */
795 list_del_rcu(&shrinker->list);
796 debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
797 shrinker->flags &= ~SHRINKER_REGISTERED;
798 }
799
800 shrinker_debugfs_name_free(shrinker);
801
802 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
803 shrinker_memcg_remove(shrinker);
804 mutex_unlock(&shrinker_mutex);
805
806 if (debugfs_entry)
807 shrinker_debugfs_remove(debugfs_entry, debugfs_id);
808
809 call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
810 }
811 EXPORT_SYMBOL_GPL(shrinker_free);
812