1 // SPDX-License-Identifier: GPL-2.0-or-later
2
3 #include <linux/memcontrol.h>
4 #include <linux/swap.h>
5 #include <linux/mm_inline.h>
6 #include <linux/pagewalk.h>
7 #include <linux/backing-dev.h>
8 #include <linux/swap_cgroup.h>
9 #include <linux/eventfd.h>
10 #include <linux/poll.h>
11 #include <linux/sort.h>
12 #include <linux/file.h>
13 #include <linux/seq_buf.h>
14
15 #include "internal.h"
16 #include "swap.h"
17 #include "memcontrol-v1.h"
18
19 /*
20 * Cgroups above their limits are maintained in a RB-Tree, independent of
21 * their hierarchy representation
22 */
23
24 struct mem_cgroup_tree_per_node {
25 struct rb_root rb_root;
26 struct rb_node *rb_rightmost;
27 spinlock_t lock;
28 };
29
30 struct mem_cgroup_tree {
31 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
32 };
33
34 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
35
36 /*
37 * Maximum loops in mem_cgroup_soft_reclaim(), used for soft
38 * limit reclaim to prevent infinite loops, if they ever occur.
39 */
40 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
41 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
42
43 /* Stuffs for move charges at task migration. */
44 /*
45 * Types of charges to be moved.
46 */
47 #define MOVE_ANON 0x1ULL
48 #define MOVE_FILE 0x2ULL
49 #define MOVE_MASK (MOVE_ANON | MOVE_FILE)
50
51 /* "mc" and its members are protected by cgroup_mutex */
52 static struct move_charge_struct {
53 spinlock_t lock; /* for from, to */
54 struct mm_struct *mm;
55 struct mem_cgroup *from;
56 struct mem_cgroup *to;
57 unsigned long flags;
58 unsigned long precharge;
59 unsigned long moved_charge;
60 unsigned long moved_swap;
61 struct task_struct *moving_task; /* a task moving charges */
62 wait_queue_head_t waitq; /* a waitq for other context */
63 } mc = {
64 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
65 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
66 };
67
68 /* for OOM */
69 struct mem_cgroup_eventfd_list {
70 struct list_head list;
71 struct eventfd_ctx *eventfd;
72 };
73
74 /*
75 * cgroup_event represents events which userspace want to receive.
76 */
77 struct mem_cgroup_event {
78 /*
79 * memcg which the event belongs to.
80 */
81 struct mem_cgroup *memcg;
82 /*
83 * eventfd to signal userspace about the event.
84 */
85 struct eventfd_ctx *eventfd;
86 /*
87 * Each of these stored in a list by the cgroup.
88 */
89 struct list_head list;
90 /*
91 * register_event() callback will be used to add new userspace
92 * waiter for changes related to this event. Use eventfd_signal()
93 * on eventfd to send notification to userspace.
94 */
95 int (*register_event)(struct mem_cgroup *memcg,
96 struct eventfd_ctx *eventfd, const char *args);
97 /*
98 * unregister_event() callback will be called when userspace closes
99 * the eventfd or on cgroup removing. This callback must be set,
100 * if you want provide notification functionality.
101 */
102 void (*unregister_event)(struct mem_cgroup *memcg,
103 struct eventfd_ctx *eventfd);
104 /*
105 * All fields below needed to unregister event when
106 * userspace closes eventfd.
107 */
108 poll_table pt;
109 wait_queue_head_t *wqh;
110 wait_queue_entry_t wait;
111 struct work_struct remove;
112 };
113
114 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
115 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
116 #define MEMFILE_ATTR(val) ((val) & 0xffff)
117
118 enum {
119 RES_USAGE,
120 RES_LIMIT,
121 RES_MAX_USAGE,
122 RES_FAILCNT,
123 RES_SOFT_LIMIT,
124 };
125
126 #ifdef CONFIG_LOCKDEP
127 static struct lockdep_map memcg_oom_lock_dep_map = {
128 .name = "memcg_oom_lock",
129 };
130 #endif
131
132 DEFINE_SPINLOCK(memcg_oom_lock);
133
__mem_cgroup_insert_exceeded(struct mem_cgroup_per_node * mz,struct mem_cgroup_tree_per_node * mctz,unsigned long new_usage_in_excess)134 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
135 struct mem_cgroup_tree_per_node *mctz,
136 unsigned long new_usage_in_excess)
137 {
138 struct rb_node **p = &mctz->rb_root.rb_node;
139 struct rb_node *parent = NULL;
140 struct mem_cgroup_per_node *mz_node;
141 bool rightmost = true;
142
143 if (mz->on_tree)
144 return;
145
146 mz->usage_in_excess = new_usage_in_excess;
147 if (!mz->usage_in_excess)
148 return;
149 while (*p) {
150 parent = *p;
151 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
152 tree_node);
153 if (mz->usage_in_excess < mz_node->usage_in_excess) {
154 p = &(*p)->rb_left;
155 rightmost = false;
156 } else {
157 p = &(*p)->rb_right;
158 }
159 }
160
161 if (rightmost)
162 mctz->rb_rightmost = &mz->tree_node;
163
164 rb_link_node(&mz->tree_node, parent, p);
165 rb_insert_color(&mz->tree_node, &mctz->rb_root);
166 mz->on_tree = true;
167 }
168
__mem_cgroup_remove_exceeded(struct mem_cgroup_per_node * mz,struct mem_cgroup_tree_per_node * mctz)169 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
170 struct mem_cgroup_tree_per_node *mctz)
171 {
172 if (!mz->on_tree)
173 return;
174
175 if (&mz->tree_node == mctz->rb_rightmost)
176 mctz->rb_rightmost = rb_prev(&mz->tree_node);
177
178 rb_erase(&mz->tree_node, &mctz->rb_root);
179 mz->on_tree = false;
180 }
181
mem_cgroup_remove_exceeded(struct mem_cgroup_per_node * mz,struct mem_cgroup_tree_per_node * mctz)182 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
183 struct mem_cgroup_tree_per_node *mctz)
184 {
185 unsigned long flags;
186
187 spin_lock_irqsave(&mctz->lock, flags);
188 __mem_cgroup_remove_exceeded(mz, mctz);
189 spin_unlock_irqrestore(&mctz->lock, flags);
190 }
191
soft_limit_excess(struct mem_cgroup * memcg)192 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
193 {
194 unsigned long nr_pages = page_counter_read(&memcg->memory);
195 unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
196 unsigned long excess = 0;
197
198 if (nr_pages > soft_limit)
199 excess = nr_pages - soft_limit;
200
201 return excess;
202 }
203
memcg1_update_tree(struct mem_cgroup * memcg,int nid)204 static void memcg1_update_tree(struct mem_cgroup *memcg, int nid)
205 {
206 unsigned long excess;
207 struct mem_cgroup_per_node *mz;
208 struct mem_cgroup_tree_per_node *mctz;
209
210 if (lru_gen_enabled()) {
211 if (soft_limit_excess(memcg))
212 lru_gen_soft_reclaim(memcg, nid);
213 return;
214 }
215
216 mctz = soft_limit_tree.rb_tree_per_node[nid];
217 if (!mctz)
218 return;
219 /*
220 * Necessary to update all ancestors when hierarchy is used.
221 * because their event counter is not touched.
222 */
223 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
224 mz = memcg->nodeinfo[nid];
225 excess = soft_limit_excess(memcg);
226 /*
227 * We have to update the tree if mz is on RB-tree or
228 * mem is over its softlimit.
229 */
230 if (excess || mz->on_tree) {
231 unsigned long flags;
232
233 spin_lock_irqsave(&mctz->lock, flags);
234 /* if on-tree, remove it */
235 if (mz->on_tree)
236 __mem_cgroup_remove_exceeded(mz, mctz);
237 /*
238 * Insert again. mz->usage_in_excess will be updated.
239 * If excess is 0, no tree ops.
240 */
241 __mem_cgroup_insert_exceeded(mz, mctz, excess);
242 spin_unlock_irqrestore(&mctz->lock, flags);
243 }
244 }
245 }
246
memcg1_remove_from_trees(struct mem_cgroup * memcg)247 void memcg1_remove_from_trees(struct mem_cgroup *memcg)
248 {
249 struct mem_cgroup_tree_per_node *mctz;
250 struct mem_cgroup_per_node *mz;
251 int nid;
252
253 for_each_node(nid) {
254 mz = memcg->nodeinfo[nid];
255 mctz = soft_limit_tree.rb_tree_per_node[nid];
256 if (mctz)
257 mem_cgroup_remove_exceeded(mz, mctz);
258 }
259 }
260
261 static struct mem_cgroup_per_node *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node * mctz)262 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
263 {
264 struct mem_cgroup_per_node *mz;
265
266 retry:
267 mz = NULL;
268 if (!mctz->rb_rightmost)
269 goto done; /* Nothing to reclaim from */
270
271 mz = rb_entry(mctz->rb_rightmost,
272 struct mem_cgroup_per_node, tree_node);
273 /*
274 * Remove the node now but someone else can add it back,
275 * we will to add it back at the end of reclaim to its correct
276 * position in the tree.
277 */
278 __mem_cgroup_remove_exceeded(mz, mctz);
279 if (!soft_limit_excess(mz->memcg) ||
280 !css_tryget(&mz->memcg->css))
281 goto retry;
282 done:
283 return mz;
284 }
285
286 static struct mem_cgroup_per_node *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node * mctz)287 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
288 {
289 struct mem_cgroup_per_node *mz;
290
291 spin_lock_irq(&mctz->lock);
292 mz = __mem_cgroup_largest_soft_limit_node(mctz);
293 spin_unlock_irq(&mctz->lock);
294 return mz;
295 }
296
mem_cgroup_soft_reclaim(struct mem_cgroup * root_memcg,pg_data_t * pgdat,gfp_t gfp_mask,unsigned long * total_scanned)297 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
298 pg_data_t *pgdat,
299 gfp_t gfp_mask,
300 unsigned long *total_scanned)
301 {
302 struct mem_cgroup *victim = NULL;
303 int total = 0;
304 int loop = 0;
305 unsigned long excess;
306 unsigned long nr_scanned;
307 struct mem_cgroup_reclaim_cookie reclaim = {
308 .pgdat = pgdat,
309 };
310
311 excess = soft_limit_excess(root_memcg);
312
313 while (1) {
314 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
315 if (!victim) {
316 loop++;
317 if (loop >= 2) {
318 /*
319 * If we have not been able to reclaim
320 * anything, it might because there are
321 * no reclaimable pages under this hierarchy
322 */
323 if (!total)
324 break;
325 /*
326 * We want to do more targeted reclaim.
327 * excess >> 2 is not to excessive so as to
328 * reclaim too much, nor too less that we keep
329 * coming back to reclaim from this cgroup
330 */
331 if (total >= (excess >> 2) ||
332 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
333 break;
334 }
335 continue;
336 }
337 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
338 pgdat, &nr_scanned);
339 *total_scanned += nr_scanned;
340 if (!soft_limit_excess(root_memcg))
341 break;
342 }
343 mem_cgroup_iter_break(root_memcg, victim);
344 return total;
345 }
346
memcg1_soft_limit_reclaim(pg_data_t * pgdat,int order,gfp_t gfp_mask,unsigned long * total_scanned)347 unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
348 gfp_t gfp_mask,
349 unsigned long *total_scanned)
350 {
351 unsigned long nr_reclaimed = 0;
352 struct mem_cgroup_per_node *mz, *next_mz = NULL;
353 unsigned long reclaimed;
354 int loop = 0;
355 struct mem_cgroup_tree_per_node *mctz;
356 unsigned long excess;
357
358 if (lru_gen_enabled())
359 return 0;
360
361 if (order > 0)
362 return 0;
363
364 mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
365
366 /*
367 * Do not even bother to check the largest node if the root
368 * is empty. Do it lockless to prevent lock bouncing. Races
369 * are acceptable as soft limit is best effort anyway.
370 */
371 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
372 return 0;
373
374 /*
375 * This loop can run a while, specially if mem_cgroup's continuously
376 * keep exceeding their soft limit and putting the system under
377 * pressure
378 */
379 do {
380 if (next_mz)
381 mz = next_mz;
382 else
383 mz = mem_cgroup_largest_soft_limit_node(mctz);
384 if (!mz)
385 break;
386
387 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
388 gfp_mask, total_scanned);
389 nr_reclaimed += reclaimed;
390 spin_lock_irq(&mctz->lock);
391
392 /*
393 * If we failed to reclaim anything from this memory cgroup
394 * it is time to move on to the next cgroup
395 */
396 next_mz = NULL;
397 if (!reclaimed)
398 next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
399
400 excess = soft_limit_excess(mz->memcg);
401 /*
402 * One school of thought says that we should not add
403 * back the node to the tree if reclaim returns 0.
404 * But our reclaim could return 0, simply because due
405 * to priority we are exposing a smaller subset of
406 * memory to reclaim from. Consider this as a longer
407 * term TODO.
408 */
409 /* If excess == 0, no tree ops */
410 __mem_cgroup_insert_exceeded(mz, mctz, excess);
411 spin_unlock_irq(&mctz->lock);
412 css_put(&mz->memcg->css);
413 loop++;
414 /*
415 * Could not reclaim anything and there are no more
416 * mem cgroups to try or we seem to be looping without
417 * reclaiming anything.
418 */
419 if (!nr_reclaimed &&
420 (next_mz == NULL ||
421 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
422 break;
423 } while (!nr_reclaimed);
424 if (next_mz)
425 css_put(&next_mz->memcg->css);
426 return nr_reclaimed;
427 }
428
429 /*
430 * A routine for checking "mem" is under move_account() or not.
431 *
432 * Checking a cgroup is mc.from or mc.to or under hierarchy of
433 * moving cgroups. This is for waiting at high-memory pressure
434 * caused by "move".
435 */
mem_cgroup_under_move(struct mem_cgroup * memcg)436 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
437 {
438 struct mem_cgroup *from;
439 struct mem_cgroup *to;
440 bool ret = false;
441 /*
442 * Unlike task_move routines, we access mc.to, mc.from not under
443 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
444 */
445 spin_lock(&mc.lock);
446 from = mc.from;
447 to = mc.to;
448 if (!from)
449 goto unlock;
450
451 ret = mem_cgroup_is_descendant(from, memcg) ||
452 mem_cgroup_is_descendant(to, memcg);
453 unlock:
454 spin_unlock(&mc.lock);
455 return ret;
456 }
457
memcg1_wait_acct_move(struct mem_cgroup * memcg)458 bool memcg1_wait_acct_move(struct mem_cgroup *memcg)
459 {
460 if (mc.moving_task && current != mc.moving_task) {
461 if (mem_cgroup_under_move(memcg)) {
462 DEFINE_WAIT(wait);
463 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
464 /* moving charge context might have finished. */
465 if (mc.moving_task)
466 schedule();
467 finish_wait(&mc.waitq, &wait);
468 return true;
469 }
470 }
471 return false;
472 }
473
474 /**
475 * folio_memcg_lock - Bind a folio to its memcg.
476 * @folio: The folio.
477 *
478 * This function prevents unlocked LRU folios from being moved to
479 * another cgroup.
480 *
481 * It ensures lifetime of the bound memcg. The caller is responsible
482 * for the lifetime of the folio.
483 */
folio_memcg_lock(struct folio * folio)484 void folio_memcg_lock(struct folio *folio)
485 {
486 struct mem_cgroup *memcg;
487 unsigned long flags;
488
489 /*
490 * The RCU lock is held throughout the transaction. The fast
491 * path can get away without acquiring the memcg->move_lock
492 * because page moving starts with an RCU grace period.
493 */
494 rcu_read_lock();
495
496 if (mem_cgroup_disabled())
497 return;
498 again:
499 memcg = folio_memcg(folio);
500 if (unlikely(!memcg))
501 return;
502
503 #ifdef CONFIG_PROVE_LOCKING
504 local_irq_save(flags);
505 might_lock(&memcg->move_lock);
506 local_irq_restore(flags);
507 #endif
508
509 if (atomic_read(&memcg->moving_account) <= 0)
510 return;
511
512 spin_lock_irqsave(&memcg->move_lock, flags);
513 if (memcg != folio_memcg(folio)) {
514 spin_unlock_irqrestore(&memcg->move_lock, flags);
515 goto again;
516 }
517
518 /*
519 * When charge migration first begins, we can have multiple
520 * critical sections holding the fast-path RCU lock and one
521 * holding the slowpath move_lock. Track the task who has the
522 * move_lock for folio_memcg_unlock().
523 */
524 memcg->move_lock_task = current;
525 memcg->move_lock_flags = flags;
526 }
527
__folio_memcg_unlock(struct mem_cgroup * memcg)528 static void __folio_memcg_unlock(struct mem_cgroup *memcg)
529 {
530 if (memcg && memcg->move_lock_task == current) {
531 unsigned long flags = memcg->move_lock_flags;
532
533 memcg->move_lock_task = NULL;
534 memcg->move_lock_flags = 0;
535
536 spin_unlock_irqrestore(&memcg->move_lock, flags);
537 }
538
539 rcu_read_unlock();
540 }
541
542 /**
543 * folio_memcg_unlock - Release the binding between a folio and its memcg.
544 * @folio: The folio.
545 *
546 * This releases the binding created by folio_memcg_lock(). This does
547 * not change the accounting of this folio to its memcg, but it does
548 * permit others to change it.
549 */
folio_memcg_unlock(struct folio * folio)550 void folio_memcg_unlock(struct folio *folio)
551 {
552 __folio_memcg_unlock(folio_memcg(folio));
553 }
554
555 #ifdef CONFIG_SWAP
556 /**
557 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
558 * @entry: swap entry to be moved
559 * @from: mem_cgroup which the entry is moved from
560 * @to: mem_cgroup which the entry is moved to
561 *
562 * It succeeds only when the swap_cgroup's record for this entry is the same
563 * as the mem_cgroup's id of @from.
564 *
565 * Returns 0 on success, -EINVAL on failure.
566 *
567 * The caller must have charged to @to, IOW, called page_counter_charge() about
568 * both res and memsw, and called css_get().
569 */
mem_cgroup_move_swap_account(swp_entry_t entry,struct mem_cgroup * from,struct mem_cgroup * to)570 static int mem_cgroup_move_swap_account(swp_entry_t entry,
571 struct mem_cgroup *from, struct mem_cgroup *to)
572 {
573 unsigned short old_id, new_id;
574
575 old_id = mem_cgroup_id(from);
576 new_id = mem_cgroup_id(to);
577
578 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
579 mod_memcg_state(from, MEMCG_SWAP, -1);
580 mod_memcg_state(to, MEMCG_SWAP, 1);
581 return 0;
582 }
583 return -EINVAL;
584 }
585 #else
mem_cgroup_move_swap_account(swp_entry_t entry,struct mem_cgroup * from,struct mem_cgroup * to)586 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
587 struct mem_cgroup *from, struct mem_cgroup *to)
588 {
589 return -EINVAL;
590 }
591 #endif
592
mem_cgroup_move_charge_read(struct cgroup_subsys_state * css,struct cftype * cft)593 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
594 struct cftype *cft)
595 {
596 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
597 }
598
599 #ifdef CONFIG_MMU
mem_cgroup_move_charge_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)600 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
601 struct cftype *cft, u64 val)
602 {
603 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
604
605 pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
606 "Please report your usecase to linux-mm@kvack.org if you "
607 "depend on this functionality.\n");
608
609 if (val & ~MOVE_MASK)
610 return -EINVAL;
611
612 /*
613 * No kind of locking is needed in here, because ->can_attach() will
614 * check this value once in the beginning of the process, and then carry
615 * on with stale data. This means that changes to this value will only
616 * affect task migrations starting after the change.
617 */
618 memcg->move_charge_at_immigrate = val;
619 return 0;
620 }
621 #else
mem_cgroup_move_charge_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)622 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
623 struct cftype *cft, u64 val)
624 {
625 return -ENOSYS;
626 }
627 #endif
628
629 #ifdef CONFIG_MMU
630 /* Handlers for move charge at task migration. */
mem_cgroup_do_precharge(unsigned long count)631 static int mem_cgroup_do_precharge(unsigned long count)
632 {
633 int ret;
634
635 /* Try a single bulk charge without reclaim first, kswapd may wake */
636 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
637 if (!ret) {
638 mc.precharge += count;
639 return ret;
640 }
641
642 /* Try charges one by one with reclaim, but do not retry */
643 while (count--) {
644 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
645 if (ret)
646 return ret;
647 mc.precharge++;
648 cond_resched();
649 }
650 return 0;
651 }
652
653 union mc_target {
654 struct folio *folio;
655 swp_entry_t ent;
656 };
657
658 enum mc_target_type {
659 MC_TARGET_NONE = 0,
660 MC_TARGET_PAGE,
661 MC_TARGET_SWAP,
662 MC_TARGET_DEVICE,
663 };
664
mc_handle_present_pte(struct vm_area_struct * vma,unsigned long addr,pte_t ptent)665 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
666 unsigned long addr, pte_t ptent)
667 {
668 struct page *page = vm_normal_page(vma, addr, ptent);
669
670 if (!page)
671 return NULL;
672 if (PageAnon(page)) {
673 if (!(mc.flags & MOVE_ANON))
674 return NULL;
675 } else {
676 if (!(mc.flags & MOVE_FILE))
677 return NULL;
678 }
679 get_page(page);
680
681 return page;
682 }
683
684 #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
mc_handle_swap_pte(struct vm_area_struct * vma,pte_t ptent,swp_entry_t * entry)685 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
686 pte_t ptent, swp_entry_t *entry)
687 {
688 struct page *page = NULL;
689 swp_entry_t ent = pte_to_swp_entry(ptent);
690
691 if (!(mc.flags & MOVE_ANON))
692 return NULL;
693
694 /*
695 * Handle device private pages that are not accessible by the CPU, but
696 * stored as special swap entries in the page table.
697 */
698 if (is_device_private_entry(ent)) {
699 page = pfn_swap_entry_to_page(ent);
700 if (!get_page_unless_zero(page))
701 return NULL;
702 return page;
703 }
704
705 if (non_swap_entry(ent))
706 return NULL;
707
708 /*
709 * Because swap_cache_get_folio() updates some statistics counter,
710 * we call find_get_page() with swapper_space directly.
711 */
712 page = find_get_page(swap_address_space(ent), swap_cache_index(ent));
713 entry->val = ent.val;
714
715 return page;
716 }
717 #else
mc_handle_swap_pte(struct vm_area_struct * vma,pte_t ptent,swp_entry_t * entry)718 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
719 pte_t ptent, swp_entry_t *entry)
720 {
721 return NULL;
722 }
723 #endif
724
mc_handle_file_pte(struct vm_area_struct * vma,unsigned long addr,pte_t ptent)725 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
726 unsigned long addr, pte_t ptent)
727 {
728 unsigned long index;
729 struct folio *folio;
730
731 if (!vma->vm_file) /* anonymous vma */
732 return NULL;
733 if (!(mc.flags & MOVE_FILE))
734 return NULL;
735
736 /* folio is moved even if it's not RSS of this task(page-faulted). */
737 /* shmem/tmpfs may report page out on swap: account for that too. */
738 index = linear_page_index(vma, addr);
739 folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index);
740 if (IS_ERR(folio))
741 return NULL;
742 return folio_file_page(folio, index);
743 }
744
745 static void memcg1_check_events(struct mem_cgroup *memcg, int nid);
746 static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages);
747
748 /**
749 * mem_cgroup_move_account - move account of the folio
750 * @folio: The folio.
751 * @compound: charge the page as compound or small page
752 * @from: mem_cgroup which the folio is moved from.
753 * @to: mem_cgroup which the folio is moved to. @from != @to.
754 *
755 * The folio must be locked and not on the LRU.
756 *
757 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
758 * from old cgroup.
759 */
mem_cgroup_move_account(struct folio * folio,bool compound,struct mem_cgroup * from,struct mem_cgroup * to)760 static int mem_cgroup_move_account(struct folio *folio,
761 bool compound,
762 struct mem_cgroup *from,
763 struct mem_cgroup *to)
764 {
765 struct lruvec *from_vec, *to_vec;
766 struct pglist_data *pgdat;
767 unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
768 int nid, ret;
769
770 VM_BUG_ON(from == to);
771 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
772 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
773 VM_BUG_ON(compound && !folio_test_large(folio));
774
775 ret = -EINVAL;
776 if (folio_memcg(folio) != from)
777 goto out;
778
779 pgdat = folio_pgdat(folio);
780 from_vec = mem_cgroup_lruvec(from, pgdat);
781 to_vec = mem_cgroup_lruvec(to, pgdat);
782
783 folio_memcg_lock(folio);
784
785 if (folio_test_anon(folio)) {
786 if (folio_mapped(folio)) {
787 __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
788 __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
789 if (folio_test_pmd_mappable(folio)) {
790 __mod_lruvec_state(from_vec, NR_ANON_THPS,
791 -nr_pages);
792 __mod_lruvec_state(to_vec, NR_ANON_THPS,
793 nr_pages);
794 }
795 }
796 } else {
797 __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
798 __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
799
800 if (folio_test_swapbacked(folio)) {
801 __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
802 __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
803 }
804
805 if (folio_mapped(folio)) {
806 __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
807 __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
808 }
809
810 if (folio_test_dirty(folio)) {
811 struct address_space *mapping = folio_mapping(folio);
812
813 if (mapping_can_writeback(mapping)) {
814 __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
815 -nr_pages);
816 __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
817 nr_pages);
818 }
819 }
820 }
821
822 #ifdef CONFIG_SWAP
823 if (folio_test_swapcache(folio)) {
824 __mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages);
825 __mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages);
826 }
827 #endif
828 if (folio_test_writeback(folio)) {
829 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
830 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
831 }
832
833 /*
834 * All state has been migrated, let's switch to the new memcg.
835 *
836 * It is safe to change page's memcg here because the page
837 * is referenced, charged, isolated, and locked: we can't race
838 * with (un)charging, migration, LRU putback, or anything else
839 * that would rely on a stable page's memory cgroup.
840 *
841 * Note that folio_memcg_lock is a memcg lock, not a page lock,
842 * to save space. As soon as we switch page's memory cgroup to a
843 * new memcg that isn't locked, the above state can change
844 * concurrently again. Make sure we're truly done with it.
845 */
846 smp_mb();
847
848 css_get(&to->css);
849 css_put(&from->css);
850
851 /* Warning should never happen, so don't worry about refcount non-0 */
852 WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
853 folio->memcg_data = (unsigned long)to;
854
855 __folio_memcg_unlock(from);
856
857 ret = 0;
858 nid = folio_nid(folio);
859
860 local_irq_disable();
861 memcg1_charge_statistics(to, nr_pages);
862 memcg1_check_events(to, nid);
863 memcg1_charge_statistics(from, -nr_pages);
864 memcg1_check_events(from, nid);
865 local_irq_enable();
866 out:
867 return ret;
868 }
869
870 /**
871 * get_mctgt_type - get target type of moving charge
872 * @vma: the vma the pte to be checked belongs
873 * @addr: the address corresponding to the pte to be checked
874 * @ptent: the pte to be checked
875 * @target: the pointer the target page or swap ent will be stored(can be NULL)
876 *
877 * Context: Called with pte lock held.
878 * Return:
879 * * MC_TARGET_NONE - If the pte is not a target for move charge.
880 * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for
881 * move charge. If @target is not NULL, the folio is stored in target->folio
882 * with extra refcnt taken (Caller should release it).
883 * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a
884 * target for charge migration. If @target is not NULL, the entry is
885 * stored in target->ent.
886 * * MC_TARGET_DEVICE - Like MC_TARGET_PAGE but page is device memory and
887 * thus not on the lru. For now such page is charged like a regular page
888 * would be as it is just special memory taking the place of a regular page.
889 * See Documentations/vm/hmm.txt and include/linux/hmm.h
890 */
get_mctgt_type(struct vm_area_struct * vma,unsigned long addr,pte_t ptent,union mc_target * target)891 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
892 unsigned long addr, pte_t ptent, union mc_target *target)
893 {
894 struct page *page = NULL;
895 struct folio *folio;
896 enum mc_target_type ret = MC_TARGET_NONE;
897 swp_entry_t ent = { .val = 0 };
898
899 if (pte_present(ptent))
900 page = mc_handle_present_pte(vma, addr, ptent);
901 else if (pte_none_mostly(ptent))
902 /*
903 * PTE markers should be treated as a none pte here, separated
904 * from other swap handling below.
905 */
906 page = mc_handle_file_pte(vma, addr, ptent);
907 else if (is_swap_pte(ptent))
908 page = mc_handle_swap_pte(vma, ptent, &ent);
909
910 if (page)
911 folio = page_folio(page);
912 if (target && page) {
913 if (!folio_trylock(folio)) {
914 folio_put(folio);
915 return ret;
916 }
917 /*
918 * page_mapped() must be stable during the move. This
919 * pte is locked, so if it's present, the page cannot
920 * become unmapped. If it isn't, we have only partial
921 * control over the mapped state: the page lock will
922 * prevent new faults against pagecache and swapcache,
923 * so an unmapped page cannot become mapped. However,
924 * if the page is already mapped elsewhere, it can
925 * unmap, and there is nothing we can do about it.
926 * Alas, skip moving the page in this case.
927 */
928 if (!pte_present(ptent) && page_mapped(page)) {
929 folio_unlock(folio);
930 folio_put(folio);
931 return ret;
932 }
933 }
934
935 if (!page && !ent.val)
936 return ret;
937 if (page) {
938 /*
939 * Do only loose check w/o serialization.
940 * mem_cgroup_move_account() checks the page is valid or
941 * not under LRU exclusion.
942 */
943 if (folio_memcg(folio) == mc.from) {
944 ret = MC_TARGET_PAGE;
945 if (folio_is_device_private(folio) ||
946 folio_is_device_coherent(folio))
947 ret = MC_TARGET_DEVICE;
948 if (target)
949 target->folio = folio;
950 }
951 if (!ret || !target) {
952 if (target)
953 folio_unlock(folio);
954 folio_put(folio);
955 }
956 }
957 /*
958 * There is a swap entry and a page doesn't exist or isn't charged.
959 * But we cannot move a tail-page in a THP.
960 */
961 if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
962 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
963 ret = MC_TARGET_SWAP;
964 if (target)
965 target->ent = ent;
966 }
967 return ret;
968 }
969
970 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
971 /*
972 * We don't consider PMD mapped swapping or file mapped pages because THP does
973 * not support them for now.
974 * Caller should make sure that pmd_trans_huge(pmd) is true.
975 */
get_mctgt_type_thp(struct vm_area_struct * vma,unsigned long addr,pmd_t pmd,union mc_target * target)976 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
977 unsigned long addr, pmd_t pmd, union mc_target *target)
978 {
979 struct page *page = NULL;
980 struct folio *folio;
981 enum mc_target_type ret = MC_TARGET_NONE;
982
983 if (unlikely(is_swap_pmd(pmd))) {
984 VM_BUG_ON(thp_migration_supported() &&
985 !is_pmd_migration_entry(pmd));
986 return ret;
987 }
988 page = pmd_page(pmd);
989 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
990 folio = page_folio(page);
991 if (!(mc.flags & MOVE_ANON))
992 return ret;
993 if (folio_memcg(folio) == mc.from) {
994 ret = MC_TARGET_PAGE;
995 if (target) {
996 folio_get(folio);
997 if (!folio_trylock(folio)) {
998 folio_put(folio);
999 return MC_TARGET_NONE;
1000 }
1001 target->folio = folio;
1002 }
1003 }
1004 return ret;
1005 }
1006 #else
get_mctgt_type_thp(struct vm_area_struct * vma,unsigned long addr,pmd_t pmd,union mc_target * target)1007 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
1008 unsigned long addr, pmd_t pmd, union mc_target *target)
1009 {
1010 return MC_TARGET_NONE;
1011 }
1012 #endif
1013
mem_cgroup_count_precharge_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)1014 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
1015 unsigned long addr, unsigned long end,
1016 struct mm_walk *walk)
1017 {
1018 struct vm_area_struct *vma = walk->vma;
1019 pte_t *pte;
1020 spinlock_t *ptl;
1021
1022 ptl = pmd_trans_huge_lock(pmd, vma);
1023 if (ptl) {
1024 /*
1025 * Note their can not be MC_TARGET_DEVICE for now as we do not
1026 * support transparent huge page with MEMORY_DEVICE_PRIVATE but
1027 * this might change.
1028 */
1029 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
1030 mc.precharge += HPAGE_PMD_NR;
1031 spin_unlock(ptl);
1032 return 0;
1033 }
1034
1035 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1036 if (!pte)
1037 return 0;
1038 for (; addr != end; pte++, addr += PAGE_SIZE)
1039 if (get_mctgt_type(vma, addr, ptep_get(pte), NULL))
1040 mc.precharge++; /* increment precharge temporarily */
1041 pte_unmap_unlock(pte - 1, ptl);
1042 cond_resched();
1043
1044 return 0;
1045 }
1046
1047 static const struct mm_walk_ops precharge_walk_ops = {
1048 .pmd_entry = mem_cgroup_count_precharge_pte_range,
1049 .walk_lock = PGWALK_RDLOCK,
1050 };
1051
mem_cgroup_count_precharge(struct mm_struct * mm)1052 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
1053 {
1054 unsigned long precharge;
1055
1056 mmap_read_lock(mm);
1057 walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL);
1058 mmap_read_unlock(mm);
1059
1060 precharge = mc.precharge;
1061 mc.precharge = 0;
1062
1063 return precharge;
1064 }
1065
mem_cgroup_precharge_mc(struct mm_struct * mm)1066 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
1067 {
1068 unsigned long precharge = mem_cgroup_count_precharge(mm);
1069
1070 VM_BUG_ON(mc.moving_task);
1071 mc.moving_task = current;
1072 return mem_cgroup_do_precharge(precharge);
1073 }
1074
1075 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
__mem_cgroup_clear_mc(void)1076 static void __mem_cgroup_clear_mc(void)
1077 {
1078 struct mem_cgroup *from = mc.from;
1079 struct mem_cgroup *to = mc.to;
1080
1081 /* we must uncharge all the leftover precharges from mc.to */
1082 if (mc.precharge) {
1083 mem_cgroup_cancel_charge(mc.to, mc.precharge);
1084 mc.precharge = 0;
1085 }
1086 /*
1087 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
1088 * we must uncharge here.
1089 */
1090 if (mc.moved_charge) {
1091 mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
1092 mc.moved_charge = 0;
1093 }
1094 /* we must fixup refcnts and charges */
1095 if (mc.moved_swap) {
1096 /* uncharge swap account from the old cgroup */
1097 if (!mem_cgroup_is_root(mc.from))
1098 page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
1099
1100 mem_cgroup_id_put_many(mc.from, mc.moved_swap);
1101
1102 /*
1103 * we charged both to->memory and to->memsw, so we
1104 * should uncharge to->memory.
1105 */
1106 if (!mem_cgroup_is_root(mc.to))
1107 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
1108
1109 mc.moved_swap = 0;
1110 }
1111 memcg1_oom_recover(from);
1112 memcg1_oom_recover(to);
1113 wake_up_all(&mc.waitq);
1114 }
1115
mem_cgroup_clear_mc(void)1116 static void mem_cgroup_clear_mc(void)
1117 {
1118 struct mm_struct *mm = mc.mm;
1119
1120 /*
1121 * we must clear moving_task before waking up waiters at the end of
1122 * task migration.
1123 */
1124 mc.moving_task = NULL;
1125 __mem_cgroup_clear_mc();
1126 spin_lock(&mc.lock);
1127 mc.from = NULL;
1128 mc.to = NULL;
1129 mc.mm = NULL;
1130 spin_unlock(&mc.lock);
1131
1132 mmput(mm);
1133 }
1134
memcg1_can_attach(struct cgroup_taskset * tset)1135 int memcg1_can_attach(struct cgroup_taskset *tset)
1136 {
1137 struct cgroup_subsys_state *css;
1138 struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
1139 struct mem_cgroup *from;
1140 struct task_struct *leader, *p;
1141 struct mm_struct *mm;
1142 unsigned long move_flags;
1143 int ret = 0;
1144
1145 /* charge immigration isn't supported on the default hierarchy */
1146 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1147 return 0;
1148
1149 /*
1150 * Multi-process migrations only happen on the default hierarchy
1151 * where charge immigration is not used. Perform charge
1152 * immigration if @tset contains a leader and whine if there are
1153 * multiple.
1154 */
1155 p = NULL;
1156 cgroup_taskset_for_each_leader(leader, css, tset) {
1157 WARN_ON_ONCE(p);
1158 p = leader;
1159 memcg = mem_cgroup_from_css(css);
1160 }
1161 if (!p)
1162 return 0;
1163
1164 /*
1165 * We are now committed to this value whatever it is. Changes in this
1166 * tunable will only affect upcoming migrations, not the current one.
1167 * So we need to save it, and keep it going.
1168 */
1169 move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
1170 if (!move_flags)
1171 return 0;
1172
1173 from = mem_cgroup_from_task(p);
1174
1175 VM_BUG_ON(from == memcg);
1176
1177 mm = get_task_mm(p);
1178 if (!mm)
1179 return 0;
1180 /* We move charges only when we move a owner of the mm */
1181 if (mm->owner == p) {
1182 VM_BUG_ON(mc.from);
1183 VM_BUG_ON(mc.to);
1184 VM_BUG_ON(mc.precharge);
1185 VM_BUG_ON(mc.moved_charge);
1186 VM_BUG_ON(mc.moved_swap);
1187
1188 spin_lock(&mc.lock);
1189 mc.mm = mm;
1190 mc.from = from;
1191 mc.to = memcg;
1192 mc.flags = move_flags;
1193 spin_unlock(&mc.lock);
1194 /* We set mc.moving_task later */
1195
1196 ret = mem_cgroup_precharge_mc(mm);
1197 if (ret)
1198 mem_cgroup_clear_mc();
1199 } else {
1200 mmput(mm);
1201 }
1202 return ret;
1203 }
1204
memcg1_cancel_attach(struct cgroup_taskset * tset)1205 void memcg1_cancel_attach(struct cgroup_taskset *tset)
1206 {
1207 if (mc.to)
1208 mem_cgroup_clear_mc();
1209 }
1210
mem_cgroup_move_charge_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)1211 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
1212 unsigned long addr, unsigned long end,
1213 struct mm_walk *walk)
1214 {
1215 int ret = 0;
1216 struct vm_area_struct *vma = walk->vma;
1217 pte_t *pte;
1218 spinlock_t *ptl;
1219 enum mc_target_type target_type;
1220 union mc_target target;
1221 struct folio *folio;
1222 bool tried_split_before = false;
1223
1224 retry_pmd:
1225 ptl = pmd_trans_huge_lock(pmd, vma);
1226 if (ptl) {
1227 if (mc.precharge < HPAGE_PMD_NR) {
1228 spin_unlock(ptl);
1229 return 0;
1230 }
1231 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
1232 if (target_type == MC_TARGET_PAGE) {
1233 folio = target.folio;
1234 /*
1235 * Deferred split queue locking depends on memcg,
1236 * and unqueue is unsafe unless folio refcount is 0:
1237 * split or skip if on the queue? first try to split.
1238 */
1239 if (!list_empty(&folio->_deferred_list)) {
1240 spin_unlock(ptl);
1241 if (!tried_split_before)
1242 split_folio(folio);
1243 folio_unlock(folio);
1244 folio_put(folio);
1245 if (tried_split_before)
1246 return 0;
1247 tried_split_before = true;
1248 goto retry_pmd;
1249 }
1250 /*
1251 * So long as that pmd lock is held, the folio cannot
1252 * be racily added to the _deferred_list, because
1253 * __folio_remove_rmap() will find !partially_mapped.
1254 */
1255 if (folio_isolate_lru(folio)) {
1256 if (!mem_cgroup_move_account(folio, true,
1257 mc.from, mc.to)) {
1258 mc.precharge -= HPAGE_PMD_NR;
1259 mc.moved_charge += HPAGE_PMD_NR;
1260 }
1261 folio_putback_lru(folio);
1262 }
1263 folio_unlock(folio);
1264 folio_put(folio);
1265 } else if (target_type == MC_TARGET_DEVICE) {
1266 folio = target.folio;
1267 if (!mem_cgroup_move_account(folio, true,
1268 mc.from, mc.to)) {
1269 mc.precharge -= HPAGE_PMD_NR;
1270 mc.moved_charge += HPAGE_PMD_NR;
1271 }
1272 folio_unlock(folio);
1273 folio_put(folio);
1274 }
1275 spin_unlock(ptl);
1276 return 0;
1277 }
1278
1279 retry:
1280 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1281 if (!pte)
1282 return 0;
1283 for (; addr != end; addr += PAGE_SIZE) {
1284 pte_t ptent = ptep_get(pte++);
1285 bool device = false;
1286 swp_entry_t ent;
1287
1288 if (!mc.precharge)
1289 break;
1290
1291 switch (get_mctgt_type(vma, addr, ptent, &target)) {
1292 case MC_TARGET_DEVICE:
1293 device = true;
1294 fallthrough;
1295 case MC_TARGET_PAGE:
1296 folio = target.folio;
1297 /*
1298 * We can have a part of the split pmd here. Moving it
1299 * can be done but it would be too convoluted so simply
1300 * ignore such a partial THP and keep it in original
1301 * memcg. There should be somebody mapping the head.
1302 */
1303 if (folio_test_large(folio))
1304 goto put;
1305 if (!device && !folio_isolate_lru(folio))
1306 goto put;
1307 if (!mem_cgroup_move_account(folio, false,
1308 mc.from, mc.to)) {
1309 mc.precharge--;
1310 /* we uncharge from mc.from later. */
1311 mc.moved_charge++;
1312 }
1313 if (!device)
1314 folio_putback_lru(folio);
1315 put: /* get_mctgt_type() gets & locks the page */
1316 folio_unlock(folio);
1317 folio_put(folio);
1318 break;
1319 case MC_TARGET_SWAP:
1320 ent = target.ent;
1321 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
1322 mc.precharge--;
1323 mem_cgroup_id_get_many(mc.to, 1);
1324 /* we fixup other refcnts and charges later. */
1325 mc.moved_swap++;
1326 }
1327 break;
1328 default:
1329 break;
1330 }
1331 }
1332 pte_unmap_unlock(pte - 1, ptl);
1333 cond_resched();
1334
1335 if (addr != end) {
1336 /*
1337 * We have consumed all precharges we got in can_attach().
1338 * We try charge one by one, but don't do any additional
1339 * charges to mc.to if we have failed in charge once in attach()
1340 * phase.
1341 */
1342 ret = mem_cgroup_do_precharge(1);
1343 if (!ret)
1344 goto retry;
1345 }
1346
1347 return ret;
1348 }
1349
1350 static const struct mm_walk_ops charge_walk_ops = {
1351 .pmd_entry = mem_cgroup_move_charge_pte_range,
1352 .walk_lock = PGWALK_RDLOCK,
1353 };
1354
mem_cgroup_move_charge(void)1355 static void mem_cgroup_move_charge(void)
1356 {
1357 lru_add_drain_all();
1358 /*
1359 * Signal folio_memcg_lock() to take the memcg's move_lock
1360 * while we're moving its pages to another memcg. Then wait
1361 * for already started RCU-only updates to finish.
1362 */
1363 atomic_inc(&mc.from->moving_account);
1364 synchronize_rcu();
1365 retry:
1366 if (unlikely(!mmap_read_trylock(mc.mm))) {
1367 /*
1368 * Someone who are holding the mmap_lock might be waiting in
1369 * waitq. So we cancel all extra charges, wake up all waiters,
1370 * and retry. Because we cancel precharges, we might not be able
1371 * to move enough charges, but moving charge is a best-effort
1372 * feature anyway, so it wouldn't be a big problem.
1373 */
1374 __mem_cgroup_clear_mc();
1375 cond_resched();
1376 goto retry;
1377 }
1378 /*
1379 * When we have consumed all precharges and failed in doing
1380 * additional charge, the page walk just aborts.
1381 */
1382 walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL);
1383 mmap_read_unlock(mc.mm);
1384 atomic_dec(&mc.from->moving_account);
1385 }
1386
memcg1_move_task(void)1387 void memcg1_move_task(void)
1388 {
1389 if (mc.to) {
1390 mem_cgroup_move_charge();
1391 mem_cgroup_clear_mc();
1392 }
1393 }
1394
1395 #else /* !CONFIG_MMU */
memcg1_can_attach(struct cgroup_taskset * tset)1396 int memcg1_can_attach(struct cgroup_taskset *tset)
1397 {
1398 return 0;
1399 }
memcg1_cancel_attach(struct cgroup_taskset * tset)1400 void memcg1_cancel_attach(struct cgroup_taskset *tset)
1401 {
1402 }
memcg1_move_task(void)1403 void memcg1_move_task(void)
1404 {
1405 }
1406 #endif
1407
__mem_cgroup_threshold(struct mem_cgroup * memcg,bool swap)1408 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
1409 {
1410 struct mem_cgroup_threshold_ary *t;
1411 unsigned long usage;
1412 int i;
1413
1414 rcu_read_lock();
1415 if (!swap)
1416 t = rcu_dereference(memcg->thresholds.primary);
1417 else
1418 t = rcu_dereference(memcg->memsw_thresholds.primary);
1419
1420 if (!t)
1421 goto unlock;
1422
1423 usage = mem_cgroup_usage(memcg, swap);
1424
1425 /*
1426 * current_threshold points to threshold just below or equal to usage.
1427 * If it's not true, a threshold was crossed after last
1428 * call of __mem_cgroup_threshold().
1429 */
1430 i = t->current_threshold;
1431
1432 /*
1433 * Iterate backward over array of thresholds starting from
1434 * current_threshold and check if a threshold is crossed.
1435 * If none of thresholds below usage is crossed, we read
1436 * only one element of the array here.
1437 */
1438 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
1439 eventfd_signal(t->entries[i].eventfd);
1440
1441 /* i = current_threshold + 1 */
1442 i++;
1443
1444 /*
1445 * Iterate forward over array of thresholds starting from
1446 * current_threshold+1 and check if a threshold is crossed.
1447 * If none of thresholds above usage is crossed, we read
1448 * only one element of the array here.
1449 */
1450 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
1451 eventfd_signal(t->entries[i].eventfd);
1452
1453 /* Update current_threshold */
1454 t->current_threshold = i - 1;
1455 unlock:
1456 rcu_read_unlock();
1457 }
1458
mem_cgroup_threshold(struct mem_cgroup * memcg)1459 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
1460 {
1461 while (memcg) {
1462 __mem_cgroup_threshold(memcg, false);
1463 if (do_memsw_account())
1464 __mem_cgroup_threshold(memcg, true);
1465
1466 memcg = parent_mem_cgroup(memcg);
1467 }
1468 }
1469
1470 /* Cgroup1: threshold notifications & softlimit tree updates */
1471 struct memcg1_events_percpu {
1472 unsigned long nr_page_events;
1473 unsigned long targets[MEM_CGROUP_NTARGETS];
1474 };
1475
memcg1_charge_statistics(struct mem_cgroup * memcg,int nr_pages)1476 static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
1477 {
1478 /* pagein of a big page is an event. So, ignore page size */
1479 if (nr_pages > 0)
1480 __count_memcg_events(memcg, PGPGIN, 1);
1481 else {
1482 __count_memcg_events(memcg, PGPGOUT, 1);
1483 nr_pages = -nr_pages; /* for event */
1484 }
1485
1486 __this_cpu_add(memcg->events_percpu->nr_page_events, nr_pages);
1487 }
1488
1489 #define THRESHOLDS_EVENTS_TARGET 128
1490 #define SOFTLIMIT_EVENTS_TARGET 1024
1491
memcg1_event_ratelimit(struct mem_cgroup * memcg,enum mem_cgroup_events_target target)1492 static bool memcg1_event_ratelimit(struct mem_cgroup *memcg,
1493 enum mem_cgroup_events_target target)
1494 {
1495 unsigned long val, next;
1496
1497 val = __this_cpu_read(memcg->events_percpu->nr_page_events);
1498 next = __this_cpu_read(memcg->events_percpu->targets[target]);
1499 /* from time_after() in jiffies.h */
1500 if ((long)(next - val) < 0) {
1501 switch (target) {
1502 case MEM_CGROUP_TARGET_THRESH:
1503 next = val + THRESHOLDS_EVENTS_TARGET;
1504 break;
1505 case MEM_CGROUP_TARGET_SOFTLIMIT:
1506 next = val + SOFTLIMIT_EVENTS_TARGET;
1507 break;
1508 default:
1509 break;
1510 }
1511 __this_cpu_write(memcg->events_percpu->targets[target], next);
1512 return true;
1513 }
1514 return false;
1515 }
1516
1517 /*
1518 * Check events in order.
1519 *
1520 */
memcg1_check_events(struct mem_cgroup * memcg,int nid)1521 static void memcg1_check_events(struct mem_cgroup *memcg, int nid)
1522 {
1523 if (IS_ENABLED(CONFIG_PREEMPT_RT))
1524 return;
1525
1526 /* threshold event is triggered in finer grain than soft limit */
1527 if (unlikely(memcg1_event_ratelimit(memcg,
1528 MEM_CGROUP_TARGET_THRESH))) {
1529 bool do_softlimit;
1530
1531 do_softlimit = memcg1_event_ratelimit(memcg,
1532 MEM_CGROUP_TARGET_SOFTLIMIT);
1533 mem_cgroup_threshold(memcg);
1534 if (unlikely(do_softlimit))
1535 memcg1_update_tree(memcg, nid);
1536 }
1537 }
1538
memcg1_commit_charge(struct folio * folio,struct mem_cgroup * memcg)1539 void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
1540 {
1541 unsigned long flags;
1542
1543 local_irq_save(flags);
1544 memcg1_charge_statistics(memcg, folio_nr_pages(folio));
1545 memcg1_check_events(memcg, folio_nid(folio));
1546 local_irq_restore(flags);
1547 }
1548
memcg1_swapout(struct folio * folio,struct mem_cgroup * memcg)1549 void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg)
1550 {
1551 /*
1552 * Interrupts should be disabled here because the caller holds the
1553 * i_pages lock which is taken with interrupts-off. It is
1554 * important here to have the interrupts disabled because it is the
1555 * only synchronisation we have for updating the per-CPU variables.
1556 */
1557 preempt_disable_nested();
1558 VM_WARN_ON_IRQS_ENABLED();
1559 memcg1_charge_statistics(memcg, -folio_nr_pages(folio));
1560 preempt_enable_nested();
1561 memcg1_check_events(memcg, folio_nid(folio));
1562 }
1563
memcg1_uncharge_batch(struct mem_cgroup * memcg,unsigned long pgpgout,unsigned long nr_memory,int nid)1564 void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
1565 unsigned long nr_memory, int nid)
1566 {
1567 unsigned long flags;
1568
1569 local_irq_save(flags);
1570 __count_memcg_events(memcg, PGPGOUT, pgpgout);
1571 __this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory);
1572 memcg1_check_events(memcg, nid);
1573 local_irq_restore(flags);
1574 }
1575
compare_thresholds(const void * a,const void * b)1576 static int compare_thresholds(const void *a, const void *b)
1577 {
1578 const struct mem_cgroup_threshold *_a = a;
1579 const struct mem_cgroup_threshold *_b = b;
1580
1581 if (_a->threshold > _b->threshold)
1582 return 1;
1583
1584 if (_a->threshold < _b->threshold)
1585 return -1;
1586
1587 return 0;
1588 }
1589
mem_cgroup_oom_notify_cb(struct mem_cgroup * memcg)1590 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
1591 {
1592 struct mem_cgroup_eventfd_list *ev;
1593
1594 spin_lock(&memcg_oom_lock);
1595
1596 list_for_each_entry(ev, &memcg->oom_notify, list)
1597 eventfd_signal(ev->eventfd);
1598
1599 spin_unlock(&memcg_oom_lock);
1600 return 0;
1601 }
1602
mem_cgroup_oom_notify(struct mem_cgroup * memcg)1603 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
1604 {
1605 struct mem_cgroup *iter;
1606
1607 for_each_mem_cgroup_tree(iter, memcg)
1608 mem_cgroup_oom_notify_cb(iter);
1609 }
1610
__mem_cgroup_usage_register_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd,const char * args,enum res_type type)1611 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
1612 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
1613 {
1614 struct mem_cgroup_thresholds *thresholds;
1615 struct mem_cgroup_threshold_ary *new;
1616 unsigned long threshold;
1617 unsigned long usage;
1618 int i, size, ret;
1619
1620 ret = page_counter_memparse(args, "-1", &threshold);
1621 if (ret)
1622 return ret;
1623
1624 mutex_lock(&memcg->thresholds_lock);
1625
1626 if (type == _MEM) {
1627 thresholds = &memcg->thresholds;
1628 usage = mem_cgroup_usage(memcg, false);
1629 } else if (type == _MEMSWAP) {
1630 thresholds = &memcg->memsw_thresholds;
1631 usage = mem_cgroup_usage(memcg, true);
1632 } else
1633 BUG();
1634
1635 /* Check if a threshold crossed before adding a new one */
1636 if (thresholds->primary)
1637 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
1638
1639 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
1640
1641 /* Allocate memory for new array of thresholds */
1642 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
1643 if (!new) {
1644 ret = -ENOMEM;
1645 goto unlock;
1646 }
1647 new->size = size;
1648
1649 /* Copy thresholds (if any) to new array */
1650 if (thresholds->primary)
1651 memcpy(new->entries, thresholds->primary->entries,
1652 flex_array_size(new, entries, size - 1));
1653
1654 /* Add new threshold */
1655 new->entries[size - 1].eventfd = eventfd;
1656 new->entries[size - 1].threshold = threshold;
1657
1658 /* Sort thresholds. Registering of new threshold isn't time-critical */
1659 sort(new->entries, size, sizeof(*new->entries),
1660 compare_thresholds, NULL);
1661
1662 /* Find current threshold */
1663 new->current_threshold = -1;
1664 for (i = 0; i < size; i++) {
1665 if (new->entries[i].threshold <= usage) {
1666 /*
1667 * new->current_threshold will not be used until
1668 * rcu_assign_pointer(), so it's safe to increment
1669 * it here.
1670 */
1671 ++new->current_threshold;
1672 } else
1673 break;
1674 }
1675
1676 /* Free old spare buffer and save old primary buffer as spare */
1677 kfree(thresholds->spare);
1678 thresholds->spare = thresholds->primary;
1679
1680 rcu_assign_pointer(thresholds->primary, new);
1681
1682 /* To be sure that nobody uses thresholds */
1683 synchronize_rcu();
1684
1685 unlock:
1686 mutex_unlock(&memcg->thresholds_lock);
1687
1688 return ret;
1689 }
1690
mem_cgroup_usage_register_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd,const char * args)1691 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
1692 struct eventfd_ctx *eventfd, const char *args)
1693 {
1694 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
1695 }
1696
memsw_cgroup_usage_register_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd,const char * args)1697 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
1698 struct eventfd_ctx *eventfd, const char *args)
1699 {
1700 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
1701 }
1702
__mem_cgroup_usage_unregister_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd,enum res_type type)1703 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
1704 struct eventfd_ctx *eventfd, enum res_type type)
1705 {
1706 struct mem_cgroup_thresholds *thresholds;
1707 struct mem_cgroup_threshold_ary *new;
1708 unsigned long usage;
1709 int i, j, size, entries;
1710
1711 mutex_lock(&memcg->thresholds_lock);
1712
1713 if (type == _MEM) {
1714 thresholds = &memcg->thresholds;
1715 usage = mem_cgroup_usage(memcg, false);
1716 } else if (type == _MEMSWAP) {
1717 thresholds = &memcg->memsw_thresholds;
1718 usage = mem_cgroup_usage(memcg, true);
1719 } else
1720 BUG();
1721
1722 if (!thresholds->primary)
1723 goto unlock;
1724
1725 /* Check if a threshold crossed before removing */
1726 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
1727
1728 /* Calculate new number of threshold */
1729 size = entries = 0;
1730 for (i = 0; i < thresholds->primary->size; i++) {
1731 if (thresholds->primary->entries[i].eventfd != eventfd)
1732 size++;
1733 else
1734 entries++;
1735 }
1736
1737 new = thresholds->spare;
1738
1739 /* If no items related to eventfd have been cleared, nothing to do */
1740 if (!entries)
1741 goto unlock;
1742
1743 /* Set thresholds array to NULL if we don't have thresholds */
1744 if (!size) {
1745 kfree(new);
1746 new = NULL;
1747 goto swap_buffers;
1748 }
1749
1750 new->size = size;
1751
1752 /* Copy thresholds and find current threshold */
1753 new->current_threshold = -1;
1754 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
1755 if (thresholds->primary->entries[i].eventfd == eventfd)
1756 continue;
1757
1758 new->entries[j] = thresholds->primary->entries[i];
1759 if (new->entries[j].threshold <= usage) {
1760 /*
1761 * new->current_threshold will not be used
1762 * until rcu_assign_pointer(), so it's safe to increment
1763 * it here.
1764 */
1765 ++new->current_threshold;
1766 }
1767 j++;
1768 }
1769
1770 swap_buffers:
1771 /* Swap primary and spare array */
1772 thresholds->spare = thresholds->primary;
1773
1774 rcu_assign_pointer(thresholds->primary, new);
1775
1776 /* To be sure that nobody uses thresholds */
1777 synchronize_rcu();
1778
1779 /* If all events are unregistered, free the spare array */
1780 if (!new) {
1781 kfree(thresholds->spare);
1782 thresholds->spare = NULL;
1783 }
1784 unlock:
1785 mutex_unlock(&memcg->thresholds_lock);
1786 }
1787
mem_cgroup_usage_unregister_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd)1788 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
1789 struct eventfd_ctx *eventfd)
1790 {
1791 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
1792 }
1793
memsw_cgroup_usage_unregister_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd)1794 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
1795 struct eventfd_ctx *eventfd)
1796 {
1797 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
1798 }
1799
mem_cgroup_oom_register_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd,const char * args)1800 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
1801 struct eventfd_ctx *eventfd, const char *args)
1802 {
1803 struct mem_cgroup_eventfd_list *event;
1804
1805 event = kmalloc(sizeof(*event), GFP_KERNEL);
1806 if (!event)
1807 return -ENOMEM;
1808
1809 spin_lock(&memcg_oom_lock);
1810
1811 event->eventfd = eventfd;
1812 list_add(&event->list, &memcg->oom_notify);
1813
1814 /* already in OOM ? */
1815 if (memcg->under_oom)
1816 eventfd_signal(eventfd);
1817 spin_unlock(&memcg_oom_lock);
1818
1819 return 0;
1820 }
1821
mem_cgroup_oom_unregister_event(struct mem_cgroup * memcg,struct eventfd_ctx * eventfd)1822 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
1823 struct eventfd_ctx *eventfd)
1824 {
1825 struct mem_cgroup_eventfd_list *ev, *tmp;
1826
1827 spin_lock(&memcg_oom_lock);
1828
1829 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
1830 if (ev->eventfd == eventfd) {
1831 list_del(&ev->list);
1832 kfree(ev);
1833 }
1834 }
1835
1836 spin_unlock(&memcg_oom_lock);
1837 }
1838
1839 /*
1840 * DO NOT USE IN NEW FILES.
1841 *
1842 * "cgroup.event_control" implementation.
1843 *
1844 * This is way over-engineered. It tries to support fully configurable
1845 * events for each user. Such level of flexibility is completely
1846 * unnecessary especially in the light of the planned unified hierarchy.
1847 *
1848 * Please deprecate this and replace with something simpler if at all
1849 * possible.
1850 */
1851
1852 /*
1853 * Unregister event and free resources.
1854 *
1855 * Gets called from workqueue.
1856 */
memcg_event_remove(struct work_struct * work)1857 static void memcg_event_remove(struct work_struct *work)
1858 {
1859 struct mem_cgroup_event *event =
1860 container_of(work, struct mem_cgroup_event, remove);
1861 struct mem_cgroup *memcg = event->memcg;
1862
1863 remove_wait_queue(event->wqh, &event->wait);
1864
1865 event->unregister_event(memcg, event->eventfd);
1866
1867 /* Notify userspace the event is going away. */
1868 eventfd_signal(event->eventfd);
1869
1870 eventfd_ctx_put(event->eventfd);
1871 kfree(event);
1872 css_put(&memcg->css);
1873 }
1874
1875 /*
1876 * Gets called on EPOLLHUP on eventfd when user closes it.
1877 *
1878 * Called with wqh->lock held and interrupts disabled.
1879 */
memcg_event_wake(wait_queue_entry_t * wait,unsigned mode,int sync,void * key)1880 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
1881 int sync, void *key)
1882 {
1883 struct mem_cgroup_event *event =
1884 container_of(wait, struct mem_cgroup_event, wait);
1885 struct mem_cgroup *memcg = event->memcg;
1886 __poll_t flags = key_to_poll(key);
1887
1888 if (flags & EPOLLHUP) {
1889 /*
1890 * If the event has been detached at cgroup removal, we
1891 * can simply return knowing the other side will cleanup
1892 * for us.
1893 *
1894 * We can't race against event freeing since the other
1895 * side will require wqh->lock via remove_wait_queue(),
1896 * which we hold.
1897 */
1898 spin_lock(&memcg->event_list_lock);
1899 if (!list_empty(&event->list)) {
1900 list_del_init(&event->list);
1901 /*
1902 * We are in atomic context, but cgroup_event_remove()
1903 * may sleep, so we have to call it in workqueue.
1904 */
1905 schedule_work(&event->remove);
1906 }
1907 spin_unlock(&memcg->event_list_lock);
1908 }
1909
1910 return 0;
1911 }
1912
memcg_event_ptable_queue_proc(struct file * file,wait_queue_head_t * wqh,poll_table * pt)1913 static void memcg_event_ptable_queue_proc(struct file *file,
1914 wait_queue_head_t *wqh, poll_table *pt)
1915 {
1916 struct mem_cgroup_event *event =
1917 container_of(pt, struct mem_cgroup_event, pt);
1918
1919 event->wqh = wqh;
1920 add_wait_queue(wqh, &event->wait);
1921 }
1922
1923 /*
1924 * DO NOT USE IN NEW FILES.
1925 *
1926 * Parse input and register new cgroup event handler.
1927 *
1928 * Input must be in format '<event_fd> <control_fd> <args>'.
1929 * Interpretation of args is defined by control file implementation.
1930 */
memcg_write_event_control(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)1931 static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
1932 char *buf, size_t nbytes, loff_t off)
1933 {
1934 struct cgroup_subsys_state *css = of_css(of);
1935 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
1936 struct mem_cgroup_event *event;
1937 struct cgroup_subsys_state *cfile_css;
1938 unsigned int efd, cfd;
1939 struct fd efile;
1940 struct fd cfile;
1941 struct dentry *cdentry;
1942 const char *name;
1943 char *endp;
1944 int ret;
1945
1946 if (IS_ENABLED(CONFIG_PREEMPT_RT))
1947 return -EOPNOTSUPP;
1948
1949 buf = strstrip(buf);
1950
1951 efd = simple_strtoul(buf, &endp, 10);
1952 if (*endp != ' ')
1953 return -EINVAL;
1954 buf = endp + 1;
1955
1956 cfd = simple_strtoul(buf, &endp, 10);
1957 if (*endp == '\0')
1958 buf = endp;
1959 else if (*endp == ' ')
1960 buf = endp + 1;
1961 else
1962 return -EINVAL;
1963
1964 event = kzalloc(sizeof(*event), GFP_KERNEL);
1965 if (!event)
1966 return -ENOMEM;
1967
1968 event->memcg = memcg;
1969 INIT_LIST_HEAD(&event->list);
1970 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
1971 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
1972 INIT_WORK(&event->remove, memcg_event_remove);
1973
1974 efile = fdget(efd);
1975 if (!fd_file(efile)) {
1976 ret = -EBADF;
1977 goto out_kfree;
1978 }
1979
1980 event->eventfd = eventfd_ctx_fileget(fd_file(efile));
1981 if (IS_ERR(event->eventfd)) {
1982 ret = PTR_ERR(event->eventfd);
1983 goto out_put_efile;
1984 }
1985
1986 cfile = fdget(cfd);
1987 if (!fd_file(cfile)) {
1988 ret = -EBADF;
1989 goto out_put_eventfd;
1990 }
1991
1992 /* the process need read permission on control file */
1993 /* AV: shouldn't we check that it's been opened for read instead? */
1994 ret = file_permission(fd_file(cfile), MAY_READ);
1995 if (ret < 0)
1996 goto out_put_cfile;
1997
1998 /*
1999 * The control file must be a regular cgroup1 file. As a regular cgroup
2000 * file can't be renamed, it's safe to access its name afterwards.
2001 */
2002 cdentry = fd_file(cfile)->f_path.dentry;
2003 if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
2004 ret = -EINVAL;
2005 goto out_put_cfile;
2006 }
2007
2008 /*
2009 * Determine the event callbacks and set them in @event. This used
2010 * to be done via struct cftype but cgroup core no longer knows
2011 * about these events. The following is crude but the whole thing
2012 * is for compatibility anyway.
2013 *
2014 * DO NOT ADD NEW FILES.
2015 */
2016 name = cdentry->d_name.name;
2017
2018 if (!strcmp(name, "memory.usage_in_bytes")) {
2019 event->register_event = mem_cgroup_usage_register_event;
2020 event->unregister_event = mem_cgroup_usage_unregister_event;
2021 } else if (!strcmp(name, "memory.oom_control")) {
2022 pr_warn_once("oom_control is deprecated and will be removed. "
2023 "Please report your usecase to linux-mm-@kvack.org"
2024 " if you depend on this functionality. \n");
2025 event->register_event = mem_cgroup_oom_register_event;
2026 event->unregister_event = mem_cgroup_oom_unregister_event;
2027 } else if (!strcmp(name, "memory.pressure_level")) {
2028 pr_warn_once("pressure_level is deprecated and will be removed. "
2029 "Please report your usecase to linux-mm-@kvack.org "
2030 "if you depend on this functionality. \n");
2031 event->register_event = vmpressure_register_event;
2032 event->unregister_event = vmpressure_unregister_event;
2033 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
2034 event->register_event = memsw_cgroup_usage_register_event;
2035 event->unregister_event = memsw_cgroup_usage_unregister_event;
2036 } else {
2037 ret = -EINVAL;
2038 goto out_put_cfile;
2039 }
2040
2041 /*
2042 * Verify @cfile should belong to @css. Also, remaining events are
2043 * automatically removed on cgroup destruction but the removal is
2044 * asynchronous, so take an extra ref on @css.
2045 */
2046 cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
2047 &memory_cgrp_subsys);
2048 ret = -EINVAL;
2049 if (IS_ERR(cfile_css))
2050 goto out_put_cfile;
2051 if (cfile_css != css) {
2052 css_put(cfile_css);
2053 goto out_put_cfile;
2054 }
2055
2056 ret = event->register_event(memcg, event->eventfd, buf);
2057 if (ret)
2058 goto out_put_css;
2059
2060 vfs_poll(fd_file(efile), &event->pt);
2061
2062 spin_lock_irq(&memcg->event_list_lock);
2063 list_add(&event->list, &memcg->event_list);
2064 spin_unlock_irq(&memcg->event_list_lock);
2065
2066 fdput(cfile);
2067 fdput(efile);
2068
2069 return nbytes;
2070
2071 out_put_css:
2072 css_put(css);
2073 out_put_cfile:
2074 fdput(cfile);
2075 out_put_eventfd:
2076 eventfd_ctx_put(event->eventfd);
2077 out_put_efile:
2078 fdput(efile);
2079 out_kfree:
2080 kfree(event);
2081
2082 return ret;
2083 }
2084
memcg1_memcg_init(struct mem_cgroup * memcg)2085 void memcg1_memcg_init(struct mem_cgroup *memcg)
2086 {
2087 INIT_LIST_HEAD(&memcg->oom_notify);
2088 mutex_init(&memcg->thresholds_lock);
2089 spin_lock_init(&memcg->move_lock);
2090 INIT_LIST_HEAD(&memcg->event_list);
2091 spin_lock_init(&memcg->event_list_lock);
2092 }
2093
memcg1_css_offline(struct mem_cgroup * memcg)2094 void memcg1_css_offline(struct mem_cgroup *memcg)
2095 {
2096 struct mem_cgroup_event *event, *tmp;
2097
2098 /*
2099 * Unregister events and notify userspace.
2100 * Notify userspace about cgroup removing only after rmdir of cgroup
2101 * directory to avoid race between userspace and kernelspace.
2102 */
2103 spin_lock_irq(&memcg->event_list_lock);
2104 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
2105 list_del_init(&event->list);
2106 schedule_work(&event->remove);
2107 }
2108 spin_unlock_irq(&memcg->event_list_lock);
2109 }
2110
2111 /*
2112 * Check OOM-Killer is already running under our hierarchy.
2113 * If someone is running, return false.
2114 */
mem_cgroup_oom_trylock(struct mem_cgroup * memcg)2115 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
2116 {
2117 struct mem_cgroup *iter, *failed = NULL;
2118
2119 spin_lock(&memcg_oom_lock);
2120
2121 for_each_mem_cgroup_tree(iter, memcg) {
2122 if (iter->oom_lock) {
2123 /*
2124 * this subtree of our hierarchy is already locked
2125 * so we cannot give a lock.
2126 */
2127 failed = iter;
2128 mem_cgroup_iter_break(memcg, iter);
2129 break;
2130 } else
2131 iter->oom_lock = true;
2132 }
2133
2134 if (failed) {
2135 /*
2136 * OK, we failed to lock the whole subtree so we have
2137 * to clean up what we set up to the failing subtree
2138 */
2139 for_each_mem_cgroup_tree(iter, memcg) {
2140 if (iter == failed) {
2141 mem_cgroup_iter_break(memcg, iter);
2142 break;
2143 }
2144 iter->oom_lock = false;
2145 }
2146 } else
2147 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
2148
2149 spin_unlock(&memcg_oom_lock);
2150
2151 return !failed;
2152 }
2153
mem_cgroup_oom_unlock(struct mem_cgroup * memcg)2154 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
2155 {
2156 struct mem_cgroup *iter;
2157
2158 spin_lock(&memcg_oom_lock);
2159 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
2160 for_each_mem_cgroup_tree(iter, memcg)
2161 iter->oom_lock = false;
2162 spin_unlock(&memcg_oom_lock);
2163 }
2164
mem_cgroup_mark_under_oom(struct mem_cgroup * memcg)2165 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
2166 {
2167 struct mem_cgroup *iter;
2168
2169 spin_lock(&memcg_oom_lock);
2170 for_each_mem_cgroup_tree(iter, memcg)
2171 iter->under_oom++;
2172 spin_unlock(&memcg_oom_lock);
2173 }
2174
mem_cgroup_unmark_under_oom(struct mem_cgroup * memcg)2175 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
2176 {
2177 struct mem_cgroup *iter;
2178
2179 /*
2180 * Be careful about under_oom underflows because a child memcg
2181 * could have been added after mem_cgroup_mark_under_oom.
2182 */
2183 spin_lock(&memcg_oom_lock);
2184 for_each_mem_cgroup_tree(iter, memcg)
2185 if (iter->under_oom > 0)
2186 iter->under_oom--;
2187 spin_unlock(&memcg_oom_lock);
2188 }
2189
2190 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
2191
2192 struct oom_wait_info {
2193 struct mem_cgroup *memcg;
2194 wait_queue_entry_t wait;
2195 };
2196
memcg_oom_wake_function(wait_queue_entry_t * wait,unsigned mode,int sync,void * arg)2197 static int memcg_oom_wake_function(wait_queue_entry_t *wait,
2198 unsigned mode, int sync, void *arg)
2199 {
2200 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
2201 struct mem_cgroup *oom_wait_memcg;
2202 struct oom_wait_info *oom_wait_info;
2203
2204 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
2205 oom_wait_memcg = oom_wait_info->memcg;
2206
2207 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
2208 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
2209 return 0;
2210 return autoremove_wake_function(wait, mode, sync, arg);
2211 }
2212
memcg1_oom_recover(struct mem_cgroup * memcg)2213 void memcg1_oom_recover(struct mem_cgroup *memcg)
2214 {
2215 /*
2216 * For the following lockless ->under_oom test, the only required
2217 * guarantee is that it must see the state asserted by an OOM when
2218 * this function is called as a result of userland actions
2219 * triggered by the notification of the OOM. This is trivially
2220 * achieved by invoking mem_cgroup_mark_under_oom() before
2221 * triggering notification.
2222 */
2223 if (memcg && memcg->under_oom)
2224 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
2225 }
2226
2227 /**
2228 * mem_cgroup_oom_synchronize - complete memcg OOM handling
2229 * @handle: actually kill/wait or just clean up the OOM state
2230 *
2231 * This has to be called at the end of a page fault if the memcg OOM
2232 * handler was enabled.
2233 *
2234 * Memcg supports userspace OOM handling where failed allocations must
2235 * sleep on a waitqueue until the userspace task resolves the
2236 * situation. Sleeping directly in the charge context with all kinds
2237 * of locks held is not a good idea, instead we remember an OOM state
2238 * in the task and mem_cgroup_oom_synchronize() has to be called at
2239 * the end of the page fault to complete the OOM handling.
2240 *
2241 * Returns %true if an ongoing memcg OOM situation was detected and
2242 * completed, %false otherwise.
2243 */
mem_cgroup_oom_synchronize(bool handle)2244 bool mem_cgroup_oom_synchronize(bool handle)
2245 {
2246 struct mem_cgroup *memcg = current->memcg_in_oom;
2247 struct oom_wait_info owait;
2248 bool locked;
2249
2250 /* OOM is global, do not handle */
2251 if (!memcg)
2252 return false;
2253
2254 if (!handle)
2255 goto cleanup;
2256
2257 owait.memcg = memcg;
2258 owait.wait.flags = 0;
2259 owait.wait.func = memcg_oom_wake_function;
2260 owait.wait.private = current;
2261 INIT_LIST_HEAD(&owait.wait.entry);
2262
2263 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2264 mem_cgroup_mark_under_oom(memcg);
2265
2266 locked = mem_cgroup_oom_trylock(memcg);
2267
2268 if (locked)
2269 mem_cgroup_oom_notify(memcg);
2270
2271 schedule();
2272 mem_cgroup_unmark_under_oom(memcg);
2273 finish_wait(&memcg_oom_waitq, &owait.wait);
2274
2275 if (locked)
2276 mem_cgroup_oom_unlock(memcg);
2277 cleanup:
2278 current->memcg_in_oom = NULL;
2279 css_put(&memcg->css);
2280 return true;
2281 }
2282
2283
memcg1_oom_prepare(struct mem_cgroup * memcg,bool * locked)2284 bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked)
2285 {
2286 /*
2287 * We are in the middle of the charge context here, so we
2288 * don't want to block when potentially sitting on a callstack
2289 * that holds all kinds of filesystem and mm locks.
2290 *
2291 * cgroup1 allows disabling the OOM killer and waiting for outside
2292 * handling until the charge can succeed; remember the context and put
2293 * the task to sleep at the end of the page fault when all locks are
2294 * released.
2295 *
2296 * On the other hand, in-kernel OOM killer allows for an async victim
2297 * memory reclaim (oom_reaper) and that means that we are not solely
2298 * relying on the oom victim to make a forward progress and we can
2299 * invoke the oom killer here.
2300 *
2301 * Please note that mem_cgroup_out_of_memory might fail to find a
2302 * victim and then we have to bail out from the charge path.
2303 */
2304 if (READ_ONCE(memcg->oom_kill_disable)) {
2305 if (current->in_user_fault) {
2306 css_get(&memcg->css);
2307 current->memcg_in_oom = memcg;
2308 }
2309 return false;
2310 }
2311
2312 mem_cgroup_mark_under_oom(memcg);
2313
2314 *locked = mem_cgroup_oom_trylock(memcg);
2315
2316 if (*locked)
2317 mem_cgroup_oom_notify(memcg);
2318
2319 mem_cgroup_unmark_under_oom(memcg);
2320
2321 return true;
2322 }
2323
memcg1_oom_finish(struct mem_cgroup * memcg,bool locked)2324 void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked)
2325 {
2326 if (locked)
2327 mem_cgroup_oom_unlock(memcg);
2328 }
2329
2330 static DEFINE_MUTEX(memcg_max_mutex);
2331
mem_cgroup_resize_max(struct mem_cgroup * memcg,unsigned long max,bool memsw)2332 static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
2333 unsigned long max, bool memsw)
2334 {
2335 bool enlarge = false;
2336 bool drained = false;
2337 int ret;
2338 bool limits_invariant;
2339 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
2340
2341 do {
2342 if (signal_pending(current)) {
2343 ret = -EINTR;
2344 break;
2345 }
2346
2347 mutex_lock(&memcg_max_mutex);
2348 /*
2349 * Make sure that the new limit (memsw or memory limit) doesn't
2350 * break our basic invariant rule memory.max <= memsw.max.
2351 */
2352 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
2353 max <= memcg->memsw.max;
2354 if (!limits_invariant) {
2355 mutex_unlock(&memcg_max_mutex);
2356 ret = -EINVAL;
2357 break;
2358 }
2359 if (max > counter->max)
2360 enlarge = true;
2361 ret = page_counter_set_max(counter, max);
2362 mutex_unlock(&memcg_max_mutex);
2363
2364 if (!ret)
2365 break;
2366
2367 if (!drained) {
2368 drain_all_stock(memcg);
2369 drained = true;
2370 continue;
2371 }
2372
2373 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
2374 memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
2375 ret = -EBUSY;
2376 break;
2377 }
2378 } while (true);
2379
2380 if (!ret && enlarge)
2381 memcg1_oom_recover(memcg);
2382
2383 return ret;
2384 }
2385
2386 /*
2387 * Reclaims as many pages from the given memcg as possible.
2388 *
2389 * Caller is responsible for holding css reference for memcg.
2390 */
mem_cgroup_force_empty(struct mem_cgroup * memcg)2391 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
2392 {
2393 int nr_retries = MAX_RECLAIM_RETRIES;
2394
2395 /* we call try-to-free pages for make this cgroup empty */
2396 lru_add_drain_all();
2397
2398 drain_all_stock(memcg);
2399
2400 /* try to free all pages in this cgroup */
2401 while (nr_retries && page_counter_read(&memcg->memory)) {
2402 if (signal_pending(current))
2403 return -EINTR;
2404
2405 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
2406 MEMCG_RECLAIM_MAY_SWAP, NULL))
2407 nr_retries--;
2408 }
2409
2410 return 0;
2411 }
2412
mem_cgroup_force_empty_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)2413 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
2414 char *buf, size_t nbytes,
2415 loff_t off)
2416 {
2417 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2418
2419 if (mem_cgroup_is_root(memcg))
2420 return -EINVAL;
2421 return mem_cgroup_force_empty(memcg) ?: nbytes;
2422 }
2423
mem_cgroup_hierarchy_read(struct cgroup_subsys_state * css,struct cftype * cft)2424 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
2425 struct cftype *cft)
2426 {
2427 return 1;
2428 }
2429
mem_cgroup_hierarchy_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)2430 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
2431 struct cftype *cft, u64 val)
2432 {
2433 if (val == 1)
2434 return 0;
2435
2436 pr_warn_once("Non-hierarchical mode is deprecated. "
2437 "Please report your usecase to linux-mm@kvack.org if you "
2438 "depend on this functionality.\n");
2439
2440 return -EINVAL;
2441 }
2442
mem_cgroup_read_u64(struct cgroup_subsys_state * css,struct cftype * cft)2443 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
2444 struct cftype *cft)
2445 {
2446 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2447 struct page_counter *counter;
2448
2449 switch (MEMFILE_TYPE(cft->private)) {
2450 case _MEM:
2451 counter = &memcg->memory;
2452 break;
2453 case _MEMSWAP:
2454 counter = &memcg->memsw;
2455 break;
2456 case _KMEM:
2457 counter = &memcg->kmem;
2458 break;
2459 case _TCP:
2460 counter = &memcg->tcpmem;
2461 break;
2462 default:
2463 BUG();
2464 }
2465
2466 switch (MEMFILE_ATTR(cft->private)) {
2467 case RES_USAGE:
2468 if (counter == &memcg->memory)
2469 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
2470 if (counter == &memcg->memsw)
2471 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
2472 return (u64)page_counter_read(counter) * PAGE_SIZE;
2473 case RES_LIMIT:
2474 return (u64)counter->max * PAGE_SIZE;
2475 case RES_MAX_USAGE:
2476 return (u64)counter->watermark * PAGE_SIZE;
2477 case RES_FAILCNT:
2478 return counter->failcnt;
2479 case RES_SOFT_LIMIT:
2480 return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE;
2481 default:
2482 BUG();
2483 }
2484 }
2485
2486 /*
2487 * This function doesn't do anything useful. Its only job is to provide a read
2488 * handler for a file so that cgroup_file_mode() will add read permissions.
2489 */
mem_cgroup_dummy_seq_show(__always_unused struct seq_file * m,__always_unused void * v)2490 static int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m,
2491 __always_unused void *v)
2492 {
2493 return -EINVAL;
2494 }
2495
memcg_update_tcp_max(struct mem_cgroup * memcg,unsigned long max)2496 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
2497 {
2498 int ret;
2499
2500 mutex_lock(&memcg_max_mutex);
2501
2502 ret = page_counter_set_max(&memcg->tcpmem, max);
2503 if (ret)
2504 goto out;
2505
2506 if (!memcg->tcpmem_active) {
2507 /*
2508 * The active flag needs to be written after the static_key
2509 * update. This is what guarantees that the socket activation
2510 * function is the last one to run. See mem_cgroup_sk_alloc()
2511 * for details, and note that we don't mark any socket as
2512 * belonging to this memcg until that flag is up.
2513 *
2514 * We need to do this, because static_keys will span multiple
2515 * sites, but we can't control their order. If we mark a socket
2516 * as accounted, but the accounting functions are not patched in
2517 * yet, we'll lose accounting.
2518 *
2519 * We never race with the readers in mem_cgroup_sk_alloc(),
2520 * because when this value change, the code to process it is not
2521 * patched in yet.
2522 */
2523 static_branch_inc(&memcg_sockets_enabled_key);
2524 memcg->tcpmem_active = true;
2525 }
2526 out:
2527 mutex_unlock(&memcg_max_mutex);
2528 return ret;
2529 }
2530
2531 /*
2532 * The user of this function is...
2533 * RES_LIMIT.
2534 */
mem_cgroup_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)2535 static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
2536 char *buf, size_t nbytes, loff_t off)
2537 {
2538 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2539 unsigned long nr_pages;
2540 int ret;
2541
2542 buf = strstrip(buf);
2543 ret = page_counter_memparse(buf, "-1", &nr_pages);
2544 if (ret)
2545 return ret;
2546
2547 switch (MEMFILE_ATTR(of_cft(of)->private)) {
2548 case RES_LIMIT:
2549 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
2550 ret = -EINVAL;
2551 break;
2552 }
2553 switch (MEMFILE_TYPE(of_cft(of)->private)) {
2554 case _MEM:
2555 ret = mem_cgroup_resize_max(memcg, nr_pages, false);
2556 break;
2557 case _MEMSWAP:
2558 ret = mem_cgroup_resize_max(memcg, nr_pages, true);
2559 break;
2560 case _KMEM:
2561 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
2562 "Writing any value to this file has no effect. "
2563 "Please report your usecase to linux-mm@kvack.org if you "
2564 "depend on this functionality.\n");
2565 ret = 0;
2566 break;
2567 case _TCP:
2568 pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. "
2569 "Please report your usecase to linux-mm@kvack.org if you "
2570 "depend on this functionality.\n");
2571 ret = memcg_update_tcp_max(memcg, nr_pages);
2572 break;
2573 }
2574 break;
2575 case RES_SOFT_LIMIT:
2576 if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
2577 ret = -EOPNOTSUPP;
2578 } else {
2579 pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. "
2580 "Please report your usecase to linux-mm@kvack.org if you "
2581 "depend on this functionality.\n");
2582 WRITE_ONCE(memcg->soft_limit, nr_pages);
2583 ret = 0;
2584 }
2585 break;
2586 }
2587 return ret ?: nbytes;
2588 }
2589
mem_cgroup_reset(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)2590 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
2591 size_t nbytes, loff_t off)
2592 {
2593 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2594 struct page_counter *counter;
2595
2596 switch (MEMFILE_TYPE(of_cft(of)->private)) {
2597 case _MEM:
2598 counter = &memcg->memory;
2599 break;
2600 case _MEMSWAP:
2601 counter = &memcg->memsw;
2602 break;
2603 case _KMEM:
2604 counter = &memcg->kmem;
2605 break;
2606 case _TCP:
2607 counter = &memcg->tcpmem;
2608 break;
2609 default:
2610 BUG();
2611 }
2612
2613 switch (MEMFILE_ATTR(of_cft(of)->private)) {
2614 case RES_MAX_USAGE:
2615 page_counter_reset_watermark(counter);
2616 break;
2617 case RES_FAILCNT:
2618 counter->failcnt = 0;
2619 break;
2620 default:
2621 BUG();
2622 }
2623
2624 return nbytes;
2625 }
2626
2627 #ifdef CONFIG_NUMA
2628
2629 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
2630 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
2631 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
2632
mem_cgroup_node_nr_lru_pages(struct mem_cgroup * memcg,int nid,unsigned int lru_mask,bool tree)2633 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
2634 int nid, unsigned int lru_mask, bool tree)
2635 {
2636 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
2637 unsigned long nr = 0;
2638 enum lru_list lru;
2639
2640 VM_BUG_ON((unsigned)nid >= nr_node_ids);
2641
2642 for_each_lru(lru) {
2643 if (!(BIT(lru) & lru_mask))
2644 continue;
2645 if (tree)
2646 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
2647 else
2648 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
2649 }
2650 return nr;
2651 }
2652
mem_cgroup_nr_lru_pages(struct mem_cgroup * memcg,unsigned int lru_mask,bool tree)2653 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
2654 unsigned int lru_mask,
2655 bool tree)
2656 {
2657 unsigned long nr = 0;
2658 enum lru_list lru;
2659
2660 for_each_lru(lru) {
2661 if (!(BIT(lru) & lru_mask))
2662 continue;
2663 if (tree)
2664 nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
2665 else
2666 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
2667 }
2668 return nr;
2669 }
2670
memcg_numa_stat_show(struct seq_file * m,void * v)2671 static int memcg_numa_stat_show(struct seq_file *m, void *v)
2672 {
2673 struct numa_stat {
2674 const char *name;
2675 unsigned int lru_mask;
2676 };
2677
2678 static const struct numa_stat stats[] = {
2679 { "total", LRU_ALL },
2680 { "file", LRU_ALL_FILE },
2681 { "anon", LRU_ALL_ANON },
2682 { "unevictable", BIT(LRU_UNEVICTABLE) },
2683 };
2684 const struct numa_stat *stat;
2685 int nid;
2686 struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
2687
2688 mem_cgroup_flush_stats(memcg);
2689
2690 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
2691 seq_printf(m, "%s=%lu", stat->name,
2692 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
2693 false));
2694 for_each_node_state(nid, N_MEMORY)
2695 seq_printf(m, " N%d=%lu", nid,
2696 mem_cgroup_node_nr_lru_pages(memcg, nid,
2697 stat->lru_mask, false));
2698 seq_putc(m, '\n');
2699 }
2700
2701 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
2702
2703 seq_printf(m, "hierarchical_%s=%lu", stat->name,
2704 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
2705 true));
2706 for_each_node_state(nid, N_MEMORY)
2707 seq_printf(m, " N%d=%lu", nid,
2708 mem_cgroup_node_nr_lru_pages(memcg, nid,
2709 stat->lru_mask, true));
2710 seq_putc(m, '\n');
2711 }
2712
2713 return 0;
2714 }
2715 #endif /* CONFIG_NUMA */
2716
2717 static const unsigned int memcg1_stats[] = {
2718 NR_FILE_PAGES,
2719 NR_ANON_MAPPED,
2720 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2721 NR_ANON_THPS,
2722 #endif
2723 NR_SHMEM,
2724 NR_FILE_MAPPED,
2725 NR_FILE_DIRTY,
2726 NR_WRITEBACK,
2727 WORKINGSET_REFAULT_ANON,
2728 WORKINGSET_REFAULT_FILE,
2729 #ifdef CONFIG_SWAP
2730 MEMCG_SWAP,
2731 NR_SWAPCACHE,
2732 #endif
2733 };
2734
2735 static const char *const memcg1_stat_names[] = {
2736 "cache",
2737 "rss",
2738 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2739 "rss_huge",
2740 #endif
2741 "shmem",
2742 "mapped_file",
2743 "dirty",
2744 "writeback",
2745 "workingset_refault_anon",
2746 "workingset_refault_file",
2747 #ifdef CONFIG_SWAP
2748 "swap",
2749 "swapcached",
2750 #endif
2751 };
2752
2753 /* Universal VM events cgroup1 shows, original sort order */
2754 static const unsigned int memcg1_events[] = {
2755 PGPGIN,
2756 PGPGOUT,
2757 PGFAULT,
2758 PGMAJFAULT,
2759 };
2760
memcg1_stat_format(struct mem_cgroup * memcg,struct seq_buf * s)2761 void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
2762 {
2763 unsigned long memory, memsw;
2764 struct mem_cgroup *mi;
2765 unsigned int i;
2766
2767 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
2768
2769 mem_cgroup_flush_stats(memcg);
2770
2771 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
2772 unsigned long nr;
2773
2774 nr = memcg_page_state_local_output(memcg, memcg1_stats[i]);
2775 seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr);
2776 }
2777
2778 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
2779 seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]),
2780 memcg_events_local(memcg, memcg1_events[i]));
2781
2782 for (i = 0; i < NR_LRU_LISTS; i++)
2783 seq_buf_printf(s, "%s %lu\n", lru_list_name(i),
2784 memcg_page_state_local(memcg, NR_LRU_BASE + i) *
2785 PAGE_SIZE);
2786
2787 /* Hierarchical information */
2788 memory = memsw = PAGE_COUNTER_MAX;
2789 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
2790 memory = min(memory, READ_ONCE(mi->memory.max));
2791 memsw = min(memsw, READ_ONCE(mi->memsw.max));
2792 }
2793 seq_buf_printf(s, "hierarchical_memory_limit %llu\n",
2794 (u64)memory * PAGE_SIZE);
2795 seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
2796 (u64)memsw * PAGE_SIZE);
2797
2798 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
2799 unsigned long nr;
2800
2801 nr = memcg_page_state_output(memcg, memcg1_stats[i]);
2802 seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i],
2803 (u64)nr);
2804 }
2805
2806 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
2807 seq_buf_printf(s, "total_%s %llu\n",
2808 vm_event_name(memcg1_events[i]),
2809 (u64)memcg_events(memcg, memcg1_events[i]));
2810
2811 for (i = 0; i < NR_LRU_LISTS; i++)
2812 seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i),
2813 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
2814 PAGE_SIZE);
2815
2816 #ifdef CONFIG_DEBUG_VM
2817 {
2818 pg_data_t *pgdat;
2819 struct mem_cgroup_per_node *mz;
2820 unsigned long anon_cost = 0;
2821 unsigned long file_cost = 0;
2822
2823 for_each_online_pgdat(pgdat) {
2824 mz = memcg->nodeinfo[pgdat->node_id];
2825
2826 anon_cost += mz->lruvec.anon_cost;
2827 file_cost += mz->lruvec.file_cost;
2828 }
2829 seq_buf_printf(s, "anon_cost %lu\n", anon_cost);
2830 seq_buf_printf(s, "file_cost %lu\n", file_cost);
2831 }
2832 #endif
2833 }
2834
mem_cgroup_swappiness_read(struct cgroup_subsys_state * css,struct cftype * cft)2835 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
2836 struct cftype *cft)
2837 {
2838 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2839
2840 return mem_cgroup_swappiness(memcg);
2841 }
2842
mem_cgroup_swappiness_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)2843 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
2844 struct cftype *cft, u64 val)
2845 {
2846 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2847
2848 if (val > MAX_SWAPPINESS)
2849 return -EINVAL;
2850
2851 if (!mem_cgroup_is_root(memcg))
2852 WRITE_ONCE(memcg->swappiness, val);
2853 else
2854 WRITE_ONCE(vm_swappiness, val);
2855
2856 return 0;
2857 }
2858
mem_cgroup_oom_control_read(struct seq_file * sf,void * v)2859 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
2860 {
2861 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
2862
2863 seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable));
2864 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
2865 seq_printf(sf, "oom_kill %lu\n",
2866 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
2867 return 0;
2868 }
2869
mem_cgroup_oom_control_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)2870 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
2871 struct cftype *cft, u64 val)
2872 {
2873 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2874
2875 pr_warn_once("oom_control is deprecated and will be removed. "
2876 "Please report your usecase to linux-mm-@kvack.org if you "
2877 "depend on this functionality. \n");
2878
2879 /* cannot set to root cgroup and only 0 and 1 are allowed */
2880 if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
2881 return -EINVAL;
2882
2883 WRITE_ONCE(memcg->oom_kill_disable, val);
2884 if (!val)
2885 memcg1_oom_recover(memcg);
2886
2887 return 0;
2888 }
2889
2890 #ifdef CONFIG_SLUB_DEBUG
mem_cgroup_slab_show(struct seq_file * m,void * p)2891 static int mem_cgroup_slab_show(struct seq_file *m, void *p)
2892 {
2893 /*
2894 * Deprecated.
2895 * Please, take a look at tools/cgroup/memcg_slabinfo.py .
2896 */
2897 return 0;
2898 }
2899 #endif
2900
2901 struct cftype mem_cgroup_legacy_files[] = {
2902 {
2903 .name = "usage_in_bytes",
2904 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
2905 .read_u64 = mem_cgroup_read_u64,
2906 },
2907 {
2908 .name = "max_usage_in_bytes",
2909 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
2910 .write = mem_cgroup_reset,
2911 .read_u64 = mem_cgroup_read_u64,
2912 },
2913 {
2914 .name = "limit_in_bytes",
2915 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
2916 .write = mem_cgroup_write,
2917 .read_u64 = mem_cgroup_read_u64,
2918 },
2919 {
2920 .name = "soft_limit_in_bytes",
2921 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
2922 .write = mem_cgroup_write,
2923 .read_u64 = mem_cgroup_read_u64,
2924 },
2925 {
2926 .name = "failcnt",
2927 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
2928 .write = mem_cgroup_reset,
2929 .read_u64 = mem_cgroup_read_u64,
2930 },
2931 {
2932 .name = "stat",
2933 .seq_show = memory_stat_show,
2934 },
2935 {
2936 .name = "force_empty",
2937 .write = mem_cgroup_force_empty_write,
2938 },
2939 {
2940 .name = "use_hierarchy",
2941 .write_u64 = mem_cgroup_hierarchy_write,
2942 .read_u64 = mem_cgroup_hierarchy_read,
2943 },
2944 {
2945 .name = "cgroup.event_control", /* XXX: for compat */
2946 .write = memcg_write_event_control,
2947 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
2948 },
2949 {
2950 .name = "swappiness",
2951 .read_u64 = mem_cgroup_swappiness_read,
2952 .write_u64 = mem_cgroup_swappiness_write,
2953 },
2954 {
2955 .name = "move_charge_at_immigrate",
2956 .read_u64 = mem_cgroup_move_charge_read,
2957 .write_u64 = mem_cgroup_move_charge_write,
2958 },
2959 {
2960 .name = "oom_control",
2961 .seq_show = mem_cgroup_oom_control_read,
2962 .write_u64 = mem_cgroup_oom_control_write,
2963 },
2964 {
2965 .name = "pressure_level",
2966 .seq_show = mem_cgroup_dummy_seq_show,
2967 },
2968 #ifdef CONFIG_NUMA
2969 {
2970 .name = "numa_stat",
2971 .seq_show = memcg_numa_stat_show,
2972 },
2973 #endif
2974 {
2975 .name = "kmem.limit_in_bytes",
2976 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
2977 .write = mem_cgroup_write,
2978 .read_u64 = mem_cgroup_read_u64,
2979 },
2980 {
2981 .name = "kmem.usage_in_bytes",
2982 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
2983 .read_u64 = mem_cgroup_read_u64,
2984 },
2985 {
2986 .name = "kmem.failcnt",
2987 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
2988 .write = mem_cgroup_reset,
2989 .read_u64 = mem_cgroup_read_u64,
2990 },
2991 {
2992 .name = "kmem.max_usage_in_bytes",
2993 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
2994 .write = mem_cgroup_reset,
2995 .read_u64 = mem_cgroup_read_u64,
2996 },
2997 #ifdef CONFIG_SLUB_DEBUG
2998 {
2999 .name = "kmem.slabinfo",
3000 .seq_show = mem_cgroup_slab_show,
3001 },
3002 #endif
3003 {
3004 .name = "kmem.tcp.limit_in_bytes",
3005 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
3006 .write = mem_cgroup_write,
3007 .read_u64 = mem_cgroup_read_u64,
3008 },
3009 {
3010 .name = "kmem.tcp.usage_in_bytes",
3011 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
3012 .read_u64 = mem_cgroup_read_u64,
3013 },
3014 {
3015 .name = "kmem.tcp.failcnt",
3016 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
3017 .write = mem_cgroup_reset,
3018 .read_u64 = mem_cgroup_read_u64,
3019 },
3020 {
3021 .name = "kmem.tcp.max_usage_in_bytes",
3022 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
3023 .write = mem_cgroup_reset,
3024 .read_u64 = mem_cgroup_read_u64,
3025 },
3026 { }, /* terminate */
3027 };
3028
3029 struct cftype memsw_files[] = {
3030 {
3031 .name = "memsw.usage_in_bytes",
3032 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
3033 .read_u64 = mem_cgroup_read_u64,
3034 },
3035 {
3036 .name = "memsw.max_usage_in_bytes",
3037 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
3038 .write = mem_cgroup_reset,
3039 .read_u64 = mem_cgroup_read_u64,
3040 },
3041 {
3042 .name = "memsw.limit_in_bytes",
3043 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
3044 .write = mem_cgroup_write,
3045 .read_u64 = mem_cgroup_read_u64,
3046 },
3047 {
3048 .name = "memsw.failcnt",
3049 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
3050 .write = mem_cgroup_reset,
3051 .read_u64 = mem_cgroup_read_u64,
3052 },
3053 { }, /* terminate */
3054 };
3055
memcg1_account_kmem(struct mem_cgroup * memcg,int nr_pages)3056 void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages)
3057 {
3058 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
3059 if (nr_pages > 0)
3060 page_counter_charge(&memcg->kmem, nr_pages);
3061 else
3062 page_counter_uncharge(&memcg->kmem, -nr_pages);
3063 }
3064 }
3065
memcg1_charge_skmem(struct mem_cgroup * memcg,unsigned int nr_pages,gfp_t gfp_mask)3066 bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
3067 gfp_t gfp_mask)
3068 {
3069 struct page_counter *fail;
3070
3071 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
3072 memcg->tcpmem_pressure = 0;
3073 return true;
3074 }
3075 memcg->tcpmem_pressure = 1;
3076 if (gfp_mask & __GFP_NOFAIL) {
3077 page_counter_charge(&memcg->tcpmem, nr_pages);
3078 return true;
3079 }
3080 return false;
3081 }
3082
memcg1_alloc_events(struct mem_cgroup * memcg)3083 bool memcg1_alloc_events(struct mem_cgroup *memcg)
3084 {
3085 memcg->events_percpu = alloc_percpu_gfp(struct memcg1_events_percpu,
3086 GFP_KERNEL_ACCOUNT);
3087 return !!memcg->events_percpu;
3088 }
3089
memcg1_free_events(struct mem_cgroup * memcg)3090 void memcg1_free_events(struct mem_cgroup *memcg)
3091 {
3092 if (memcg->events_percpu)
3093 free_percpu(memcg->events_percpu);
3094 }
3095
memcg1_init(void)3096 static int __init memcg1_init(void)
3097 {
3098 int node;
3099
3100 for_each_node(node) {
3101 struct mem_cgroup_tree_per_node *rtpn;
3102
3103 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
3104
3105 rtpn->rb_root = RB_ROOT;
3106 rtpn->rb_rightmost = NULL;
3107 spin_lock_init(&rtpn->lock);
3108 soft_limit_tree.rb_tree_per_node[node] = rtpn;
3109 }
3110
3111 return 0;
3112 }
3113 subsys_initcall(memcg1_init);
3114