Lines Matching +full:lock +full:- +full:offset

1 // SPDX-License-Identifier: GPL-2.0-only
22 #include <linux/blk-cgroup.h>
31 #include <linux/backing-dev.h>
58 static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
62 struct swap_info_struct *si, unsigned long offset);
75 /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
77 static int least_priority = -1;
85 static const char Bad_offset[] = "Bad swap offset entry ";
86 static const char Unused_offset[] = "Unused swap offset entry ";
100 * This uses its own lock instead of swap_lock because when a
101 * swap_info_struct changes between not-full/full, it needs to
102 * add/remove itself to/from this list, but the swap_info_struct->lock
104 * before any swap_info_struct->lock.
141 /* Reclaim directly, bypass the slot cache and don't touch device lock */
145 unsigned long offset, int nr_pages) in swap_is_has_cache() argument
147 unsigned char *map = si->swap_map + offset; in swap_is_has_cache()
160 unsigned long offset, int nr_pages, bool *has_cache) in swap_is_last_map() argument
162 unsigned char *map = si->swap_map + offset; in swap_is_last_map()
184 unsigned long offset, unsigned long flags) in __try_to_reclaim_swap() argument
186 swp_entry_t entry = swp_entry(si->type, offset); in __try_to_reclaim_swap()
198 ret = -nr_pages; in __try_to_reclaim_swap()
202 * called by vmscan.c at reclaiming folios. So we hold a folio lock in __try_to_reclaim_swap()
210 /* offset could point to the middle of a large folio */ in __try_to_reclaim_swap()
211 entry = folio->swap; in __try_to_reclaim_swap()
212 offset = swp_offset(entry); in __try_to_reclaim_swap()
225 ci = lock_cluster_or_swap_info(si, offset); in __try_to_reclaim_swap()
226 need_reclaim = swap_is_has_cache(si, offset, nr_pages); in __try_to_reclaim_swap()
239 xa_lock_irq(&address_space->i_pages); in __try_to_reclaim_swap()
241 xa_unlock_irq(&address_space->i_pages); in __try_to_reclaim_swap()
245 spin_lock(&si->lock); in __try_to_reclaim_swap()
250 spin_unlock(&si->lock); in __try_to_reclaim_swap()
261 struct rb_node *rb = rb_first(&sis->swap_extent_root); in first_se()
267 struct rb_node *rb = rb_next(&se->rb_node); in next_se()
273 * to allow the swap device to optimize its wear-levelling.
284 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); in discard_swap()
285 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); in discard_swap()
287 err = blkdev_issue_discard(si->bdev, start_block, in discard_swap()
295 start_block = se->start_block << (PAGE_SHIFT - 9); in discard_swap()
296 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); in discard_swap()
298 err = blkdev_issue_discard(si->bdev, start_block, in discard_swap()
305 return err; /* That will often be -EOPNOTSUPP */ in discard_swap()
309 offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) in offset_to_swap_extent() argument
314 rb = sis->swap_extent_root.rb_node; in offset_to_swap_extent()
317 if (offset < se->start_page) in offset_to_swap_extent()
318 rb = rb->rb_left; in offset_to_swap_extent()
319 else if (offset >= se->start_page + se->nr_pages) in offset_to_swap_extent()
320 rb = rb->rb_right; in offset_to_swap_extent()
330 struct swap_info_struct *sis = swp_swap_info(folio->swap); in swap_folio_sector()
333 pgoff_t offset; in swap_folio_sector() local
335 offset = swp_offset(folio->swap); in swap_folio_sector()
336 se = offset_to_swap_extent(sis, offset); in swap_folio_sector()
337 sector = se->start_block + (offset - se->start_page); in swap_folio_sector()
338 return sector << (PAGE_SHIFT - 9); in swap_folio_sector()
343 * to allow the swap device to optimize its wear-levelling.
351 pgoff_t offset = start_page - se->start_page; in discard_swap_cluster() local
352 sector_t start_block = se->start_block + offset; in discard_swap_cluster()
353 sector_t nr_blocks = se->nr_pages - offset; in discard_swap_cluster()
358 nr_pages -= nr_blocks; in discard_swap_cluster()
360 start_block <<= PAGE_SHIFT - 9; in discard_swap_cluster()
361 nr_blocks <<= PAGE_SHIFT - 9; in discard_swap_cluster()
362 if (blkdev_issue_discard(si->bdev, start_block, in discard_swap_cluster()
387 return info->flags & CLUSTER_FLAG_FREE; in cluster_is_free()
393 return ci - si->cluster_info; in cluster_index()
403 unsigned long offset) in lock_cluster() argument
407 ci = si->cluster_info; in lock_cluster()
409 ci += offset / SWAPFILE_CLUSTER; in lock_cluster()
410 spin_lock(&ci->lock); in lock_cluster()
418 spin_unlock(&ci->lock); in unlock_cluster()
423 * swap_cluster_info if SSD-style cluster-based locking is in place.
426 struct swap_info_struct *si, unsigned long offset) in lock_cluster_or_swap_info() argument
430 /* Try to use fine-grained SSD-style locking if available: */ in lock_cluster_or_swap_info()
431 ci = lock_cluster(si, offset); in lock_cluster_or_swap_info()
434 spin_lock(&si->lock); in lock_cluster_or_swap_info()
445 spin_unlock(&si->lock); in unlock_cluster_or_swap_info()
455 * si->swap_map directly. To make sure the discarding cluster isn't in swap_cluster_schedule_discard()
459 memset(si->swap_map + idx * SWAPFILE_CLUSTER, in swap_cluster_schedule_discard()
462 VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); in swap_cluster_schedule_discard()
463 list_move_tail(&ci->list, &si->discard_clusters); in swap_cluster_schedule_discard()
464 ci->flags = 0; in swap_cluster_schedule_discard()
465 schedule_work(&si->discard_work); in swap_cluster_schedule_discard()
470 lockdep_assert_held(&si->lock); in __free_cluster()
471 lockdep_assert_held(&ci->lock); in __free_cluster()
473 if (ci->flags) in __free_cluster()
474 list_move_tail(&ci->list, &si->free_clusters); in __free_cluster()
476 list_add_tail(&ci->list, &si->free_clusters); in __free_cluster()
477 ci->flags = CLUSTER_FLAG_FREE; in __free_cluster()
478 ci->order = 0; in __free_cluster()
483 * will be added to free cluster list. caller should hold si->lock.
490 while (!list_empty(&si->discard_clusters)) { in swap_do_scheduled_discard()
491 ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); in swap_do_scheduled_discard()
492 list_del(&ci->list); in swap_do_scheduled_discard()
494 spin_unlock(&si->lock); in swap_do_scheduled_discard()
499 spin_lock(&si->lock); in swap_do_scheduled_discard()
500 spin_lock(&ci->lock); in swap_do_scheduled_discard()
502 memset(si->swap_map + idx * SWAPFILE_CLUSTER, in swap_do_scheduled_discard()
504 spin_unlock(&ci->lock); in swap_do_scheduled_discard()
514 spin_lock(&si->lock); in swap_discard_work()
516 spin_unlock(&si->lock); in swap_discard_work()
524 complete(&si->comp); in swap_users_ref_free()
529 VM_BUG_ON(ci->count != 0); in free_cluster()
530 lockdep_assert_held(&si->lock); in free_cluster()
531 lockdep_assert_held(&ci->lock); in free_cluster()
533 if (ci->flags & CLUSTER_FLAG_FRAG) in free_cluster()
534 si->frag_cluster_nr[ci->order]--; in free_cluster()
541 if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == in free_cluster()
565 ci->count++; in inc_cluster_info_page()
567 VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); in inc_cluster_info_page()
568 VM_BUG_ON(ci->flags); in inc_cluster_info_page()
579 if (!si->cluster_info) in dec_cluster_info_page()
582 VM_BUG_ON(ci->count < nr_pages); in dec_cluster_info_page()
584 lockdep_assert_held(&si->lock); in dec_cluster_info_page()
585 lockdep_assert_held(&ci->lock); in dec_cluster_info_page()
586 ci->count -= nr_pages; in dec_cluster_info_page()
588 if (!ci->count) { in dec_cluster_info_page()
593 if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { in dec_cluster_info_page()
594 VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); in dec_cluster_info_page()
595 if (ci->flags & CLUSTER_FLAG_FRAG) in dec_cluster_info_page()
596 si->frag_cluster_nr[ci->order]--; in dec_cluster_info_page()
597 list_move_tail(&ci->list, &si->nonfull_clusters[ci->order]); in dec_cluster_info_page()
598 ci->flags = CLUSTER_FLAG_NONFULL; in dec_cluster_info_page()
606 unsigned char *map = si->swap_map; in cluster_reclaim_range()
607 unsigned long offset; in cluster_reclaim_range() local
609 spin_unlock(&ci->lock); in cluster_reclaim_range()
610 spin_unlock(&si->lock); in cluster_reclaim_range()
612 for (offset = start; offset < end; offset++) { in cluster_reclaim_range()
613 switch (READ_ONCE(map[offset])) { in cluster_reclaim_range()
617 if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0) in cluster_reclaim_range()
625 spin_lock(&si->lock); in cluster_reclaim_range()
626 spin_lock(&ci->lock); in cluster_reclaim_range()
630 * could have been be freed while we are not holding the lock. in cluster_reclaim_range()
632 for (offset = start; offset < end; offset++) in cluster_reclaim_range()
633 if (READ_ONCE(map[offset])) in cluster_reclaim_range()
643 unsigned long offset, end = start + nr_pages; in cluster_scan_range() local
644 unsigned char *map = si->swap_map; in cluster_scan_range()
647 for (offset = start; offset < end; offset++) { in cluster_scan_range()
648 switch (READ_ONCE(map[offset])) { in cluster_scan_range()
673 if (!(si->flags & SWP_WRITEOK)) in cluster_alloc_range()
678 list_move_tail(&ci->list, &si->nonfull_clusters[order]); in cluster_alloc_range()
679 ci->flags = CLUSTER_FLAG_NONFULL; in cluster_alloc_range()
681 ci->order = order; in cluster_alloc_range()
684 memset(si->swap_map + start, usage, nr_pages); in cluster_alloc_range()
686 ci->count += nr_pages; in cluster_alloc_range()
688 if (ci->count == SWAPFILE_CLUSTER) { in cluster_alloc_range()
689 VM_BUG_ON(!(ci->flags & in cluster_alloc_range()
691 if (ci->flags & CLUSTER_FLAG_FRAG) in cluster_alloc_range()
692 si->frag_cluster_nr[ci->order]--; in cluster_alloc_range()
693 list_move_tail(&ci->list, &si->full_clusters); in cluster_alloc_range()
694 ci->flags = CLUSTER_FLAG_FULL; in cluster_alloc_range()
700 static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset, in alloc_swap_scan_cluster() argument
704 unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1); in alloc_swap_scan_cluster()
705 unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); in alloc_swap_scan_cluster()
711 end -= nr_pages; in alloc_swap_scan_cluster()
713 ci = lock_cluster(si, offset); in alloc_swap_scan_cluster()
714 if (ci->count + nr_pages > SWAPFILE_CLUSTER) { in alloc_swap_scan_cluster()
715 offset = SWAP_NEXT_INVALID; in alloc_swap_scan_cluster()
719 while (offset <= end) { in alloc_swap_scan_cluster()
720 if (cluster_scan_range(si, ci, offset, nr_pages)) { in alloc_swap_scan_cluster()
721 if (!cluster_alloc_range(si, ci, offset, usage, order)) { in alloc_swap_scan_cluster()
722 offset = SWAP_NEXT_INVALID; in alloc_swap_scan_cluster()
725 *foundp = offset; in alloc_swap_scan_cluster()
726 if (ci->count == SWAPFILE_CLUSTER) { in alloc_swap_scan_cluster()
727 offset = SWAP_NEXT_INVALID; in alloc_swap_scan_cluster()
730 offset += nr_pages; in alloc_swap_scan_cluster()
733 offset += nr_pages; in alloc_swap_scan_cluster()
735 if (offset > end) in alloc_swap_scan_cluster()
736 offset = SWAP_NEXT_INVALID; in alloc_swap_scan_cluster()
739 return offset; in alloc_swap_scan_cluster()
746 unsigned long offset, end; in swap_reclaim_full_clusters() local
748 unsigned char *map = si->swap_map; in swap_reclaim_full_clusters()
752 to_scan = si->inuse_pages / SWAPFILE_CLUSTER; in swap_reclaim_full_clusters()
754 while (!list_empty(&si->full_clusters)) { in swap_reclaim_full_clusters()
755 ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list); in swap_reclaim_full_clusters()
756 list_move_tail(&ci->list, &si->full_clusters); in swap_reclaim_full_clusters()
757 offset = cluster_offset(si, ci); in swap_reclaim_full_clusters()
758 end = min(si->max, offset + SWAPFILE_CLUSTER); in swap_reclaim_full_clusters()
759 to_scan--; in swap_reclaim_full_clusters()
761 spin_unlock(&si->lock); in swap_reclaim_full_clusters()
762 while (offset < end) { in swap_reclaim_full_clusters()
763 if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { in swap_reclaim_full_clusters()
764 nr_reclaim = __try_to_reclaim_swap(si, offset, in swap_reclaim_full_clusters()
767 offset += abs(nr_reclaim); in swap_reclaim_full_clusters()
771 offset++; in swap_reclaim_full_clusters()
773 spin_lock(&si->lock); in swap_reclaim_full_clusters()
786 spin_lock(&si->lock); in swap_reclaim_work()
788 spin_unlock(&si->lock); in swap_reclaim_work()
801 unsigned int offset, found = 0; in cluster_alloc_swap_entry() local
804 lockdep_assert_held(&si->lock); in cluster_alloc_swap_entry()
805 cluster = this_cpu_ptr(si->percpu_cluster); in cluster_alloc_swap_entry()
806 offset = cluster->next[order]; in cluster_alloc_swap_entry()
807 if (offset) { in cluster_alloc_swap_entry()
808 offset = alloc_swap_scan_cluster(si, offset, &found, order, usage); in cluster_alloc_swap_entry()
813 if (!list_empty(&si->free_clusters)) { in cluster_alloc_swap_entry()
814 ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); in cluster_alloc_swap_entry()
815 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); in cluster_alloc_swap_entry()
820 VM_BUG_ON((si->flags & SWP_WRITEOK) && !found); in cluster_alloc_swap_entry()
831 while (!list_empty(&si->nonfull_clusters[order])) { in cluster_alloc_swap_entry()
832 ci = list_first_entry(&si->nonfull_clusters[order], in cluster_alloc_swap_entry()
834 list_move_tail(&ci->list, &si->frag_clusters[order]); in cluster_alloc_swap_entry()
835 ci->flags = CLUSTER_FLAG_FRAG; in cluster_alloc_swap_entry()
836 si->frag_cluster_nr[order]++; in cluster_alloc_swap_entry()
837 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), in cluster_alloc_swap_entry()
849 while (frags < si->frag_cluster_nr[order]) { in cluster_alloc_swap_entry()
850 ci = list_first_entry(&si->frag_clusters[order], in cluster_alloc_swap_entry()
854 * high order allocation or moved here due to per-CPU usage, in cluster_alloc_swap_entry()
857 list_move_tail(&ci->list, &si->frag_clusters[order]); in cluster_alloc_swap_entry()
858 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), in cluster_alloc_swap_entry()
870 if (!list_empty(&si->discard_clusters)) { in cluster_alloc_swap_entry()
874 * reread cluster_next_cpu since we dropped si->lock in cluster_alloc_swap_entry()
887 * allocation, but reclaim may drop si->lock and race with another user. in cluster_alloc_swap_entry()
889 while (!list_empty(&si->frag_clusters[o])) { in cluster_alloc_swap_entry()
890 ci = list_first_entry(&si->frag_clusters[o], in cluster_alloc_swap_entry()
892 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), in cluster_alloc_swap_entry()
898 while (!list_empty(&si->nonfull_clusters[o])) { in cluster_alloc_swap_entry()
899 ci = list_first_entry(&si->nonfull_clusters[o], in cluster_alloc_swap_entry()
901 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), in cluster_alloc_swap_entry()
909 cluster->next[order] = offset; in cluster_alloc_swap_entry()
917 assert_spin_locked(&si->lock); in __del_from_avail_list()
919 plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); in __del_from_avail_list()
929 static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, in swap_range_alloc() argument
932 unsigned int end = offset + nr_entries - 1; in swap_range_alloc()
934 if (offset == si->lowest_bit) in swap_range_alloc()
935 si->lowest_bit += nr_entries; in swap_range_alloc()
936 if (end == si->highest_bit) in swap_range_alloc()
937 WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); in swap_range_alloc()
938 WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries); in swap_range_alloc()
939 if (si->inuse_pages == si->pages) { in swap_range_alloc()
940 si->lowest_bit = si->max; in swap_range_alloc()
941 si->highest_bit = 0; in swap_range_alloc()
944 if (si->cluster_info && vm_swap_full()) in swap_range_alloc()
945 schedule_work(&si->reclaim_work); in swap_range_alloc()
955 plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); in add_to_avail_list()
959 static void swap_range_free(struct swap_info_struct *si, unsigned long offset, in swap_range_free() argument
962 unsigned long begin = offset; in swap_range_free()
963 unsigned long end = offset + nr_entries - 1; in swap_range_free()
968 * Use atomic clear_bit operations only on zeromap instead of non-atomic in swap_range_free()
972 clear_bit(offset + i, si->zeromap); in swap_range_free()
974 if (offset < si->lowest_bit) in swap_range_free()
975 si->lowest_bit = offset; in swap_range_free()
976 if (end > si->highest_bit) { in swap_range_free()
977 bool was_full = !si->highest_bit; in swap_range_free()
979 WRITE_ONCE(si->highest_bit, end); in swap_range_free()
980 if (was_full && (si->flags & SWP_WRITEOK)) in swap_range_free()
983 if (si->flags & SWP_BLKDEV) in swap_range_free()
985 si->bdev->bd_disk->fops->swap_slot_free_notify; in swap_range_free()
988 while (offset <= end) { in swap_range_free()
989 arch_swap_invalidate_page(si->type, offset); in swap_range_free()
991 swap_slot_free_notify(si->bdev, offset); in swap_range_free()
992 offset++; in swap_range_free()
994 clear_shadow_from_swap_cache(si->type, begin, end); in swap_range_free()
997 * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 in swap_range_free()
1002 WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); in swap_range_free()
1009 if (!(si->flags & SWP_SOLIDSTATE)) { in set_cluster_next()
1010 si->cluster_next = next; in set_cluster_next()
1014 prev = this_cpu_read(*si->cluster_next_cpu); in set_cluster_next()
1017 * another trunk randomly to avoid lock contention on swap in set_cluster_next()
1023 if (si->highest_bit <= si->lowest_bit) in set_cluster_next()
1025 next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit); in set_cluster_next()
1027 next = max_t(unsigned int, next, si->lowest_bit); in set_cluster_next()
1029 this_cpu_write(*si->cluster_next_cpu, next); in set_cluster_next()
1033 unsigned long offset) in swap_offset_available_and_locked() argument
1035 if (data_race(!si->swap_map[offset])) { in swap_offset_available_and_locked()
1036 spin_lock(&si->lock); in swap_offset_available_and_locked()
1040 if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { in swap_offset_available_and_locked()
1041 spin_lock(&si->lock); in swap_offset_available_and_locked()
1054 VM_BUG_ON(!si->cluster_info); in cluster_alloc_swap()
1056 si->flags += SWP_SCANNING; in cluster_alloc_swap()
1059 unsigned long offset = cluster_alloc_swap_entry(si, order, usage); in cluster_alloc_swap() local
1061 if (!offset) in cluster_alloc_swap()
1063 slots[n_ret++] = swp_entry(si->type, offset); in cluster_alloc_swap()
1066 si->flags -= SWP_SCANNING; in cluster_alloc_swap()
1075 unsigned long offset; in scan_swap_map_slots() local
1086 * way, however, we resort to first-free allocation, starting in scan_swap_map_slots()
1089 * overall disk seek times between swap pages. -- sct in scan_swap_map_slots()
1090 * But we do now try to find an empty cluster. -Andrea in scan_swap_map_slots()
1109 if (!(si->flags & SWP_BLKDEV) || !si->cluster_info) in scan_swap_map_slots()
1113 if (si->cluster_info) in scan_swap_map_slots()
1116 si->flags += SWP_SCANNING; in scan_swap_map_slots()
1119 scan_base = si->cluster_next; in scan_swap_map_slots()
1120 offset = scan_base; in scan_swap_map_slots()
1122 if (unlikely(!si->cluster_nr--)) { in scan_swap_map_slots()
1123 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { in scan_swap_map_slots()
1124 si->cluster_nr = SWAPFILE_CLUSTER - 1; in scan_swap_map_slots()
1128 spin_unlock(&si->lock); in scan_swap_map_slots()
1134 scan_base = offset = si->lowest_bit; in scan_swap_map_slots()
1135 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; in scan_swap_map_slots()
1138 for (; last_in_cluster <= READ_ONCE(si->highest_bit); offset++) { in scan_swap_map_slots()
1139 if (si->swap_map[offset]) in scan_swap_map_slots()
1140 last_in_cluster = offset + SWAPFILE_CLUSTER; in scan_swap_map_slots()
1141 else if (offset == last_in_cluster) { in scan_swap_map_slots()
1142 spin_lock(&si->lock); in scan_swap_map_slots()
1143 offset -= SWAPFILE_CLUSTER - 1; in scan_swap_map_slots()
1144 si->cluster_next = offset; in scan_swap_map_slots()
1145 si->cluster_nr = SWAPFILE_CLUSTER - 1; in scan_swap_map_slots()
1148 if (unlikely(--latency_ration < 0)) { in scan_swap_map_slots()
1154 offset = scan_base; in scan_swap_map_slots()
1155 spin_lock(&si->lock); in scan_swap_map_slots()
1156 si->cluster_nr = SWAPFILE_CLUSTER - 1; in scan_swap_map_slots()
1160 if (!(si->flags & SWP_WRITEOK)) in scan_swap_map_slots()
1162 if (!si->highest_bit) in scan_swap_map_slots()
1164 if (offset > si->highest_bit) in scan_swap_map_slots()
1165 scan_base = offset = si->lowest_bit; in scan_swap_map_slots()
1167 /* reuse swap entry of cache-only swap if not busy. */ in scan_swap_map_slots()
1168 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { in scan_swap_map_slots()
1170 spin_unlock(&si->lock); in scan_swap_map_slots()
1171 swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); in scan_swap_map_slots()
1172 spin_lock(&si->lock); in scan_swap_map_slots()
1179 if (si->swap_map[offset]) { in scan_swap_map_slots()
1185 memset(si->swap_map + offset, usage, nr_pages); in scan_swap_map_slots()
1187 swap_range_alloc(si, offset, nr_pages); in scan_swap_map_slots()
1188 slots[n_ret++] = swp_entry(si->type, offset); in scan_swap_map_slots()
1191 if ((n_ret == nr) || (offset >= si->highest_bit)) in scan_swap_map_slots()
1197 if (unlikely(--latency_ration < 0)) { in scan_swap_map_slots()
1200 spin_unlock(&si->lock); in scan_swap_map_slots()
1202 spin_lock(&si->lock); in scan_swap_map_slots()
1206 if (si->cluster_nr && !si->swap_map[++offset]) { in scan_swap_map_slots()
1207 /* non-ssd case, still more slots in cluster? */ in scan_swap_map_slots()
1208 --si->cluster_nr; in scan_swap_map_slots()
1214 * try to scan a little more quickly with lock held unless we in scan_swap_map_slots()
1220 if (offset < scan_base) in scan_swap_map_slots()
1223 scan_limit = si->highest_bit; in scan_swap_map_slots()
1224 for (; offset <= scan_limit && --latency_ration > 0; in scan_swap_map_slots()
1225 offset++) { in scan_swap_map_slots()
1226 if (!si->swap_map[offset]) in scan_swap_map_slots()
1233 set_cluster_next(si, offset + 1); in scan_swap_map_slots()
1234 si->flags -= SWP_SCANNING; in scan_swap_map_slots()
1239 spin_unlock(&si->lock); in scan_swap_map_slots()
1240 while (++offset <= READ_ONCE(si->highest_bit)) { in scan_swap_map_slots()
1241 if (unlikely(--latency_ration < 0)) { in scan_swap_map_slots()
1246 if (swap_offset_available_and_locked(si, offset)) in scan_swap_map_slots()
1249 offset = si->lowest_bit; in scan_swap_map_slots()
1250 while (offset < scan_base) { in scan_swap_map_slots()
1251 if (unlikely(--latency_ration < 0)) { in scan_swap_map_slots()
1256 if (swap_offset_available_and_locked(si, offset)) in scan_swap_map_slots()
1258 offset++; in scan_swap_map_slots()
1260 spin_lock(&si->lock); in scan_swap_map_slots()
1263 si->flags -= SWP_SCANNING; in scan_swap_map_slots()
1291 /* requeue si to after same-priority siblings */ in get_swap_pages()
1292 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); in get_swap_pages()
1294 spin_lock(&si->lock); in get_swap_pages()
1295 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { in get_swap_pages()
1297 if (plist_node_empty(&si->avail_lists[node])) { in get_swap_pages()
1298 spin_unlock(&si->lock); in get_swap_pages()
1301 WARN(!si->highest_bit, in get_swap_pages()
1303 si->type); in get_swap_pages()
1304 WARN(!(si->flags & SWP_WRITEOK), in get_swap_pages()
1306 si->type); in get_swap_pages()
1308 spin_unlock(&si->lock); in get_swap_pages()
1313 spin_unlock(&si->lock); in get_swap_pages()
1322 * and since scan_swap_map_slots() can drop the si->lock, in get_swap_pages()
1326 * si->lock. Since we dropped the swap_avail_lock, the in get_swap_pages()
1331 if (plist_node_empty(&next->avail_lists[node])) in get_swap_pages()
1339 atomic_long_add((long)(n_goal - n_ret) * size, in get_swap_pages()
1348 unsigned long offset; in _swap_info_get() local
1355 if (data_race(!(si->flags & SWP_USED))) in _swap_info_get()
1357 offset = swp_offset(entry); in _swap_info_get()
1358 if (offset >= si->max) in _swap_info_get()
1360 if (data_race(!si->swap_map[swp_offset(entry)])) in _swap_info_get()
1388 spin_unlock(&q->lock); in swap_info_get_cont()
1390 spin_lock(&p->lock); in swap_info_get_cont()
1396 unsigned long offset, in __swap_entry_free_locked() argument
1402 count = si->swap_map[offset]; in __swap_entry_free_locked()
1418 if (swap_count_continued(si, offset, count)) in __swap_entry_free_locked()
1423 count--; in __swap_entry_free_locked()
1428 WRITE_ONCE(si->swap_map[offset], usage); in __swap_entry_free_locked()
1430 WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE); in __swap_entry_free_locked()
1443 * RCU reader side lock (including any spinlock) is sufficient to
1477 unsigned long offset; in get_swap_device() local
1484 if (!percpu_ref_tryget_live(&si->users)) in get_swap_device()
1487 * Guarantee the si->users are checked before accessing other in get_swap_device()
1494 offset = swp_offset(entry); in get_swap_device()
1495 if (offset >= si->max) in get_swap_device()
1505 percpu_ref_put(&si->users); in get_swap_device()
1513 unsigned long offset = swp_offset(entry); in __swap_entry_free() local
1516 ci = lock_cluster_or_swap_info(si, offset); in __swap_entry_free()
1517 usage = __swap_entry_free_locked(si, offset, 1); in __swap_entry_free()
1528 unsigned long offset = swp_offset(entry); in __swap_entries_free() local
1535 if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1) in __swap_entries_free()
1538 if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) in __swap_entries_free()
1541 ci = lock_cluster_or_swap_info(si, offset); in __swap_entries_free()
1542 if (!swap_is_last_map(si, offset, nr, &has_cache)) { in __swap_entries_free()
1547 WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); in __swap_entries_free()
1552 zswap_invalidate(swp_entry(si->type, offset + i)); in __swap_entries_free()
1553 spin_lock(&si->lock); in __swap_entries_free()
1555 spin_unlock(&si->lock); in __swap_entries_free()
1561 if (data_race(si->swap_map[offset + i])) { in __swap_entries_free()
1562 count = __swap_entry_free(si, swp_entry(type, offset + i)); in __swap_entries_free()
1579 unsigned long offset = swp_offset(entry); in swap_entry_range_free() local
1580 unsigned char *map = si->swap_map + offset; in swap_entry_range_free()
1584 ci = lock_cluster(si, offset); in swap_entry_range_free()
1593 swap_range_free(si, offset, nr_pages); in swap_entry_range_free()
1597 unsigned long offset, int nr_pages, in cluster_swap_free_nr() argument
1604 ci = lock_cluster_or_swap_info(si, offset); in cluster_swap_free_nr()
1608 if (!__swap_entry_free_locked(si, offset + i, usage)) in cluster_swap_free_nr()
1614 free_swap_slot(swp_entry(si->type, offset + i)); in cluster_swap_free_nr()
1618 ci = lock_cluster_or_swap_info(si, offset); in cluster_swap_free_nr()
1620 offset += nr; in cluster_swap_free_nr()
1621 nr_pages -= nr; in cluster_swap_free_nr()
1634 unsigned long offset = swp_offset(entry); in swap_free_nr() local
1641 nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); in swap_free_nr()
1642 cluster_swap_free_nr(sis, offset, nr, 1); in swap_free_nr()
1643 offset += nr; in swap_free_nr()
1644 nr_pages -= nr; in swap_free_nr()
1653 unsigned long offset = swp_offset(entry); in put_swap_folio() local
1662 ci = lock_cluster_or_swap_info(si, offset); in put_swap_folio()
1663 if (size > 1 && swap_is_has_cache(si, offset, size)) { in put_swap_folio()
1665 spin_lock(&si->lock); in put_swap_folio()
1667 spin_unlock(&si->lock); in put_swap_folio()
1671 if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { in put_swap_folio()
1674 if (i == size - 1) in put_swap_folio()
1676 lock_cluster_or_swap_info(si, offset); in put_swap_folio()
1686 return (int)swp_type(*e1) - (int)swp_type(*e2); in swp_entry_cmp()
1701 * Sort swap entries by swap device, so each lock is only taken once. in swapcache_free_entries()
1714 spin_unlock(&p->lock); in swapcache_free_entries()
1720 pgoff_t offset = swp_offset(entry); in __swap_count() local
1722 return swap_count(si->swap_map[offset]); in __swap_count()
1732 pgoff_t offset = swp_offset(entry); in swap_swapcount() local
1736 ci = lock_cluster_or_swap_info(si, offset); in swap_swapcount()
1737 count = swap_count(si->swap_map[offset]); in swap_swapcount()
1752 pgoff_t offset; in swp_swapcount() local
1759 offset = swp_offset(entry); in swp_swapcount()
1761 ci = lock_cluster_or_swap_info(si, offset); in swp_swapcount()
1763 count = swap_count(si->swap_map[offset]); in swp_swapcount()
1770 page = vmalloc_to_page(si->swap_map + offset); in swp_swapcount()
1771 offset &= ~PAGE_MASK; in swp_swapcount()
1777 tmp_count = map[offset]; in swp_swapcount()
1792 unsigned char *map = si->swap_map; in swap_page_trans_huge_swapped()
1795 unsigned long offset = round_down(roffset, nr_pages); in swap_page_trans_huge_swapped() local
1799 ci = lock_cluster_or_swap_info(si, offset); in swap_page_trans_huge_swapped()
1806 if (swap_count(map[offset + i])) { in swap_page_trans_huge_swapped()
1818 swp_entry_t entry = folio->swap; in folio_swapped()
1842 * - most probably a call from __try_to_reclaim_swap() while in folio_swapcache_freeable()
1844 * but conceivably even a call from memory reclaim - will free in folio_swapcache_freeable()
1861 * folio_free_swap() - Free the swap space used for this folio.
1882 * free_swap_and_cache_nr() - Release reference on range of swap entries and
1889 * offset range is defined by [entry.offset, entry.offset + nr).
1897 unsigned long offset; in free_swap_and_cache_nr() local
1906 if (WARN_ON(end_offset > si->max)) in free_swap_and_cache_nr()
1915 * Short-circuit the below loop if none of the entries had their in free_swap_and_cache_nr()
1926 * latter will get a reference and lock the folio for every individual in free_swap_and_cache_nr()
1930 for (offset = start_offset; offset < end_offset; offset += nr) { in free_swap_and_cache_nr()
1932 if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { in free_swap_and_cache_nr()
1941 nr = __try_to_reclaim_swap(si, offset, in free_swap_and_cache_nr()
1946 nr = -nr; in free_swap_and_cache_nr()
1947 nr = ALIGN(offset + 1, nr) - offset; in free_swap_and_cache_nr()
1966 spin_lock(&si->lock); in get_swap_page_of_type()
1967 if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0)) in get_swap_page_of_type()
1969 spin_unlock(&si->lock); in get_swap_page_of_type()
1977 * @offset - number of the PAGE_SIZE-sized block of the device, starting
1982 int swap_type_of(dev_t device, sector_t offset) in swap_type_of() argument
1987 return -1; in swap_type_of()
1993 if (!(sis->flags & SWP_WRITEOK)) in swap_type_of()
1996 if (device == sis->bdev->bd_dev) { in swap_type_of()
1999 if (se->start_block == offset) { in swap_type_of()
2006 return -ENODEV; in swap_type_of()
2017 if (!(sis->flags & SWP_WRITEOK)) in find_first_swap()
2019 *device = sis->bdev->bd_dev; in find_first_swap()
2024 return -ENODEV; in find_first_swap()
2028 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
2031 sector_t swapdev_block(int type, pgoff_t offset) in swapdev_block() argument
2036 if (!si || !(si->flags & SWP_WRITEOK)) in swapdev_block()
2038 se = offset_to_swap_extent(si, offset); in swapdev_block()
2039 return se->start_block + (offset - se->start_page); in swapdev_block()
2056 spin_lock(&sis->lock); in count_swap_pages()
2057 if (sis->flags & SWP_WRITEOK) { in count_swap_pages()
2058 n = sis->pages; in count_swap_pages()
2060 n -= sis->inuse_pages; in count_swap_pages()
2062 spin_unlock(&sis->lock); in count_swap_pages()
2076 * just let do_wp_page work it out if a write is requested later - to
2092 return -ENOMEM; in unuse_pte()
2093 else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { in unuse_pte()
2102 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); in unuse_pte()
2114 dec_mm_counter(vma->vm_mm, MM_SWAPENTS); in unuse_pte()
2132 dec_mm_counter(vma->vm_mm, MM_SWAPENTS); in unuse_pte()
2133 inc_mm_counter(vma->vm_mm, MM_ANONPAGES); in unuse_pte()
2162 new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); in unuse_pte()
2168 set_pte_at(vma->vm_mm, addr, pte, new_pte); in unuse_pte()
2190 unsigned long offset; in unuse_pte_range() local
2211 offset = swp_offset(entry); in unuse_pte_range()
2228 swp_count = READ_ONCE(si->swap_map[offset]); in unuse_pte_range()
2231 return -ENOMEM; in unuse_pte_range()
2318 addr = vma->vm_start; in unuse_vma()
2319 end = vma->vm_end; in unuse_vma()
2321 pgd = pgd_offset(vma->vm_mm, addr); in unuse_vma()
2341 if (vma->anon_vma && !is_vm_hugetlb_page(vma)) { in unuse_mm()
2370 for (i = prev + 1; i < si->max; i++) { in find_next_to_unuse()
2371 count = READ_ONCE(si->swap_map[i]); in find_next_to_unuse()
2378 if (i == si->max) in find_next_to_unuse()
2395 if (!READ_ONCE(si->inuse_pages)) in try_to_unuse()
2408 while (READ_ONCE(si->inuse_pages) && in try_to_unuse()
2410 (p = p->next) != &init_mm.mmlist) { in try_to_unuse()
2436 while (READ_ONCE(si->inuse_pages) && in try_to_unuse()
2447 * swap cache just before we acquired the page lock. The folio in try_to_unuse()
2469 * and robust (though cpu-intensive) just to keep retrying. in try_to_unuse()
2471 if (READ_ONCE(si->inuse_pages)) { in try_to_unuse()
2474 return -EINTR; in try_to_unuse()
2480 * after swap_range_free() reduces si->inuse_pages to 0. in try_to_unuse()
2490 * added to the mmlist just after page_duplicate - before would be racy.
2498 if (swap_info[type]->inuse_pages) in drain_mmlist()
2511 while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { in destroy_swap_extents()
2512 struct rb_node *rb = sis->swap_extent_root.rb_node; in destroy_swap_extents()
2515 rb_erase(rb, &sis->swap_extent_root); in destroy_swap_extents()
2519 if (sis->flags & SWP_ACTIVATED) { in destroy_swap_extents()
2520 struct file *swap_file = sis->swap_file; in destroy_swap_extents()
2521 struct address_space *mapping = swap_file->f_mapping; in destroy_swap_extents()
2523 sis->flags &= ~SWP_ACTIVATED; in destroy_swap_extents()
2524 if (mapping->a_ops->swap_deactivate) in destroy_swap_extents()
2525 mapping->a_ops->swap_deactivate(swap_file); in destroy_swap_extents()
2539 struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; in add_swap_extent()
2549 link = &parent->rb_right; in add_swap_extent()
2554 BUG_ON(se->start_page + se->nr_pages != start_page); in add_swap_extent()
2555 if (se->start_block + se->nr_pages == start_block) { in add_swap_extent()
2557 se->nr_pages += nr_pages; in add_swap_extent()
2565 return -ENOMEM; in add_swap_extent()
2566 new_se->start_page = start_page; in add_swap_extent()
2567 new_se->nr_pages = nr_pages; in add_swap_extent()
2568 new_se->start_block = start_block; in add_swap_extent()
2570 rb_link_node(&new_se->rb_node, parent, link); in add_swap_extent()
2571 rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); in add_swap_extent()
2593 * requirements, they are simply tossed out - we will never use those blocks
2600 * Typically it is in the 1-4 megabyte range. So we can have hundreds of
2601 * extents in the rbtree. - akpm.
2605 struct file *swap_file = sis->swap_file; in setup_swap_extents()
2606 struct address_space *mapping = swap_file->f_mapping; in setup_swap_extents()
2607 struct inode *inode = mapping->host; in setup_swap_extents()
2610 if (S_ISBLK(inode->i_mode)) { in setup_swap_extents()
2611 ret = add_swap_extent(sis, 0, sis->max, 0); in setup_swap_extents()
2612 *span = sis->pages; in setup_swap_extents()
2616 if (mapping->a_ops->swap_activate) { in setup_swap_extents()
2617 ret = mapping->a_ops->swap_activate(sis, swap_file, span); in setup_swap_extents()
2620 sis->flags |= SWP_ACTIVATED; in setup_swap_extents()
2621 if ((sis->flags & SWP_FS_OPS) && in setup_swap_extents()
2624 return -ENOMEM; in setup_swap_extents()
2636 if (si->bdev) in swap_node()
2637 bdev = si->bdev; in swap_node()
2639 bdev = si->swap_file->f_inode->i_sb->s_bdev; in swap_node()
2641 return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; in swap_node()
2652 si->prio = prio; in setup_swap_info()
2654 si->prio = --least_priority; in setup_swap_info()
2657 * low-to-high, while swap ordering is high-to-low in setup_swap_info()
2659 si->list.prio = -si->prio; in setup_swap_info()
2661 if (si->prio >= 0) in setup_swap_info()
2662 si->avail_lists[i].prio = -si->prio; in setup_swap_info()
2665 si->avail_lists[i].prio = 1; in setup_swap_info()
2667 si->avail_lists[i].prio = -si->prio; in setup_swap_info()
2670 si->swap_map = swap_map; in setup_swap_info()
2671 si->cluster_info = cluster_info; in setup_swap_info()
2672 si->zeromap = zeromap; in setup_swap_info()
2677 si->flags |= SWP_WRITEOK; in _enable_swap_info()
2678 atomic_long_add(si->pages, &nr_swap_pages); in _enable_swap_info()
2679 total_swap_pages += si->pages; in _enable_swap_info()
2685 * which on removal of any swap_info_struct with an auto-assigned in _enable_swap_info()
2686 * (i.e. negative) priority increments the auto-assigned priority in _enable_swap_info()
2687 * of any lower-priority swap_info_structs. in _enable_swap_info()
2692 plist_add(&si->list, &swap_active_head); in _enable_swap_info()
2695 if (si->highest_bit) in _enable_swap_info()
2705 spin_lock(&si->lock); in enable_swap_info()
2707 spin_unlock(&si->lock); in enable_swap_info()
2712 percpu_ref_resurrect(&si->users); in enable_swap_info()
2714 spin_lock(&si->lock); in enable_swap_info()
2716 spin_unlock(&si->lock); in enable_swap_info()
2723 spin_lock(&si->lock); in reinsert_swap_info()
2724 setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap); in reinsert_swap_info()
2726 spin_unlock(&si->lock); in reinsert_swap_info()
2758 return -EPERM; in SYSCALL_DEFINE1()
2760 BUG_ON(!current->mm); in SYSCALL_DEFINE1()
2771 mapping = victim->f_mapping; in SYSCALL_DEFINE1()
2774 if (p->flags & SWP_WRITEOK) { in SYSCALL_DEFINE1()
2775 if (p->swap_file->f_mapping == mapping) { in SYSCALL_DEFINE1()
2782 err = -EINVAL; in SYSCALL_DEFINE1()
2786 if (!security_vm_enough_memory_mm(current->mm, p->pages)) in SYSCALL_DEFINE1()
2787 vm_unacct_memory(p->pages); in SYSCALL_DEFINE1()
2789 err = -ENOMEM; in SYSCALL_DEFINE1()
2793 spin_lock(&p->lock); in SYSCALL_DEFINE1()
2795 if (p->prio < 0) { in SYSCALL_DEFINE1()
2800 si->prio++; in SYSCALL_DEFINE1()
2801 si->list.prio--; in SYSCALL_DEFINE1()
2803 if (si->avail_lists[nid].prio != 1) in SYSCALL_DEFINE1()
2804 si->avail_lists[nid].prio--; in SYSCALL_DEFINE1()
2809 plist_del(&p->list, &swap_active_head); in SYSCALL_DEFINE1()
2810 atomic_long_sub(p->pages, &nr_swap_pages); in SYSCALL_DEFINE1()
2811 total_swap_pages -= p->pages; in SYSCALL_DEFINE1()
2812 p->flags &= ~SWP_WRITEOK; in SYSCALL_DEFINE1()
2813 spin_unlock(&p->lock); in SYSCALL_DEFINE1()
2819 err = try_to_unuse(p->type); in SYSCALL_DEFINE1()
2823 /* re-insert swap space back into swap_list */ in SYSCALL_DEFINE1()
2834 * operations protected by RCU reader side lock (including any in SYSCALL_DEFINE1()
2839 percpu_ref_kill(&p->users); in SYSCALL_DEFINE1()
2841 wait_for_completion(&p->comp); in SYSCALL_DEFINE1()
2843 flush_work(&p->discard_work); in SYSCALL_DEFINE1()
2844 flush_work(&p->reclaim_work); in SYSCALL_DEFINE1()
2847 if (p->flags & SWP_CONTINUED) in SYSCALL_DEFINE1()
2850 if (!p->bdev || !bdev_nonrot(p->bdev)) in SYSCALL_DEFINE1()
2855 spin_lock(&p->lock); in SYSCALL_DEFINE1()
2859 p->highest_bit = 0; /* cuts scans short */ in SYSCALL_DEFINE1()
2860 while (p->flags >= SWP_SCANNING) { in SYSCALL_DEFINE1()
2861 spin_unlock(&p->lock); in SYSCALL_DEFINE1()
2865 spin_lock(&p->lock); in SYSCALL_DEFINE1()
2868 swap_file = p->swap_file; in SYSCALL_DEFINE1()
2869 p->swap_file = NULL; in SYSCALL_DEFINE1()
2870 p->max = 0; in SYSCALL_DEFINE1()
2871 swap_map = p->swap_map; in SYSCALL_DEFINE1()
2872 p->swap_map = NULL; in SYSCALL_DEFINE1()
2873 zeromap = p->zeromap; in SYSCALL_DEFINE1()
2874 p->zeromap = NULL; in SYSCALL_DEFINE1()
2875 cluster_info = p->cluster_info; in SYSCALL_DEFINE1()
2876 p->cluster_info = NULL; in SYSCALL_DEFINE1()
2877 spin_unlock(&p->lock); in SYSCALL_DEFINE1()
2879 arch_swap_invalidate_area(p->type); in SYSCALL_DEFINE1()
2880 zswap_swapoff(p->type); in SYSCALL_DEFINE1()
2882 free_percpu(p->percpu_cluster); in SYSCALL_DEFINE1()
2883 p->percpu_cluster = NULL; in SYSCALL_DEFINE1()
2884 free_percpu(p->cluster_next_cpu); in SYSCALL_DEFINE1()
2885 p->cluster_next_cpu = NULL; in SYSCALL_DEFINE1()
2890 swap_cgroup_swapoff(p->type); in SYSCALL_DEFINE1()
2891 exit_swap_address_space(p->type); in SYSCALL_DEFINE1()
2893 inode = mapping->host; in SYSCALL_DEFINE1()
2896 inode->i_flags &= ~S_SWAPFILE; in SYSCALL_DEFINE1()
2903 * not hold p->lock after we cleared its SWP_WRITEOK. in SYSCALL_DEFINE1()
2906 p->flags = 0; in SYSCALL_DEFINE1()
2923 struct seq_file *seq = file->private_data; in swaps_poll()
2927 if (seq->poll_event != atomic_read(&proc_poll_event)) { in swaps_poll()
2928 seq->poll_event = atomic_read(&proc_poll_event); in swaps_poll()
2948 if (!(si->flags & SWP_USED) || !si->swap_map) in swap_start()
2950 if (!--l) in swap_start()
2965 type = si->type + 1; in swap_next()
2969 if (!(si->flags & SWP_USED) || !si->swap_map) in swap_next()
2994 bytes = K(si->pages); in swap_show()
2995 inuse = K(READ_ONCE(si->inuse_pages)); in swap_show()
2997 file = si->swap_file; in swap_show()
3000 len < 40 ? 40 - len : 1, " ", in swap_show()
3001 S_ISBLK(file_inode(file)->i_mode) ? in swap_show()
3005 si->prio); in swap_show()
3025 seq = file->private_data; in swaps_open()
3026 seq->poll_event = atomic_read(&proc_poll_event); in swaps_open()
3065 return ERR_PTR(-ENOMEM); in alloc_swap_info()
3067 if (percpu_ref_init(&p->users, swap_users_ref_free, in alloc_swap_info()
3070 return ERR_PTR(-ENOMEM); in alloc_swap_info()
3075 if (!(swap_info[type]->flags & SWP_USED)) in alloc_swap_info()
3080 percpu_ref_exit(&p->users); in alloc_swap_info()
3082 return ERR_PTR(-EPERM); in alloc_swap_info()
3085 p->type = type; in alloc_swap_info()
3097 * would be relying on p->type to remain valid. in alloc_swap_info()
3100 p->swap_extent_root = RB_ROOT; in alloc_swap_info()
3101 plist_node_init(&p->list, 0); in alloc_swap_info()
3103 plist_node_init(&p->avail_lists[i], 0); in alloc_swap_info()
3104 p->flags = SWP_USED; in alloc_swap_info()
3107 percpu_ref_exit(&defer->users); in alloc_swap_info()
3110 spin_lock_init(&p->lock); in alloc_swap_info()
3111 spin_lock_init(&p->cont_lock); in alloc_swap_info()
3112 init_completion(&p->comp); in alloc_swap_info()
3119 if (S_ISBLK(inode->i_mode)) { in claim_swapfile()
3120 si->bdev = I_BDEV(inode); in claim_swapfile()
3126 if (bdev_is_zoned(si->bdev)) in claim_swapfile()
3127 return -EINVAL; in claim_swapfile()
3128 si->flags |= SWP_BLKDEV; in claim_swapfile()
3129 } else if (S_ISREG(inode->i_mode)) { in claim_swapfile()
3130 si->bdev = inode->i_sb->s_bdev; in claim_swapfile()
3140 * 1) the number of bits for the swap offset in the swp_entry_t type, and
3145 * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
3146 * decoded to a swp_entry_t again, and finally the swap offset is
3174 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { in read_swap_header()
3175 pr_err("Unable to find swap-space signature\n"); in read_swap_header()
3180 if (swab32(swap_header->info.version) == 1) { in read_swap_header()
3181 swab32s(&swap_header->info.version); in read_swap_header()
3182 swab32s(&swap_header->info.last_page); in read_swap_header()
3183 swab32s(&swap_header->info.nr_badpages); in read_swap_header()
3184 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) in read_swap_header()
3186 for (i = 0; i < swap_header->info.nr_badpages; i++) in read_swap_header()
3187 swab32s(&swap_header->info.badpages[i]); in read_swap_header()
3189 /* Check the swap header's sub-version */ in read_swap_header()
3190 if (swap_header->info.version != 1) { in read_swap_header()
3192 swap_header->info.version); in read_swap_header()
3196 si->lowest_bit = 1; in read_swap_header()
3197 si->cluster_next = 1; in read_swap_header()
3198 si->cluster_nr = 0; in read_swap_header()
3201 last_page = swap_header->info.last_page; in read_swap_header()
3203 pr_warn("Empty swap-file\n"); in read_swap_header()
3212 /* p->max is an unsigned int: don't overflow it */ in read_swap_header()
3216 si->highest_bit = maxpages - 1; in read_swap_header()
3225 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) in read_swap_header()
3227 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) in read_swap_header()
3250 nr_good_pages = maxpages - 1; /* omit header page */ in setup_swap_map_and_extents()
3252 for (i = 0; i < swap_header->info.nr_badpages; i++) { in setup_swap_map_and_extents()
3253 unsigned int page_nr = swap_header->info.badpages[i]; in setup_swap_map_and_extents()
3254 if (page_nr == 0 || page_nr > swap_header->info.last_page) in setup_swap_map_and_extents()
3255 return -EINVAL; in setup_swap_map_and_extents()
3258 nr_good_pages--; in setup_swap_map_and_extents()
3264 si->max = maxpages; in setup_swap_map_and_extents()
3265 si->pages = nr_good_pages; in setup_swap_map_and_extents()
3269 nr_good_pages = si->pages; in setup_swap_map_and_extents()
3272 pr_warn("Empty swap-file\n"); in setup_swap_map_and_extents()
3273 return -EINVAL; in setup_swap_map_and_extents()
3284 unsigned long col = si->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; in setup_clusters()
3287 int cpu, err = -ENOMEM; in setup_clusters()
3294 spin_lock_init(&cluster_info[i].lock); in setup_clusters()
3296 si->cluster_next_cpu = alloc_percpu(unsigned int); in setup_clusters()
3297 if (!si->cluster_next_cpu) in setup_clusters()
3302 per_cpu(*si->cluster_next_cpu, cpu) = in setup_clusters()
3303 get_random_u32_inclusive(1, si->highest_bit); in setup_clusters()
3305 si->percpu_cluster = alloc_percpu(struct percpu_cluster); in setup_clusters()
3306 if (!si->percpu_cluster) in setup_clusters()
3312 cluster = per_cpu_ptr(si->percpu_cluster, cpu); in setup_clusters()
3314 cluster->next[i] = SWAP_NEXT_INVALID; in setup_clusters()
3325 for (i = 0; i < swap_header->info.nr_badpages; i++) in setup_clusters()
3327 swap_header->info.badpages[i]); in setup_clusters()
3331 INIT_LIST_HEAD(&si->free_clusters); in setup_clusters()
3332 INIT_LIST_HEAD(&si->full_clusters); in setup_clusters()
3333 INIT_LIST_HEAD(&si->discard_clusters); in setup_clusters()
3336 INIT_LIST_HEAD(&si->nonfull_clusters[i]); in setup_clusters()
3337 INIT_LIST_HEAD(&si->frag_clusters[i]); in setup_clusters()
3338 si->frag_cluster_nr[i] = 0; in setup_clusters()
3353 if (ci->count) { in setup_clusters()
3354 ci->flags = CLUSTER_FLAG_NONFULL; in setup_clusters()
3355 list_add_tail(&ci->list, &si->nonfull_clusters[0]); in setup_clusters()
3358 ci->flags = CLUSTER_FLAG_FREE; in setup_clusters()
3359 list_add_tail(&ci->list, &si->free_clusters); in setup_clusters()
3392 return -EINVAL; in SYSCALL_DEFINE2()
3395 return -EPERM; in SYSCALL_DEFINE2()
3398 return -ENOMEM; in SYSCALL_DEFINE2()
3404 INIT_WORK(&si->discard_work, swap_discard_work); in SYSCALL_DEFINE2()
3405 INIT_WORK(&si->reclaim_work, swap_reclaim_work); in SYSCALL_DEFINE2()
3420 si->swap_file = swap_file; in SYSCALL_DEFINE2()
3421 mapping = swap_file->f_mapping; in SYSCALL_DEFINE2()
3422 dentry = swap_file->f_path.dentry; in SYSCALL_DEFINE2()
3423 inode = mapping->host; in SYSCALL_DEFINE2()
3431 error = -ENOENT; in SYSCALL_DEFINE2()
3435 error = -EBUSY; in SYSCALL_DEFINE2()
3442 if (!mapping->a_ops->read_folio) { in SYSCALL_DEFINE2()
3443 error = -EINVAL; in SYSCALL_DEFINE2()
3455 error = -EINVAL; in SYSCALL_DEFINE2()
3462 error = -ENOMEM; in SYSCALL_DEFINE2()
3466 error = swap_cgroup_swapon(si->type, maxpages); in SYSCALL_DEFINE2()
3484 error = -ENOMEM; in SYSCALL_DEFINE2()
3488 if (si->bdev && bdev_stable_writes(si->bdev)) in SYSCALL_DEFINE2()
3489 si->flags |= SWP_STABLE_WRITES; in SYSCALL_DEFINE2()
3491 if (si->bdev && bdev_synchronous(si->bdev)) in SYSCALL_DEFINE2()
3492 si->flags |= SWP_SYNCHRONOUS_IO; in SYSCALL_DEFINE2()
3494 if (si->bdev && bdev_nonrot(si->bdev)) { in SYSCALL_DEFINE2()
3495 si->flags |= SWP_SOLIDSTATE; in SYSCALL_DEFINE2()
3509 si->bdev && bdev_max_discard_sectors(si->bdev)) { in SYSCALL_DEFINE2()
3516 si->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | in SYSCALL_DEFINE2()
3521 * either do single-time area discards only, or to just in SYSCALL_DEFINE2()
3522 * perform discards for released swap page-clusters. in SYSCALL_DEFINE2()
3523 * Now it's time to adjust the p->flags accordingly. in SYSCALL_DEFINE2()
3526 si->flags &= ~SWP_PAGE_DISCARD; in SYSCALL_DEFINE2()
3528 si->flags &= ~SWP_AREA_DISCARD; in SYSCALL_DEFINE2()
3530 /* issue a swapon-time discard if it's still required */ in SYSCALL_DEFINE2()
3531 if (si->flags & SWP_AREA_DISCARD) { in SYSCALL_DEFINE2()
3539 error = init_swap_address_space(si->type, maxpages); in SYSCALL_DEFINE2()
3543 error = zswap_swapon(si->type, maxpages); in SYSCALL_DEFINE2()
3551 inode->i_flags |= S_SWAPFILE; in SYSCALL_DEFINE2()
3554 inode->i_flags &= ~S_SWAPFILE; in SYSCALL_DEFINE2()
3559 prio = -1; in SYSCALL_DEFINE2()
3566 K(si->pages), name->name, si->prio, nr_extents, in SYSCALL_DEFINE2()
3568 (si->flags & SWP_SOLIDSTATE) ? "SS" : "", in SYSCALL_DEFINE2()
3569 (si->flags & SWP_DISCARDABLE) ? "D" : "", in SYSCALL_DEFINE2()
3570 (si->flags & SWP_AREA_DISCARD) ? "s" : "", in SYSCALL_DEFINE2()
3571 (si->flags & SWP_PAGE_DISCARD) ? "c" : ""); in SYSCALL_DEFINE2()
3580 zswap_swapoff(si->type); in SYSCALL_DEFINE2()
3582 exit_swap_address_space(si->type); in SYSCALL_DEFINE2()
3586 free_percpu(si->percpu_cluster); in SYSCALL_DEFINE2()
3587 si->percpu_cluster = NULL; in SYSCALL_DEFINE2()
3588 free_percpu(si->cluster_next_cpu); in SYSCALL_DEFINE2()
3589 si->cluster_next_cpu = NULL; in SYSCALL_DEFINE2()
3592 swap_cgroup_swapoff(si->type); in SYSCALL_DEFINE2()
3594 si->swap_file = NULL; in SYSCALL_DEFINE2()
3595 si->flags = 0; in SYSCALL_DEFINE2()
3625 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) in si_swapinfo()
3626 nr_to_be_unused += READ_ONCE(si->inuse_pages); in si_swapinfo()
3628 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; in si_swapinfo()
3629 val->totalswap = total_swap_pages + nr_to_be_unused; in si_swapinfo()
3637 * - success -> 0
3638 * - swp_entry is invalid -> EINVAL
3639 * - swp_entry is migration entry -> EINVAL
3640 * - swap-cache reference is requested but there is already one. -> EEXIST
3641 * - swap-cache reference is requested but the entry is not used. -> ENOENT
3642 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
3648 unsigned long offset; in __swap_duplicate() local
3655 offset = swp_offset(entry); in __swap_duplicate()
3656 VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); in __swap_duplicate()
3658 ci = lock_cluster_or_swap_info(si, offset); in __swap_duplicate()
3662 count = si->swap_map[offset + i]; in __swap_duplicate()
3666 * swap entry could be SWAP_MAP_BAD. Check here with lock held. in __swap_duplicate()
3669 err = -ENOENT; in __swap_duplicate()
3677 err = -ENOENT; in __swap_duplicate()
3680 err = -EEXIST; in __swap_duplicate()
3682 err = -EINVAL; in __swap_duplicate()
3690 count = si->swap_map[offset + i]; in __swap_duplicate()
3698 else if (swap_count_continued(si, offset + i, count)) in __swap_duplicate()
3705 err = -ENOMEM; in __swap_duplicate()
3709 WRITE_ONCE(si->swap_map[offset + i], count | has_cache); in __swap_duplicate()
3728 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
3730 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
3737 while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) in swap_duplicate()
3747 * -EEXIST means there is a swap cache.
3757 unsigned long offset = swp_offset(entry); in swapcache_clear() local
3759 cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE); in swapcache_clear()
3768 * out-of-line methods to avoid include hell.
3772 return swp_swap_info(folio->swap)->swap_file->f_mapping; in swapcache_mapping()
3778 return swap_cache_index(folio->swap); in __folio_swap_cache_index()
3783 * add_swap_count_continuation - called when a swap count is duplicated
3804 pgoff_t offset; in add_swap_count_continuation() local
3822 spin_lock(&si->lock); in add_swap_count_continuation()
3824 offset = swp_offset(entry); in add_swap_count_continuation()
3826 ci = lock_cluster(si, offset); in add_swap_count_continuation()
3828 count = swap_count(si->swap_map[offset]); in add_swap_count_continuation()
3834 * over-provisioning. in add_swap_count_continuation()
3840 ret = -ENOMEM; in add_swap_count_continuation()
3844 head = vmalloc_to_page(si->swap_map + offset); in add_swap_count_continuation()
3845 offset &= ~PAGE_MASK; in add_swap_count_continuation()
3847 spin_lock(&si->cont_lock); in add_swap_count_continuation()
3854 INIT_LIST_HEAD(&head->lru); in add_swap_count_continuation()
3856 si->flags |= SWP_CONTINUED; in add_swap_count_continuation()
3859 list_for_each_entry(list_page, &head->lru, lru) { in add_swap_count_continuation()
3869 map = kmap_local_page(list_page) + offset; in add_swap_count_continuation()
3881 list_add_tail(&page->lru, &head->lru); in add_swap_count_continuation()
3884 spin_unlock(&si->cont_lock); in add_swap_count_continuation()
3887 spin_unlock(&si->lock); in add_swap_count_continuation()
3896 * swap_count_continued - when the original swap_map count is incremented
3902 * lock.
3905 pgoff_t offset, unsigned char count) in swap_count_continued() argument
3912 head = vmalloc_to_page(si->swap_map + offset); in swap_count_continued()
3918 spin_lock(&si->cont_lock); in swap_count_continued()
3919 offset &= ~PAGE_MASK; in swap_count_continued()
3921 map = kmap_local_page(page) + offset; in swap_count_continued()
3934 map = kmap_local_page(page) + offset; in swap_count_continued()
3943 map = kmap_local_page(page) + offset; in swap_count_continued()
3949 map = kmap_local_page(page) + offset; in swap_count_continued()
3964 map = kmap_local_page(page) + offset; in swap_count_continued()
3967 *map -= 1; in swap_count_continued()
3972 map = kmap_local_page(page) + offset; in swap_count_continued()
3980 spin_unlock(&si->cont_lock); in swap_count_continued()
3985 * free_swap_count_continuations - swapoff free all the continuation pages
3990 pgoff_t offset; in free_swap_count_continuations() local
3992 for (offset = 0; offset < si->max; offset += PAGE_SIZE) { in free_swap_count_continuations()
3994 head = vmalloc_to_page(si->swap_map + offset); in free_swap_count_continuations()
3998 list_for_each_entry_safe(page, next, &head->lru, lru) { in free_swap_count_continuations()
3999 list_del(&page->lru); in free_swap_count_continuations()
4023 * lock. in __folio_throttle_swaprate()
4025 if (current->throttle_disk) in __folio_throttle_swaprate()
4031 if (si->bdev) { in __folio_throttle_swaprate()
4032 blkcg_schedule_throttle(si->bdev->bd_disk, true); in __folio_throttle_swaprate()
4048 return -ENOMEM; in swapfile_init()