summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Morton <akpm@linux-foundation.org>2024-04-08 13:40:13 -0700
committerAndrew Morton <akpm@linux-foundation.org>2024-04-08 13:40:13 -0700
commitdc439d3fc6f3661f5f7cfd08a59d5507d079a340 (patch)
treeb6c6cbb4e39b94310d61214b8a65e5a8b77237be
parentec3a259c71e17cfcefa54f5647e906bf57c1cbef (diff)
download25-new-dc439d3fc6f3661f5f7cfd08a59d5507d079a340.tar.gz
foo
-rw-r--r--patches/mm-add-per-order-mthp-anon_swpout-and-anon_swpout_fallback-counters.patch6
-rw-r--r--patches/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.patch283
-rw-r--r--patches/mm-swap-allow-storage-of-all-mthp-orders.patch430
-rw-r--r--patches/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.patch408
-rw-r--r--patches/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.patch271
-rw-r--r--patches/mm-swap-simplify-struct-percpu_cluster.patch140
-rw-r--r--patches/mm-swap-update-get_swap_pages-to-take-folio-order.patch116
-rw-r--r--patches/mm-vmscan-avoid-split-during-shrink_folio_list.patch77
-rw-r--r--pc/devel-series7
-rw-r--r--pc/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.pc4
-rw-r--r--pc/mm-swap-allow-storage-of-all-mthp-orders.pc2
-rw-r--r--pc/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.pc6
-rw-r--r--pc/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.pc3
-rw-r--r--pc/mm-swap-simplify-struct-percpu_cluster.pc2
-rw-r--r--pc/mm-swap-update-get_swap_pages-to-take-folio-order.pc3
-rw-r--r--pc/mm-vmscan-avoid-split-during-shrink_folio_list.pc1
-rw-r--r--txt/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.txt41
-rw-r--r--txt/mm-set-pageblock_order-to-hpage_pmd_order-in-case-with-config_hugetlb_page-but-thp-enabled.txt1
-rw-r--r--txt/mm-swap-allow-storage-of-all-mthp-orders.txt60
-rw-r--r--txt/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.txt41
-rw-r--r--txt/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.txt107
-rw-r--r--txt/mm-swap-simplify-struct-percpu_cluster.txt41
-rw-r--r--txt/mm-swap-update-get_swap_pages-to-take-folio-order.txt24
-rw-r--r--txt/mm-userfaultfd-allow-hugetlb-change-protection-upon-poison-entry.txt4
-rw-r--r--txt/mm-vmscan-avoid-split-during-shrink_folio_list.txt32
25 files changed, 2105 insertions, 5 deletions
diff --git a/patches/mm-add-per-order-mthp-anon_swpout-and-anon_swpout_fallback-counters.patch b/patches/mm-add-per-order-mthp-anon_swpout-and-anon_swpout_fallback-counters.patch
index 85dbca063..b09222878 100644
--- a/patches/mm-add-per-order-mthp-anon_swpout-and-anon_swpout_fallback-counters.patch
+++ b/patches/mm-add-per-order-mthp-anon_swpout-and-anon_swpout_fallback-counters.patch
@@ -80,9 +80,9 @@ Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
--- a/mm/vmscan.c~mm-add-per-order-mthp-anon_swpout-and-anon_swpout_fallback-counters
+++ a/mm/vmscan.c
-@@ -1230,6 +1230,9 @@ retry:
- count_vm_event(
- THP_SWPOUT_FALLBACK);
+@@ -1225,6 +1225,9 @@ retry:
+ THP_SWPOUT_FALLBACK, 1);
+ count_vm_event(THP_SWPOUT_FALLBACK);
}
+ if (nr_pages > 0)
+ count_mthp_stat(get_order(nr_pages * PAGE_SIZE),
diff --git a/patches/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.patch b/patches/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.patch
new file mode 100644
index 000000000..71c0e7571
--- /dev/null
+++ b/patches/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.patch
@@ -0,0 +1,283 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: madvise: avoid split during MADV_PAGEOUT and MADV_COLD
+Date: Mon, 8 Apr 2024 19:39:46 +0100
+
+Rework madvise_cold_or_pageout_pte_range() to avoid splitting any large
+folio that is fully and contiguously mapped in the pageout/cold vm range.
+This change means that large folios will be maintained all the way to swap
+storage. This both improves performance during swap-out, by eliding the
+cost of splitting the folio, and sets us up nicely for maintaining the
+large folio when it is swapped back in (to be covered in a separate
+series).
+
+Folios that are not fully mapped in the target range are still split, but
+note that behavior is changed so that if the split fails for any reason
+(folio locked, shared, etc) we now leave it as is and move to the next pte
+in the range and continue work on the proceeding folios. Previously any
+failure of this sort would cause the entire operation to give up and no
+folios mapped at higher addresses were paged out or made cold. Given
+large folios are becoming more common, this old behavior would have likely
+lead to wasted opportunities.
+
+While we are at it, change the code that clears young from the ptes to use
+ptep_test_and_clear_young(), via the new mkold_ptes() batch helper
+function. This is more efficent than get_and_clear/modify/set, especially
+for contpte mappings on arm64, where the old approach would require
+unfolding/refolding and the new approach can be done in place.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-8-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: Barry Song <v-songbaohua@oppo.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ include/linux/pgtable.h | 30 +++++++++++++
+ mm/internal.h | 12 ++++-
+ mm/madvise.c | 87 +++++++++++++++++++++-----------------
+ mm/memory.c | 4 -
+ 4 files changed, 92 insertions(+), 41 deletions(-)
+
+--- a/include/linux/pgtable.h~mm-madvise-avoid-split-during-madv_pageout-and-madv_cold
++++ a/include/linux/pgtable.h
+@@ -361,6 +361,36 @@ static inline int ptep_test_and_clear_yo
+ }
+ #endif
+
++#ifndef mkold_ptes
++/**
++ * mkold_ptes - Mark PTEs that map consecutive pages of the same folio as old.
++ * @vma: VMA the pages are mapped into.
++ * @addr: Address the first page is mapped at.
++ * @ptep: Page table pointer for the first entry.
++ * @nr: Number of entries to mark old.
++ *
++ * May be overridden by the architecture; otherwise, implemented as a simple
++ * loop over ptep_test_and_clear_young().
++ *
++ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
++ * some PTEs might be write-protected.
++ *
++ * Context: The caller holds the page table lock. The PTEs map consecutive
++ * pages that belong to the same folio. The PTEs are all in the same PMD.
++ */
++static inline void mkold_ptes(struct vm_area_struct *vma, unsigned long addr,
++ pte_t *ptep, unsigned int nr)
++{
++ for (;;) {
++ ptep_test_and_clear_young(vma, addr, ptep);
++ if (--nr == 0)
++ break;
++ ptep++;
++ addr += PAGE_SIZE;
++ }
++}
++#endif
++
+ #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
+ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+--- a/mm/internal.h~mm-madvise-avoid-split-during-madv_pageout-and-madv_cold
++++ a/mm/internal.h
+@@ -130,6 +130,8 @@ static inline pte_t __pte_batch_clear_ig
+ * @flags: Flags to modify the PTE batch semantics.
+ * @any_writable: Optional pointer to indicate whether any entry except the
+ * first one is writable.
++ * @any_young: Optional pointer to indicate whether any entry except the
++ * first one is young.
+ *
+ * Detect a PTE batch: consecutive (present) PTEs that map consecutive
+ * pages of the same large folio.
+@@ -145,16 +147,18 @@ static inline pte_t __pte_batch_clear_ig
+ */
+ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
+ pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
+- bool *any_writable)
++ bool *any_writable, bool *any_young)
+ {
+ unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
+ const pte_t *end_ptep = start_ptep + max_nr;
+ pte_t expected_pte, *ptep;
+- bool writable;
++ bool writable, young;
+ int nr;
+
+ if (any_writable)
+ *any_writable = false;
++ if (any_young)
++ *any_young = false;
+
+ VM_WARN_ON_FOLIO(!pte_present(pte), folio);
+ VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
+@@ -168,6 +172,8 @@ static inline int folio_pte_batch(struct
+ pte = ptep_get(ptep);
+ if (any_writable)
+ writable = !!pte_write(pte);
++ if (any_young)
++ young = !!pte_young(pte);
+ pte = __pte_batch_clear_ignored(pte, flags);
+
+ if (!pte_same(pte, expected_pte))
+@@ -183,6 +189,8 @@ static inline int folio_pte_batch(struct
+
+ if (any_writable)
+ *any_writable |= writable;
++ if (any_young)
++ *any_young |= young;
+
+ nr = pte_batch_hint(ptep, pte);
+ expected_pte = pte_advance_pfn(expected_pte, nr);
+--- a/mm/madvise.c~mm-madvise-avoid-split-during-madv_pageout-and-madv_cold
++++ a/mm/madvise.c
+@@ -336,6 +336,7 @@ static int madvise_cold_or_pageout_pte_r
+ LIST_HEAD(folio_list);
+ bool pageout_anon_only_filter;
+ unsigned int batch_count = 0;
++ int nr;
+
+ if (fatal_signal_pending(current))
+ return -EINTR;
+@@ -423,7 +424,8 @@ restart:
+ return 0;
+ flush_tlb_batched_pending(mm);
+ arch_enter_lazy_mmu_mode();
+- for (; addr < end; pte++, addr += PAGE_SIZE) {
++ for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
++ nr = 1;
+ ptent = ptep_get(pte);
+
+ if (++batch_count == SWAP_CLUSTER_MAX) {
+@@ -447,55 +449,66 @@ restart:
+ continue;
+
+ /*
+- * Creating a THP page is expensive so split it only if we
+- * are sure it's worth. Split it if we are only owner.
++ * If we encounter a large folio, only split it if it is not
++ * fully mapped within the range we are operating on. Otherwise
++ * leave it as is so that it can be swapped out whole. If we
++ * fail to split a folio, leave it in place and advance to the
++ * next pte in the range.
+ */
+ if (folio_test_large(folio)) {
+- int err;
+-
+- if (folio_likely_mapped_shared(folio))
+- break;
+- if (pageout_anon_only_filter && !folio_test_anon(folio))
+- break;
+- if (!folio_trylock(folio))
+- break;
+- folio_get(folio);
+- arch_leave_lazy_mmu_mode();
+- pte_unmap_unlock(start_pte, ptl);
+- start_pte = NULL;
+- err = split_folio(folio);
+- folio_unlock(folio);
+- folio_put(folio);
+- if (err)
+- break;
+- start_pte = pte =
+- pte_offset_map_lock(mm, pmd, addr, &ptl);
+- if (!start_pte)
+- break;
+- arch_enter_lazy_mmu_mode();
+- pte--;
+- addr -= PAGE_SIZE;
+- continue;
++ const fpb_t fpb_flags = FPB_IGNORE_DIRTY |
++ FPB_IGNORE_SOFT_DIRTY;
++ int max_nr = (end - addr) / PAGE_SIZE;
++ bool any_young;
++
++ nr = folio_pte_batch(folio, addr, pte, ptent, max_nr,
++ fpb_flags, NULL, &any_young);
++ if (any_young)
++ ptent = pte_mkyoung(ptent);
++
++ if (nr < folio_nr_pages(folio)) {
++ int err;
++
++ if (folio_likely_mapped_shared(folio))
++ continue;
++ if (pageout_anon_only_filter && !folio_test_anon(folio))
++ continue;
++ if (!folio_trylock(folio))
++ continue;
++ folio_get(folio);
++ arch_leave_lazy_mmu_mode();
++ pte_unmap_unlock(start_pte, ptl);
++ start_pte = NULL;
++ err = split_folio(folio);
++ folio_unlock(folio);
++ folio_put(folio);
++ start_pte = pte =
++ pte_offset_map_lock(mm, pmd, addr, &ptl);
++ if (!start_pte)
++ break;
++ arch_enter_lazy_mmu_mode();
++ if (!err)
++ nr = 0;
++ continue;
++ }
+ }
+
+ /*
+ * Do not interfere with other mappings of this folio and
+- * non-LRU folio.
++ * non-LRU folio. If we have a large folio at this point, we
++ * know it is fully mapped so if its mapcount is the same as its
++ * number of pages, it must be exclusive.
+ */
+- if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
++ if (!folio_test_lru(folio) ||
++ folio_mapcount(folio) != folio_nr_pages(folio))
+ continue;
+
+ if (pageout_anon_only_filter && !folio_test_anon(folio))
+ continue;
+
+- VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
+-
+ if (!pageout && pte_young(ptent)) {
+- ptent = ptep_get_and_clear_full(mm, addr, pte,
+- tlb->fullmm);
+- ptent = pte_mkold(ptent);
+- set_pte_at(mm, addr, pte, ptent);
+- tlb_remove_tlb_entry(tlb, pte, addr);
++ mkold_ptes(vma, addr, pte, nr);
++ tlb_remove_tlb_entries(tlb, pte, nr, addr);
+ }
+
+ /*
+--- a/mm/memory.c~mm-madvise-avoid-split-during-madv_pageout-and-madv_cold
++++ a/mm/memory.c
+@@ -989,7 +989,7 @@ copy_present_ptes(struct vm_area_struct
+ flags |= FPB_IGNORE_SOFT_DIRTY;
+
+ nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
+- &any_writable);
++ &any_writable, NULL);
+ folio_ref_add(folio, nr);
+ if (folio_test_anon(folio)) {
+ if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
+@@ -1559,7 +1559,7 @@ static inline int zap_present_ptes(struc
+ */
+ if (unlikely(folio_test_large(folio) && max_nr != 1)) {
+ nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
+- NULL);
++ NULL, NULL);
+
+ zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
+ addr, details, rss, force_flush,
+_
diff --git a/patches/mm-swap-allow-storage-of-all-mthp-orders.patch b/patches/mm-swap-allow-storage-of-all-mthp-orders.patch
new file mode 100644
index 000000000..bd5e5808a
--- /dev/null
+++ b/patches/mm-swap-allow-storage-of-all-mthp-orders.patch
@@ -0,0 +1,430 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: allow storage of all mTHP orders
+Date: Mon, 8 Apr 2024 19:39:44 +0100
+
+Multi-size THP enables performance improvements by allocating large,
+pte-mapped folios for anonymous memory. However I've observed that on an
+arm64 system running a parallel workload (e.g. kernel compilation) across
+many cores, under high memory pressure, the speed regresses. This is due
+to bottlenecking on the increased number of TLBIs added due to all the
+extra folio splitting when the large folios are swapped out.
+
+Therefore, solve this regression by adding support for swapping out mTHP
+without needing to split the folio, just like is already done for
+PMD-sized THP. This change only applies when CONFIG_THP_SWAP is enabled,
+and when the swap backing store is a non-rotating block device. These are
+the same constraints as for the existing PMD-sized THP swap-out support.
+
+Note that no attempt is made to swap-in (m)THP here - this is still done
+page-by-page, like for PMD-sized THP. But swapping-out mTHP is a
+prerequisite for swapping-in mTHP.
+
+The main change here is to improve the swap entry allocator so that it can
+allocate any power-of-2 number of contiguous entries between [1, (1 <<
+PMD_ORDER)]. This is done by allocating a cluster for each distinct order
+and allocating sequentially from it until the cluster is full. This
+ensures that we don't need to search the map and we get no fragmentation
+due to alignment padding for different orders in the cluster. If there is
+no current cluster for a given order, we attempt to allocate a free
+cluster from the list. If there are no free clusters, we fail the
+allocation and the caller can fall back to splitting the folio and
+allocates individual entries (as per existing PMD-sized THP fallback).
+
+The per-order current clusters are maintained per-cpu using the existing
+infrastructure. This is done to avoid interleving pages from different
+tasks, which would prevent IO being batched. This is already done for the
+order-0 allocations so we follow the same pattern.
+
+As is done for order-0 per-cpu clusters, the scanner now can steal order-0
+entries from any per-cpu-per-order reserved cluster. This ensures that
+when the swap file is getting full, space doesn't get tied up in the
+per-cpu reserves.
+
+This change only modifies swap to be able to accept any order mTHP. It
+doesn't change the callers to elide doing the actual split. That will be
+done in separate changes.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-6-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ include/linux/swap.h | 8 +-
+ mm/swapfile.c | 162 +++++++++++++++++++++++------------------
+ 2 files changed, 98 insertions(+), 72 deletions(-)
+
+--- a/include/linux/swap.h~mm-swap-allow-storage-of-all-mthp-orders
++++ a/include/linux/swap.h
+@@ -268,13 +268,19 @@ struct swap_cluster_info {
+ */
+ #define SWAP_NEXT_INVALID 0
+
++#ifdef CONFIG_THP_SWAP
++#define SWAP_NR_ORDERS (PMD_ORDER + 1)
++#else
++#define SWAP_NR_ORDERS 1
++#endif
++
+ /*
+ * We assign a cluster to each CPU, so each CPU can allocate swap entry from
+ * its own cluster and swapout sequentially. The purpose is to optimize swapout
+ * throughput.
+ */
+ struct percpu_cluster {
+- unsigned int next; /* Likely next allocation offset */
++ unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
+ };
+
+ struct swap_cluster_list {
+--- a/mm/swapfile.c~mm-swap-allow-storage-of-all-mthp-orders
++++ a/mm/swapfile.c
+@@ -551,10 +551,12 @@ static void free_cluster(struct swap_inf
+
+ /*
+ * The cluster corresponding to page_nr will be used. The cluster will be
+- * removed from free cluster list and its usage counter will be increased.
++ * removed from free cluster list and its usage counter will be increased by
++ * count.
+ */
+-static void inc_cluster_info_page(struct swap_info_struct *p,
+- struct swap_cluster_info *cluster_info, unsigned long page_nr)
++static void add_cluster_info_page(struct swap_info_struct *p,
++ struct swap_cluster_info *cluster_info, unsigned long page_nr,
++ unsigned long count)
+ {
+ unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+
+@@ -563,9 +565,19 @@ static void inc_cluster_info_page(struct
+ if (cluster_is_free(&cluster_info[idx]))
+ alloc_cluster(p, idx);
+
+- VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
++ VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
+ cluster_set_count(&cluster_info[idx],
+- cluster_count(&cluster_info[idx]) + 1);
++ cluster_count(&cluster_info[idx]) + count);
++}
++
++/*
++ * The cluster corresponding to page_nr will be used. The cluster will be
++ * removed from free cluster list and its usage counter will be increased by 1.
++ */
++static void inc_cluster_info_page(struct swap_info_struct *p,
++ struct swap_cluster_info *cluster_info, unsigned long page_nr)
++{
++ add_cluster_info_page(p, cluster_info, page_nr, 1);
+ }
+
+ /*
+@@ -595,7 +607,7 @@ static void dec_cluster_info_page(struct
+ */
+ static bool
+ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
+- unsigned long offset)
++ unsigned long offset, int order)
+ {
+ struct percpu_cluster *percpu_cluster;
+ bool conflict;
+@@ -609,24 +621,39 @@ scan_swap_map_ssd_cluster_conflict(struc
+ return false;
+
+ percpu_cluster = this_cpu_ptr(si->percpu_cluster);
+- percpu_cluster->next = SWAP_NEXT_INVALID;
++ percpu_cluster->next[order] = SWAP_NEXT_INVALID;
++ return true;
++}
++
++static inline bool swap_range_empty(char *swap_map, unsigned int start,
++ unsigned int nr_pages)
++{
++ unsigned int i;
++
++ for (i = 0; i < nr_pages; i++) {
++ if (swap_map[start + i])
++ return false;
++ }
++
+ return true;
+ }
+
+ /*
+- * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
+- * might involve allocating a new cluster for current CPU too.
++ * Try to get swap entries with specified order from current cpu's swap entry
++ * pool (a cluster). This might involve allocating a new cluster for current CPU
++ * too.
+ */
+ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+- unsigned long *offset, unsigned long *scan_base)
++ unsigned long *offset, unsigned long *scan_base, int order)
+ {
++ unsigned int nr_pages = 1 << order;
+ struct percpu_cluster *cluster;
+ struct swap_cluster_info *ci;
+ unsigned int tmp, max;
+
+ new_cluster:
+ cluster = this_cpu_ptr(si->percpu_cluster);
+- tmp = cluster->next;
++ tmp = cluster->next[order];
+ if (tmp == SWAP_NEXT_INVALID) {
+ if (!cluster_list_empty(&si->free_clusters)) {
+ tmp = cluster_next(&si->free_clusters.head) *
+@@ -647,26 +674,27 @@ new_cluster:
+
+ /*
+ * Other CPUs can use our cluster if they can't find a free cluster,
+- * check if there is still free entry in the cluster
++ * check if there is still free entry in the cluster, maintaining
++ * natural alignment.
+ */
+ max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER));
+ if (tmp < max) {
+ ci = lock_cluster(si, tmp);
+ while (tmp < max) {
+- if (!si->swap_map[tmp])
++ if (swap_range_empty(si->swap_map, tmp, nr_pages))
+ break;
+- tmp++;
++ tmp += nr_pages;
+ }
+ unlock_cluster(ci);
+ }
+ if (tmp >= max) {
+- cluster->next = SWAP_NEXT_INVALID;
++ cluster->next[order] = SWAP_NEXT_INVALID;
+ goto new_cluster;
+ }
+ *offset = tmp;
+ *scan_base = tmp;
+- tmp += 1;
+- cluster->next = tmp < max ? tmp : SWAP_NEXT_INVALID;
++ tmp += nr_pages;
++ cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID;
+ return true;
+ }
+
+@@ -796,13 +824,14 @@ static bool swap_offset_available_and_lo
+
+ static int scan_swap_map_slots(struct swap_info_struct *si,
+ unsigned char usage, int nr,
+- swp_entry_t slots[])
++ swp_entry_t slots[], int order)
+ {
+ struct swap_cluster_info *ci;
+ unsigned long offset;
+ unsigned long scan_base;
+ unsigned long last_in_cluster = 0;
+ int latency_ration = LATENCY_LIMIT;
++ unsigned int nr_pages = 1 << order;
+ int n_ret = 0;
+ bool scanned_many = false;
+
+@@ -817,6 +846,25 @@ static int scan_swap_map_slots(struct sw
+ * And we let swap pages go all over an SSD partition. Hugh
+ */
+
++ if (order > 0) {
++ /*
++ * Should not even be attempting large allocations when huge
++ * page swap is disabled. Warn and fail the allocation.
++ */
++ if (!IS_ENABLED(CONFIG_THP_SWAP) ||
++ nr_pages > SWAPFILE_CLUSTER) {
++ VM_WARN_ON_ONCE(1);
++ return 0;
++ }
++
++ /*
++ * Swapfile is not block device or not using clusters so unable
++ * to allocate large entries.
++ */
++ if (!(si->flags & SWP_BLKDEV) || !si->cluster_info)
++ return 0;
++ }
++
+ si->flags += SWP_SCANNING;
+ /*
+ * Use percpu scan base for SSD to reduce lock contention on
+@@ -831,8 +879,11 @@ static int scan_swap_map_slots(struct sw
+
+ /* SSD algorithm */
+ if (si->cluster_info) {
+- if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
++ if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) {
++ if (order > 0)
++ goto no_page;
+ goto scan;
++ }
+ } else if (unlikely(!si->cluster_nr--)) {
+ if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
+ si->cluster_nr = SWAPFILE_CLUSTER - 1;
+@@ -874,13 +925,16 @@ static int scan_swap_map_slots(struct sw
+
+ checks:
+ if (si->cluster_info) {
+- while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
++ while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) {
+ /* take a break if we already got some slots */
+ if (n_ret)
+ goto done;
+ if (!scan_swap_map_try_ssd_cluster(si, &offset,
+- &scan_base))
++ &scan_base, order)) {
++ if (order > 0)
++ goto no_page;
+ goto scan;
++ }
+ }
+ }
+ if (!(si->flags & SWP_WRITEOK))
+@@ -911,11 +965,11 @@ checks:
+ else
+ goto done;
+ }
+- WRITE_ONCE(si->swap_map[offset], usage);
+- inc_cluster_info_page(si, si->cluster_info, offset);
++ memset(si->swap_map + offset, usage, nr_pages);
++ add_cluster_info_page(si, si->cluster_info, offset, nr_pages);
+ unlock_cluster(ci);
+
+- swap_range_alloc(si, offset, 1);
++ swap_range_alloc(si, offset, nr_pages);
+ slots[n_ret++] = swp_entry(si->type, offset);
+
+ /* got enough slots or reach max slots? */
+@@ -936,8 +990,10 @@ checks:
+
+ /* try to get more slots in cluster */
+ if (si->cluster_info) {
+- if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
++ if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order))
+ goto checks;
++ if (order > 0)
++ goto done;
+ } else if (si->cluster_nr && !si->swap_map[++offset]) {
+ /* non-ssd case, still more slots in cluster? */
+ --si->cluster_nr;
+@@ -964,11 +1020,13 @@ checks:
+ }
+
+ done:
+- set_cluster_next(si, offset + 1);
++ if (order == 0)
++ set_cluster_next(si, offset + 1);
+ si->flags -= SWP_SCANNING;
+ return n_ret;
+
+ scan:
++ VM_WARN_ON(order > 0);
+ spin_unlock(&si->lock);
+ while (++offset <= READ_ONCE(si->highest_bit)) {
+ if (unlikely(--latency_ration < 0)) {
+@@ -997,38 +1055,6 @@ no_page:
+ return n_ret;
+ }
+
+-static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
+-{
+- unsigned long idx;
+- struct swap_cluster_info *ci;
+- unsigned long offset;
+-
+- /*
+- * Should not even be attempting cluster allocations when huge
+- * page swap is disabled. Warn and fail the allocation.
+- */
+- if (!IS_ENABLED(CONFIG_THP_SWAP)) {
+- VM_WARN_ON_ONCE(1);
+- return 0;
+- }
+-
+- if (cluster_list_empty(&si->free_clusters))
+- return 0;
+-
+- idx = cluster_list_first(&si->free_clusters);
+- offset = idx * SWAPFILE_CLUSTER;
+- ci = lock_cluster(si, offset);
+- alloc_cluster(si, idx);
+- cluster_set_count(ci, SWAPFILE_CLUSTER);
+-
+- memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER);
+- unlock_cluster(ci);
+- swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
+- *slot = swp_entry(si->type, offset);
+-
+- return 1;
+-}
+-
+ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
+ {
+ unsigned long offset = idx * SWAPFILE_CLUSTER;
+@@ -1051,9 +1077,6 @@ int get_swap_pages(int n_goal, swp_entry
+ int n_ret = 0;
+ int node;
+
+- /* Only single cluster request supported */
+- WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
+-
+ spin_lock(&swap_avail_lock);
+
+ avail_pgs = atomic_long_read(&nr_swap_pages) / size;
+@@ -1089,14 +1112,10 @@ start_over:
+ spin_unlock(&si->lock);
+ goto nextsi;
+ }
+- if (size == SWAPFILE_CLUSTER) {
+- if (si->flags & SWP_BLKDEV)
+- n_ret = swap_alloc_cluster(si, swp_entries);
+- } else
+- n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
+- n_goal, swp_entries);
++ n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
++ n_goal, swp_entries, order);
+ spin_unlock(&si->lock);
+- if (n_ret || size == SWAPFILE_CLUSTER)
++ if (n_ret || size > 1)
+ goto check_out;
+ cond_resched();
+
+@@ -1673,7 +1692,7 @@ swp_entry_t get_swap_page_of_type(int ty
+
+ /* This is called for allocating swap entry, not cache */
+ spin_lock(&si->lock);
+- if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry))
++ if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
+ atomic_long_dec(&nr_swap_pages);
+ spin_unlock(&si->lock);
+ fail:
+@@ -3127,7 +3146,7 @@ SYSCALL_DEFINE2(swapon, const char __use
+ p->flags |= SWP_SYNCHRONOUS_IO;
+
+ if (p->bdev && bdev_nonrot(p->bdev)) {
+- int cpu;
++ int cpu, i;
+ unsigned long ci, nr_cluster;
+
+ p->flags |= SWP_SOLIDSTATE;
+@@ -3165,7 +3184,8 @@ SYSCALL_DEFINE2(swapon, const char __use
+ struct percpu_cluster *cluster;
+
+ cluster = per_cpu_ptr(p->percpu_cluster, cpu);
+- cluster->next = SWAP_NEXT_INVALID;
++ for (i = 0; i < SWAP_NR_ORDERS; i++)
++ cluster->next[i] = SWAP_NEXT_INVALID;
+ }
+ } else {
+ atomic_inc(&nr_rotate_swap);
+_
diff --git a/patches/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.patch b/patches/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.patch
new file mode 100644
index 000000000..655016bb0
--- /dev/null
+++ b/patches/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.patch
@@ -0,0 +1,408 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: free_swap_and_cache_nr() as batched free_swap_and_cache()
+Date: Mon, 8 Apr 2024 19:39:41 +0100
+
+Now that we no longer have a convenient flag in the cluster to determine
+if a folio is large, free_swap_and_cache() will take a reference and lock
+a large folio much more often, which could lead to contention and (e.g.)
+failure to split large folios, etc.
+
+Let's solve that problem by batch freeing swap and cache with a new
+function, free_swap_and_cache_nr(), to free a contiguous range of swap
+entries together. This allows us to first drop a reference to each swap
+slot before we try to release the cache folio. This means we only try to
+release the folio once, only taking the reference and lock once - much
+better than the previous 512 times for the 2M THP case.
+
+Contiguous swap entries are gathered in zap_pte_range() and
+madvise_free_pte_range() in a similar way to how present ptes are already
+gathered in zap_pte_range().
+
+While we are at it, let's simplify by converting the return type of both
+functions to void. The return value was used only by zap_pte_range() to
+print a bad pte, and was ignored by everyone else, so the extra reporting
+wasn't exactly guaranteed. We will still get the warning with most of the
+information from get_swap_device(). With the batch version, we wouldn't
+know which pte was bad anyway so could print the wrong one.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-3-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ include/linux/pgtable.h | 29 +++++++++++
+ include/linux/swap.h | 12 +++-
+ mm/internal.h | 63 ++++++++++++++++++++++++
+ mm/madvise.c | 12 +++-
+ mm/memory.c | 13 ++---
+ mm/swapfile.c | 97 ++++++++++++++++++++++++++++++--------
+ 6 files changed, 195 insertions(+), 31 deletions(-)
+
+--- a/include/linux/pgtable.h~mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache
++++ a/include/linux/pgtable.h
+@@ -708,6 +708,35 @@ static inline void pte_clear_not_present
+ }
+ #endif
+
++#ifndef clear_not_present_full_ptes
++/**
++ * clear_not_present_full_ptes - Clear multiple not present PTEs which are
++ * consecutive in the pgtable.
++ * @mm: Address space the ptes represent.
++ * @addr: Address of the first pte.
++ * @ptep: Page table pointer for the first entry.
++ * @nr: Number of entries to clear.
++ * @full: Whether we are clearing a full mm.
++ *
++ * May be overridden by the architecture; otherwise, implemented as a simple
++ * loop over pte_clear_not_present_full().
++ *
++ * Context: The caller holds the page table lock. The PTEs are all not present.
++ * The PTEs are all in the same PMD.
++ */
++static inline void clear_not_present_full_ptes(struct mm_struct *mm,
++ unsigned long addr, pte_t *ptep, unsigned int nr, int full)
++{
++ for (;;) {
++ pte_clear_not_present_full(mm, addr, ptep, full);
++ if (--nr == 0)
++ break;
++ ptep++;
++ addr += PAGE_SIZE;
++ }
++}
++#endif
++
+ #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
+ extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
+ unsigned long address,
+--- a/include/linux/swap.h~mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache
++++ a/include/linux/swap.h
+@@ -468,7 +468,7 @@ extern int swap_duplicate(swp_entry_t);
+ extern int swapcache_prepare(swp_entry_t);
+ extern void swap_free(swp_entry_t);
+ extern void swapcache_free_entries(swp_entry_t *entries, int n);
+-extern int free_swap_and_cache(swp_entry_t);
++extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
+ int swap_type_of(dev_t device, sector_t offset);
+ int find_first_swap(dev_t *device);
+ extern unsigned int count_swap_pages(int, int);
+@@ -517,8 +517,9 @@ static inline void put_swap_device(struc
+ #define free_pages_and_swap_cache(pages, nr) \
+ release_pages((pages), (nr));
+
+-/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
+-#define free_swap_and_cache(e) is_pfn_swap_entry(e)
++static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
++{
++}
+
+ static inline void free_swap_cache(struct folio *folio)
+ {
+@@ -586,6 +587,11 @@ static inline int add_swap_extent(struct
+ }
+ #endif /* CONFIG_SWAP */
+
++static inline void free_swap_and_cache(swp_entry_t entry)
++{
++ free_swap_and_cache_nr(entry, 1);
++}
++
+ #ifdef CONFIG_MEMCG
+ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
+ {
+--- a/mm/internal.h~mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache
++++ a/mm/internal.h
+@@ -11,6 +11,8 @@
+ #include <linux/mm.h>
+ #include <linux/pagemap.h>
+ #include <linux/rmap.h>
++#include <linux/swap.h>
++#include <linux/swapops.h>
+ #include <linux/tracepoint-defs.h>
+
+ struct folio_batch;
+@@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct
+
+ return min(ptep - start_ptep, max_nr);
+ }
++
++/**
++ * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
++ * @pte: The initial pte state; is_swap_pte(pte) must be true.
++ *
++ * Increments the swap offset, while maintaining all other fields, including
++ * swap type, and any swp pte bits. The resulting pte is returned.
++ */
++static inline pte_t pte_next_swp_offset(pte_t pte)
++{
++ swp_entry_t entry = pte_to_swp_entry(pte);
++ pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
++ swp_offset(entry) + 1));
++
++ if (pte_swp_soft_dirty(pte))
++ new = pte_swp_mksoft_dirty(new);
++ if (pte_swp_exclusive(pte))
++ new = pte_swp_mkexclusive(new);
++ if (pte_swp_uffd_wp(pte))
++ new = pte_swp_mkuffd_wp(new);
++
++ return new;
++}
++
++/**
++ * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
++ * @start_ptep: Page table pointer for the first entry.
++ * @max_nr: The maximum number of table entries to consider.
++ * @pte: Page table entry for the first entry.
++ *
++ * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
++ * containing swap entries all with consecutive offsets and targeting the same
++ * swap type, all with matching swp pte bits.
++ *
++ * max_nr must be at least one and must be limited by the caller so scanning
++ * cannot exceed a single page table.
++ *
++ * Return: the number of table entries in the batch.
++ */
++static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
++{
++ pte_t expected_pte = pte_next_swp_offset(pte);
++ const pte_t *end_ptep = start_ptep + max_nr;
++ pte_t *ptep = start_ptep + 1;
++
++ VM_WARN_ON(max_nr < 1);
++ VM_WARN_ON(!is_swap_pte(pte));
++ VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
++
++ while (ptep < end_ptep) {
++ pte = ptep_get(ptep);
++
++ if (!pte_same(pte, expected_pte))
++ break;
++
++ expected_pte = pte_next_swp_offset(expected_pte);
++ ptep++;
++ }
++
++ return ptep - start_ptep;
++}
+ #endif /* CONFIG_MMU */
+
+ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
+--- a/mm/madvise.c~mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache
++++ a/mm/madvise.c
+@@ -628,6 +628,7 @@ static int madvise_free_pte_range(pmd_t
+ struct folio *folio;
+ int nr_swap = 0;
+ unsigned long next;
++ int nr, max_nr;
+
+ next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd))
+@@ -640,7 +641,8 @@ static int madvise_free_pte_range(pmd_t
+ return 0;
+ flush_tlb_batched_pending(mm);
+ arch_enter_lazy_mmu_mode();
+- for (; addr != end; pte++, addr += PAGE_SIZE) {
++ for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
++ nr = 1;
+ ptent = ptep_get(pte);
+
+ if (pte_none(ptent))
+@@ -655,9 +657,11 @@ static int madvise_free_pte_range(pmd_t
+
+ entry = pte_to_swp_entry(ptent);
+ if (!non_swap_entry(entry)) {
+- nr_swap--;
+- free_swap_and_cache(entry);
+- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
++ max_nr = (end - addr) / PAGE_SIZE;
++ nr = swap_pte_batch(pte, max_nr, ptent);
++ nr_swap -= nr;
++ free_swap_and_cache_nr(entry, nr);
++ clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
+ } else if (is_hwpoison_entry(entry) ||
+ is_poisoned_swp_entry(entry)) {
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+--- a/mm/memory.c~mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache
++++ a/mm/memory.c
+@@ -1637,12 +1637,13 @@ static unsigned long zap_pte_range(struc
+ folio_remove_rmap_pte(folio, page, vma);
+ folio_put(folio);
+ } else if (!non_swap_entry(entry)) {
+- /* Genuine swap entry, hence a private anon page */
++ max_nr = (end - addr) / PAGE_SIZE;
++ nr = swap_pte_batch(pte, max_nr, ptent);
++ /* Genuine swap entries, hence a private anon pages */
+ if (!should_zap_cows(details))
+ continue;
+- rss[MM_SWAPENTS]--;
+- if (unlikely(!free_swap_and_cache(entry)))
+- print_bad_pte(vma, addr, ptent, NULL);
++ rss[MM_SWAPENTS] -= nr;
++ free_swap_and_cache_nr(entry, nr);
+ } else if (is_migration_entry(entry)) {
+ folio = pfn_swap_entry_folio(entry);
+ if (!should_zap_folio(details, folio))
+@@ -1665,8 +1666,8 @@ static unsigned long zap_pte_range(struc
+ pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
+ WARN_ON_ONCE(1);
+ }
+- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+- zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent);
++ clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
++ zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
+ } while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
+
+ add_mm_rss_vec(mm, rss);
+--- a/mm/swapfile.c~mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache
++++ a/mm/swapfile.c
+@@ -130,7 +130,11 @@ static inline unsigned char swap_count(u
+ /* Reclaim the swap entry if swap is getting full*/
+ #define TTRS_FULL 0x4
+
+-/* returns 1 if swap entry is freed */
++/*
++ * returns number of pages in the folio that backs the swap entry. If positive,
++ * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
++ * folio was associated with the swap entry.
++ */
+ static int __try_to_reclaim_swap(struct swap_info_struct *si,
+ unsigned long offset, unsigned long flags)
+ {
+@@ -155,6 +159,7 @@ static int __try_to_reclaim_swap(struct
+ ret = folio_free_swap(folio);
+ folio_unlock(folio);
+ }
++ ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
+ folio_put(folio);
+ return ret;
+ }
+@@ -895,7 +900,7 @@ checks:
+ swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
+ spin_lock(&si->lock);
+ /* entry was freed successfully, try to use this again */
+- if (swap_was_freed)
++ if (swap_was_freed > 0)
+ goto checks;
+ goto scan; /* check next one */
+ }
+@@ -1572,32 +1577,88 @@ bool folio_free_swap(struct folio *folio
+ return true;
+ }
+
+-/*
+- * Free the swap entry like above, but also try to
+- * free the page cache entry if it is the last user.
++/**
++ * free_swap_and_cache_nr() - Release reference on range of swap entries and
++ * reclaim their cache if no more references remain.
++ * @entry: First entry of range.
++ * @nr: Number of entries in range.
++ *
++ * For each swap entry in the contiguous range, release a reference. If any swap
++ * entries become free, try to reclaim their underlying folios, if present. The
++ * offset range is defined by [entry.offset, entry.offset + nr).
+ */
+-int free_swap_and_cache(swp_entry_t entry)
++void free_swap_and_cache_nr(swp_entry_t entry, int nr)
+ {
+- struct swap_info_struct *p;
++ const unsigned long start_offset = swp_offset(entry);
++ const unsigned long end_offset = start_offset + nr;
++ unsigned int type = swp_type(entry);
++ struct swap_info_struct *si;
++ bool any_only_cache = false;
++ unsigned long offset;
+ unsigned char count;
+
+ if (non_swap_entry(entry))
+- return 1;
++ return;
++
++ si = get_swap_device(entry);
++ if (!si)
++ return;
++
++ if (WARN_ON(end_offset > si->max))
++ goto out;
+
+- p = get_swap_device(entry);
+- if (p) {
+- if (WARN_ON(data_race(!p->swap_map[swp_offset(entry)]))) {
+- put_swap_device(p);
+- return 0;
++ /*
++ * First free all entries in the range.
++ */
++ for (offset = start_offset; offset < end_offset; offset++) {
++ if (data_race(si->swap_map[offset])) {
++ count = __swap_entry_free(si, swp_entry(type, offset));
++ if (count == SWAP_HAS_CACHE)
++ any_only_cache = true;
++ } else {
++ WARN_ON_ONCE(1);
+ }
++ }
++
++ /*
++ * Short-circuit the below loop if none of the entries had their
++ * reference drop to zero.
++ */
++ if (!any_only_cache)
++ goto out;
+
+- count = __swap_entry_free(p, entry);
+- if (count == SWAP_HAS_CACHE)
+- __try_to_reclaim_swap(p, swp_offset(entry),
++ /*
++ * Now go back over the range trying to reclaim the swap cache. This is
++ * more efficient for large folios because we will only try to reclaim
++ * the swap once per folio in the common case. If we do
++ * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
++ * latter will get a reference and lock the folio for every individual
++ * page but will only succeed once the swap slot for every subpage is
++ * zero.
++ */
++ for (offset = start_offset; offset < end_offset; offset += nr) {
++ nr = 1;
++ if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
++ /*
++ * Folios are always naturally aligned in swap so
++ * advance forward to the next boundary. Zero means no
++ * folio was found for the swap entry, so advance by 1
++ * in this case. Negative value means folio was found
++ * but could not be reclaimed. Here we can still advance
++ * to the next boundary.
++ */
++ nr = __try_to_reclaim_swap(si, offset,
+ TTRS_UNMAPPED | TTRS_FULL);
+- put_swap_device(p);
++ if (nr == 0)
++ nr = 1;
++ else if (nr < 0)
++ nr = -nr;
++ nr = ALIGN(offset + 1, nr) - offset;
++ }
+ }
+- return p != NULL;
++
++out:
++ put_swap_device(si);
+ }
+
+ #ifdef CONFIG_HIBERNATION
+_
diff --git a/patches/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.patch b/patches/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.patch
new file mode 100644
index 000000000..aeb4e608e
--- /dev/null
+++ b/patches/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.patch
@@ -0,0 +1,271 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: remove CLUSTER_FLAG_HUGE from swap_cluster_info:flags
+Date: Mon, 8 Apr 2024 19:39:40 +0100
+
+Patch series "Swap-out mTHP without splitting", v7.
+
+This series adds support for swapping out multi-size THP (mTHP) without
+needing to first split the large folio via
+split_huge_page_to_list_to_order(). It closely follows the approach
+already used to swap-out PMD-sized THP.
+
+There are a couple of reasons for swapping out mTHP without splitting:
+
+ - Performance: It is expensive to split a large folio and under
+ extreme memory pressure some workloads regressed performance when
+ using 64K mTHP vs 4K small folios because of this extra cost in the
+ swap-out path. This series not only eliminates the regression but
+ makes it faster to swap out 64K mTHP vs 4K small folios.
+
+ - Memory fragmentation avoidance: If we can avoid splitting a large
+ folio memory is less likely to become fragmented, making it easier to
+ re-allocate a large folio in future.
+
+ - Performance: Enables a separate series [7] to swap-in whole mTHPs,
+ which means we won't lose the TLB-efficiency benefits of mTHP once the
+ memory has been through a swap cycle.
+
+I've done what I thought was the smallest change possible, and as a
+result, this approach is only employed when the swap is backed by a
+non-rotating block device (just as PMD-sized THP is supported today).
+Discussion against the RFC concluded that this is sufficient.
+
+
+Performance Testing
+===================
+
+I've run some swap performance tests on Ampere Altra VM (arm64) with 8
+CPUs. The VM is set up with a 35G block ram device as the swap device and
+the test is run from inside a memcg limited to 40G memory. I've then run
+`usemem` from vm-scalability with 70 processes, each allocating and
+writing 1G of memory. I've repeated everything 6 times and taken the mean
+performance improvement relative to 4K page baseline:
+
+| alloc size | baseline | + this series |
+| | mm-unstable (~v6.9-rc1) | |
+|:-----------|------------------------:|------------------------:|
+| 4K Page | 0.0% | 1.3% |
+| 64K THP | -13.6% | 46.3% |
+| 2M THP | 91.4% | 89.6% |
+
+So with this change, the 64K swap performance goes from a 14% regression to a
+46% improvement. While 2M shows a small regression I'm confident that this is
+just noise.
+
+[1] https://lore.kernel.org/linux-mm/20231010142111.3997780-1-ryan.roberts@arm.com/
+[2] https://lore.kernel.org/linux-mm/20231017161302.2518826-1-ryan.roberts@arm.com/
+[3] https://lore.kernel.org/linux-mm/20231025144546.577640-1-ryan.roberts@arm.com/
+[4] https://lore.kernel.org/linux-mm/20240311150058.1122862-1-ryan.roberts@arm.com/
+[5] https://lore.kernel.org/linux-mm/20240327144537.4165578-1-ryan.roberts@arm.com/
+[6] https://lore.kernel.org/linux-mm/20240403114032.1162100-1-ryan.roberts@arm.com/
+[7] https://lore.kernel.org/linux-mm/20240304081348.197341-1-21cnbao@gmail.com/
+[8] https://lore.kernel.org/linux-mm/CAGsJ_4yMOow27WDvN2q=E4HAtDd2PJ=OQ5Pj9DG+6FLWwNuXUw@mail.gmail.com/
+[9] https://lore.kernel.org/linux-mm/579d5127-c763-4001-9625-4563a9316ac3@redhat.com/
+
+
+This patch (of 7):
+
+As preparation for supporting small-sized THP in the swap-out path,
+without first needing to split to order-0, Remove the CLUSTER_FLAG_HUGE,
+which, when present, always implies PMD-sized THP, which is the same as
+the cluster size.
+
+The only use of the flag was to determine whether a swap entry refers to a
+single page or a PMD-sized THP in swap_page_trans_huge_swapped(). Instead
+of relying on the flag, we now pass in order, which originates from the
+folio's order. This allows the logic to work for folios of any order.
+
+The one snag is that one of the swap_page_trans_huge_swapped() call sites
+does not have the folio. But it was only being called there to shortcut a
+call __try_to_reclaim_swap() in some cases. __try_to_reclaim_swap() gets
+the folio and (via some other functions) calls
+swap_page_trans_huge_swapped(). So I've removed the problematic call site
+and believe the new logic should be functionally equivalent.
+
+That said, removing the fast path means that we will take a reference and
+trylock a large folio much more often, which we would like to avoid. The
+next patch will solve this.
+
+Removing CLUSTER_FLAG_HUGE also means we can remove split_swap_cluster()
+which used to be called during folio splitting, since
+split_swap_cluster()'s only job was to remove the flag.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-1-ryan.roberts@arm.com
+Link: https://lkml.kernel.org/r/20240408183946.2991168-2-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Acked-by: Chris Li <chrisl@kernel.org>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ include/linux/swap.h | 10 --------
+ mm/huge_memory.c | 3 --
+ mm/swapfile.c | 47 ++++++-----------------------------------
+ 3 files changed, 8 insertions(+), 52 deletions(-)
+
+--- a/include/linux/swap.h~mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags
++++ a/include/linux/swap.h
+@@ -259,7 +259,6 @@ struct swap_cluster_info {
+ };
+ #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
+ #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
+-#define CLUSTER_FLAG_HUGE 4 /* This cluster is backing a transparent huge page */
+
+ /*
+ * We assign a cluster to each CPU, so each CPU can allocate swap entry from
+@@ -587,15 +586,6 @@ static inline int add_swap_extent(struct
+ }
+ #endif /* CONFIG_SWAP */
+
+-#ifdef CONFIG_THP_SWAP
+-extern int split_swap_cluster(swp_entry_t entry);
+-#else
+-static inline int split_swap_cluster(swp_entry_t entry)
+-{
+- return 0;
+-}
+-#endif
+-
+ #ifdef CONFIG_MEMCG
+ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
+ {
+--- a/mm/huge_memory.c~mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags
++++ a/mm/huge_memory.c
+@@ -2844,9 +2844,6 @@ static void __split_huge_page(struct pag
+ shmem_uncharge(folio->mapping->host, nr_dropped);
+ remap_page(folio, nr);
+
+- if (folio_test_swapcache(folio))
+- split_swap_cluster(folio->swap);
+-
+ /*
+ * set page to its compound_head when split to non order-0 pages, so
+ * we can skip unlocking it below, since PG_locked is transferred to
+--- a/mm/swapfile.c~mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags
++++ a/mm/swapfile.c
+@@ -343,18 +343,6 @@ static inline void cluster_set_null(stru
+ info->data = 0;
+ }
+
+-static inline bool cluster_is_huge(struct swap_cluster_info *info)
+-{
+- if (IS_ENABLED(CONFIG_THP_SWAP))
+- return info->flags & CLUSTER_FLAG_HUGE;
+- return false;
+-}
+-
+-static inline void cluster_clear_huge(struct swap_cluster_info *info)
+-{
+- info->flags &= ~CLUSTER_FLAG_HUGE;
+-}
+-
+ static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
+ unsigned long offset)
+ {
+@@ -1027,7 +1015,7 @@ static int swap_alloc_cluster(struct swa
+ offset = idx * SWAPFILE_CLUSTER;
+ ci = lock_cluster(si, offset);
+ alloc_cluster(si, idx);
+- cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
++ cluster_set_count(ci, SWAPFILE_CLUSTER);
+
+ memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER);
+ unlock_cluster(ci);
+@@ -1365,7 +1353,6 @@ void put_swap_folio(struct folio *folio,
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ if (size == SWAPFILE_CLUSTER) {
+- VM_BUG_ON(!cluster_is_huge(ci));
+ map = si->swap_map + offset;
+ for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+ val = map[i];
+@@ -1373,7 +1360,6 @@ void put_swap_folio(struct folio *folio,
+ if (val == SWAP_HAS_CACHE)
+ free_entries++;
+ }
+- cluster_clear_huge(ci);
+ if (free_entries == SWAPFILE_CLUSTER) {
+ unlock_cluster_or_swap_info(si, ci);
+ spin_lock(&si->lock);
+@@ -1395,23 +1381,6 @@ void put_swap_folio(struct folio *folio,
+ unlock_cluster_or_swap_info(si, ci);
+ }
+
+-#ifdef CONFIG_THP_SWAP
+-int split_swap_cluster(swp_entry_t entry)
+-{
+- struct swap_info_struct *si;
+- struct swap_cluster_info *ci;
+- unsigned long offset = swp_offset(entry);
+-
+- si = _swap_info_get(entry);
+- if (!si)
+- return -EBUSY;
+- ci = lock_cluster(si, offset);
+- cluster_clear_huge(ci);
+- unlock_cluster(ci);
+- return 0;
+-}
+-#endif
+-
+ static int swp_entry_cmp(const void *ent1, const void *ent2)
+ {
+ const swp_entry_t *e1 = ent1, *e2 = ent2;
+@@ -1519,22 +1488,23 @@ out:
+ }
+
+ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
+- swp_entry_t entry)
++ swp_entry_t entry, int order)
+ {
+ struct swap_cluster_info *ci;
+ unsigned char *map = si->swap_map;
++ unsigned int nr_pages = 1 << order;
+ unsigned long roffset = swp_offset(entry);
+- unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
++ unsigned long offset = round_down(roffset, nr_pages);
+ int i;
+ bool ret = false;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+- if (!ci || !cluster_is_huge(ci)) {
++ if (!ci || nr_pages == 1) {
+ if (swap_count(map[roffset]))
+ ret = true;
+ goto unlock_out;
+ }
+- for (i = 0; i < SWAPFILE_CLUSTER; i++) {
++ for (i = 0; i < nr_pages; i++) {
+ if (swap_count(map[offset + i])) {
+ ret = true;
+ break;
+@@ -1556,7 +1526,7 @@ static bool folio_swapped(struct folio *
+ if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
+ return swap_swapcount(si, entry) != 0;
+
+- return swap_page_trans_huge_swapped(si, entry);
++ return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
+ }
+
+ /**
+@@ -1622,8 +1592,7 @@ int free_swap_and_cache(swp_entry_t entr
+ }
+
+ count = __swap_entry_free(p, entry);
+- if (count == SWAP_HAS_CACHE &&
+- !swap_page_trans_huge_swapped(p, entry))
++ if (count == SWAP_HAS_CACHE)
+ __try_to_reclaim_swap(p, swp_offset(entry),
+ TTRS_UNMAPPED | TTRS_FULL);
+ put_swap_device(p);
+_
diff --git a/patches/mm-swap-simplify-struct-percpu_cluster.patch b/patches/mm-swap-simplify-struct-percpu_cluster.patch
new file mode 100644
index 000000000..f53ce5982
--- /dev/null
+++ b/patches/mm-swap-simplify-struct-percpu_cluster.patch
@@ -0,0 +1,140 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: simplify struct percpu_cluster
+Date: Mon, 8 Apr 2024 19:39:42 +0100
+
+struct percpu_cluster stores the index of cpu's current cluster and the
+offset of the next entry that will be allocated for the cpu. These two
+pieces of information are redundant because the cluster index is just
+(offset / SWAPFILE_CLUSTER). The only reason for explicitly keeping the
+cluster index is because the structure used for it also has a flag to
+indicate "no cluster". However this data structure also contains a spin
+lock, which is never used in this context, as a side effect the code
+copies the spinlock_t structure, which is questionable coding practice in
+my view.
+
+So let's clean this up and store only the next offset, and use a sentinal
+value (SWAP_NEXT_INVALID) to indicate "no cluster". SWAP_NEXT_INVALID is
+chosen to be 0, because 0 will never be seen legitimately; The first page
+in the swap file is the swap header, which is always marked bad to prevent
+it from being allocated as an entry. This also prevents the cluster to
+which it belongs being marked free, so it will never appear on the free
+list.
+
+This change saves 16 bytes per cpu. And given we are shortly going to
+extend this mechanism to be per-cpu-AND-per-order, we will end up saving
+16 * 9 = 144 bytes per cpu, which adds up if you have 256 cpus in the
+system.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-4-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ include/linux/swap.h | 9 ++++++++-
+ mm/swapfile.c | 22 +++++++++++-----------
+ 2 files changed, 19 insertions(+), 12 deletions(-)
+
+--- a/include/linux/swap.h~mm-swap-simplify-struct-percpu_cluster
++++ a/include/linux/swap.h
+@@ -261,12 +261,19 @@ struct swap_cluster_info {
+ #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
+
+ /*
++ * The first page in the swap file is the swap header, which is always marked
++ * bad to prevent it from being allocated as an entry. This also prevents the
++ * cluster to which it belongs being marked free. Therefore 0 is safe to use as
++ * a sentinel to indicate next is not valid in percpu_cluster.
++ */
++#define SWAP_NEXT_INVALID 0
++
++/*
+ * We assign a cluster to each CPU, so each CPU can allocate swap entry from
+ * its own cluster and swapout sequentially. The purpose is to optimize swapout
+ * throughput.
+ */
+ struct percpu_cluster {
+- struct swap_cluster_info index; /* Current cluster index */
+ unsigned int next; /* Likely next allocation offset */
+ };
+
+--- a/mm/swapfile.c~mm-swap-simplify-struct-percpu_cluster
++++ a/mm/swapfile.c
+@@ -609,7 +609,7 @@ scan_swap_map_ssd_cluster_conflict(struc
+ return false;
+
+ percpu_cluster = this_cpu_ptr(si->percpu_cluster);
+- cluster_set_null(&percpu_cluster->index);
++ percpu_cluster->next = SWAP_NEXT_INVALID;
+ return true;
+ }
+
+@@ -622,14 +622,14 @@ static bool scan_swap_map_try_ssd_cluste
+ {
+ struct percpu_cluster *cluster;
+ struct swap_cluster_info *ci;
+- unsigned long tmp, max;
++ unsigned int tmp, max;
+
+ new_cluster:
+ cluster = this_cpu_ptr(si->percpu_cluster);
+- if (cluster_is_null(&cluster->index)) {
++ tmp = cluster->next;
++ if (tmp == SWAP_NEXT_INVALID) {
+ if (!cluster_list_empty(&si->free_clusters)) {
+- cluster->index = si->free_clusters.head;
+- cluster->next = cluster_next(&cluster->index) *
++ tmp = cluster_next(&si->free_clusters.head) *
+ SWAPFILE_CLUSTER;
+ } else if (!cluster_list_empty(&si->discard_clusters)) {
+ /*
+@@ -649,9 +649,7 @@ new_cluster:
+ * Other CPUs can use our cluster if they can't find a free cluster,
+ * check if there is still free entry in the cluster
+ */
+- tmp = cluster->next;
+- max = min_t(unsigned long, si->max,
+- (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
++ max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER));
+ if (tmp < max) {
+ ci = lock_cluster(si, tmp);
+ while (tmp < max) {
+@@ -662,12 +660,13 @@ new_cluster:
+ unlock_cluster(ci);
+ }
+ if (tmp >= max) {
+- cluster_set_null(&cluster->index);
++ cluster->next = SWAP_NEXT_INVALID;
+ goto new_cluster;
+ }
+- cluster->next = tmp + 1;
+ *offset = tmp;
+ *scan_base = tmp;
++ tmp += 1;
++ cluster->next = tmp < max ? tmp : SWAP_NEXT_INVALID;
+ return true;
+ }
+
+@@ -3163,8 +3162,9 @@ SYSCALL_DEFINE2(swapon, const char __use
+ }
+ for_each_possible_cpu(cpu) {
+ struct percpu_cluster *cluster;
++
+ cluster = per_cpu_ptr(p->percpu_cluster, cpu);
+- cluster_set_null(&cluster->index);
++ cluster->next = SWAP_NEXT_INVALID;
+ }
+ } else {
+ atomic_inc(&nr_rotate_swap);
+_
diff --git a/patches/mm-swap-update-get_swap_pages-to-take-folio-order.patch b/patches/mm-swap-update-get_swap_pages-to-take-folio-order.patch
new file mode 100644
index 000000000..75ef345ae
--- /dev/null
+++ b/patches/mm-swap-update-get_swap_pages-to-take-folio-order.patch
@@ -0,0 +1,116 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: update get_swap_pages() to take folio order
+Date: Mon, 8 Apr 2024 19:39:43 +0100
+
+We are about to allow swap storage of any mTHP size. To prepare for that,
+let's change get_swap_pages() to take a folio order parameter instead of
+nr_pages. This makes the interface self-documenting; a power-of-2 number
+of pages must be provided. We will also need the order internally so this
+simplifies accessing it.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-5-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ include/linux/swap.h | 2 +-
+ mm/swap_slots.c | 6 +++---
+ mm/swapfile.c | 13 +++++++------
+ 3 files changed, 11 insertions(+), 10 deletions(-)
+
+--- a/include/linux/swap.h~mm-swap-update-get_swap_pages-to-take-folio-order
++++ a/include/linux/swap.h
+@@ -468,7 +468,7 @@ swp_entry_t folio_alloc_swap(struct foli
+ bool folio_free_swap(struct folio *folio);
+ void put_swap_folio(struct folio *folio, swp_entry_t entry);
+ extern swp_entry_t get_swap_page_of_type(int);
+-extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size);
++extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order);
+ extern int add_swap_count_continuation(swp_entry_t, gfp_t);
+ extern void swap_shmem_alloc(swp_entry_t);
+ extern int swap_duplicate(swp_entry_t);
+--- a/mm/swapfile.c~mm-swap-update-get_swap_pages-to-take-folio-order
++++ a/mm/swapfile.c
+@@ -278,15 +278,15 @@ static void discard_swap_cluster(struct
+ #ifdef CONFIG_THP_SWAP
+ #define SWAPFILE_CLUSTER HPAGE_PMD_NR
+
+-#define swap_entry_size(size) (size)
++#define swap_entry_order(order) (order)
+ #else
+ #define SWAPFILE_CLUSTER 256
+
+ /*
+- * Define swap_entry_size() as constant to let compiler to optimize
++ * Define swap_entry_order() as constant to let compiler to optimize
+ * out some code if !CONFIG_THP_SWAP
+ */
+-#define swap_entry_size(size) 1
++#define swap_entry_order(order) 0
+ #endif
+ #define LATENCY_LIMIT 256
+
+@@ -1042,9 +1042,10 @@ static void swap_free_cluster(struct swa
+ swap_range_free(si, offset, SWAPFILE_CLUSTER);
+ }
+
+-int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
++int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
+ {
+- unsigned long size = swap_entry_size(entry_size);
++ int order = swap_entry_order(entry_order);
++ unsigned long size = 1 << order;
+ struct swap_info_struct *si, *next;
+ long avail_pgs;
+ int n_ret = 0;
+@@ -1349,7 +1350,7 @@ void put_swap_folio(struct folio *folio,
+ unsigned char *map;
+ unsigned int i, free_entries = 0;
+ unsigned char val;
+- int size = swap_entry_size(folio_nr_pages(folio));
++ int size = 1 << swap_entry_order(folio_order(folio));
+
+ si = _swap_info_get(entry);
+ if (!si)
+--- a/mm/swap_slots.c~mm-swap-update-get_swap_pages-to-take-folio-order
++++ a/mm/swap_slots.c
+@@ -264,7 +264,7 @@ static int refill_swap_slots_cache(struc
+ cache->cur = 0;
+ if (swap_slot_cache_active)
+ cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
+- cache->slots, 1);
++ cache->slots, 0);
+
+ return cache->nr;
+ }
+@@ -311,7 +311,7 @@ swp_entry_t folio_alloc_swap(struct foli
+
+ if (folio_test_large(folio)) {
+ if (IS_ENABLED(CONFIG_THP_SWAP))
+- get_swap_pages(1, &entry, folio_nr_pages(folio));
++ get_swap_pages(1, &entry, folio_order(folio));
+ goto out;
+ }
+
+@@ -343,7 +343,7 @@ repeat:
+ goto out;
+ }
+
+- get_swap_pages(1, &entry, 1);
++ get_swap_pages(1, &entry, 0);
+ out:
+ if (mem_cgroup_try_charge_swap(folio, entry)) {
+ put_swap_folio(folio, entry);
+_
diff --git a/patches/mm-vmscan-avoid-split-during-shrink_folio_list.patch b/patches/mm-vmscan-avoid-split-during-shrink_folio_list.patch
new file mode 100644
index 000000000..544090bfc
--- /dev/null
+++ b/patches/mm-vmscan-avoid-split-during-shrink_folio_list.patch
@@ -0,0 +1,77 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: vmscan: avoid split during shrink_folio_list()
+Date: Mon, 8 Apr 2024 19:39:45 +0100
+
+Now that swap supports storing all mTHP sizes, avoid splitting large
+folios before swap-out. This benefits performance of the swap-out path by
+eliding split_folio_to_list(), which is expensive, and also sets us up for
+swapping in large folios in a future series.
+
+If the folio is partially mapped, we continue to split it since we want to
+avoid the extra IO overhead and storage of writing out pages
+uneccessarily.
+
+THP_SWPOUT and THP_SWPOUT_FALLBACK counters should continue to count
+events only for PMD-mappable folios to avoid user confusion. THP_SWPOUT
+already has the appropriate guard. Add a guard for THP_SWPOUT_FALLBACK.
+It may be appropriate to add per-size counters in future.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-7-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Barry Song <v-songbaohua@oppo.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ mm/vmscan.c | 20 ++++++++++----------
+ 1 file changed, 10 insertions(+), 10 deletions(-)
+
+--- a/mm/vmscan.c~mm-vmscan-avoid-split-during-shrink_folio_list
++++ a/mm/vmscan.c
+@@ -1206,25 +1206,25 @@ retry:
+ if (!can_split_folio(folio, NULL))
+ goto activate_locked;
+ /*
+- * Split folios without a PMD map right
+- * away. Chances are some or all of the
+- * tail pages can be freed without IO.
++ * Split partially mapped folios right away.
++ * We can free the unmapped pages without IO.
+ */
+- if (!folio_entire_mapcount(folio) &&
+- split_folio_to_list(folio,
+- folio_list))
++ if (data_race(!list_empty(&folio->_deferred_list)) &&
++ split_folio_to_list(folio, folio_list))
+ goto activate_locked;
+ }
+ if (!add_to_swap(folio)) {
+ if (!folio_test_large(folio))
+ goto activate_locked_split;
+ /* Fallback to swap normal pages */
+- if (split_folio_to_list(folio,
+- folio_list))
++ if (split_folio_to_list(folio, folio_list))
+ goto activate_locked;
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+- count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1);
+- count_vm_event(THP_SWPOUT_FALLBACK);
++ if (nr_pages >= HPAGE_PMD_NR) {
++ count_memcg_folio_events(folio,
++ THP_SWPOUT_FALLBACK, 1);
++ count_vm_event(THP_SWPOUT_FALLBACK);
++ }
+ #endif
+ if (!add_to_swap(folio))
+ goto activate_locked_split;
+_
diff --git a/pc/devel-series b/pc/devel-series
index 1ac0257b5..5cfaa541e 100644
--- a/pc/devel-series
+++ b/pc/devel-series
@@ -455,6 +455,13 @@ proc-convert-smaps_pmd_entry-to-use-a-folio.patch
#
mm-page_alloc-use-the-correct-thp-order-for-thp-pcp.patch
#
+mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.patch
+mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.patch
+mm-swap-simplify-struct-percpu_cluster.patch
+mm-swap-update-get_swap_pages-to-take-folio-order.patch
+mm-swap-allow-storage-of-all-mthp-orders.patch
+mm-vmscan-avoid-split-during-shrink_folio_list.patch
+mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.patch
#
#arm64-mm-cleanup-__do_page_fault.patch: https://lkml.kernel.org/r/20240407171902.5958-A-hca@linux.ibm.com
arm64-mm-cleanup-__do_page_fault.patch
diff --git a/pc/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.pc b/pc/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.pc
new file mode 100644
index 000000000..ac995ae95
--- /dev/null
+++ b/pc/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.pc
@@ -0,0 +1,4 @@
+include/linux/pgtable.h
+mm/internal.h
+mm/madvise.c
+mm/memory.c
diff --git a/pc/mm-swap-allow-storage-of-all-mthp-orders.pc b/pc/mm-swap-allow-storage-of-all-mthp-orders.pc
new file mode 100644
index 000000000..3bce48932
--- /dev/null
+++ b/pc/mm-swap-allow-storage-of-all-mthp-orders.pc
@@ -0,0 +1,2 @@
+include/linux/swap.h
+mm/swapfile.c
diff --git a/pc/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.pc b/pc/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.pc
new file mode 100644
index 000000000..f89ab82c9
--- /dev/null
+++ b/pc/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.pc
@@ -0,0 +1,6 @@
+include/linux/pgtable.h
+include/linux/swap.h
+mm/internal.h
+mm/madvise.c
+mm/memory.c
+mm/swapfile.c
diff --git a/pc/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.pc b/pc/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.pc
new file mode 100644
index 000000000..d45dac10c
--- /dev/null
+++ b/pc/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.pc
@@ -0,0 +1,3 @@
+include/linux/swap.h
+mm/huge_memory.c
+mm/swapfile.c
diff --git a/pc/mm-swap-simplify-struct-percpu_cluster.pc b/pc/mm-swap-simplify-struct-percpu_cluster.pc
new file mode 100644
index 000000000..3bce48932
--- /dev/null
+++ b/pc/mm-swap-simplify-struct-percpu_cluster.pc
@@ -0,0 +1,2 @@
+include/linux/swap.h
+mm/swapfile.c
diff --git a/pc/mm-swap-update-get_swap_pages-to-take-folio-order.pc b/pc/mm-swap-update-get_swap_pages-to-take-folio-order.pc
new file mode 100644
index 000000000..f2bd0b484
--- /dev/null
+++ b/pc/mm-swap-update-get_swap_pages-to-take-folio-order.pc
@@ -0,0 +1,3 @@
+include/linux/swap.h
+mm/swapfile.c
+mm/swap_slots.c
diff --git a/pc/mm-vmscan-avoid-split-during-shrink_folio_list.pc b/pc/mm-vmscan-avoid-split-during-shrink_folio_list.pc
new file mode 100644
index 000000000..40d089036
--- /dev/null
+++ b/pc/mm-vmscan-avoid-split-during-shrink_folio_list.pc
@@ -0,0 +1 @@
+mm/vmscan.c
diff --git a/txt/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.txt b/txt/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.txt
new file mode 100644
index 000000000..cac0e7b9b
--- /dev/null
+++ b/txt/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.txt
@@ -0,0 +1,41 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: madvise: avoid split during MADV_PAGEOUT and MADV_COLD
+Date: Mon, 8 Apr 2024 19:39:46 +0100
+
+Rework madvise_cold_or_pageout_pte_range() to avoid splitting any large
+folio that is fully and contiguously mapped in the pageout/cold vm range.
+This change means that large folios will be maintained all the way to swap
+storage. This both improves performance during swap-out, by eliding the
+cost of splitting the folio, and sets us up nicely for maintaining the
+large folio when it is swapped back in (to be covered in a separate
+series).
+
+Folios that are not fully mapped in the target range are still split, but
+note that behavior is changed so that if the split fails for any reason
+(folio locked, shared, etc) we now leave it as is and move to the next pte
+in the range and continue work on the proceeding folios. Previously any
+failure of this sort would cause the entire operation to give up and no
+folios mapped at higher addresses were paged out or made cold. Given
+large folios are becoming more common, this old behavior would have likely
+lead to wasted opportunities.
+
+While we are at it, change the code that clears young from the ptes to use
+ptep_test_and_clear_young(), via the new mkold_ptes() batch helper
+function. This is more efficent than get_and_clear/modify/set, especially
+for contpte mappings on arm64, where the old approach would require
+unfolding/refolding and the new approach can be done in place.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-8-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: Barry Song <v-songbaohua@oppo.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
diff --git a/txt/mm-set-pageblock_order-to-hpage_pmd_order-in-case-with-config_hugetlb_page-but-thp-enabled.txt b/txt/mm-set-pageblock_order-to-hpage_pmd_order-in-case-with-config_hugetlb_page-but-thp-enabled.txt
index eac7c7335..7d6935c1e 100644
--- a/txt/mm-set-pageblock_order-to-hpage_pmd_order-in-case-with-config_hugetlb_page-but-thp-enabled.txt
+++ b/txt/mm-set-pageblock_order-to-hpage_pmd_order-in-case-with-config_hugetlb_page-but-thp-enabled.txt
@@ -12,4 +12,5 @@ Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Zi Yan <ziy@nvidia.com>
+Acked-by: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
diff --git a/txt/mm-swap-allow-storage-of-all-mthp-orders.txt b/txt/mm-swap-allow-storage-of-all-mthp-orders.txt
new file mode 100644
index 000000000..71e43fef0
--- /dev/null
+++ b/txt/mm-swap-allow-storage-of-all-mthp-orders.txt
@@ -0,0 +1,60 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: allow storage of all mTHP orders
+Date: Mon, 8 Apr 2024 19:39:44 +0100
+
+Multi-size THP enables performance improvements by allocating large,
+pte-mapped folios for anonymous memory. However I've observed that on an
+arm64 system running a parallel workload (e.g. kernel compilation) across
+many cores, under high memory pressure, the speed regresses. This is due
+to bottlenecking on the increased number of TLBIs added due to all the
+extra folio splitting when the large folios are swapped out.
+
+Therefore, solve this regression by adding support for swapping out mTHP
+without needing to split the folio, just like is already done for
+PMD-sized THP. This change only applies when CONFIG_THP_SWAP is enabled,
+and when the swap backing store is a non-rotating block device. These are
+the same constraints as for the existing PMD-sized THP swap-out support.
+
+Note that no attempt is made to swap-in (m)THP here - this is still done
+page-by-page, like for PMD-sized THP. But swapping-out mTHP is a
+prerequisite for swapping-in mTHP.
+
+The main change here is to improve the swap entry allocator so that it can
+allocate any power-of-2 number of contiguous entries between [1, (1 <<
+PMD_ORDER)]. This is done by allocating a cluster for each distinct order
+and allocating sequentially from it until the cluster is full. This
+ensures that we don't need to search the map and we get no fragmentation
+due to alignment padding for different orders in the cluster. If there is
+no current cluster for a given order, we attempt to allocate a free
+cluster from the list. If there are no free clusters, we fail the
+allocation and the caller can fall back to splitting the folio and
+allocates individual entries (as per existing PMD-sized THP fallback).
+
+The per-order current clusters are maintained per-cpu using the existing
+infrastructure. This is done to avoid interleving pages from different
+tasks, which would prevent IO being batched. This is already done for the
+order-0 allocations so we follow the same pattern.
+
+As is done for order-0 per-cpu clusters, the scanner now can steal order-0
+entries from any per-cpu-per-order reserved cluster. This ensures that
+when the swap file is getting full, space doesn't get tied up in the
+per-cpu reserves.
+
+This change only modifies swap to be able to accept any order mTHP. It
+doesn't change the callers to elide doing the actual split. That will be
+done in separate changes.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-6-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
diff --git a/txt/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.txt b/txt/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.txt
new file mode 100644
index 000000000..c6b497aff
--- /dev/null
+++ b/txt/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.txt
@@ -0,0 +1,41 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: free_swap_and_cache_nr() as batched free_swap_and_cache()
+Date: Mon, 8 Apr 2024 19:39:41 +0100
+
+Now that we no longer have a convenient flag in the cluster to determine
+if a folio is large, free_swap_and_cache() will take a reference and lock
+a large folio much more often, which could lead to contention and (e.g.)
+failure to split large folios, etc.
+
+Let's solve that problem by batch freeing swap and cache with a new
+function, free_swap_and_cache_nr(), to free a contiguous range of swap
+entries together. This allows us to first drop a reference to each swap
+slot before we try to release the cache folio. This means we only try to
+release the folio once, only taking the reference and lock once - much
+better than the previous 512 times for the 2M THP case.
+
+Contiguous swap entries are gathered in zap_pte_range() and
+madvise_free_pte_range() in a similar way to how present ptes are already
+gathered in zap_pte_range().
+
+While we are at it, let's simplify by converting the return type of both
+functions to void. The return value was used only by zap_pte_range() to
+print a bad pte, and was ignored by everyone else, so the extra reporting
+wasn't exactly guaranteed. We will still get the warning with most of the
+information from get_swap_device(). With the batch version, we wouldn't
+know which pte was bad anyway so could print the wrong one.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-3-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
diff --git a/txt/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.txt b/txt/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.txt
new file mode 100644
index 000000000..8f12a5894
--- /dev/null
+++ b/txt/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.txt
@@ -0,0 +1,107 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: remove CLUSTER_FLAG_HUGE from swap_cluster_info:flags
+Date: Mon, 8 Apr 2024 19:39:40 +0100
+
+Patch series "Swap-out mTHP without splitting", v7.
+
+This series adds support for swapping out multi-size THP (mTHP) without
+needing to first split the large folio via
+split_huge_page_to_list_to_order(). It closely follows the approach
+already used to swap-out PMD-sized THP.
+
+There are a couple of reasons for swapping out mTHP without splitting:
+
+ - Performance: It is expensive to split a large folio and under
+ extreme memory pressure some workloads regressed performance when
+ using 64K mTHP vs 4K small folios because of this extra cost in the
+ swap-out path. This series not only eliminates the regression but
+ makes it faster to swap out 64K mTHP vs 4K small folios.
+
+ - Memory fragmentation avoidance: If we can avoid splitting a large
+ folio memory is less likely to become fragmented, making it easier to
+ re-allocate a large folio in future.
+
+ - Performance: Enables a separate series [7] to swap-in whole mTHPs,
+ which means we won't lose the TLB-efficiency benefits of mTHP once the
+ memory has been through a swap cycle.
+
+I've done what I thought was the smallest change possible, and as a
+result, this approach is only employed when the swap is backed by a
+non-rotating block device (just as PMD-sized THP is supported today).
+Discussion against the RFC concluded that this is sufficient.
+
+
+Performance Testing
+===================
+
+I've run some swap performance tests on Ampere Altra VM (arm64) with 8
+CPUs. The VM is set up with a 35G block ram device as the swap device and
+the test is run from inside a memcg limited to 40G memory. I've then run
+`usemem` from vm-scalability with 70 processes, each allocating and
+writing 1G of memory. I've repeated everything 6 times and taken the mean
+performance improvement relative to 4K page baseline:
+
+| alloc size | baseline | + this series |
+| | mm-unstable (~v6.9-rc1) | |
+|:-----------|------------------------:|------------------------:|
+| 4K Page | 0.0% | 1.3% |
+| 64K THP | -13.6% | 46.3% |
+| 2M THP | 91.4% | 89.6% |
+
+So with this change, the 64K swap performance goes from a 14% regression to a
+46% improvement. While 2M shows a small regression I'm confident that this is
+just noise.
+
+[1] https://lore.kernel.org/linux-mm/20231010142111.3997780-1-ryan.roberts@arm.com/
+[2] https://lore.kernel.org/linux-mm/20231017161302.2518826-1-ryan.roberts@arm.com/
+[3] https://lore.kernel.org/linux-mm/20231025144546.577640-1-ryan.roberts@arm.com/
+[4] https://lore.kernel.org/linux-mm/20240311150058.1122862-1-ryan.roberts@arm.com/
+[5] https://lore.kernel.org/linux-mm/20240327144537.4165578-1-ryan.roberts@arm.com/
+[6] https://lore.kernel.org/linux-mm/20240403114032.1162100-1-ryan.roberts@arm.com/
+[7] https://lore.kernel.org/linux-mm/20240304081348.197341-1-21cnbao@gmail.com/
+[8] https://lore.kernel.org/linux-mm/CAGsJ_4yMOow27WDvN2q=E4HAtDd2PJ=OQ5Pj9DG+6FLWwNuXUw@mail.gmail.com/
+[9] https://lore.kernel.org/linux-mm/579d5127-c763-4001-9625-4563a9316ac3@redhat.com/
+
+
+This patch (of 7):
+
+As preparation for supporting small-sized THP in the swap-out path,
+without first needing to split to order-0, Remove the CLUSTER_FLAG_HUGE,
+which, when present, always implies PMD-sized THP, which is the same as
+the cluster size.
+
+The only use of the flag was to determine whether a swap entry refers to a
+single page or a PMD-sized THP in swap_page_trans_huge_swapped(). Instead
+of relying on the flag, we now pass in order, which originates from the
+folio's order. This allows the logic to work for folios of any order.
+
+The one snag is that one of the swap_page_trans_huge_swapped() call sites
+does not have the folio. But it was only being called there to shortcut a
+call __try_to_reclaim_swap() in some cases. __try_to_reclaim_swap() gets
+the folio and (via some other functions) calls
+swap_page_trans_huge_swapped(). So I've removed the problematic call site
+and believe the new logic should be functionally equivalent.
+
+That said, removing the fast path means that we will take a reference and
+trylock a large folio much more often, which we would like to avoid. The
+next patch will solve this.
+
+Removing CLUSTER_FLAG_HUGE also means we can remove split_swap_cluster()
+which used to be called during folio splitting, since
+split_swap_cluster()'s only job was to remove the flag.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-1-ryan.roberts@arm.com
+Link: https://lkml.kernel.org/r/20240408183946.2991168-2-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Acked-by: Chris Li <chrisl@kernel.org>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
diff --git a/txt/mm-swap-simplify-struct-percpu_cluster.txt b/txt/mm-swap-simplify-struct-percpu_cluster.txt
new file mode 100644
index 000000000..5819bb69a
--- /dev/null
+++ b/txt/mm-swap-simplify-struct-percpu_cluster.txt
@@ -0,0 +1,41 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: simplify struct percpu_cluster
+Date: Mon, 8 Apr 2024 19:39:42 +0100
+
+struct percpu_cluster stores the index of cpu's current cluster and the
+offset of the next entry that will be allocated for the cpu. These two
+pieces of information are redundant because the cluster index is just
+(offset / SWAPFILE_CLUSTER). The only reason for explicitly keeping the
+cluster index is because the structure used for it also has a flag to
+indicate "no cluster". However this data structure also contains a spin
+lock, which is never used in this context, as a side effect the code
+copies the spinlock_t structure, which is questionable coding practice in
+my view.
+
+So let's clean this up and store only the next offset, and use a sentinal
+value (SWAP_NEXT_INVALID) to indicate "no cluster". SWAP_NEXT_INVALID is
+chosen to be 0, because 0 will never be seen legitimately; The first page
+in the swap file is the swap header, which is always marked bad to prevent
+it from being allocated as an entry. This also prevents the cluster to
+which it belongs being marked free, so it will never appear on the free
+list.
+
+This change saves 16 bytes per cpu. And given we are shortly going to
+extend this mechanism to be per-cpu-AND-per-order, we will end up saving
+16 * 9 = 144 bytes per cpu, which adds up if you have 256 cpus in the
+system.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-4-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
diff --git a/txt/mm-swap-update-get_swap_pages-to-take-folio-order.txt b/txt/mm-swap-update-get_swap_pages-to-take-folio-order.txt
new file mode 100644
index 000000000..3498d553c
--- /dev/null
+++ b/txt/mm-swap-update-get_swap_pages-to-take-folio-order.txt
@@ -0,0 +1,24 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: update get_swap_pages() to take folio order
+Date: Mon, 8 Apr 2024 19:39:43 +0100
+
+We are about to allow swap storage of any mTHP size. To prepare for that,
+let's change get_swap_pages() to take a folio order parameter instead of
+nr_pages. This makes the interface self-documenting; a power-of-2 number
+of pages must be provided. We will also need the order internally so this
+simplifies accessing it.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-5-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
diff --git a/txt/mm-userfaultfd-allow-hugetlb-change-protection-upon-poison-entry.txt b/txt/mm-userfaultfd-allow-hugetlb-change-protection-upon-poison-entry.txt
index 3c9c70090..9d2bdcd03 100644
--- a/txt/mm-userfaultfd-allow-hugetlb-change-protection-upon-poison-entry.txt
+++ b/txt/mm-userfaultfd-allow-hugetlb-change-protection-upon-poison-entry.txt
@@ -23,9 +23,9 @@ so no chance of swapin errors).
Link: https://lkml.kernel.org/r/20240405231920.1772199-1-peterx@redhat.com
Link: https://lore.kernel.org/r/000000000000920d5e0615602dd1@google.com
-Reported-by: syzbot+b07c8ac8eee3d4d8440f@syzkaller.appspotmail.com
Fixes: fc71884a5f59 ("mm: userfaultfd: add new UFFDIO_POISON ioctl")
Signed-off-by: Peter Xu <peterx@redhat.com>
+Reported-by: syzbot+b07c8ac8eee3d4d8440f@syzkaller.appspotmail.com
+Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
-Cc: David Hildenbrand <david@redhat.com>
Cc: <stable@vger.kernel.org> [6.6+]
diff --git a/txt/mm-vmscan-avoid-split-during-shrink_folio_list.txt b/txt/mm-vmscan-avoid-split-during-shrink_folio_list.txt
new file mode 100644
index 000000000..3c042cc0e
--- /dev/null
+++ b/txt/mm-vmscan-avoid-split-during-shrink_folio_list.txt
@@ -0,0 +1,32 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: vmscan: avoid split during shrink_folio_list()
+Date: Mon, 8 Apr 2024 19:39:45 +0100
+
+Now that swap supports storing all mTHP sizes, avoid splitting large
+folios before swap-out. This benefits performance of the swap-out path by
+eliding split_folio_to_list(), which is expensive, and also sets us up for
+swapping in large folios in a future series.
+
+If the folio is partially mapped, we continue to split it since we want to
+avoid the extra IO overhead and storage of writing out pages
+uneccessarily.
+
+THP_SWPOUT and THP_SWPOUT_FALLBACK counters should continue to count
+events only for PMD-mappable folios to avoid user confusion. THP_SWPOUT
+already has the appropriate guard. Add a guard for THP_SWPOUT_FALLBACK.
+It may be appropriate to add per-size counters in future.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-7-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Barry Song <v-songbaohua@oppo.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>