foo

author: Andrew Morton <akpm@linux-foundation.org> 2024-04-08 13:40:13 -0700
committer: Andrew Morton <akpm@linux-foundation.org> 2024-04-08 13:40:13 -0700
commit: dc439d3fc6f3661f5f7cfd08a59d5507d079a340 (patch)
tree: b6c6cbb4e39b94310d61214b8a65e5a8b77237be
parent: ec3a259c71e17cfcefa54f5647e906bf57c1cbef (diff)
download: 25-new-dc439d3fc6f3661f5f7cfd08a59d5507d079a340.tar.gz
25 files changed, 2105 insertions, 5 deletions
diff --git a/patches/mm-add-per-order-mthp-anon_swpout-and-anon_swpout_fallback-counters.patch b/patches/mm-add-per-order-mthp-anon_swpout-and-anon_swpout_fallback-counters.patch
index 85dbca063..b09222878 100644
--- a/patches/mm-add-per-order-mthp-anon_swpout-and-anon_swpout_fallback-counters.patch
+++ b/patches/mm-add-per-order-mthp-anon_swpout-and-anon_swpout_fallback-counters.patch
@@ -80,9 +80,9 @@ Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
  #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
 --- a/mm/vmscan.c~mm-add-per-order-mthp-anon_swpout-and-anon_swpout_fallback-counters
 +++ a/mm/vmscan.c
-@@ -1230,6 +1230,9 @@ retry:
- 						count_vm_event(
- 							THP_SWPOUT_FALLBACK);
+@@ -1225,6 +1225,9 @@ retry:
+ 							THP_SWPOUT_FALLBACK, 1);
+ 						count_vm_event(THP_SWPOUT_FALLBACK);
  					}
 +					if (nr_pages > 0)
 +						count_mthp_stat(get_order(nr_pages * PAGE_SIZE),
diff --git a/patches/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.patch b/patches/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.patch
new file mode 100644
index 000000000..71c0e7571
--- /dev/null
+++ b/patches/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.patch
@@ -0,0 +1,283 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: madvise: avoid split during MADV_PAGEOUT and MADV_COLD
+Date: Mon, 8 Apr 2024 19:39:46 +0100
+
+Rework madvise_cold_or_pageout_pte_range() to avoid splitting any large
+folio that is fully and contiguously mapped in the pageout/cold vm range. 
+This change means that large folios will be maintained all the way to swap
+storage.  This both improves performance during swap-out, by eliding the
+cost of splitting the folio, and sets us up nicely for maintaining the
+large folio when it is swapped back in (to be covered in a separate
+series).
+
+Folios that are not fully mapped in the target range are still split, but
+note that behavior is changed so that if the split fails for any reason
+(folio locked, shared, etc) we now leave it as is and move to the next pte
+in the range and continue work on the proceeding folios.  Previously any
+failure of this sort would cause the entire operation to give up and no
+folios mapped at higher addresses were paged out or made cold.  Given
+large folios are becoming more common, this old behavior would have likely
+lead to wasted opportunities.
+
+While we are at it, change the code that clears young from the ptes to use
+ptep_test_and_clear_young(), via the new mkold_ptes() batch helper
+function.  This is more efficent than get_and_clear/modify/set, especially
+for contpte mappings on arm64, where the old approach would require
+unfolding/refolding and the new approach can be done in place.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-8-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: Barry Song <v-songbaohua@oppo.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ include/linux/pgtable.h |   30 +++++++++++++
+ mm/internal.h           |   12 ++++-
+ mm/madvise.c            |   87 +++++++++++++++++++++-----------------
+ mm/memory.c             |    4 -
+ 4 files changed, 92 insertions(+), 41 deletions(-)
+
+--- a/include/linux/pgtable.h~mm-madvise-avoid-split-during-madv_pageout-and-madv_cold
++++ a/include/linux/pgtable.h
+@@ -361,6 +361,36 @@ static inline int ptep_test_and_clear_yo
+ }
+ #endif
+ 
++#ifndef mkold_ptes
++/**
++ * mkold_ptes - Mark PTEs that map consecutive pages of the same folio as old.
++ * @vma: VMA the pages are mapped into.
++ * @addr: Address the first page is mapped at.
++ * @ptep: Page table pointer for the first entry.
++ * @nr: Number of entries to mark old.
++ *
++ * May be overridden by the architecture; otherwise, implemented as a simple
++ * loop over ptep_test_and_clear_young().
++ *
++ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
++ * some PTEs might be write-protected.
++ *
++ * Context: The caller holds the page table lock.  The PTEs map consecutive
++ * pages that belong to the same folio.  The PTEs are all in the same PMD.
++ */
++static inline void mkold_ptes(struct vm_area_struct *vma, unsigned long addr,
++		pte_t *ptep, unsigned int nr)
++{
++	for (;;) {
++		ptep_test_and_clear_young(vma, addr, ptep);
++		if (--nr == 0)
++			break;
++		ptep++;
++		addr += PAGE_SIZE;
++	}
++}
++#endif
++
+ #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
+ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+--- a/mm/internal.h~mm-madvise-avoid-split-during-madv_pageout-and-madv_cold
++++ a/mm/internal.h
+@@ -130,6 +130,8 @@ static inline pte_t __pte_batch_clear_ig
+  * @flags: Flags to modify the PTE batch semantics.
+  * @any_writable: Optional pointer to indicate whether any entry except the
+  *		  first one is writable.
++ * @any_young: Optional pointer to indicate whether any entry except the
++ *		  first one is young.
+  *
+  * Detect a PTE batch: consecutive (present) PTEs that map consecutive
+  * pages of the same large folio.
+@@ -145,16 +147,18 @@ static inline pte_t __pte_batch_clear_ig
+  */
+ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
+ 		pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
+-		bool *any_writable)
++		bool *any_writable, bool *any_young)
+ {
+ 	unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
+ 	const pte_t *end_ptep = start_ptep + max_nr;
+ 	pte_t expected_pte, *ptep;
+-	bool writable;
++	bool writable, young;
+ 	int nr;
+ 
+ 	if (any_writable)
+ 		*any_writable = false;
++	if (any_young)
++		*any_young = false;
+ 
+ 	VM_WARN_ON_FOLIO(!pte_present(pte), folio);
+ 	VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
+@@ -168,6 +172,8 @@ static inline int folio_pte_batch(struct
+ 		pte = ptep_get(ptep);
+ 		if (any_writable)
+ 			writable = !!pte_write(pte);
++		if (any_young)
++			young = !!pte_young(pte);
+ 		pte = __pte_batch_clear_ignored(pte, flags);
+ 
+ 		if (!pte_same(pte, expected_pte))
+@@ -183,6 +189,8 @@ static inline int folio_pte_batch(struct
+ 
+ 		if (any_writable)
+ 			*any_writable |= writable;
++		if (any_young)
++			*any_young |= young;
+ 
+ 		nr = pte_batch_hint(ptep, pte);
+ 		expected_pte = pte_advance_pfn(expected_pte, nr);
+--- a/mm/madvise.c~mm-madvise-avoid-split-during-madv_pageout-and-madv_cold
++++ a/mm/madvise.c
+@@ -336,6 +336,7 @@ static int madvise_cold_or_pageout_pte_r
+ 	LIST_HEAD(folio_list);
+ 	bool pageout_anon_only_filter;
+ 	unsigned int batch_count = 0;
++	int nr;
+ 
+ 	if (fatal_signal_pending(current))
+ 		return -EINTR;
+@@ -423,7 +424,8 @@ restart:
+ 		return 0;
+ 	flush_tlb_batched_pending(mm);
+ 	arch_enter_lazy_mmu_mode();
+-	for (; addr < end; pte++, addr += PAGE_SIZE) {
++	for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
++		nr = 1;
+ 		ptent = ptep_get(pte);
+ 
+ 		if (++batch_count == SWAP_CLUSTER_MAX) {
+@@ -447,55 +449,66 @@ restart:
+ 			continue;
+ 
+ 		/*
+-		 * Creating a THP page is expensive so split it only if we
+-		 * are sure it's worth. Split it if we are only owner.
++		 * If we encounter a large folio, only split it if it is not
++		 * fully mapped within the range we are operating on. Otherwise
++		 * leave it as is so that it can be swapped out whole. If we
++		 * fail to split a folio, leave it in place and advance to the
++		 * next pte in the range.
+ 		 */
+ 		if (folio_test_large(folio)) {
+-			int err;
+-
+-			if (folio_likely_mapped_shared(folio))
+-				break;
+-			if (pageout_anon_only_filter && !folio_test_anon(folio))
+-				break;
+-			if (!folio_trylock(folio))
+-				break;
+-			folio_get(folio);
+-			arch_leave_lazy_mmu_mode();
+-			pte_unmap_unlock(start_pte, ptl);
+-			start_pte = NULL;
+-			err = split_folio(folio);
+-			folio_unlock(folio);
+-			folio_put(folio);
+-			if (err)
+-				break;
+-			start_pte = pte =
+-				pte_offset_map_lock(mm, pmd, addr, &ptl);
+-			if (!start_pte)
+-				break;
+-			arch_enter_lazy_mmu_mode();
+-			pte--;
+-			addr -= PAGE_SIZE;
+-			continue;
++			const fpb_t fpb_flags = FPB_IGNORE_DIRTY |
++						FPB_IGNORE_SOFT_DIRTY;
++			int max_nr = (end - addr) / PAGE_SIZE;
++			bool any_young;
++
++			nr = folio_pte_batch(folio, addr, pte, ptent, max_nr,
++					     fpb_flags, NULL, &any_young);
++			if (any_young)
++				ptent = pte_mkyoung(ptent);
++
++			if (nr < folio_nr_pages(folio)) {
++				int err;
++
++				if (folio_likely_mapped_shared(folio))
++					continue;
++				if (pageout_anon_only_filter && !folio_test_anon(folio))
++					continue;
++				if (!folio_trylock(folio))
++					continue;
++				folio_get(folio);
++				arch_leave_lazy_mmu_mode();
++				pte_unmap_unlock(start_pte, ptl);
++				start_pte = NULL;
++				err = split_folio(folio);
++				folio_unlock(folio);
++				folio_put(folio);
++				start_pte = pte =
++					pte_offset_map_lock(mm, pmd, addr, &ptl);
++				if (!start_pte)
++					break;
++				arch_enter_lazy_mmu_mode();
++				if (!err)
++					nr = 0;
++				continue;
++			}
+ 		}
+ 
+ 		/*
+ 		 * Do not interfere with other mappings of this folio and
+-		 * non-LRU folio.
++		 * non-LRU folio. If we have a large folio at this point, we
++		 * know it is fully mapped so if its mapcount is the same as its
++		 * number of pages, it must be exclusive.
+ 		 */
+-		if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
++		if (!folio_test_lru(folio) ||
++		    folio_mapcount(folio) != folio_nr_pages(folio))
+ 			continue;
+ 
+ 		if (pageout_anon_only_filter && !folio_test_anon(folio))
+ 			continue;
+ 
+-		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
+-
+ 		if (!pageout && pte_young(ptent)) {
+-			ptent = ptep_get_and_clear_full(mm, addr, pte,
+-							tlb->fullmm);
+-			ptent = pte_mkold(ptent);
+-			set_pte_at(mm, addr, pte, ptent);
+-			tlb_remove_tlb_entry(tlb, pte, addr);
++			mkold_ptes(vma, addr, pte, nr);
++			tlb_remove_tlb_entries(tlb, pte, nr, addr);
+ 		}
+ 
+ 		/*
+--- a/mm/memory.c~mm-madvise-avoid-split-during-madv_pageout-and-madv_cold
++++ a/mm/memory.c
+@@ -989,7 +989,7 @@ copy_present_ptes(struct vm_area_struct
+ 			flags |= FPB_IGNORE_SOFT_DIRTY;
+ 
+ 		nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
+-				     &any_writable);
++				     &any_writable, NULL);
+ 		folio_ref_add(folio, nr);
+ 		if (folio_test_anon(folio)) {
+ 			if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
+@@ -1559,7 +1559,7 @@ static inline int zap_present_ptes(struc
+ 	 */
+ 	if (unlikely(folio_test_large(folio) && max_nr != 1)) {
+ 		nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
+-				     NULL);
++				     NULL, NULL);
+ 
+ 		zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
+ 				       addr, details, rss, force_flush,
+_
diff --git a/patches/mm-swap-allow-storage-of-all-mthp-orders.patch b/patches/mm-swap-allow-storage-of-all-mthp-orders.patch
new file mode 100644
index 000000000..bd5e5808a
--- /dev/null
+++ b/patches/mm-swap-allow-storage-of-all-mthp-orders.patch
@@ -0,0 +1,430 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: allow storage of all mTHP orders
+Date: Mon, 8 Apr 2024 19:39:44 +0100
+
+Multi-size THP enables performance improvements by allocating large,
+pte-mapped folios for anonymous memory.  However I've observed that on an
+arm64 system running a parallel workload (e.g.  kernel compilation) across
+many cores, under high memory pressure, the speed regresses.  This is due
+to bottlenecking on the increased number of TLBIs added due to all the
+extra folio splitting when the large folios are swapped out.
+
+Therefore, solve this regression by adding support for swapping out mTHP
+without needing to split the folio, just like is already done for
+PMD-sized THP.  This change only applies when CONFIG_THP_SWAP is enabled,
+and when the swap backing store is a non-rotating block device.  These are
+the same constraints as for the existing PMD-sized THP swap-out support.
+
+Note that no attempt is made to swap-in (m)THP here - this is still done
+page-by-page, like for PMD-sized THP.  But swapping-out mTHP is a
+prerequisite for swapping-in mTHP.
+
+The main change here is to improve the swap entry allocator so that it can
+allocate any power-of-2 number of contiguous entries between [1, (1 <<
+PMD_ORDER)].  This is done by allocating a cluster for each distinct order
+and allocating sequentially from it until the cluster is full.  This
+ensures that we don't need to search the map and we get no fragmentation
+due to alignment padding for different orders in the cluster.  If there is
+no current cluster for a given order, we attempt to allocate a free
+cluster from the list.  If there are no free clusters, we fail the
+allocation and the caller can fall back to splitting the folio and
+allocates individual entries (as per existing PMD-sized THP fallback).
+
+The per-order current clusters are maintained per-cpu using the existing
+infrastructure.  This is done to avoid interleving pages from different
+tasks, which would prevent IO being batched.  This is already done for the
+order-0 allocations so we follow the same pattern.
+
+As is done for order-0 per-cpu clusters, the scanner now can steal order-0
+entries from any per-cpu-per-order reserved cluster.  This ensures that
+when the swap file is getting full, space doesn't get tied up in the
+per-cpu reserves.
+
+This change only modifies swap to be able to accept any order mTHP.  It
+doesn't change the callers to elide doing the actual split.  That will be
+done in separate changes.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-6-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ include/linux/swap.h |    8 +-
+ mm/swapfile.c        |  162 +++++++++++++++++++++++------------------
+ 2 files changed, 98 insertions(+), 72 deletions(-)
+
+--- a/include/linux/swap.h~mm-swap-allow-storage-of-all-mthp-orders
++++ a/include/linux/swap.h
+@@ -268,13 +268,19 @@ struct swap_cluster_info {
+  */
+ #define SWAP_NEXT_INVALID	0
+ 
++#ifdef CONFIG_THP_SWAP
++#define SWAP_NR_ORDERS		(PMD_ORDER + 1)
++#else
++#define SWAP_NR_ORDERS		1
++#endif
++
+ /*
+  * We assign a cluster to each CPU, so each CPU can allocate swap entry from
+  * its own cluster and swapout sequentially. The purpose is to optimize swapout
+  * throughput.
+  */
+ struct percpu_cluster {
+-	unsigned int next; /* Likely next allocation offset */
++	unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
+ };
+ 
+ struct swap_cluster_list {
+--- a/mm/swapfile.c~mm-swap-allow-storage-of-all-mthp-orders
++++ a/mm/swapfile.c
+@@ -551,10 +551,12 @@ static void free_cluster(struct swap_inf
+ 
+ /*
+  * The cluster corresponding to page_nr will be used. The cluster will be
+- * removed from free cluster list and its usage counter will be increased.
++ * removed from free cluster list and its usage counter will be increased by
++ * count.
+  */
+-static void inc_cluster_info_page(struct swap_info_struct *p,
+-	struct swap_cluster_info *cluster_info, unsigned long page_nr)
++static void add_cluster_info_page(struct swap_info_struct *p,
++	struct swap_cluster_info *cluster_info, unsigned long page_nr,
++	unsigned long count)
+ {
+ 	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+ 
+@@ -563,9 +565,19 @@ static void inc_cluster_info_page(struct
+ 	if (cluster_is_free(&cluster_info[idx]))
+ 		alloc_cluster(p, idx);
+ 
+-	VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
++	VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
+ 	cluster_set_count(&cluster_info[idx],
+-		cluster_count(&cluster_info[idx]) + 1);
++		cluster_count(&cluster_info[idx]) + count);
++}
++
++/*
++ * The cluster corresponding to page_nr will be used. The cluster will be
++ * removed from free cluster list and its usage counter will be increased by 1.
++ */
++static void inc_cluster_info_page(struct swap_info_struct *p,
++	struct swap_cluster_info *cluster_info, unsigned long page_nr)
++{
++	add_cluster_info_page(p, cluster_info, page_nr, 1);
+ }
+ 
+ /*
+@@ -595,7 +607,7 @@ static void dec_cluster_info_page(struct
+  */
+ static bool
+ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
+-	unsigned long offset)
++	unsigned long offset, int order)
+ {
+ 	struct percpu_cluster *percpu_cluster;
+ 	bool conflict;
+@@ -609,24 +621,39 @@ scan_swap_map_ssd_cluster_conflict(struc
+ 		return false;
+ 
+ 	percpu_cluster = this_cpu_ptr(si->percpu_cluster);
+-	percpu_cluster->next = SWAP_NEXT_INVALID;
++	percpu_cluster->next[order] = SWAP_NEXT_INVALID;
++	return true;
++}
++
++static inline bool swap_range_empty(char *swap_map, unsigned int start,
++				    unsigned int nr_pages)
++{
++	unsigned int i;
++
++	for (i = 0; i < nr_pages; i++) {
++		if (swap_map[start + i])
++			return false;
++	}
++
+ 	return true;
+ }
+ 
+ /*
+- * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
+- * might involve allocating a new cluster for current CPU too.
++ * Try to get swap entries with specified order from current cpu's swap entry
++ * pool (a cluster). This might involve allocating a new cluster for current CPU
++ * too.
+  */
+ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+-	unsigned long *offset, unsigned long *scan_base)
++	unsigned long *offset, unsigned long *scan_base, int order)
+ {
++	unsigned int nr_pages = 1 << order;
+ 	struct percpu_cluster *cluster;
+ 	struct swap_cluster_info *ci;
+ 	unsigned int tmp, max;
+ 
+ new_cluster:
+ 	cluster = this_cpu_ptr(si->percpu_cluster);
+-	tmp = cluster->next;
++	tmp = cluster->next[order];
+ 	if (tmp == SWAP_NEXT_INVALID) {
+ 		if (!cluster_list_empty(&si->free_clusters)) {
+ 			tmp = cluster_next(&si->free_clusters.head) *
+@@ -647,26 +674,27 @@ new_cluster:
+ 
+ 	/*
+ 	 * Other CPUs can use our cluster if they can't find a free cluster,
+-	 * check if there is still free entry in the cluster
++	 * check if there is still free entry in the cluster, maintaining
++	 * natural alignment.
+ 	 */
+ 	max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER));
+ 	if (tmp < max) {
+ 		ci = lock_cluster(si, tmp);
+ 		while (tmp < max) {
+-			if (!si->swap_map[tmp])
++			if (swap_range_empty(si->swap_map, tmp, nr_pages))
+ 				break;
+-			tmp++;
++			tmp += nr_pages;
+ 		}
+ 		unlock_cluster(ci);
+ 	}
+ 	if (tmp >= max) {
+-		cluster->next = SWAP_NEXT_INVALID;
++		cluster->next[order] = SWAP_NEXT_INVALID;
+ 		goto new_cluster;
+ 	}
+ 	*offset = tmp;
+ 	*scan_base = tmp;
+-	tmp += 1;
+-	cluster->next = tmp < max ? tmp : SWAP_NEXT_INVALID;
++	tmp += nr_pages;
++	cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID;
+ 	return true;
+ }
+ 
+@@ -796,13 +824,14 @@ static bool swap_offset_available_and_lo
+ 
+ static int scan_swap_map_slots(struct swap_info_struct *si,
+ 			       unsigned char usage, int nr,
+-			       swp_entry_t slots[])
++			       swp_entry_t slots[], int order)
+ {
+ 	struct swap_cluster_info *ci;
+ 	unsigned long offset;
+ 	unsigned long scan_base;
+ 	unsigned long last_in_cluster = 0;
+ 	int latency_ration = LATENCY_LIMIT;
++	unsigned int nr_pages = 1 << order;
+ 	int n_ret = 0;
+ 	bool scanned_many = false;
+ 
+@@ -817,6 +846,25 @@ static int scan_swap_map_slots(struct sw
+ 	 * And we let swap pages go all over an SSD partition.  Hugh
+ 	 */
+ 
++	if (order > 0) {
++		/*
++		 * Should not even be attempting large allocations when huge
++		 * page swap is disabled.  Warn and fail the allocation.
++		 */
++		if (!IS_ENABLED(CONFIG_THP_SWAP) ||
++		    nr_pages > SWAPFILE_CLUSTER) {
++			VM_WARN_ON_ONCE(1);
++			return 0;
++		}
++
++		/*
++		 * Swapfile is not block device or not using clusters so unable
++		 * to allocate large entries.
++		 */
++		if (!(si->flags & SWP_BLKDEV) || !si->cluster_info)
++			return 0;
++	}
++
+ 	si->flags += SWP_SCANNING;
+ 	/*
+ 	 * Use percpu scan base for SSD to reduce lock contention on
+@@ -831,8 +879,11 @@ static int scan_swap_map_slots(struct sw
+ 
+ 	/* SSD algorithm */
+ 	if (si->cluster_info) {
+-		if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
++		if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) {
++			if (order > 0)
++				goto no_page;
+ 			goto scan;
++		}
+ 	} else if (unlikely(!si->cluster_nr--)) {
+ 		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
+ 			si->cluster_nr = SWAPFILE_CLUSTER - 1;
+@@ -874,13 +925,16 @@ static int scan_swap_map_slots(struct sw
+ 
+ checks:
+ 	if (si->cluster_info) {
+-		while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
++		while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) {
+ 		/* take a break if we already got some slots */
+ 			if (n_ret)
+ 				goto done;
+ 			if (!scan_swap_map_try_ssd_cluster(si, &offset,
+-							&scan_base))
++							&scan_base, order)) {
++				if (order > 0)
++					goto no_page;
+ 				goto scan;
++			}
+ 		}
+ 	}
+ 	if (!(si->flags & SWP_WRITEOK))
+@@ -911,11 +965,11 @@ checks:
+ 		else
+ 			goto done;
+ 	}
+-	WRITE_ONCE(si->swap_map[offset], usage);
+-	inc_cluster_info_page(si, si->cluster_info, offset);
++	memset(si->swap_map + offset, usage, nr_pages);
++	add_cluster_info_page(si, si->cluster_info, offset, nr_pages);
+ 	unlock_cluster(ci);
+ 
+-	swap_range_alloc(si, offset, 1);
++	swap_range_alloc(si, offset, nr_pages);
+ 	slots[n_ret++] = swp_entry(si->type, offset);
+ 
+ 	/* got enough slots or reach max slots? */
+@@ -936,8 +990,10 @@ checks:
+ 
+ 	/* try to get more slots in cluster */
+ 	if (si->cluster_info) {
+-		if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
++		if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order))
+ 			goto checks;
++		if (order > 0)
++			goto done;
+ 	} else if (si->cluster_nr && !si->swap_map[++offset]) {
+ 		/* non-ssd case, still more slots in cluster? */
+ 		--si->cluster_nr;
+@@ -964,11 +1020,13 @@ checks:
+ 	}
+ 
+ done:
+-	set_cluster_next(si, offset + 1);
++	if (order == 0)
++		set_cluster_next(si, offset + 1);
+ 	si->flags -= SWP_SCANNING;
+ 	return n_ret;
+ 
+ scan:
++	VM_WARN_ON(order > 0);
+ 	spin_unlock(&si->lock);
+ 	while (++offset <= READ_ONCE(si->highest_bit)) {
+ 		if (unlikely(--latency_ration < 0)) {
+@@ -997,38 +1055,6 @@ no_page:
+ 	return n_ret;
+ }
+ 
+-static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
+-{
+-	unsigned long idx;
+-	struct swap_cluster_info *ci;
+-	unsigned long offset;
+-
+-	/*
+-	 * Should not even be attempting cluster allocations when huge
+-	 * page swap is disabled.  Warn and fail the allocation.
+-	 */
+-	if (!IS_ENABLED(CONFIG_THP_SWAP)) {
+-		VM_WARN_ON_ONCE(1);
+-		return 0;
+-	}
+-
+-	if (cluster_list_empty(&si->free_clusters))
+-		return 0;
+-
+-	idx = cluster_list_first(&si->free_clusters);
+-	offset = idx * SWAPFILE_CLUSTER;
+-	ci = lock_cluster(si, offset);
+-	alloc_cluster(si, idx);
+-	cluster_set_count(ci, SWAPFILE_CLUSTER);
+-
+-	memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER);
+-	unlock_cluster(ci);
+-	swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
+-	*slot = swp_entry(si->type, offset);
+-
+-	return 1;
+-}
+-
+ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
+ {
+ 	unsigned long offset = idx * SWAPFILE_CLUSTER;
+@@ -1051,9 +1077,6 @@ int get_swap_pages(int n_goal, swp_entry
+ 	int n_ret = 0;
+ 	int node;
+ 
+-	/* Only single cluster request supported */
+-	WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
+-
+ 	spin_lock(&swap_avail_lock);
+ 
+ 	avail_pgs = atomic_long_read(&nr_swap_pages) / size;
+@@ -1089,14 +1112,10 @@ start_over:
+ 			spin_unlock(&si->lock);
+ 			goto nextsi;
+ 		}
+-		if (size == SWAPFILE_CLUSTER) {
+-			if (si->flags & SWP_BLKDEV)
+-				n_ret = swap_alloc_cluster(si, swp_entries);
+-		} else
+-			n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
+-						    n_goal, swp_entries);
++		n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
++					    n_goal, swp_entries, order);
+ 		spin_unlock(&si->lock);
+-		if (n_ret || size == SWAPFILE_CLUSTER)
++		if (n_ret || size > 1)
+ 			goto check_out;
+ 		cond_resched();
+ 
+@@ -1673,7 +1692,7 @@ swp_entry_t get_swap_page_of_type(int ty
+ 
+ 	/* This is called for allocating swap entry, not cache */
+ 	spin_lock(&si->lock);
+-	if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry))
++	if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
+ 		atomic_long_dec(&nr_swap_pages);
+ 	spin_unlock(&si->lock);
+ fail:
+@@ -3127,7 +3146,7 @@ SYSCALL_DEFINE2(swapon, const char __use
+ 		p->flags |= SWP_SYNCHRONOUS_IO;
+ 
+ 	if (p->bdev && bdev_nonrot(p->bdev)) {
+-		int cpu;
++		int cpu, i;
+ 		unsigned long ci, nr_cluster;
+ 
+ 		p->flags |= SWP_SOLIDSTATE;
+@@ -3165,7 +3184,8 @@ SYSCALL_DEFINE2(swapon, const char __use
+ 			struct percpu_cluster *cluster;
+ 
+ 			cluster = per_cpu_ptr(p->percpu_cluster, cpu);
+-			cluster->next = SWAP_NEXT_INVALID;
++			for (i = 0; i < SWAP_NR_ORDERS; i++)
++				cluster->next[i] = SWAP_NEXT_INVALID;
+ 		}
+ 	} else {
+ 		atomic_inc(&nr_rotate_swap);
+_
diff --git a/patches/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.patch b/patches/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.patch
new file mode 100644
index 000000000..655016bb0
--- /dev/null
+++ b/patches/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.patch
@@ -0,0 +1,408 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: free_swap_and_cache_nr() as batched free_swap_and_cache()
+Date: Mon, 8 Apr 2024 19:39:41 +0100
+
+Now that we no longer have a convenient flag in the cluster to determine
+if a folio is large, free_swap_and_cache() will take a reference and lock
+a large folio much more often, which could lead to contention and (e.g.)
+failure to split large folios, etc.
+
+Let's solve that problem by batch freeing swap and cache with a new
+function, free_swap_and_cache_nr(), to free a contiguous range of swap
+entries together.  This allows us to first drop a reference to each swap
+slot before we try to release the cache folio.  This means we only try to
+release the folio once, only taking the reference and lock once - much
+better than the previous 512 times for the 2M THP case.
+
+Contiguous swap entries are gathered in zap_pte_range() and
+madvise_free_pte_range() in a similar way to how present ptes are already
+gathered in zap_pte_range().
+
+While we are at it, let's simplify by converting the return type of both
+functions to void.  The return value was used only by zap_pte_range() to
+print a bad pte, and was ignored by everyone else, so the extra reporting
+wasn't exactly guaranteed.  We will still get the warning with most of the
+information from get_swap_device().  With the batch version, we wouldn't
+know which pte was bad anyway so could print the wrong one.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-3-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ include/linux/pgtable.h |   29 +++++++++++
+ include/linux/swap.h    |   12 +++-
+ mm/internal.h           |   63 ++++++++++++++++++++++++
+ mm/madvise.c            |   12 +++-
+ mm/memory.c             |   13 ++---
+ mm/swapfile.c           |   97 ++++++++++++++++++++++++++++++--------
+ 6 files changed, 195 insertions(+), 31 deletions(-)
+
+--- a/include/linux/pgtable.h~mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache
++++ a/include/linux/pgtable.h
+@@ -708,6 +708,35 @@ static inline void pte_clear_not_present
+ }
+ #endif
+ 
++#ifndef clear_not_present_full_ptes
++/**
++ * clear_not_present_full_ptes - Clear multiple not present PTEs which are
++ *				 consecutive in the pgtable.
++ * @mm: Address space the ptes represent.
++ * @addr: Address of the first pte.
++ * @ptep: Page table pointer for the first entry.
++ * @nr: Number of entries to clear.
++ * @full: Whether we are clearing a full mm.
++ *
++ * May be overridden by the architecture; otherwise, implemented as a simple
++ * loop over pte_clear_not_present_full().
++ *
++ * Context: The caller holds the page table lock.  The PTEs are all not present.
++ * The PTEs are all in the same PMD.
++ */
++static inline void clear_not_present_full_ptes(struct mm_struct *mm,
++		unsigned long addr, pte_t *ptep, unsigned int nr, int full)
++{
++	for (;;) {
++		pte_clear_not_present_full(mm, addr, ptep, full);
++		if (--nr == 0)
++			break;
++		ptep++;
++		addr += PAGE_SIZE;
++	}
++}
++#endif
++
+ #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
+ extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
+ 			      unsigned long address,
+--- a/include/linux/swap.h~mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache
++++ a/include/linux/swap.h
+@@ -468,7 +468,7 @@ extern int swap_duplicate(swp_entry_t);
+ extern int swapcache_prepare(swp_entry_t);
+ extern void swap_free(swp_entry_t);
+ extern void swapcache_free_entries(swp_entry_t *entries, int n);
+-extern int free_swap_and_cache(swp_entry_t);
++extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
+ int swap_type_of(dev_t device, sector_t offset);
+ int find_first_swap(dev_t *device);
+ extern unsigned int count_swap_pages(int, int);
+@@ -517,8 +517,9 @@ static inline void put_swap_device(struc
+ #define free_pages_and_swap_cache(pages, nr) \
+ 	release_pages((pages), (nr));
+ 
+-/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
+-#define free_swap_and_cache(e) is_pfn_swap_entry(e)
++static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
++{
++}
+ 
+ static inline void free_swap_cache(struct folio *folio)
+ {
+@@ -586,6 +587,11 @@ static inline int add_swap_extent(struct
+ }
+ #endif /* CONFIG_SWAP */
+ 
++static inline void free_swap_and_cache(swp_entry_t entry)
++{
++	free_swap_and_cache_nr(entry, 1);
++}
++
+ #ifdef CONFIG_MEMCG
+ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
+ {
+--- a/mm/internal.h~mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache
++++ a/mm/internal.h
+@@ -11,6 +11,8 @@
+ #include <linux/mm.h>
+ #include <linux/pagemap.h>
+ #include <linux/rmap.h>
++#include <linux/swap.h>
++#include <linux/swapops.h>
+ #include <linux/tracepoint-defs.h>
+ 
+ struct folio_batch;
+@@ -189,6 +191,67 @@ static inline int folio_pte_batch(struct
+ 
+ 	return min(ptep - start_ptep, max_nr);
+ }
++
++/**
++ * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
++ * @pte: The initial pte state; is_swap_pte(pte) must be true.
++ *
++ * Increments the swap offset, while maintaining all other fields, including
++ * swap type, and any swp pte bits. The resulting pte is returned.
++ */
++static inline pte_t pte_next_swp_offset(pte_t pte)
++{
++	swp_entry_t entry = pte_to_swp_entry(pte);
++	pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
++						   swp_offset(entry) + 1));
++
++	if (pte_swp_soft_dirty(pte))
++		new = pte_swp_mksoft_dirty(new);
++	if (pte_swp_exclusive(pte))
++		new = pte_swp_mkexclusive(new);
++	if (pte_swp_uffd_wp(pte))
++		new = pte_swp_mkuffd_wp(new);
++
++	return new;
++}
++
++/**
++ * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
++ * @start_ptep: Page table pointer for the first entry.
++ * @max_nr: The maximum number of table entries to consider.
++ * @pte: Page table entry for the first entry.
++ *
++ * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
++ * containing swap entries all with consecutive offsets and targeting the same
++ * swap type, all with matching swp pte bits.
++ *
++ * max_nr must be at least one and must be limited by the caller so scanning
++ * cannot exceed a single page table.
++ *
++ * Return: the number of table entries in the batch.
++ */
++static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
++{
++	pte_t expected_pte = pte_next_swp_offset(pte);
++	const pte_t *end_ptep = start_ptep + max_nr;
++	pte_t *ptep = start_ptep + 1;
++
++	VM_WARN_ON(max_nr < 1);
++	VM_WARN_ON(!is_swap_pte(pte));
++	VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
++
++	while (ptep < end_ptep) {
++		pte = ptep_get(ptep);
++
++		if (!pte_same(pte, expected_pte))
++			break;
++
++		expected_pte = pte_next_swp_offset(expected_pte);
++		ptep++;
++	}
++
++	return ptep - start_ptep;
++}
+ #endif /* CONFIG_MMU */
+ 
+ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
+--- a/mm/madvise.c~mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache
++++ a/mm/madvise.c
+@@ -628,6 +628,7 @@ static int madvise_free_pte_range(pmd_t
+ 	struct folio *folio;
+ 	int nr_swap = 0;
+ 	unsigned long next;
++	int nr, max_nr;
+ 
+ 	next = pmd_addr_end(addr, end);
+ 	if (pmd_trans_huge(*pmd))
+@@ -640,7 +641,8 @@ static int madvise_free_pte_range(pmd_t
+ 		return 0;
+ 	flush_tlb_batched_pending(mm);
+ 	arch_enter_lazy_mmu_mode();
+-	for (; addr != end; pte++, addr += PAGE_SIZE) {
++	for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
++		nr = 1;
+ 		ptent = ptep_get(pte);
+ 
+ 		if (pte_none(ptent))
+@@ -655,9 +657,11 @@ static int madvise_free_pte_range(pmd_t
+ 
+ 			entry = pte_to_swp_entry(ptent);
+ 			if (!non_swap_entry(entry)) {
+-				nr_swap--;
+-				free_swap_and_cache(entry);
+-				pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
++				max_nr = (end - addr) / PAGE_SIZE;
++				nr = swap_pte_batch(pte, max_nr, ptent);
++				nr_swap -= nr;
++				free_swap_and_cache_nr(entry, nr);
++				clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
+ 			} else if (is_hwpoison_entry(entry) ||
+ 				   is_poisoned_swp_entry(entry)) {
+ 				pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+--- a/mm/memory.c~mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache
++++ a/mm/memory.c
+@@ -1637,12 +1637,13 @@ static unsigned long zap_pte_range(struc
+ 				folio_remove_rmap_pte(folio, page, vma);
+ 			folio_put(folio);
+ 		} else if (!non_swap_entry(entry)) {
+-			/* Genuine swap entry, hence a private anon page */
++			max_nr = (end - addr) / PAGE_SIZE;
++			nr = swap_pte_batch(pte, max_nr, ptent);
++			/* Genuine swap entries, hence a private anon pages */
+ 			if (!should_zap_cows(details))
+ 				continue;
+-			rss[MM_SWAPENTS]--;
+-			if (unlikely(!free_swap_and_cache(entry)))
+-				print_bad_pte(vma, addr, ptent, NULL);
++			rss[MM_SWAPENTS] -= nr;
++			free_swap_and_cache_nr(entry, nr);
+ 		} else if (is_migration_entry(entry)) {
+ 			folio = pfn_swap_entry_folio(entry);
+ 			if (!should_zap_folio(details, folio))
+@@ -1665,8 +1666,8 @@ static unsigned long zap_pte_range(struc
+ 			pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
+ 			WARN_ON_ONCE(1);
+ 		}
+-		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+-		zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent);
++		clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
++		zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
+ 	} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
+ 
+ 	add_mm_rss_vec(mm, rss);
+--- a/mm/swapfile.c~mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache
++++ a/mm/swapfile.c
+@@ -130,7 +130,11 @@ static inline unsigned char swap_count(u
+ /* Reclaim the swap entry if swap is getting full*/
+ #define TTRS_FULL		0x4
+ 
+-/* returns 1 if swap entry is freed */
++/*
++ * returns number of pages in the folio that backs the swap entry. If positive,
++ * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
++ * folio was associated with the swap entry.
++ */
+ static int __try_to_reclaim_swap(struct swap_info_struct *si,
+ 				 unsigned long offset, unsigned long flags)
+ {
+@@ -155,6 +159,7 @@ static int __try_to_reclaim_swap(struct
+ 			ret = folio_free_swap(folio);
+ 		folio_unlock(folio);
+ 	}
++	ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
+ 	folio_put(folio);
+ 	return ret;
+ }
+@@ -895,7 +900,7 @@ checks:
+ 		swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
+ 		spin_lock(&si->lock);
+ 		/* entry was freed successfully, try to use this again */
+-		if (swap_was_freed)
++		if (swap_was_freed > 0)
+ 			goto checks;
+ 		goto scan; /* check next one */
+ 	}
+@@ -1572,32 +1577,88 @@ bool folio_free_swap(struct folio *folio
+ 	return true;
+ }
+ 
+-/*
+- * Free the swap entry like above, but also try to
+- * free the page cache entry if it is the last user.
++/**
++ * free_swap_and_cache_nr() - Release reference on range of swap entries and
++ *                            reclaim their cache if no more references remain.
++ * @entry: First entry of range.
++ * @nr: Number of entries in range.
++ *
++ * For each swap entry in the contiguous range, release a reference. If any swap
++ * entries become free, try to reclaim their underlying folios, if present. The
++ * offset range is defined by [entry.offset, entry.offset + nr).
+  */
+-int free_swap_and_cache(swp_entry_t entry)
++void free_swap_and_cache_nr(swp_entry_t entry, int nr)
+ {
+-	struct swap_info_struct *p;
++	const unsigned long start_offset = swp_offset(entry);
++	const unsigned long end_offset = start_offset + nr;
++	unsigned int type = swp_type(entry);
++	struct swap_info_struct *si;
++	bool any_only_cache = false;
++	unsigned long offset;
+ 	unsigned char count;
+ 
+ 	if (non_swap_entry(entry))
+-		return 1;
++		return;
++
++	si = get_swap_device(entry);
++	if (!si)
++		return;
++
++	if (WARN_ON(end_offset > si->max))
++		goto out;
+ 
+-	p = get_swap_device(entry);
+-	if (p) {
+-		if (WARN_ON(data_race(!p->swap_map[swp_offset(entry)]))) {
+-			put_swap_device(p);
+-			return 0;
++	/*
++	 * First free all entries in the range.
++	 */
++	for (offset = start_offset; offset < end_offset; offset++) {
++		if (data_race(si->swap_map[offset])) {
++			count = __swap_entry_free(si, swp_entry(type, offset));
++			if (count == SWAP_HAS_CACHE)
++				any_only_cache = true;
++		} else {
++			WARN_ON_ONCE(1);
+ 		}
++	}
++
++	/*
++	 * Short-circuit the below loop if none of the entries had their
++	 * reference drop to zero.
++	 */
++	if (!any_only_cache)
++		goto out;
+ 
+-		count = __swap_entry_free(p, entry);
+-		if (count == SWAP_HAS_CACHE)
+-			__try_to_reclaim_swap(p, swp_offset(entry),
++	/*
++	 * Now go back over the range trying to reclaim the swap cache. This is
++	 * more efficient for large folios because we will only try to reclaim
++	 * the swap once per folio in the common case. If we do
++	 * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
++	 * latter will get a reference and lock the folio for every individual
++	 * page but will only succeed once the swap slot for every subpage is
++	 * zero.
++	 */
++	for (offset = start_offset; offset < end_offset; offset += nr) {
++		nr = 1;
++		if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
++			/*
++			 * Folios are always naturally aligned in swap so
++			 * advance forward to the next boundary. Zero means no
++			 * folio was found for the swap entry, so advance by 1
++			 * in this case. Negative value means folio was found
++			 * but could not be reclaimed. Here we can still advance
++			 * to the next boundary.
++			 */
++			nr = __try_to_reclaim_swap(si, offset,
+ 					      TTRS_UNMAPPED | TTRS_FULL);
+-		put_swap_device(p);
++			if (nr == 0)
++				nr = 1;
++			else if (nr < 0)
++				nr = -nr;
++			nr = ALIGN(offset + 1, nr) - offset;
++		}
+ 	}
+-	return p != NULL;
++
++out:
++	put_swap_device(si);
+ }
+ 
+ #ifdef CONFIG_HIBERNATION
+_
diff --git a/patches/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.patch b/patches/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.patch
new file mode 100644
index 000000000..aeb4e608e
--- /dev/null
+++ b/patches/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.patch
@@ -0,0 +1,271 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: remove CLUSTER_FLAG_HUGE from swap_cluster_info:flags
+Date: Mon, 8 Apr 2024 19:39:40 +0100
+
+Patch series "Swap-out mTHP without splitting", v7.
+
+This series adds support for swapping out multi-size THP (mTHP) without
+needing to first split the large folio via
+split_huge_page_to_list_to_order().  It closely follows the approach
+already used to swap-out PMD-sized THP.
+
+There are a couple of reasons for swapping out mTHP without splitting:
+
+  - Performance: It is expensive to split a large folio and under
+    extreme memory pressure some workloads regressed performance when
+    using 64K mTHP vs 4K small folios because of this extra cost in the
+    swap-out path.  This series not only eliminates the regression but
+    makes it faster to swap out 64K mTHP vs 4K small folios.
+
+  - Memory fragmentation avoidance: If we can avoid splitting a large
+    folio memory is less likely to become fragmented, making it easier to
+    re-allocate a large folio in future.
+
+  - Performance: Enables a separate series [7] to swap-in whole mTHPs,
+    which means we won't lose the TLB-efficiency benefits of mTHP once the
+    memory has been through a swap cycle.
+
+I've done what I thought was the smallest change possible, and as a
+result, this approach is only employed when the swap is backed by a
+non-rotating block device (just as PMD-sized THP is supported today). 
+Discussion against the RFC concluded that this is sufficient.
+
+
+Performance Testing
+===================
+
+I've run some swap performance tests on Ampere Altra VM (arm64) with 8
+CPUs.  The VM is set up with a 35G block ram device as the swap device and
+the test is run from inside a memcg limited to 40G memory.  I've then run
+`usemem` from vm-scalability with 70 processes, each allocating and
+writing 1G of memory.  I've repeated everything 6 times and taken the mean
+performance improvement relative to 4K page baseline:
+
+| alloc size |                baseline |           + this series |
+|            | mm-unstable (~v6.9-rc1) |                         |
+|:-----------|------------------------:|------------------------:|
+| 4K Page    |                    0.0% |                    1.3% |
+| 64K THP    |                  -13.6% |                   46.3% |
+| 2M THP     |                   91.4% |                   89.6% |
+
+So with this change, the 64K swap performance goes from a 14% regression to a
+46% improvement. While 2M shows a small regression I'm confident that this is
+just noise.
+
+[1] https://lore.kernel.org/linux-mm/20231010142111.3997780-1-ryan.roberts@arm.com/
+[2] https://lore.kernel.org/linux-mm/20231017161302.2518826-1-ryan.roberts@arm.com/
+[3] https://lore.kernel.org/linux-mm/20231025144546.577640-1-ryan.roberts@arm.com/
+[4] https://lore.kernel.org/linux-mm/20240311150058.1122862-1-ryan.roberts@arm.com/
+[5] https://lore.kernel.org/linux-mm/20240327144537.4165578-1-ryan.roberts@arm.com/
+[6] https://lore.kernel.org/linux-mm/20240403114032.1162100-1-ryan.roberts@arm.com/
+[7] https://lore.kernel.org/linux-mm/20240304081348.197341-1-21cnbao@gmail.com/
+[8] https://lore.kernel.org/linux-mm/CAGsJ_4yMOow27WDvN2q=E4HAtDd2PJ=OQ5Pj9DG+6FLWwNuXUw@mail.gmail.com/
+[9] https://lore.kernel.org/linux-mm/579d5127-c763-4001-9625-4563a9316ac3@redhat.com/
+
+
+This patch (of 7):
+
+As preparation for supporting small-sized THP in the swap-out path,
+without first needing to split to order-0, Remove the CLUSTER_FLAG_HUGE,
+which, when present, always implies PMD-sized THP, which is the same as
+the cluster size.
+
+The only use of the flag was to determine whether a swap entry refers to a
+single page or a PMD-sized THP in swap_page_trans_huge_swapped().  Instead
+of relying on the flag, we now pass in order, which originates from the
+folio's order.  This allows the logic to work for folios of any order.
+
+The one snag is that one of the swap_page_trans_huge_swapped() call sites
+does not have the folio.  But it was only being called there to shortcut a
+call __try_to_reclaim_swap() in some cases.  __try_to_reclaim_swap() gets
+the folio and (via some other functions) calls
+swap_page_trans_huge_swapped().  So I've removed the problematic call site
+and believe the new logic should be functionally equivalent.
+
+That said, removing the fast path means that we will take a reference and
+trylock a large folio much more often, which we would like to avoid.  The
+next patch will solve this.
+
+Removing CLUSTER_FLAG_HUGE also means we can remove split_swap_cluster()
+which used to be called during folio splitting, since
+split_swap_cluster()'s only job was to remove the flag.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-1-ryan.roberts@arm.com
+Link: https://lkml.kernel.org/r/20240408183946.2991168-2-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Acked-by: Chris Li <chrisl@kernel.org>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ include/linux/swap.h |   10 --------
+ mm/huge_memory.c     |    3 --
+ mm/swapfile.c        |   47 ++++++-----------------------------------
+ 3 files changed, 8 insertions(+), 52 deletions(-)
+
+--- a/include/linux/swap.h~mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags
++++ a/include/linux/swap.h
+@@ -259,7 +259,6 @@ struct swap_cluster_info {
+ };
+ #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
+ #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
+-#define CLUSTER_FLAG_HUGE 4 /* This cluster is backing a transparent huge page */
+ 
+ /*
+  * We assign a cluster to each CPU, so each CPU can allocate swap entry from
+@@ -587,15 +586,6 @@ static inline int add_swap_extent(struct
+ }
+ #endif /* CONFIG_SWAP */
+ 
+-#ifdef CONFIG_THP_SWAP
+-extern int split_swap_cluster(swp_entry_t entry);
+-#else
+-static inline int split_swap_cluster(swp_entry_t entry)
+-{
+-	return 0;
+-}
+-#endif
+-
+ #ifdef CONFIG_MEMCG
+ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
+ {
+--- a/mm/huge_memory.c~mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags
++++ a/mm/huge_memory.c
+@@ -2844,9 +2844,6 @@ static void __split_huge_page(struct pag
+ 		shmem_uncharge(folio->mapping->host, nr_dropped);
+ 	remap_page(folio, nr);
+ 
+-	if (folio_test_swapcache(folio))
+-		split_swap_cluster(folio->swap);
+-
+ 	/*
+ 	 * set page to its compound_head when split to non order-0 pages, so
+ 	 * we can skip unlocking it below, since PG_locked is transferred to
+--- a/mm/swapfile.c~mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags
++++ a/mm/swapfile.c
+@@ -343,18 +343,6 @@ static inline void cluster_set_null(stru
+ 	info->data = 0;
+ }
+ 
+-static inline bool cluster_is_huge(struct swap_cluster_info *info)
+-{
+-	if (IS_ENABLED(CONFIG_THP_SWAP))
+-		return info->flags & CLUSTER_FLAG_HUGE;
+-	return false;
+-}
+-
+-static inline void cluster_clear_huge(struct swap_cluster_info *info)
+-{
+-	info->flags &= ~CLUSTER_FLAG_HUGE;
+-}
+-
+ static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
+ 						     unsigned long offset)
+ {
+@@ -1027,7 +1015,7 @@ static int swap_alloc_cluster(struct swa
+ 	offset = idx * SWAPFILE_CLUSTER;
+ 	ci = lock_cluster(si, offset);
+ 	alloc_cluster(si, idx);
+-	cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
++	cluster_set_count(ci, SWAPFILE_CLUSTER);
+ 
+ 	memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER);
+ 	unlock_cluster(ci);
+@@ -1365,7 +1353,6 @@ void put_swap_folio(struct folio *folio,
+ 
+ 	ci = lock_cluster_or_swap_info(si, offset);
+ 	if (size == SWAPFILE_CLUSTER) {
+-		VM_BUG_ON(!cluster_is_huge(ci));
+ 		map = si->swap_map + offset;
+ 		for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+ 			val = map[i];
+@@ -1373,7 +1360,6 @@ void put_swap_folio(struct folio *folio,
+ 			if (val == SWAP_HAS_CACHE)
+ 				free_entries++;
+ 		}
+-		cluster_clear_huge(ci);
+ 		if (free_entries == SWAPFILE_CLUSTER) {
+ 			unlock_cluster_or_swap_info(si, ci);
+ 			spin_lock(&si->lock);
+@@ -1395,23 +1381,6 @@ void put_swap_folio(struct folio *folio,
+ 	unlock_cluster_or_swap_info(si, ci);
+ }
+ 
+-#ifdef CONFIG_THP_SWAP
+-int split_swap_cluster(swp_entry_t entry)
+-{
+-	struct swap_info_struct *si;
+-	struct swap_cluster_info *ci;
+-	unsigned long offset = swp_offset(entry);
+-
+-	si = _swap_info_get(entry);
+-	if (!si)
+-		return -EBUSY;
+-	ci = lock_cluster(si, offset);
+-	cluster_clear_huge(ci);
+-	unlock_cluster(ci);
+-	return 0;
+-}
+-#endif
+-
+ static int swp_entry_cmp(const void *ent1, const void *ent2)
+ {
+ 	const swp_entry_t *e1 = ent1, *e2 = ent2;
+@@ -1519,22 +1488,23 @@ out:
+ }
+ 
+ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
+-					 swp_entry_t entry)
++					 swp_entry_t entry, int order)
+ {
+ 	struct swap_cluster_info *ci;
+ 	unsigned char *map = si->swap_map;
++	unsigned int nr_pages = 1 << order;
+ 	unsigned long roffset = swp_offset(entry);
+-	unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
++	unsigned long offset = round_down(roffset, nr_pages);
+ 	int i;
+ 	bool ret = false;
+ 
+ 	ci = lock_cluster_or_swap_info(si, offset);
+-	if (!ci || !cluster_is_huge(ci)) {
++	if (!ci || nr_pages == 1) {
+ 		if (swap_count(map[roffset]))
+ 			ret = true;
+ 		goto unlock_out;
+ 	}
+-	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
++	for (i = 0; i < nr_pages; i++) {
+ 		if (swap_count(map[offset + i])) {
+ 			ret = true;
+ 			break;
+@@ -1556,7 +1526,7 @@ static bool folio_swapped(struct folio *
+ 	if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
+ 		return swap_swapcount(si, entry) != 0;
+ 
+-	return swap_page_trans_huge_swapped(si, entry);
++	return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
+ }
+ 
+ /**
+@@ -1622,8 +1592,7 @@ int free_swap_and_cache(swp_entry_t entr
+ 		}
+ 
+ 		count = __swap_entry_free(p, entry);
+-		if (count == SWAP_HAS_CACHE &&
+-		    !swap_page_trans_huge_swapped(p, entry))
++		if (count == SWAP_HAS_CACHE)
+ 			__try_to_reclaim_swap(p, swp_offset(entry),
+ 					      TTRS_UNMAPPED | TTRS_FULL);
+ 		put_swap_device(p);
+_
diff --git a/patches/mm-swap-simplify-struct-percpu_cluster.patch b/patches/mm-swap-simplify-struct-percpu_cluster.patch
new file mode 100644
index 000000000..f53ce5982
--- /dev/null
+++ b/patches/mm-swap-simplify-struct-percpu_cluster.patch
@@ -0,0 +1,140 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: simplify struct percpu_cluster
+Date: Mon, 8 Apr 2024 19:39:42 +0100
+
+struct percpu_cluster stores the index of cpu's current cluster and the
+offset of the next entry that will be allocated for the cpu.  These two
+pieces of information are redundant because the cluster index is just
+(offset / SWAPFILE_CLUSTER).  The only reason for explicitly keeping the
+cluster index is because the structure used for it also has a flag to
+indicate "no cluster".  However this data structure also contains a spin
+lock, which is never used in this context, as a side effect the code
+copies the spinlock_t structure, which is questionable coding practice in
+my view.
+
+So let's clean this up and store only the next offset, and use a sentinal
+value (SWAP_NEXT_INVALID) to indicate "no cluster".  SWAP_NEXT_INVALID is
+chosen to be 0, because 0 will never be seen legitimately; The first page
+in the swap file is the swap header, which is always marked bad to prevent
+it from being allocated as an entry.  This also prevents the cluster to
+which it belongs being marked free, so it will never appear on the free
+list.
+
+This change saves 16 bytes per cpu.  And given we are shortly going to
+extend this mechanism to be per-cpu-AND-per-order, we will end up saving
+16 * 9 = 144 bytes per cpu, which adds up if you have 256 cpus in the
+system.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-4-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ include/linux/swap.h |    9 ++++++++-
+ mm/swapfile.c        |   22 +++++++++++-----------
+ 2 files changed, 19 insertions(+), 12 deletions(-)
+
+--- a/include/linux/swap.h~mm-swap-simplify-struct-percpu_cluster
++++ a/include/linux/swap.h
+@@ -261,12 +261,19 @@ struct swap_cluster_info {
+ #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
+ 
+ /*
++ * The first page in the swap file is the swap header, which is always marked
++ * bad to prevent it from being allocated as an entry. This also prevents the
++ * cluster to which it belongs being marked free. Therefore 0 is safe to use as
++ * a sentinel to indicate next is not valid in percpu_cluster.
++ */
++#define SWAP_NEXT_INVALID	0
++
++/*
+  * We assign a cluster to each CPU, so each CPU can allocate swap entry from
+  * its own cluster and swapout sequentially. The purpose is to optimize swapout
+  * throughput.
+  */
+ struct percpu_cluster {
+-	struct swap_cluster_info index; /* Current cluster index */
+ 	unsigned int next; /* Likely next allocation offset */
+ };
+ 
+--- a/mm/swapfile.c~mm-swap-simplify-struct-percpu_cluster
++++ a/mm/swapfile.c
+@@ -609,7 +609,7 @@ scan_swap_map_ssd_cluster_conflict(struc
+ 		return false;
+ 
+ 	percpu_cluster = this_cpu_ptr(si->percpu_cluster);
+-	cluster_set_null(&percpu_cluster->index);
++	percpu_cluster->next = SWAP_NEXT_INVALID;
+ 	return true;
+ }
+ 
+@@ -622,14 +622,14 @@ static bool scan_swap_map_try_ssd_cluste
+ {
+ 	struct percpu_cluster *cluster;
+ 	struct swap_cluster_info *ci;
+-	unsigned long tmp, max;
++	unsigned int tmp, max;
+ 
+ new_cluster:
+ 	cluster = this_cpu_ptr(si->percpu_cluster);
+-	if (cluster_is_null(&cluster->index)) {
++	tmp = cluster->next;
++	if (tmp == SWAP_NEXT_INVALID) {
+ 		if (!cluster_list_empty(&si->free_clusters)) {
+-			cluster->index = si->free_clusters.head;
+-			cluster->next = cluster_next(&cluster->index) *
++			tmp = cluster_next(&si->free_clusters.head) *
+ 					SWAPFILE_CLUSTER;
+ 		} else if (!cluster_list_empty(&si->discard_clusters)) {
+ 			/*
+@@ -649,9 +649,7 @@ new_cluster:
+ 	 * Other CPUs can use our cluster if they can't find a free cluster,
+ 	 * check if there is still free entry in the cluster
+ 	 */
+-	tmp = cluster->next;
+-	max = min_t(unsigned long, si->max,
+-		    (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
++	max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER));
+ 	if (tmp < max) {
+ 		ci = lock_cluster(si, tmp);
+ 		while (tmp < max) {
+@@ -662,12 +660,13 @@ new_cluster:
+ 		unlock_cluster(ci);
+ 	}
+ 	if (tmp >= max) {
+-		cluster_set_null(&cluster->index);
++		cluster->next = SWAP_NEXT_INVALID;
+ 		goto new_cluster;
+ 	}
+-	cluster->next = tmp + 1;
+ 	*offset = tmp;
+ 	*scan_base = tmp;
++	tmp += 1;
++	cluster->next = tmp < max ? tmp : SWAP_NEXT_INVALID;
+ 	return true;
+ }
+ 
+@@ -3163,8 +3162,9 @@ SYSCALL_DEFINE2(swapon, const char __use
+ 		}
+ 		for_each_possible_cpu(cpu) {
+ 			struct percpu_cluster *cluster;
++
+ 			cluster = per_cpu_ptr(p->percpu_cluster, cpu);
+-			cluster_set_null(&cluster->index);
++			cluster->next = SWAP_NEXT_INVALID;
+ 		}
+ 	} else {
+ 		atomic_inc(&nr_rotate_swap);
+_
diff --git a/patches/mm-swap-update-get_swap_pages-to-take-folio-order.patch b/patches/mm-swap-update-get_swap_pages-to-take-folio-order.patch
new file mode 100644
index 000000000..75ef345ae
--- /dev/null
+++ b/patches/mm-swap-update-get_swap_pages-to-take-folio-order.patch
@@ -0,0 +1,116 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: update get_swap_pages() to take folio order
+Date: Mon, 8 Apr 2024 19:39:43 +0100
+
+We are about to allow swap storage of any mTHP size.  To prepare for that,
+let's change get_swap_pages() to take a folio order parameter instead of
+nr_pages.  This makes the interface self-documenting; a power-of-2 number
+of pages must be provided.  We will also need the order internally so this
+simplifies accessing it.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-5-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ include/linux/swap.h |    2 +-
+ mm/swap_slots.c      |    6 +++---
+ mm/swapfile.c        |   13 +++++++------
+ 3 files changed, 11 insertions(+), 10 deletions(-)
+
+--- a/include/linux/swap.h~mm-swap-update-get_swap_pages-to-take-folio-order
++++ a/include/linux/swap.h
+@@ -468,7 +468,7 @@ swp_entry_t folio_alloc_swap(struct foli
+ bool folio_free_swap(struct folio *folio);
+ void put_swap_folio(struct folio *folio, swp_entry_t entry);
+ extern swp_entry_t get_swap_page_of_type(int);
+-extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size);
++extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order);
+ extern int add_swap_count_continuation(swp_entry_t, gfp_t);
+ extern void swap_shmem_alloc(swp_entry_t);
+ extern int swap_duplicate(swp_entry_t);
+--- a/mm/swapfile.c~mm-swap-update-get_swap_pages-to-take-folio-order
++++ a/mm/swapfile.c
+@@ -278,15 +278,15 @@ static void discard_swap_cluster(struct
+ #ifdef CONFIG_THP_SWAP
+ #define SWAPFILE_CLUSTER	HPAGE_PMD_NR
+ 
+-#define swap_entry_size(size)	(size)
++#define swap_entry_order(order)	(order)
+ #else
+ #define SWAPFILE_CLUSTER	256
+ 
+ /*
+- * Define swap_entry_size() as constant to let compiler to optimize
++ * Define swap_entry_order() as constant to let compiler to optimize
+  * out some code if !CONFIG_THP_SWAP
+  */
+-#define swap_entry_size(size)	1
++#define swap_entry_order(order)	0
+ #endif
+ #define LATENCY_LIMIT		256
+ 
+@@ -1042,9 +1042,10 @@ static void swap_free_cluster(struct swa
+ 	swap_range_free(si, offset, SWAPFILE_CLUSTER);
+ }
+ 
+-int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
++int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
+ {
+-	unsigned long size = swap_entry_size(entry_size);
++	int order = swap_entry_order(entry_order);
++	unsigned long size = 1 << order;
+ 	struct swap_info_struct *si, *next;
+ 	long avail_pgs;
+ 	int n_ret = 0;
+@@ -1349,7 +1350,7 @@ void put_swap_folio(struct folio *folio,
+ 	unsigned char *map;
+ 	unsigned int i, free_entries = 0;
+ 	unsigned char val;
+-	int size = swap_entry_size(folio_nr_pages(folio));
++	int size = 1 << swap_entry_order(folio_order(folio));
+ 
+ 	si = _swap_info_get(entry);
+ 	if (!si)
+--- a/mm/swap_slots.c~mm-swap-update-get_swap_pages-to-take-folio-order
++++ a/mm/swap_slots.c
+@@ -264,7 +264,7 @@ static int refill_swap_slots_cache(struc
+ 	cache->cur = 0;
+ 	if (swap_slot_cache_active)
+ 		cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
+-					   cache->slots, 1);
++					   cache->slots, 0);
+ 
+ 	return cache->nr;
+ }
+@@ -311,7 +311,7 @@ swp_entry_t folio_alloc_swap(struct foli
+ 
+ 	if (folio_test_large(folio)) {
+ 		if (IS_ENABLED(CONFIG_THP_SWAP))
+-			get_swap_pages(1, &entry, folio_nr_pages(folio));
++			get_swap_pages(1, &entry, folio_order(folio));
+ 		goto out;
+ 	}
+ 
+@@ -343,7 +343,7 @@ repeat:
+ 			goto out;
+ 	}
+ 
+-	get_swap_pages(1, &entry, 1);
++	get_swap_pages(1, &entry, 0);
+ out:
+ 	if (mem_cgroup_try_charge_swap(folio, entry)) {
+ 		put_swap_folio(folio, entry);
+_
diff --git a/patches/mm-vmscan-avoid-split-during-shrink_folio_list.patch b/patches/mm-vmscan-avoid-split-during-shrink_folio_list.patch
new file mode 100644
index 000000000..544090bfc
--- /dev/null
+++ b/patches/mm-vmscan-avoid-split-during-shrink_folio_list.patch
@@ -0,0 +1,77 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: vmscan: avoid split during shrink_folio_list()
+Date: Mon, 8 Apr 2024 19:39:45 +0100
+
+Now that swap supports storing all mTHP sizes, avoid splitting large
+folios before swap-out.  This benefits performance of the swap-out path by
+eliding split_folio_to_list(), which is expensive, and also sets us up for
+swapping in large folios in a future series.
+
+If the folio is partially mapped, we continue to split it since we want to
+avoid the extra IO overhead and storage of writing out pages
+uneccessarily.
+
+THP_SWPOUT and THP_SWPOUT_FALLBACK counters should continue to count
+events only for PMD-mappable folios to avoid user confusion.  THP_SWPOUT
+already has the appropriate guard.  Add a guard for THP_SWPOUT_FALLBACK. 
+It may be appropriate to add per-size counters in future.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-7-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Barry Song <v-songbaohua@oppo.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ mm/vmscan.c |   20 ++++++++++----------
+ 1 file changed, 10 insertions(+), 10 deletions(-)
+
+--- a/mm/vmscan.c~mm-vmscan-avoid-split-during-shrink_folio_list
++++ a/mm/vmscan.c
+@@ -1206,25 +1206,25 @@ retry:
+ 					if (!can_split_folio(folio, NULL))
+ 						goto activate_locked;
+ 					/*
+-					 * Split folios without a PMD map right
+-					 * away. Chances are some or all of the
+-					 * tail pages can be freed without IO.
++					 * Split partially mapped folios right away.
++					 * We can free the unmapped pages without IO.
+ 					 */
+-					if (!folio_entire_mapcount(folio) &&
+-					    split_folio_to_list(folio,
+-								folio_list))
++					if (data_race(!list_empty(&folio->_deferred_list)) &&
++					    split_folio_to_list(folio, folio_list))
+ 						goto activate_locked;
+ 				}
+ 				if (!add_to_swap(folio)) {
+ 					if (!folio_test_large(folio))
+ 						goto activate_locked_split;
+ 					/* Fallback to swap normal pages */
+-					if (split_folio_to_list(folio,
+-								folio_list))
++					if (split_folio_to_list(folio, folio_list))
+ 						goto activate_locked;
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+-					count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1);
+-					count_vm_event(THP_SWPOUT_FALLBACK);
++					if (nr_pages >= HPAGE_PMD_NR) {
++						count_memcg_folio_events(folio,
++							THP_SWPOUT_FALLBACK, 1);
++						count_vm_event(THP_SWPOUT_FALLBACK);
++					}
+ #endif
+ 					if (!add_to_swap(folio))
+ 						goto activate_locked_split;
+_
diff --git a/pc/devel-series b/pc/devel-series
index 1ac0257b5..5cfaa541e 100644
--- a/pc/devel-series
+++ b/pc/devel-series
@@ -455,6 +455,13 @@ proc-convert-smaps_pmd_entry-to-use-a-folio.patch
 #
 mm-page_alloc-use-the-correct-thp-order-for-thp-pcp.patch
 #
+mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.patch
+mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.patch
+mm-swap-simplify-struct-percpu_cluster.patch
+mm-swap-update-get_swap_pages-to-take-folio-order.patch
+mm-swap-allow-storage-of-all-mthp-orders.patch
+mm-vmscan-avoid-split-during-shrink_folio_list.patch
+mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.patch
 #
 #arm64-mm-cleanup-__do_page_fault.patch: https://lkml.kernel.org/r/20240407171902.5958-A-hca@linux.ibm.com
 arm64-mm-cleanup-__do_page_fault.patch
diff --git a/pc/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.pc b/pc/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.pc
new file mode 100644
index 000000000..ac995ae95
--- /dev/null
+++ b/pc/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.pc
@@ -0,0 +1,4 @@
+include/linux/pgtable.h
+mm/internal.h
+mm/madvise.c
+mm/memory.c
diff --git a/pc/mm-swap-allow-storage-of-all-mthp-orders.pc b/pc/mm-swap-allow-storage-of-all-mthp-orders.pc
new file mode 100644
index 000000000..3bce48932
--- /dev/null
+++ b/pc/mm-swap-allow-storage-of-all-mthp-orders.pc
@@ -0,0 +1,2 @@
+include/linux/swap.h
+mm/swapfile.c
diff --git a/pc/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.pc b/pc/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.pc
new file mode 100644
index 000000000..f89ab82c9
--- /dev/null
+++ b/pc/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.pc
@@ -0,0 +1,6 @@
+include/linux/pgtable.h
+include/linux/swap.h
+mm/internal.h
+mm/madvise.c
+mm/memory.c
+mm/swapfile.c
diff --git a/pc/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.pc b/pc/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.pc
new file mode 100644
index 000000000..d45dac10c
--- /dev/null
+++ b/pc/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.pc
@@ -0,0 +1,3 @@
+include/linux/swap.h
+mm/huge_memory.c
+mm/swapfile.c
diff --git a/pc/mm-swap-simplify-struct-percpu_cluster.pc b/pc/mm-swap-simplify-struct-percpu_cluster.pc
new file mode 100644
index 000000000..3bce48932
--- /dev/null
+++ b/pc/mm-swap-simplify-struct-percpu_cluster.pc
@@ -0,0 +1,2 @@
+include/linux/swap.h
+mm/swapfile.c
diff --git a/pc/mm-swap-update-get_swap_pages-to-take-folio-order.pc b/pc/mm-swap-update-get_swap_pages-to-take-folio-order.pc
new file mode 100644
index 000000000..f2bd0b484
--- /dev/null
+++ b/pc/mm-swap-update-get_swap_pages-to-take-folio-order.pc
@@ -0,0 +1,3 @@
+include/linux/swap.h
+mm/swapfile.c
+mm/swap_slots.c
diff --git a/pc/mm-vmscan-avoid-split-during-shrink_folio_list.pc b/pc/mm-vmscan-avoid-split-during-shrink_folio_list.pc
new file mode 100644
index 000000000..40d089036
--- /dev/null
+++ b/pc/mm-vmscan-avoid-split-during-shrink_folio_list.pc
@@ -0,0 +1 @@
+mm/vmscan.c
diff --git a/txt/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.txt b/txt/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.txt
new file mode 100644
index 000000000..cac0e7b9b
--- /dev/null
+++ b/txt/mm-madvise-avoid-split-during-madv_pageout-and-madv_cold.txt
@@ -0,0 +1,41 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: madvise: avoid split during MADV_PAGEOUT and MADV_COLD
+Date: Mon, 8 Apr 2024 19:39:46 +0100
+
+Rework madvise_cold_or_pageout_pte_range() to avoid splitting any large
+folio that is fully and contiguously mapped in the pageout/cold vm range. 
+This change means that large folios will be maintained all the way to swap
+storage.  This both improves performance during swap-out, by eliding the
+cost of splitting the folio, and sets us up nicely for maintaining the
+large folio when it is swapped back in (to be covered in a separate
+series).
+
+Folios that are not fully mapped in the target range are still split, but
+note that behavior is changed so that if the split fails for any reason
+(folio locked, shared, etc) we now leave it as is and move to the next pte
+in the range and continue work on the proceeding folios.  Previously any
+failure of this sort would cause the entire operation to give up and no
+folios mapped at higher addresses were paged out or made cold.  Given
+large folios are becoming more common, this old behavior would have likely
+lead to wasted opportunities.
+
+While we are at it, change the code that clears young from the ptes to use
+ptep_test_and_clear_young(), via the new mkold_ptes() batch helper
+function.  This is more efficent than get_and_clear/modify/set, especially
+for contpte mappings on arm64, where the old approach would require
+unfolding/refolding and the new approach can be done in place.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-8-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: Barry Song <v-songbaohua@oppo.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
diff --git a/txt/mm-set-pageblock_order-to-hpage_pmd_order-in-case-with-config_hugetlb_page-but-thp-enabled.txt b/txt/mm-set-pageblock_order-to-hpage_pmd_order-in-case-with-config_hugetlb_page-but-thp-enabled.txt
index eac7c7335..7d6935c1e 100644
--- a/txt/mm-set-pageblock_order-to-hpage_pmd_order-in-case-with-config_hugetlb_page-but-thp-enabled.txt
+++ b/txt/mm-set-pageblock_order-to-hpage_pmd_order-in-case-with-config_hugetlb_page-but-thp-enabled.txt
@@ -12,4 +12,5 @@ Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
 Suggested-by: Vlastimil Babka <vbabka@suse.cz>
 Acked-by: Vlastimil Babka <vbabka@suse.cz>
 Reviewed-by: Zi Yan <ziy@nvidia.com>
+Acked-by: David Hildenbrand <david@redhat.com>
 Cc: Mel Gorman <mgorman@techsingularity.net>
diff --git a/txt/mm-swap-allow-storage-of-all-mthp-orders.txt b/txt/mm-swap-allow-storage-of-all-mthp-orders.txt
new file mode 100644
index 000000000..71e43fef0
--- /dev/null
+++ b/txt/mm-swap-allow-storage-of-all-mthp-orders.txt
@@ -0,0 +1,60 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: allow storage of all mTHP orders
+Date: Mon, 8 Apr 2024 19:39:44 +0100
+
+Multi-size THP enables performance improvements by allocating large,
+pte-mapped folios for anonymous memory.  However I've observed that on an
+arm64 system running a parallel workload (e.g.  kernel compilation) across
+many cores, under high memory pressure, the speed regresses.  This is due
+to bottlenecking on the increased number of TLBIs added due to all the
+extra folio splitting when the large folios are swapped out.
+
+Therefore, solve this regression by adding support for swapping out mTHP
+without needing to split the folio, just like is already done for
+PMD-sized THP.  This change only applies when CONFIG_THP_SWAP is enabled,
+and when the swap backing store is a non-rotating block device.  These are
+the same constraints as for the existing PMD-sized THP swap-out support.
+
+Note that no attempt is made to swap-in (m)THP here - this is still done
+page-by-page, like for PMD-sized THP.  But swapping-out mTHP is a
+prerequisite for swapping-in mTHP.
+
+The main change here is to improve the swap entry allocator so that it can
+allocate any power-of-2 number of contiguous entries between [1, (1 <<
+PMD_ORDER)].  This is done by allocating a cluster for each distinct order
+and allocating sequentially from it until the cluster is full.  This
+ensures that we don't need to search the map and we get no fragmentation
+due to alignment padding for different orders in the cluster.  If there is
+no current cluster for a given order, we attempt to allocate a free
+cluster from the list.  If there are no free clusters, we fail the
+allocation and the caller can fall back to splitting the folio and
+allocates individual entries (as per existing PMD-sized THP fallback).
+
+The per-order current clusters are maintained per-cpu using the existing
+infrastructure.  This is done to avoid interleving pages from different
+tasks, which would prevent IO being batched.  This is already done for the
+order-0 allocations so we follow the same pattern.
+
+As is done for order-0 per-cpu clusters, the scanner now can steal order-0
+entries from any per-cpu-per-order reserved cluster.  This ensures that
+when the swap file is getting full, space doesn't get tied up in the
+per-cpu reserves.
+
+This change only modifies swap to be able to accept any order mTHP.  It
+doesn't change the callers to elide doing the actual split.  That will be
+done in separate changes.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-6-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
diff --git a/txt/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.txt b/txt/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.txt
new file mode 100644
index 000000000..c6b497aff
--- /dev/null
+++ b/txt/mm-swap-free_swap_and_cache_nr-as-batched-free_swap_and_cache.txt
@@ -0,0 +1,41 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: free_swap_and_cache_nr() as batched free_swap_and_cache()
+Date: Mon, 8 Apr 2024 19:39:41 +0100
+
+Now that we no longer have a convenient flag in the cluster to determine
+if a folio is large, free_swap_and_cache() will take a reference and lock
+a large folio much more often, which could lead to contention and (e.g.)
+failure to split large folios, etc.
+
+Let's solve that problem by batch freeing swap and cache with a new
+function, free_swap_and_cache_nr(), to free a contiguous range of swap
+entries together.  This allows us to first drop a reference to each swap
+slot before we try to release the cache folio.  This means we only try to
+release the folio once, only taking the reference and lock once - much
+better than the previous 512 times for the 2M THP case.
+
+Contiguous swap entries are gathered in zap_pte_range() and
+madvise_free_pte_range() in a similar way to how present ptes are already
+gathered in zap_pte_range().
+
+While we are at it, let's simplify by converting the return type of both
+functions to void.  The return value was used only by zap_pte_range() to
+print a bad pte, and was ignored by everyone else, so the extra reporting
+wasn't exactly guaranteed.  We will still get the warning with most of the
+information from get_swap_device().  With the batch version, we wouldn't
+know which pte was bad anyway so could print the wrong one.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-3-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
diff --git a/txt/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.txt b/txt/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.txt
new file mode 100644
index 000000000..8f12a5894
--- /dev/null
+++ b/txt/mm-swap-remove-cluster_flag_huge-from-swap_cluster_info-flags.txt
@@ -0,0 +1,107 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: remove CLUSTER_FLAG_HUGE from swap_cluster_info:flags
+Date: Mon, 8 Apr 2024 19:39:40 +0100
+
+Patch series "Swap-out mTHP without splitting", v7.
+
+This series adds support for swapping out multi-size THP (mTHP) without
+needing to first split the large folio via
+split_huge_page_to_list_to_order().  It closely follows the approach
+already used to swap-out PMD-sized THP.
+
+There are a couple of reasons for swapping out mTHP without splitting:
+
+  - Performance: It is expensive to split a large folio and under
+    extreme memory pressure some workloads regressed performance when
+    using 64K mTHP vs 4K small folios because of this extra cost in the
+    swap-out path.  This series not only eliminates the regression but
+    makes it faster to swap out 64K mTHP vs 4K small folios.
+
+  - Memory fragmentation avoidance: If we can avoid splitting a large
+    folio memory is less likely to become fragmented, making it easier to
+    re-allocate a large folio in future.
+
+  - Performance: Enables a separate series [7] to swap-in whole mTHPs,
+    which means we won't lose the TLB-efficiency benefits of mTHP once the
+    memory has been through a swap cycle.
+
+I've done what I thought was the smallest change possible, and as a
+result, this approach is only employed when the swap is backed by a
+non-rotating block device (just as PMD-sized THP is supported today). 
+Discussion against the RFC concluded that this is sufficient.
+
+
+Performance Testing
+===================
+
+I've run some swap performance tests on Ampere Altra VM (arm64) with 8
+CPUs.  The VM is set up with a 35G block ram device as the swap device and
+the test is run from inside a memcg limited to 40G memory.  I've then run
+`usemem` from vm-scalability with 70 processes, each allocating and
+writing 1G of memory.  I've repeated everything 6 times and taken the mean
+performance improvement relative to 4K page baseline:
+
+| alloc size |                baseline |           + this series |
+|            | mm-unstable (~v6.9-rc1) |                         |
+|:-----------|------------------------:|------------------------:|
+| 4K Page    |                    0.0% |                    1.3% |
+| 64K THP    |                  -13.6% |                   46.3% |
+| 2M THP     |                   91.4% |                   89.6% |
+
+So with this change, the 64K swap performance goes from a 14% regression to a
+46% improvement. While 2M shows a small regression I'm confident that this is
+just noise.
+
+[1] https://lore.kernel.org/linux-mm/20231010142111.3997780-1-ryan.roberts@arm.com/
+[2] https://lore.kernel.org/linux-mm/20231017161302.2518826-1-ryan.roberts@arm.com/
+[3] https://lore.kernel.org/linux-mm/20231025144546.577640-1-ryan.roberts@arm.com/
+[4] https://lore.kernel.org/linux-mm/20240311150058.1122862-1-ryan.roberts@arm.com/
+[5] https://lore.kernel.org/linux-mm/20240327144537.4165578-1-ryan.roberts@arm.com/
+[6] https://lore.kernel.org/linux-mm/20240403114032.1162100-1-ryan.roberts@arm.com/
+[7] https://lore.kernel.org/linux-mm/20240304081348.197341-1-21cnbao@gmail.com/
+[8] https://lore.kernel.org/linux-mm/CAGsJ_4yMOow27WDvN2q=E4HAtDd2PJ=OQ5Pj9DG+6FLWwNuXUw@mail.gmail.com/
+[9] https://lore.kernel.org/linux-mm/579d5127-c763-4001-9625-4563a9316ac3@redhat.com/
+
+
+This patch (of 7):
+
+As preparation for supporting small-sized THP in the swap-out path,
+without first needing to split to order-0, Remove the CLUSTER_FLAG_HUGE,
+which, when present, always implies PMD-sized THP, which is the same as
+the cluster size.
+
+The only use of the flag was to determine whether a swap entry refers to a
+single page or a PMD-sized THP in swap_page_trans_huge_swapped().  Instead
+of relying on the flag, we now pass in order, which originates from the
+folio's order.  This allows the logic to work for folios of any order.
+
+The one snag is that one of the swap_page_trans_huge_swapped() call sites
+does not have the folio.  But it was only being called there to shortcut a
+call __try_to_reclaim_swap() in some cases.  __try_to_reclaim_swap() gets
+the folio and (via some other functions) calls
+swap_page_trans_huge_swapped().  So I've removed the problematic call site
+and believe the new logic should be functionally equivalent.
+
+That said, removing the fast path means that we will take a reference and
+trylock a large folio much more often, which we would like to avoid.  The
+next patch will solve this.
+
+Removing CLUSTER_FLAG_HUGE also means we can remove split_swap_cluster()
+which used to be called during folio splitting, since
+split_swap_cluster()'s only job was to remove the flag.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-1-ryan.roberts@arm.com
+Link: https://lkml.kernel.org/r/20240408183946.2991168-2-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Acked-by: Chris Li <chrisl@kernel.org>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
diff --git a/txt/mm-swap-simplify-struct-percpu_cluster.txt b/txt/mm-swap-simplify-struct-percpu_cluster.txt
new file mode 100644
index 000000000..5819bb69a
--- /dev/null
+++ b/txt/mm-swap-simplify-struct-percpu_cluster.txt
@@ -0,0 +1,41 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: simplify struct percpu_cluster
+Date: Mon, 8 Apr 2024 19:39:42 +0100
+
+struct percpu_cluster stores the index of cpu's current cluster and the
+offset of the next entry that will be allocated for the cpu.  These two
+pieces of information are redundant because the cluster index is just
+(offset / SWAPFILE_CLUSTER).  The only reason for explicitly keeping the
+cluster index is because the structure used for it also has a flag to
+indicate "no cluster".  However this data structure also contains a spin
+lock, which is never used in this context, as a side effect the code
+copies the spinlock_t structure, which is questionable coding practice in
+my view.
+
+So let's clean this up and store only the next offset, and use a sentinal
+value (SWAP_NEXT_INVALID) to indicate "no cluster".  SWAP_NEXT_INVALID is
+chosen to be 0, because 0 will never be seen legitimately; The first page
+in the swap file is the swap header, which is always marked bad to prevent
+it from being allocated as an entry.  This also prevents the cluster to
+which it belongs being marked free, so it will never appear on the free
+list.
+
+This change saves 16 bytes per cpu.  And given we are shortly going to
+extend this mechanism to be per-cpu-AND-per-order, we will end up saving
+16 * 9 = 144 bytes per cpu, which adds up if you have 256 cpus in the
+system.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-4-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
diff --git a/txt/mm-swap-update-get_swap_pages-to-take-folio-order.txt b/txt/mm-swap-update-get_swap_pages-to-take-folio-order.txt
new file mode 100644
index 000000000..3498d553c
--- /dev/null
+++ b/txt/mm-swap-update-get_swap_pages-to-take-folio-order.txt
@@ -0,0 +1,24 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: swap: update get_swap_pages() to take folio order
+Date: Mon, 8 Apr 2024 19:39:43 +0100
+
+We are about to allow swap storage of any mTHP size.  To prepare for that,
+let's change get_swap_pages() to take a folio order parameter instead of
+nr_pages.  This makes the interface self-documenting; a power-of-2 number
+of pages must be provided.  We will also need the order internally so this
+simplifies accessing it.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-5-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Barry Song <v-songbaohua@oppo.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
diff --git a/txt/mm-userfaultfd-allow-hugetlb-change-protection-upon-poison-entry.txt b/txt/mm-userfaultfd-allow-hugetlb-change-protection-upon-poison-entry.txt
index 3c9c70090..9d2bdcd03 100644
--- a/txt/mm-userfaultfd-allow-hugetlb-change-protection-upon-poison-entry.txt
+++ b/txt/mm-userfaultfd-allow-hugetlb-change-protection-upon-poison-entry.txt
@@ -23,9 +23,9 @@ so no chance of swapin errors).
 
 Link: https://lkml.kernel.org/r/20240405231920.1772199-1-peterx@redhat.com
 Link: https://lore.kernel.org/r/000000000000920d5e0615602dd1@google.com
-Reported-by: syzbot+b07c8ac8eee3d4d8440f@syzkaller.appspotmail.com
 Fixes: fc71884a5f59 ("mm: userfaultfd: add new UFFDIO_POISON ioctl")
 Signed-off-by: Peter Xu <peterx@redhat.com>
+Reported-by: syzbot+b07c8ac8eee3d4d8440f@syzkaller.appspotmail.com
+Reviewed-by: David Hildenbrand <david@redhat.com>
 Cc: Axel Rasmussen <axelrasmussen@google.com>
-Cc: David Hildenbrand <david@redhat.com>
 Cc: <stable@vger.kernel.org>	[6.6+]
diff --git a/txt/mm-vmscan-avoid-split-during-shrink_folio_list.txt b/txt/mm-vmscan-avoid-split-during-shrink_folio_list.txt
new file mode 100644
index 000000000..3c042cc0e
--- /dev/null
+++ b/txt/mm-vmscan-avoid-split-during-shrink_folio_list.txt
@@ -0,0 +1,32 @@
+From: Ryan Roberts <ryan.roberts@arm.com>
+Subject: mm: vmscan: avoid split during shrink_folio_list()
+Date: Mon, 8 Apr 2024 19:39:45 +0100
+
+Now that swap supports storing all mTHP sizes, avoid splitting large
+folios before swap-out.  This benefits performance of the swap-out path by
+eliding split_folio_to_list(), which is expensive, and also sets us up for
+swapping in large folios in a future series.
+
+If the folio is partially mapped, we continue to split it since we want to
+avoid the extra IO overhead and storage of writing out pages
+uneccessarily.
+
+THP_SWPOUT and THP_SWPOUT_FALLBACK counters should continue to count
+events only for PMD-mappable folios to avoid user confusion.  THP_SWPOUT
+already has the appropriate guard.  Add a guard for THP_SWPOUT_FALLBACK. 
+It may be appropriate to add per-size counters in future.
+
+Link: https://lkml.kernel.org/r/20240408183946.2991168-7-ryan.roberts@arm.com
+Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Barry Song <v-songbaohua@oppo.com>
+Cc: Barry Song <21cnbao@gmail.com>
+Cc: Chris Li <chrisl@kernel.org>
+Cc: Gao Xiang <xiang@kernel.org>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Lance Yang <ioworker0@gmail.com>
+Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Yu Zhao <yuzhao@google.com>
author	Andrew Morton <akpm@linux-foundation.org>	2024-04-08 13:40:13 -0700
committer	Andrew Morton <akpm@linux-foundation.org>	2024-04-08 13:40:13 -0700
commit	dc439d3fc6f3661f5f7cfd08a59d5507d079a340 (patch)
tree	b6c6cbb4e39b94310d61214b8a65e5a8b77237be
parent	ec3a259c71e17cfcefa54f5647e906bf57c1cbef (diff)
download	25-new-dc439d3fc6f3661f5f7cfd08a59d5507d079a340.tar.gz