mm: gup: FOLL_MM_SYNC: handle FOLL_MM_SYNC in can_read_pin_swap_page()

Since the introduction of FOLL_MM_SYNC we cannot unconditionally take pins on PageKsm anymore, it depends if FOLL_MM_SYNC is set. So this extracts the PageKsm knowledge from can_read_pin_swap_page() and it requires the caller to handle the race with KSM creating a PageKsm. KSM also needs do its part and ensure it never converts in place a !PageKsm to PageKsm if there's any oustanding pins and that it won't change a pgtable from a wrprotected PageAnon to a wrprotected PageKsm if there were any outstanding pins on the PageAnon. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
author: Andrea Arcangeli <aarcange@redhat.com> 2021-12-22 13:32:17 -0500
committer: Andrea Arcangeli <aarcange@redhat.com> 2023-11-11 22:03:36 -0500
commit: 143fc197abb13cc287ead52590557b19c520449a (patch)
tree: cfe6e7e4e4ebf001343c3add2f708e38e7341b82
parent: ba720b98c7269293d13dc5489ff30459181d39aa (diff)
download: aa-143fc197abb13cc287ead52590557b19c520449a.tar.gz
5 files changed, 76 insertions, 23 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3efd5e94e322a0..c86cd15bb424c4 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -671,8 +671,6 @@ static inline int swp_swapcount(swp_entry_t entry)
 
 static inline bool can_read_pin_swap_page(struct page *page)
 {
-	if (unlikely(PageKsm(page)))
-		return true;
 	return page_trans_huge_mapcount(page, NULL) <= 1;
 }
 
diff --git a/mm/gup.c b/mm/gup.c
index b6306420a43c1c..b1b57aee6f4025 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -41,7 +41,12 @@ static __always_inline bool is_fast_only_in_irq(bool irq_safe)
 	return irq_safe && unlikely(!!irq_count());
 }
 
-static bool gup_must_unshare_slowpath(struct page *page)
+static __always_inline bool gup_must_unshare_ksm(unsigned int flags)
+{
+	return !!(flags & FOLL_MM_SYNC);
+}
+
+static bool gup_must_unshare_slowpath(struct page *page, unsigned int flags)
 {
 	bool must_unshare;
 	/*
@@ -54,6 +59,15 @@ static bool gup_must_unshare_slowpath(struct page *page)
 	 */
 	if (!trylock_page(page))
 		return true;
+	if (PageKsm(page)) {
+		/*
+		 * PageKsm() cannot change anymore while holding the
+		 * page lock and either an extra refcount reference
+		 * (gup-fast varaints) or the PT lock (gup-slow).
+		 */
+		unlock_page(page);
+		return gup_must_unshare_ksm(flags);
+	}
 	must_unshare = !can_read_pin_swap_page(page);
 	unlock_page(page);
 	return must_unshare;
@@ -97,7 +111,7 @@ static __always_inline bool __gup_must_unshare(unsigned int flags,
 	if (!PageAnon(page))
 		return false;
 	if (PageKsm(page))
-		return !!(flags & FOLL_MM_SYNC);
+		return gup_must_unshare_ksm(flags);
 	if (PageHuge(page)) {
 		if (__page_mapcount(page) > 1)
 			return true;
@@ -108,7 +122,7 @@ static __always_inline bool __gup_must_unshare(unsigned int flags,
 			if (!is_fast_only_in_irq(irq_safe)) {
 				if (page_trans_huge_anon_shared(page))
 					return true;
-				return gup_must_unshare_slowpath(page);
+				return gup_must_unshare_slowpath(page, flags);
 			}
 			return true;
 		}
@@ -117,7 +131,7 @@ static __always_inline bool __gup_must_unshare(unsigned int flags,
 		if (!is_fast_only_in_irq(irq_safe)) {
 			if (page_mapcount(page) > 1)
 				return true;
-			return gup_must_unshare_slowpath(page);
+			return gup_must_unshare_slowpath(page, flags);
 		}
 		return true;
 	}
diff --git a/mm/ksm.c b/mm/ksm.c
index d73a673bcb2409..13242ead684aa1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1032,7 +1032,6 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 		.page = page,
 		.vma = vma,
 	};
-	int swapped;
 	int err = -EFAULT;
 	struct mmu_notifier_range range;
 
@@ -1042,6 +1041,10 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 
 	BUG_ON(PageTransCompound(page));
 
+	/* racy optimistic check for the fast path */
+	if (page_mapcount(page) + 1 + PageSwapCache(page) != page_count(page))
+		goto out;
+
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
 				pvmw.address,
 				pvmw.address + PAGE_SIZE);
@@ -1057,7 +1060,6 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 						mm_tlb_flush_pending(mm)) {
 		pte_t entry;
 
-		swapped = PageSwapCache(page);
 		flush_cache_page(vma, pvmw.address, page_to_pfn(page));
 		/*
 		 * Ok this is tricky, when get_user_pages_fast() run it doesn't
@@ -1077,8 +1079,21 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 		/*
 		 * Check that no O_DIRECT or similar I/O is in progress on the
 		 * page
+		 *
+		 * gup-fast cannot be running thanks to the
+		 * ptep_clear_flush(). gup-slow if it runs on the anon
+		 * page and could skip the COR, it'd mean
+		 * page_mapcount is 1 which implies gup-slow is
+		 * serialized against us with the PT lock. If the
+		 * mapcount is > 1 on the anon page or if there were
+		 * swap references by design there cannot be any GUP
+		 * pin. So only if there can be a GUP pin check the
+		 * page_count.
+		 *
+		 * PageSwapCache() is stable too under page lock.
 		 */
-		if (page_mapcount(page) + 1 + swapped != page_count(page)) {
+		if (can_read_pin_swap_page(page) &&
+		    2 + PageSwapCache(page) != page_count(page)) {
 			set_pte_at(mm, pvmw.address, pvmw.pte, entry);
 			goto out_unlock;
 		}
@@ -1117,7 +1132,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	struct mm_struct *mm = vma->vm_mm;
 	pmd_t *pmd;
 	pte_t *ptep;
-	pte_t newpte;
+	pte_t newpte, oldpte;
 	spinlock_t *ptl;
 	unsigned long addr;
 	int err = -EFAULT;
@@ -1168,7 +1183,28 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	 *
 	 * See Documentation/vm/mmu_notifier.rst
 	 */
-	ptep_clear_flush(vma, addr, ptep);
+	oldpte = ptep_clear_flush(vma, addr, ptep);
+	/*
+	 * Check that no O_DIRECT or similar I/O is in progress on the
+	 * page.
+	 *
+	 * gup-fast cannot be running thanks to the
+	 * ptep_clear_flush(). gup-slow if it runs on the anon page
+	 * and could skip the COR, it'd mean page_mapcount is 1 which
+	 * implies gup-slow is serialized against us with the PT
+	 * lock. If the mapcount is > 1 on the anon page or if there
+	 * were swap references by design there cannot be any GUP
+	 * pin. So only if there can be a GUP pin check the
+	 * page_count.
+	 *
+	 * PageSwapCache() is stable too under page lock.
+	 */
+	if (can_read_pin_swap_page(page) &&
+	    2 + PageSwapCache(page) != page_count(page)) {
+		set_pte_at(mm, addr, ptep, oldpte);
+		pte_unmap_unlock(ptep, ptl);
+		goto out_mn;
+	}
 	set_pte_at_notify(mm, addr, ptep, newpte);
 
 	page_remove_rmap(page, false);
diff --git a/mm/memory.c b/mm/memory.c
index 75992f354ac66f..9c4b2ed9e5445e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3298,6 +3298,21 @@ static vm_fault_t wp_page_unshare(struct vm_fault *vmf)
 		 */
 		if (!smart_lock_page(vmf))
 			return 0;
+		if (PageKsm(vmf->page)) {
+			/*
+			 * PageKsm() cannot change anymore while
+			 * holding the page lock. In the unlikely case
+			 * a PageKsm() materialized from under us we
+			 * can retry later. We could also jump into
+			 * the PageKsm locations if FOLL_MM_SYNC is
+			 * set to handle the COR without an extra
+			 * cycle, but this is an unlikely case that
+			 * can stay simple and it's not worth
+			 * optimizing for.
+			 */
+			unlock_page(vmf->page);
+			goto out_unlock;
+		}
 		must_unshare = !can_read_pin_swap_page(vmf->page);
 		unlock_page(vmf->page);
 		if (must_unshare)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b5bac2ed1e58e9..73f2a6b1a47502 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1688,22 +1688,12 @@ again:
  * exclusive so a read GUP pin can be taken, but the page isn't going
  * to be made writable.
  *
- * So there is a different retval in the case of PageKsm(). In
- * addition can_read_pin_swap_page() will not alter the mapping and so
- * it should not cause any side effects to the page type. It is also a
- * readonly PIN so there's no concern for stable pages.
+ * The caller has to check PageKsm before calling this.
  */
 bool can_read_pin_swap_page(struct page *page)
 {
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
-
-	/*
-	 * If the !PageKsm changed to PageKsm from under us before the
-	 * page lock was taken, always allow GUP pins on PageKsm.
-	 */
-	if (unlikely(PageKsm(page)))
-		return true;
-
+	VM_BUG_ON_PAGE(PageKsm(page), page);
 	return page_trans_huge_map_swapcount(page, NULL, NULL) <= 1;
 }
author	Andrea Arcangeli <aarcange@redhat.com>	2021-12-22 13:32:17 -0500
committer	Andrea Arcangeli <aarcange@redhat.com>	2023-11-11 22:03:36 -0500
commit	143fc197abb13cc287ead52590557b19c520449a (patch)
tree	cfe6e7e4e4ebf001343c3add2f708e38e7341b82
parent	ba720b98c7269293d13dc5489ff30459181d39aa (diff)
download	aa-143fc197abb13cc287ead52590557b19c520449a.tar.gz