aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2021-12-22 13:32:17 -0500
committerAndrea Arcangeli <aarcange@redhat.com>2023-11-11 22:03:36 -0500
commit143fc197abb13cc287ead52590557b19c520449a (patch)
treecfe6e7e4e4ebf001343c3add2f708e38e7341b82
parentba720b98c7269293d13dc5489ff30459181d39aa (diff)
downloadaa-143fc197abb13cc287ead52590557b19c520449a.tar.gz
mm: gup: FOLL_MM_SYNC: handle FOLL_MM_SYNC in can_read_pin_swap_page()
Since the introduction of FOLL_MM_SYNC we cannot unconditionally take pins on PageKsm anymore, it depends if FOLL_MM_SYNC is set. So this extracts the PageKsm knowledge from can_read_pin_swap_page() and it requires the caller to handle the race with KSM creating a PageKsm. KSM also needs do its part and ensure it never converts in place a !PageKsm to PageKsm if there's any oustanding pins and that it won't change a pgtable from a wrprotected PageAnon to a wrprotected PageKsm if there were any outstanding pins on the PageAnon. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
-rw-r--r--include/linux/swap.h2
-rw-r--r--mm/gup.c22
-rw-r--r--mm/ksm.c46
-rw-r--r--mm/memory.c15
-rw-r--r--mm/swapfile.c14
5 files changed, 76 insertions, 23 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3efd5e94e322a0..c86cd15bb424c4 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -671,8 +671,6 @@ static inline int swp_swapcount(swp_entry_t entry)
static inline bool can_read_pin_swap_page(struct page *page)
{
- if (unlikely(PageKsm(page)))
- return true;
return page_trans_huge_mapcount(page, NULL) <= 1;
}
diff --git a/mm/gup.c b/mm/gup.c
index b6306420a43c1c..b1b57aee6f4025 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -41,7 +41,12 @@ static __always_inline bool is_fast_only_in_irq(bool irq_safe)
return irq_safe && unlikely(!!irq_count());
}
-static bool gup_must_unshare_slowpath(struct page *page)
+static __always_inline bool gup_must_unshare_ksm(unsigned int flags)
+{
+ return !!(flags & FOLL_MM_SYNC);
+}
+
+static bool gup_must_unshare_slowpath(struct page *page, unsigned int flags)
{
bool must_unshare;
/*
@@ -54,6 +59,15 @@ static bool gup_must_unshare_slowpath(struct page *page)
*/
if (!trylock_page(page))
return true;
+ if (PageKsm(page)) {
+ /*
+ * PageKsm() cannot change anymore while holding the
+ * page lock and either an extra refcount reference
+ * (gup-fast varaints) or the PT lock (gup-slow).
+ */
+ unlock_page(page);
+ return gup_must_unshare_ksm(flags);
+ }
must_unshare = !can_read_pin_swap_page(page);
unlock_page(page);
return must_unshare;
@@ -97,7 +111,7 @@ static __always_inline bool __gup_must_unshare(unsigned int flags,
if (!PageAnon(page))
return false;
if (PageKsm(page))
- return !!(flags & FOLL_MM_SYNC);
+ return gup_must_unshare_ksm(flags);
if (PageHuge(page)) {
if (__page_mapcount(page) > 1)
return true;
@@ -108,7 +122,7 @@ static __always_inline bool __gup_must_unshare(unsigned int flags,
if (!is_fast_only_in_irq(irq_safe)) {
if (page_trans_huge_anon_shared(page))
return true;
- return gup_must_unshare_slowpath(page);
+ return gup_must_unshare_slowpath(page, flags);
}
return true;
}
@@ -117,7 +131,7 @@ static __always_inline bool __gup_must_unshare(unsigned int flags,
if (!is_fast_only_in_irq(irq_safe)) {
if (page_mapcount(page) > 1)
return true;
- return gup_must_unshare_slowpath(page);
+ return gup_must_unshare_slowpath(page, flags);
}
return true;
}
diff --git a/mm/ksm.c b/mm/ksm.c
index d73a673bcb2409..13242ead684aa1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1032,7 +1032,6 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
.page = page,
.vma = vma,
};
- int swapped;
int err = -EFAULT;
struct mmu_notifier_range range;
@@ -1042,6 +1041,10 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
BUG_ON(PageTransCompound(page));
+ /* racy optimistic check for the fast path */
+ if (page_mapcount(page) + 1 + PageSwapCache(page) != page_count(page))
+ goto out;
+
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
pvmw.address,
pvmw.address + PAGE_SIZE);
@@ -1057,7 +1060,6 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
mm_tlb_flush_pending(mm)) {
pte_t entry;
- swapped = PageSwapCache(page);
flush_cache_page(vma, pvmw.address, page_to_pfn(page));
/*
* Ok this is tricky, when get_user_pages_fast() run it doesn't
@@ -1077,8 +1079,21 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
/*
* Check that no O_DIRECT or similar I/O is in progress on the
* page
+ *
+ * gup-fast cannot be running thanks to the
+ * ptep_clear_flush(). gup-slow if it runs on the anon
+ * page and could skip the COR, it'd mean
+ * page_mapcount is 1 which implies gup-slow is
+ * serialized against us with the PT lock. If the
+ * mapcount is > 1 on the anon page or if there were
+ * swap references by design there cannot be any GUP
+ * pin. So only if there can be a GUP pin check the
+ * page_count.
+ *
+ * PageSwapCache() is stable too under page lock.
*/
- if (page_mapcount(page) + 1 + swapped != page_count(page)) {
+ if (can_read_pin_swap_page(page) &&
+ 2 + PageSwapCache(page) != page_count(page)) {
set_pte_at(mm, pvmw.address, pvmw.pte, entry);
goto out_unlock;
}
@@ -1117,7 +1132,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
struct mm_struct *mm = vma->vm_mm;
pmd_t *pmd;
pte_t *ptep;
- pte_t newpte;
+ pte_t newpte, oldpte;
spinlock_t *ptl;
unsigned long addr;
int err = -EFAULT;
@@ -1168,7 +1183,28 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
*
* See Documentation/vm/mmu_notifier.rst
*/
- ptep_clear_flush(vma, addr, ptep);
+ oldpte = ptep_clear_flush(vma, addr, ptep);
+ /*
+ * Check that no O_DIRECT or similar I/O is in progress on the
+ * page.
+ *
+ * gup-fast cannot be running thanks to the
+ * ptep_clear_flush(). gup-slow if it runs on the anon page
+ * and could skip the COR, it'd mean page_mapcount is 1 which
+ * implies gup-slow is serialized against us with the PT
+ * lock. If the mapcount is > 1 on the anon page or if there
+ * were swap references by design there cannot be any GUP
+ * pin. So only if there can be a GUP pin check the
+ * page_count.
+ *
+ * PageSwapCache() is stable too under page lock.
+ */
+ if (can_read_pin_swap_page(page) &&
+ 2 + PageSwapCache(page) != page_count(page)) {
+ set_pte_at(mm, addr, ptep, oldpte);
+ pte_unmap_unlock(ptep, ptl);
+ goto out_mn;
+ }
set_pte_at_notify(mm, addr, ptep, newpte);
page_remove_rmap(page, false);
diff --git a/mm/memory.c b/mm/memory.c
index 75992f354ac66f..9c4b2ed9e5445e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3298,6 +3298,21 @@ static vm_fault_t wp_page_unshare(struct vm_fault *vmf)
*/
if (!smart_lock_page(vmf))
return 0;
+ if (PageKsm(vmf->page)) {
+ /*
+ * PageKsm() cannot change anymore while
+ * holding the page lock. In the unlikely case
+ * a PageKsm() materialized from under us we
+ * can retry later. We could also jump into
+ * the PageKsm locations if FOLL_MM_SYNC is
+ * set to handle the COR without an extra
+ * cycle, but this is an unlikely case that
+ * can stay simple and it's not worth
+ * optimizing for.
+ */
+ unlock_page(vmf->page);
+ goto out_unlock;
+ }
must_unshare = !can_read_pin_swap_page(vmf->page);
unlock_page(vmf->page);
if (must_unshare)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b5bac2ed1e58e9..73f2a6b1a47502 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1688,22 +1688,12 @@ again:
* exclusive so a read GUP pin can be taken, but the page isn't going
* to be made writable.
*
- * So there is a different retval in the case of PageKsm(). In
- * addition can_read_pin_swap_page() will not alter the mapping and so
- * it should not cause any side effects to the page type. It is also a
- * readonly PIN so there's no concern for stable pages.
+ * The caller has to check PageKsm before calling this.
*/
bool can_read_pin_swap_page(struct page *page)
{
VM_BUG_ON_PAGE(!PageLocked(page), page);
-
- /*
- * If the !PageKsm changed to PageKsm from under us before the
- * page lock was taken, always allow GUP pins on PageKsm.
- */
- if (unlikely(PageKsm(page)))
- return true;
-
+ VM_BUG_ON_PAGE(PageKsm(page), page);
return page_trans_huge_map_swapcount(page, NULL, NULL) <= 1;
}