diff options
author | Andrea Arcangeli <aarcange@redhat.com> | 2021-12-22 13:32:17 -0500 |
---|---|---|
committer | Andrea Arcangeli <aarcange@redhat.com> | 2023-11-11 22:03:36 -0500 |
commit | 143fc197abb13cc287ead52590557b19c520449a (patch) | |
tree | cfe6e7e4e4ebf001343c3add2f708e38e7341b82 | |
parent | ba720b98c7269293d13dc5489ff30459181d39aa (diff) | |
download | aa-143fc197abb13cc287ead52590557b19c520449a.tar.gz |
mm: gup: FOLL_MM_SYNC: handle FOLL_MM_SYNC in can_read_pin_swap_page()
Since the introduction of FOLL_MM_SYNC we cannot unconditionally take
pins on PageKsm anymore, it depends if FOLL_MM_SYNC is set.
So this extracts the PageKsm knowledge from can_read_pin_swap_page()
and it requires the caller to handle the race with KSM creating a
PageKsm.
KSM also needs do its part and ensure it never converts in place a
!PageKsm to PageKsm if there's any oustanding pins and that it won't
change a pgtable from a wrprotected PageAnon to a wrprotected PageKsm
if there were any outstanding pins on the PageAnon.
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
-rw-r--r-- | include/linux/swap.h | 2 | ||||
-rw-r--r-- | mm/gup.c | 22 | ||||
-rw-r--r-- | mm/ksm.c | 46 | ||||
-rw-r--r-- | mm/memory.c | 15 | ||||
-rw-r--r-- | mm/swapfile.c | 14 |
5 files changed, 76 insertions, 23 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h index 3efd5e94e322a0..c86cd15bb424c4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -671,8 +671,6 @@ static inline int swp_swapcount(swp_entry_t entry) static inline bool can_read_pin_swap_page(struct page *page) { - if (unlikely(PageKsm(page))) - return true; return page_trans_huge_mapcount(page, NULL) <= 1; } diff --git a/mm/gup.c b/mm/gup.c index b6306420a43c1c..b1b57aee6f4025 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -41,7 +41,12 @@ static __always_inline bool is_fast_only_in_irq(bool irq_safe) return irq_safe && unlikely(!!irq_count()); } -static bool gup_must_unshare_slowpath(struct page *page) +static __always_inline bool gup_must_unshare_ksm(unsigned int flags) +{ + return !!(flags & FOLL_MM_SYNC); +} + +static bool gup_must_unshare_slowpath(struct page *page, unsigned int flags) { bool must_unshare; /* @@ -54,6 +59,15 @@ static bool gup_must_unshare_slowpath(struct page *page) */ if (!trylock_page(page)) return true; + if (PageKsm(page)) { + /* + * PageKsm() cannot change anymore while holding the + * page lock and either an extra refcount reference + * (gup-fast varaints) or the PT lock (gup-slow). + */ + unlock_page(page); + return gup_must_unshare_ksm(flags); + } must_unshare = !can_read_pin_swap_page(page); unlock_page(page); return must_unshare; @@ -97,7 +111,7 @@ static __always_inline bool __gup_must_unshare(unsigned int flags, if (!PageAnon(page)) return false; if (PageKsm(page)) - return !!(flags & FOLL_MM_SYNC); + return gup_must_unshare_ksm(flags); if (PageHuge(page)) { if (__page_mapcount(page) > 1) return true; @@ -108,7 +122,7 @@ static __always_inline bool __gup_must_unshare(unsigned int flags, if (!is_fast_only_in_irq(irq_safe)) { if (page_trans_huge_anon_shared(page)) return true; - return gup_must_unshare_slowpath(page); + return gup_must_unshare_slowpath(page, flags); } return true; } @@ -117,7 +131,7 @@ static __always_inline bool __gup_must_unshare(unsigned int flags, if (!is_fast_only_in_irq(irq_safe)) { if (page_mapcount(page) > 1) return true; - return gup_must_unshare_slowpath(page); + return gup_must_unshare_slowpath(page, flags); } return true; } diff --git a/mm/ksm.c b/mm/ksm.c index d73a673bcb2409..13242ead684aa1 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1032,7 +1032,6 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, .page = page, .vma = vma, }; - int swapped; int err = -EFAULT; struct mmu_notifier_range range; @@ -1042,6 +1041,10 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, BUG_ON(PageTransCompound(page)); + /* racy optimistic check for the fast path */ + if (page_mapcount(page) + 1 + PageSwapCache(page) != page_count(page)) + goto out; + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, pvmw.address, pvmw.address + PAGE_SIZE); @@ -1057,7 +1060,6 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, mm_tlb_flush_pending(mm)) { pte_t entry; - swapped = PageSwapCache(page); flush_cache_page(vma, pvmw.address, page_to_pfn(page)); /* * Ok this is tricky, when get_user_pages_fast() run it doesn't @@ -1077,8 +1079,21 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, /* * Check that no O_DIRECT or similar I/O is in progress on the * page + * + * gup-fast cannot be running thanks to the + * ptep_clear_flush(). gup-slow if it runs on the anon + * page and could skip the COR, it'd mean + * page_mapcount is 1 which implies gup-slow is + * serialized against us with the PT lock. If the + * mapcount is > 1 on the anon page or if there were + * swap references by design there cannot be any GUP + * pin. So only if there can be a GUP pin check the + * page_count. + * + * PageSwapCache() is stable too under page lock. */ - if (page_mapcount(page) + 1 + swapped != page_count(page)) { + if (can_read_pin_swap_page(page) && + 2 + PageSwapCache(page) != page_count(page)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } @@ -1117,7 +1132,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, struct mm_struct *mm = vma->vm_mm; pmd_t *pmd; pte_t *ptep; - pte_t newpte; + pte_t newpte, oldpte; spinlock_t *ptl; unsigned long addr; int err = -EFAULT; @@ -1168,7 +1183,28 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * * See Documentation/vm/mmu_notifier.rst */ - ptep_clear_flush(vma, addr, ptep); + oldpte = ptep_clear_flush(vma, addr, ptep); + /* + * Check that no O_DIRECT or similar I/O is in progress on the + * page. + * + * gup-fast cannot be running thanks to the + * ptep_clear_flush(). gup-slow if it runs on the anon page + * and could skip the COR, it'd mean page_mapcount is 1 which + * implies gup-slow is serialized against us with the PT + * lock. If the mapcount is > 1 on the anon page or if there + * were swap references by design there cannot be any GUP + * pin. So only if there can be a GUP pin check the + * page_count. + * + * PageSwapCache() is stable too under page lock. + */ + if (can_read_pin_swap_page(page) && + 2 + PageSwapCache(page) != page_count(page)) { + set_pte_at(mm, addr, ptep, oldpte); + pte_unmap_unlock(ptep, ptl); + goto out_mn; + } set_pte_at_notify(mm, addr, ptep, newpte); page_remove_rmap(page, false); diff --git a/mm/memory.c b/mm/memory.c index 75992f354ac66f..9c4b2ed9e5445e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3298,6 +3298,21 @@ static vm_fault_t wp_page_unshare(struct vm_fault *vmf) */ if (!smart_lock_page(vmf)) return 0; + if (PageKsm(vmf->page)) { + /* + * PageKsm() cannot change anymore while + * holding the page lock. In the unlikely case + * a PageKsm() materialized from under us we + * can retry later. We could also jump into + * the PageKsm locations if FOLL_MM_SYNC is + * set to handle the COR without an extra + * cycle, but this is an unlikely case that + * can stay simple and it's not worth + * optimizing for. + */ + unlock_page(vmf->page); + goto out_unlock; + } must_unshare = !can_read_pin_swap_page(vmf->page); unlock_page(vmf->page); if (must_unshare) diff --git a/mm/swapfile.c b/mm/swapfile.c index b5bac2ed1e58e9..73f2a6b1a47502 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1688,22 +1688,12 @@ again: * exclusive so a read GUP pin can be taken, but the page isn't going * to be made writable. * - * So there is a different retval in the case of PageKsm(). In - * addition can_read_pin_swap_page() will not alter the mapping and so - * it should not cause any side effects to the page type. It is also a - * readonly PIN so there's no concern for stable pages. + * The caller has to check PageKsm before calling this. */ bool can_read_pin_swap_page(struct page *page) { VM_BUG_ON_PAGE(!PageLocked(page), page); - - /* - * If the !PageKsm changed to PageKsm from under us before the - * page lock was taken, always allow GUP pins on PageKsm. - */ - if (unlikely(PageKsm(page))) - return true; - + VM_BUG_ON_PAGE(PageKsm(page), page); return page_trans_huge_map_swapcount(page, NULL, NULL) <= 1; } |