aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2023-06-05 18:53:12 -0400
committerAndrea Arcangeli <aarcange@redhat.com>2023-11-11 22:03:36 -0500
commit8cd1f8e1e8fa330299bbbae6f51ec3a1b4fd8a41 (patch)
tree132b2205ee063ecd2df7ca5ae40c9e03928061f1
parent26544b601846d246db8702ead4eb6d98c9a24e2c (diff)
downloadaa-8cd1f8e1e8fa330299bbbae6f51ec3a1b4fd8a41.tar.gz
mm: thp: fix COW accuracy on double map THP
David reported the do_wp_page() fault would erroenously copy a subpage mapped on a double map THP, that was actually exclusive, because some other subpage wasn't exclusive. The page_trans_huge_map_swapcount() was doing the right math for the whole THP that is used to decide if the THP COW can reuse the page without splitting the pmd and falling back to non-THP COW, but it was inaccurate and causing too much COWing if used to take the non-THP COW decision. Since the day write and longterm GUP pins existed, COWing too much is always a bug that causes loss of synchronicity, so this longstanding issue also causd a loss of synchronicity of the GUP pin. The fix is simple and it just consists of doing the same math but subpage granular if the caller is only interested to reuse the subpage, not the whole THP. Fixes: 6d0a07edd17c ("mm: thp: calculate the mapcount correctly for THP pages during WP faults") Reported-by: David Hildenbrand <david@redhat.com> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
-rw-r--r--include/linux/huge_mm.h10
-rw-r--r--include/linux/mm.h10
-rw-r--r--include/linux/swap.h37
-rw-r--r--mm/gup.c35
-rw-r--r--mm/huge_memory.c13
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/khugepaged.c2
-rw-r--r--mm/ksm.c4
-rw-r--r--mm/memory.c12
-rw-r--r--mm/swapfile.c24
10 files changed, 110 insertions, 41 deletions
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 732605ce595274..47bd59a98fbe82 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -392,6 +392,11 @@ static inline void page_trans_huge_mapcount_unlock(struct page *page)
bit_spin_unlock(PG_locked, &page[1].flags);
}
+static inline int page_trans_huge_subpage_idx(unsigned long address)
+{
+ return (address & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
+}
+
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -567,6 +572,11 @@ static inline void page_trans_huge_mapcount_unlock(struct page *page)
{
}
+static inline int page_trans_huge_subpage_idx(unsigned long address)
+{
+ return 0;
+}
+
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
extern bool page_trans_huge_anon_shared(struct page *);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 95eb99ec1d4f03..0943fc9b536e9c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -883,13 +883,14 @@ static inline int page_mapcount(struct page *page)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int total_mapcount(struct page *page);
-int page_trans_huge_mapcount(struct page *page, int *total_mapcount);
+int page_trans_huge_mapcount(struct page *page, int thp_idx,
+ int *total_mapcount);
#else
static inline int total_mapcount(struct page *page)
{
return page_mapcount(page);
}
-static inline int page_trans_huge_mapcount(struct page *page,
+static inline int page_trans_huge_mapcount(struct page *page, int thp_idx,
int *total_mapcount)
{
int mapcount = page_mapcount(page);
@@ -2956,9 +2957,10 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
}
extern bool gup_must_unshare(unsigned int flags, struct page *page,
- bool is_head, struct vm_area_struct *vma);
+ unsigned long address, bool is_head,
+ struct vm_area_struct *vma);
extern bool gup_must_unshare_irqsafe(unsigned int flags, struct page *page,
- bool is_head);
+ unsigned long address, bool is_head);
typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 42e38bbba605c7..db924e7086147b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -506,8 +506,8 @@ extern int __swp_swapcount(swp_entry_t entry);
extern int swp_swapcount(swp_entry_t entry);
extern struct swap_info_struct *page_swap_info(struct page *);
extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
-extern bool can_read_pin_swap_page(struct page *);
-extern bool reuse_swap_page(struct page *, int *);
+extern bool can_read_pin_swap_page(struct page *, int);
+extern bool reuse_swap_page(struct page *, int, int *);
extern int try_to_free_swap(struct page *);
struct backing_dev_info;
extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
@@ -669,16 +669,19 @@ static inline int swp_swapcount(swp_entry_t entry)
return 0;
}
-static inline bool can_read_pin_swap_page(struct page *page)
+static inline bool can_read_pin_swap_page(struct page *page,
+ unsigned long thp_idx)
{
- return page_trans_huge_mapcount(page, NULL) <= 1;
+ return page_trans_huge_mapcount(page, thp_idx, NULL) <= 1;
}
-static inline bool reuse_swap_page(struct page *page, int *total_map_swapcount)
+static inline bool reuse_swap_page(struct page *page, int thp_idx,
+ int *total_map_swapcount)
{
if (unlikely(PageKsm(page)))
return false;
- return page_trans_huge_mapcount(page, total_map_swapcount) <= 1;
+ return page_trans_huge_mapcount(page, thp_idx,
+ total_map_swapcount) <= 1;
}
static inline int try_to_free_swap(struct page *page)
@@ -695,6 +698,28 @@ static inline swp_entry_t get_swap_page(struct page *page)
#endif /* CONFIG_SWAP */
+#define REUSE_SWAP_PAGE_WHOLE_THP -1
+#define REUSE_SWAP_PAGE_NO_THP -2
+
+static inline bool can_read_pin_swap_page_no_thp(struct page * page) {
+ return can_read_pin_swap_page(page, REUSE_SWAP_PAGE_NO_THP);
+}
+static inline bool can_read_pin_swap_page_addr(struct page * page,
+ unsigned long addr) {
+ return can_read_pin_swap_page(page, page_trans_huge_subpage_idx(addr));
+}
+static inline bool reuse_swap_page_whole_thp(struct page * page,
+ int *total_map_swapcount) {
+ return reuse_swap_page(page, REUSE_SWAP_PAGE_WHOLE_THP,
+ total_map_swapcount);
+}
+static inline bool reuse_swap_page_addr(struct page * page,
+ unsigned long addr,
+ int *total_map_swapcount) {
+ return reuse_swap_page(page, page_trans_huge_subpage_idx(addr),
+ total_map_swapcount);
+}
+
#ifdef CONFIG_THP_SWAP
extern int split_swap_cluster(swp_entry_t entry);
#else
diff --git a/mm/gup.c b/mm/gup.c
index 0450462ad1df46..591f81cd964114 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -46,7 +46,8 @@ static __always_inline bool gup_must_unshare_ksm(unsigned int flags)
return !!(flags & FOLL_MM_SYNC);
}
-static bool gup_must_unshare_slowpath(struct page *page, unsigned int flags)
+static bool __gup_must_unshare_slowpath(struct page *page, unsigned int flags,
+ unsigned long thp_idx)
{
bool must_unshare;
/*
@@ -68,11 +69,19 @@ static bool gup_must_unshare_slowpath(struct page *page, unsigned int flags)
unlock_page(page);
return gup_must_unshare_ksm(flags);
}
- must_unshare = !can_read_pin_swap_page(page);
+ must_unshare = !can_read_pin_swap_page(page, thp_idx);
unlock_page(page);
return must_unshare;
}
+static __always_inline bool gup_must_unshare_slowpath(struct page *page,
+ unsigned int flags,
+ unsigned long address)
+{
+ int thp_idx = page_trans_huge_subpage_idx(address);
+ return __gup_must_unshare_slowpath(page, flags, thp_idx);
+}
+
static bool gup_must_unshare_hugetlbfs_slowpath(struct page *page)
{
bool must_unshare;
@@ -133,6 +142,7 @@ static bool gup_must_unshare_hugetlbfs_slowpath(struct page *page)
*/
static __always_inline bool __gup_must_unshare(unsigned int flags,
struct page *page,
+ unsigned long address,
bool is_head, bool irq_safe,
struct vm_area_struct *vma)
{
@@ -159,7 +169,8 @@ static __always_inline bool __gup_must_unshare(unsigned int flags,
if (!is_fast_only_in_irq(irq_safe)) {
if (page_trans_huge_anon_shared(page))
return true;
- return gup_must_unshare_slowpath(page, flags);
+ return gup_must_unshare_slowpath(page, flags,
+ address);
}
return true;
}
@@ -168,24 +179,26 @@ static __always_inline bool __gup_must_unshare(unsigned int flags,
if (!is_fast_only_in_irq(irq_safe)) {
if (page_mapcount(page) > 1)
return true;
- return gup_must_unshare_slowpath(page, flags);
+ return gup_must_unshare_slowpath(page, flags,
+ address);
}
return true;
}
}
/* requires full accuracy */
-bool gup_must_unshare(unsigned int flags, struct page *page, bool is_head,
+bool gup_must_unshare(unsigned int flags, struct page *page,
+ unsigned long address, bool is_head,
struct vm_area_struct *vma)
{
- return __gup_must_unshare(flags, page, is_head, false, vma);
+ return __gup_must_unshare(flags, page, address, is_head, false, vma);
}
/* false positives are allowed, false negatives not allowed */
bool gup_must_unshare_irqsafe(unsigned int flags, struct page *page,
- bool is_head)
+ unsigned long address, bool is_head)
{
- return __gup_must_unshare(flags, page, is_head, true, NULL);
+ return __gup_must_unshare(flags, page, address, is_head, true, NULL);
}
static void hpage_pincount_add(struct page *page, int refs)
@@ -750,7 +763,7 @@ retry:
* exclusive.
*/
if (!pte_write(pte) &&
- gup_must_unshare(flags, page, false, vma)) {
+ gup_must_unshare(flags, page, address, false, vma)) {
page = ERR_PTR(-EMLINK);
goto out;
}
@@ -2543,7 +2556,7 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
}
if (!pte_write(pte) &&
- gup_must_unshare_irqsafe(flags, page, false)) {
+ gup_must_unshare_irqsafe(flags, page, addr, false)) {
put_compound_head(head, 1, flags);
goto pte_unmap;
}
@@ -2831,7 +2844,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
if (!pmd_write(orig) &&
- gup_must_unshare_irqsafe(flags, head, true)) {
+ gup_must_unshare_irqsafe(flags, head, addr, true)) {
put_compound_head(head, refs, flags);
return 0;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 777e567fff7fe9..bdc629c550cb47 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1363,7 +1363,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
* We can only reuse the page if nobody else maps the huge page or it's
* part.
*/
- if (reuse_swap_page(page, NULL)) {
+ if (reuse_swap_page_whole_thp(page, NULL)) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1418,7 +1418,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
/* see comments of the gup_must_unshare() callers in mm/gup.c */
if (!pmd_write(*pmd) &&
- gup_must_unshare(flags, page, true, vma))
+ gup_must_unshare(flags, page, addr, true, vma))
return ERR_PTR(-EMLINK);
if (!try_grab_page(page, flags))
@@ -2583,7 +2583,8 @@ again:
* need full accuracy to avoid breaking page pinning, because
* page_trans_huge_mapcount() is slower than page_mapcount().
*/
-int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
+int page_trans_huge_mapcount(struct page *page, int thp_idx,
+ int *total_mapcount)
{
int i, ret, _total_mapcount, mapcount;
unsigned int seqcount;
@@ -2592,12 +2593,15 @@ int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
VM_BUG_ON_PAGE(PageHuge(page), page);
if (likely(!PageTransCompound(page))) {
+ WARN_ON_ONCE(thp_idx == REUSE_SWAP_PAGE_WHOLE_THP);
mapcount = atomic_read(&page->_mapcount) + 1;
if (total_mapcount)
*total_mapcount = mapcount;
return mapcount;
}
+ WARN_ON_ONCE(thp_idx == REUSE_SWAP_PAGE_NO_THP);
+
page = compound_head(page);
again:
@@ -2605,7 +2609,8 @@ again:
_total_mapcount = ret = 0;
for (i = 0; i < thp_nr_pages(page); i++) {
mapcount = atomic_read(&page[i]._mapcount) + 1;
- ret = max(ret, mapcount);
+ if (thp_idx < 0 || i == thp_idx)
+ ret = max(ret, mapcount);
_total_mapcount += mapcount;
}
if (PageDoubleMap(page)) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8ee2f9b4eb2bc0..474c7a502f6d44 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5553,7 +5553,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
(!huge_pte_write(pteval) &&
((flags & FOLL_WRITE) ||
(unshare = gup_must_unshare(flags, pte_page(pteval),
- true, vma))))) {
+ vaddr, true, vma))))) {
vm_fault_t ret;
unsigned int fault_flags = 0;
@@ -6276,7 +6276,7 @@ retry:
* check here is just in case.
*/
if (!huge_pte_write(pte) &&
- gup_must_unshare(flags, head_page, true, vma)) {
+ gup_must_unshare(flags, head_page, address, true, vma)) {
page = NULL;
goto out;
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 203792e70ac1c2..5975f07d3c168d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -685,7 +685,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
if (!pte_write(pteval) && PageSwapCache(page) &&
- !reuse_swap_page(page, NULL)) {
+ !reuse_swap_page_addr(page, address, NULL)) {
/*
* Page is in the swap cache and cannot be re-used.
* It cannot be collapsed into a THP.
diff --git a/mm/ksm.c b/mm/ksm.c
index 7edb1db9742b43..d8090082acf943 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1092,7 +1092,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
*
* PageSwapCache() is stable too under page lock.
*/
- if (can_read_pin_swap_page(page) &&
+ if (can_read_pin_swap_page_no_thp(page) &&
2 + PageSwapCache(page) != page_count(page)) {
set_pte_at(mm, pvmw.address, pvmw.pte, entry);
goto out_unlock;
@@ -1199,7 +1199,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
*
* PageSwapCache() is stable too under page lock.
*/
- if (can_read_pin_swap_page(page) &&
+ if (can_read_pin_swap_page_no_thp(page) &&
2 + PageSwapCache(page) != page_count(page)) {
set_pte_at(mm, addr, ptep, oldpte);
pte_unmap_unlock(ptep, ptl);
diff --git a/mm/memory.c b/mm/memory.c
index e9c881406e783c..5d8075ca21ec8a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3075,7 +3075,8 @@ static __always_inline vm_fault_t __wp_page_copy(struct vm_fault *vmf,
* GUP R/O pin may already have been taken so
* go back to GUP and try again.
*/
- if (can_read_pin_swap_page(vmf->page)) {
+ if (can_read_pin_swap_page_addr(vmf->page,
+ vmf->address)) {
set_pte_at_notify(mm, vmf->address, vmf->pte,
vmf->orig_pte);
goto unlock_old_page;
@@ -3354,7 +3355,8 @@ static vm_fault_t wp_page_unshare(struct vm_fault *vmf)
unlock_page(vmf->page);
goto out_unlock;
}
- must_unshare = !can_read_pin_swap_page(vmf->page);
+ must_unshare = !can_read_pin_swap_page_addr(vmf->page,
+ vmf->address);
unlock_page(vmf->page);
if (must_unshare)
return __wp_page_unshare(vmf);
@@ -3558,7 +3560,8 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
wp_page_reuse(vmf);
return VM_FAULT_WRITE;
}
- if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
+ if (reuse_swap_page_addr(vmf->page, vmf->address,
+ &total_map_swapcount)) {
if (total_map_swapcount == 1) {
/*
* The page is all ours. Move it to
@@ -3923,7 +3926,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
+ if ((vmf->flags & FAULT_FLAG_WRITE) &&
+ reuse_swap_page_addr(page, vmf->address, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e031ed44434c52..3616e87c399145 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1603,7 +1603,8 @@ static bool page_swapped(struct page *page)
return false;
}
-static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+static int page_trans_huge_map_swapcount(struct page *page, int thp_idx,
+ int *total_mapcount,
int *total_swapcount)
{
int i, map_swapcount, _total_mapcount, _total_swapcount;
@@ -1618,7 +1619,8 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
VM_BUG_ON_PAGE(PageHuge(page), page);
if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
- mapcount = page_trans_huge_mapcount(page, total_mapcount);
+ mapcount = page_trans_huge_mapcount(page, thp_idx,
+ total_mapcount);
if (PageSwapCache(page))
swapcount = page_swapcount(page);
if (total_swapcount)
@@ -1626,6 +1628,8 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
return mapcount + swapcount;
}
+ WARN_ON_ONCE(thp_idx == REUSE_SWAP_PAGE_NO_THP);
+
page = compound_head(page);
if (PageSwapCache(page)) {
@@ -1652,7 +1656,9 @@ again:
swapcount = swap_count(map[offset + i]);
_total_swapcount += swapcount;
}
- map_swapcount = max(map_swapcount, mapcount + swapcount);
+ if (thp_idx < 0 || i == thp_idx)
+ map_swapcount = max(map_swapcount,
+ mapcount + swapcount);
}
if (PageDoubleMap(page)) {
map_swapcount -= 1;
@@ -1690,7 +1696,7 @@ again:
*
* The caller has to check PageKsm before calling this.
*/
-bool can_read_pin_swap_page(struct page *page)
+bool can_read_pin_swap_page(struct page *page, int thp_idx)
{
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageKsm(page), page);
@@ -1721,7 +1727,7 @@ bool can_read_pin_swap_page(struct page *page)
*/
if (PageWriteback(page))
return false;
- return page_trans_huge_map_swapcount(page, NULL, NULL) <= 1;
+ return page_trans_huge_map_swapcount(page, thp_idx, NULL, NULL) <= 1;
}
/*
@@ -1733,15 +1739,19 @@ bool can_read_pin_swap_page(struct page *page)
* NOTE: total_map_swapcount should not be relied upon by the caller if
* reuse_swap_page() returns false, but it may be always overwritten
* (see the other implementation for CONFIG_SWAP=n).
+ *
+ * If thp_idx is negative check if the whole THP can be reused,
+ * otherwise check only the provided subpage index.
*/
-bool reuse_swap_page(struct page *page, int *total_map_swapcount)
+bool reuse_swap_page(struct page *page, int thp_idx,
+ int *total_map_swapcount)
{
int count, total_mapcount, total_swapcount;
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (unlikely(PageKsm(page)))
return false;
- count = page_trans_huge_map_swapcount(page, &total_mapcount,
+ count = page_trans_huge_map_swapcount(page, thp_idx, &total_mapcount,
&total_swapcount);
if (total_map_swapcount)
*total_map_swapcount = total_mapcount + total_swapcount;