aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2021-01-13 17:54:56 -0500
committerAndrea Arcangeli <aarcange@redhat.com>2023-11-11 22:03:35 -0500
commitf5063bcffc5b98891ca830d9b22cb2fb31fa0c1a (patch)
tree12b33b374323514c76c3761566b42905b6826f87
parent4ae013f06fd64d19a5c0904746c6618f8667a10a (diff)
downloadaa-f5063bcffc5b98891ca830d9b22cb2fb31fa0c1a.tar.gz
mm: thp: stabilize the THP mapcount in page_remove_anon_compound_rmap
The THP mapcount needs to behave as an atomic entity that moves in one direction without jittering to be usable for GUP unsharing purposes. The main challenge to achieve it is the PageDoubleMap. If the compound_mapcount and the tail mapcounts move in the same direction there's no jittering. However when the compound_mapcount is decreased and reaches zero, the reader will see initially a decrease in the THP mapcount that will then be followed by the PageDoubleMap being cleared. The act of clearing the PageDoubleMap will lead the reader to overestimate the mapcount once again until all tail mapcounts (that the PageDoubleMap flag kept artificially elevated) are finally released. The above runtime causes the THP mapcount in the reader to jitter from a) 2 b) 1 c) 2 d) 1 and this patch will avoid that jittering. Otherwise when GUP unsharing is required, GUP could take the pin in b) without COR, and another thread in c) could trigger a spurious COW immediately later, causing the readonly long term GUP pin to lose coherency. The anon compound_mapcount once zero won't increase again so the PageDoubleMap cannot be cleared in parallel, so the seqlock only needs to be taken if the PageDoubleMap flag is found set. This removes a locked op from the THP freeing fast path. In the future it may be possible to move the PageDoubleMap flag in the MSB of the compound_mapcount counter, in order to decrease the compound_mapcount and to clear the PageDoubleMap flag in a single atomic operation. That would prevent the jittering and it could make the THP mapcount reading infallible in irq, but it would be more complex to verify as correct than the seqlock. Reviewed-by: Peter Xu <peterx@redhat.com> Reported-by: Hugh Dickins <hughd@google.com> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
-rw-r--r--include/linux/huge_mm.h8
-rw-r--r--mm/huge_memory.c9
-rw-r--r--mm/rmap.c24
3 files changed, 38 insertions, 3 deletions
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 02e10d793084d0..bde1e277579277 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -559,6 +559,14 @@ static inline bool page_mapcount_seq_retry(struct page *page,
return false;
}
+static inline void page_trans_huge_mapcount_lock(struct page *page)
+{
+}
+
+static inline void page_trans_huge_mapcount_unlock(struct page *page)
+{
+}
+
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/**
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f7f1be739aa55f..814b45cd8013f2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2111,6 +2111,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
/* Sub-page mapcount accounting for above small mappings. */
int val = 1;
+ /*
+ * lock_page_memcg() is taken before
+ * page_trans_huge_mapcount_lock() in
+ * page_remove_anon_compound_rmap().
+ */
+ lock_page_memcg(page);
page_trans_huge_mapcount_lock(page);
/*
@@ -2126,7 +2132,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
for (i = 0; i < HPAGE_PMD_NR; i++)
atomic_add(val, &page[i]._mapcount);
- lock_page_memcg(page);
if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
/* Last compound_mapcount is gone. */
__mod_lruvec_page_state(page, NR_ANON_THPS,
@@ -2137,7 +2142,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
atomic_dec(&page[i]._mapcount);
}
}
- unlock_page_memcg(page);
/*
* Here a smp_wmb() is needed to make the pte writes visible
@@ -2145,6 +2149,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
* page_trans_huge_mapcount_unlock().
*/
page_trans_huge_mapcount_unlock(page);
+ unlock_page_memcg(page);
} else
smp_wmb(); /* make pte visible before pmd */
diff --git a/mm/rmap.c b/mm/rmap.c
index 330b361a460eae..7f7320259afbc9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1311,7 +1311,28 @@ static void page_remove_anon_compound_rmap(struct page *page)
__mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page));
- if (TestClearPageDoubleMap(page)) {
+ if (PageDoubleMap(page)) {
+ /*
+ * To avoid readonly FOLL_LONGTERM pins to lose
+ * coherency the page_mapcount() isn't allowed to
+ * jitter from a) 2 b) 1 c) 2 d) 1 if munmap() is
+ * running in a different process, in parallel with
+ * GUP on the current process.
+ *
+ * Without the page_trans_huge_mapcount_lock such
+ * jittering could happen and FOLL_LONGTERM could take
+ * the GUP pin in b) without GUP unsharing with COR,
+ * and another thread in c) could trigger a spurious
+ * COW immediately later, causing the readonly long
+ * term GUP pin to lose coherency.
+ */
+ page_trans_huge_mapcount_lock(page);
+ /*
+ * The last hugepmd mapping is gone so there cannot be
+ * extra pmd splits on this THP.
+ */
+ if (!TestClearPageDoubleMap(page))
+ BUG();
/*
* Subpages can be mapped with PTEs too. Check how many of
* them are still mapped.
@@ -1320,6 +1341,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
if (atomic_add_negative(-1, &page[i]._mapcount))
nr++;
}
+ page_trans_huge_mapcount_unlock(page);
/*
* Queue the page for deferred split if at least one small