aboutsummaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
authorDavid Hildenbrand <david@redhat.com>2022-11-16 11:26:48 +0100
committerAndrew Morton <akpm@linux-foundation.org>2022-11-30 15:58:58 -0800
commit84209e87c6963f928194a890399e24e8ad299db1 (patch)
tree987c556289f568ffc17bfa211285bc5a3481ac0c /mm/hugetlb.c
parent8d6a0ac09a16c026e1e2a03a61e12e95c48a25a6 (diff)
downloadlinux-84209e87c6963f928194a890399e24e8ad299db1.tar.gz
mm/gup: reliable R/O long-term pinning in COW mappings
We already support reliable R/O pinning of anonymous memory. However, assume we end up pinning (R/O long-term) a pagecache page or the shared zeropage inside a writable private ("COW") mapping. The next write access will trigger a write-fault and replace the pinned page by an exclusive anonymous page in the process page tables to break COW: the pinned page no longer corresponds to the page mapped into the process' page table. Now that FAULT_FLAG_UNSHARE can break COW on anything mapped into a COW mapping, let's properly break COW first before R/O long-term pinning something that's not an exclusive anon page inside a COW mapping. FAULT_FLAG_UNSHARE will break COW and map an exclusive anon page instead that can get pinned safely. With this change, we can stop using FOLL_FORCE|FOLL_WRITE for reliable R/O long-term pinning in COW mappings. With this change, the new R/O long-term pinning tests for non-anonymous memory succeed: # [RUN] R/O longterm GUP pin ... with shared zeropage ok 151 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with memfd ok 152 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with tmpfile ok 153 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with huge zeropage ok 154 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with memfd hugetlb (2048 kB) ok 155 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with memfd hugetlb (1048576 kB) ok 156 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with shared zeropage ok 157 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with memfd ok 158 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with tmpfile ok 159 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with huge zeropage ok 160 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with memfd hugetlb (2048 kB) ok 161 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with memfd hugetlb (1048576 kB) ok 162 Longterm R/O pin is reliable Note 1: We don't care about short-term R/O-pinning, because they have snapshot semantics: they are not supposed to observe modifications that happen after pinning. As one example, assume we start direct I/O to read from a page and store page content into a file: modifications to page content after starting direct I/O are not guaranteed to end up in the file. So even if we'd pin the shared zeropage, the end result would be as expected -- getting zeroes stored to the file. Note 2: For shared mappings we'll now always fallback to the slow path to lookup the VMA when R/O long-term pining. While that's the necessary price we have to pay right now, it's actually not that bad in practice: most FOLL_LONGTERM users already specify FOLL_WRITE, for example, along with FOLL_FORCE because they tried dealing with COW mappings correctly ... Note 3: For users that use FOLL_LONGTERM right now without FOLL_WRITE, such as VFIO, we'd now no longer pin the shared zeropage. Instead, we'd populate exclusive anon pages that we can pin. There was a concern that this could affect the memlock limit of existing setups. For example, a VM running with VFIO could run into the memlock limit and fail to run. However, we essentially had the same behavior already in commit 17839856fd58 ("gup: document and work around "COW can break either way" issue") which got merged into some enterprise distros, and there were not any such complaints. So most probably, we're fine. Link: https://lkml.kernel.org/r/20221116102659.70287-10-david@redhat.com Signed-off-by: David Hildenbrand <david@redhat.com> Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Reviewed-by: John Hubbard <jhubbard@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c7
1 files changed, 4 insertions, 3 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3d381b26d5531..9d97c9a2a15df 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6197,7 +6197,8 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
}
}
-static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
+static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma,
+ unsigned int flags, pte_t *pte,
bool *unshare)
{
pte_t pteval = huge_ptep_get(pte);
@@ -6209,7 +6210,7 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
return false;
if (flags & FOLL_WRITE)
return true;
- if (gup_must_unshare(flags, pte_page(pteval))) {
+ if (gup_must_unshare(vma, flags, pte_page(pteval))) {
*unshare = true;
return true;
}
@@ -6338,7 +6339,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* directly from any kind of swap entries.
*/
if (absent ||
- __follow_hugetlb_must_fault(flags, pte, &unshare)) {
+ __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) {
vm_fault_t ret;
unsigned int fault_flags = 0;