Skip to main content

kernel/mm/
virt.rs

1// SPDX-License-Identifier: GPL-2.0
2
3// Copyright (C) 2024 Google LLC.
4
5//! Virtual memory.
6//!
7//! This module deals with managing a single VMA in the address space of a userspace process. Each
8//! VMA corresponds to a region of memory that the userspace process can access, and the VMA lets
9//! you control what happens when userspace reads or writes to that region of memory.
10//!
11//! The module has several different Rust types that all correspond to the C type called
12//! `vm_area_struct`. The different structs represent what kind of access you have to the VMA, e.g.
13//! [`VmaRef`] is used when you hold the mmap or vma read lock. Using the appropriate struct
14//! ensures that you can't, for example, accidentally call a function that requires holding the
15//! write lock when you only hold the read lock.
16
17use crate::{
18    bindings,
19    error::{code::EINVAL, to_result, Result},
20    mm::MmWithUser,
21    page::Page,
22    types::Opaque,
23};
24
25use core::ops::Deref;
26
27/// A wrapper for the kernel's `struct vm_area_struct` with read access.
28///
29/// It represents an area of virtual memory.
30///
31/// # Invariants
32///
33/// The caller must hold the mmap read lock or the vma read lock.
34#[repr(transparent)]
35pub struct VmaRef {
36    vma: Opaque<bindings::vm_area_struct>,
37}
38
39// Methods you can call when holding the mmap or vma read lock (or stronger). They must be usable
40// no matter what the vma flags are.
41impl VmaRef {
42    /// Access a virtual memory area given a raw pointer.
43    ///
44    /// # Safety
45    ///
46    /// Callers must ensure that `vma` is valid for the duration of 'a, and that the mmap or vma
47    /// read lock (or stronger) is held for at least the duration of 'a.
48    #[inline]
49    pub unsafe fn from_raw<'a>(vma: *const bindings::vm_area_struct) -> &'a Self {
50        // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a.
51        unsafe { &*vma.cast() }
52    }
53
54    /// Returns a raw pointer to this area.
55    #[inline]
56    pub fn as_ptr(&self) -> *mut bindings::vm_area_struct {
57        self.vma.get()
58    }
59
60    /// Access the underlying `mm_struct`.
61    #[inline]
62    pub fn mm(&self) -> &MmWithUser {
63        // SAFETY: By the type invariants, this `vm_area_struct` is valid and we hold the mmap/vma
64        // read lock or stronger. This implies that the underlying mm has a non-zero value of
65        // `mm_users`.
66        unsafe { MmWithUser::from_raw((*self.as_ptr()).vm_mm) }
67    }
68
69    /// Returns the flags associated with the virtual memory area.
70    ///
71    /// The possible flags are a combination of the constants in [`flags`].
72    #[inline]
73    pub fn flags(&self) -> vm_flags_t {
74        // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this
75        // access is not a data race.
76        unsafe { (*self.as_ptr()).__bindgen_anon_2.vm_flags }
77    }
78
79    /// Returns the (inclusive) start address of the virtual memory area.
80    #[inline]
81    pub fn start(&self) -> usize {
82        // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this
83        // access is not a data race.
84        unsafe { (*self.as_ptr()).__bindgen_anon_1.__bindgen_anon_1.vm_start }
85    }
86
87    /// Returns the (exclusive) end address of the virtual memory area.
88    #[inline]
89    pub fn end(&self) -> usize {
90        // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this
91        // access is not a data race.
92        unsafe { (*self.as_ptr()).__bindgen_anon_1.__bindgen_anon_1.vm_end }
93    }
94
95    /// Zap pages in the given page range.
96    ///
97    /// This clears page table mappings for the range at the leaf level, leaving all other page
98    /// tables intact, and freeing any memory referenced by the VMA in this range. That is,
99    /// anonymous memory is completely freed, file-backed memory has its reference count on page
100    /// cache folio's dropped, any dirty data will still be written back to disk as usual.
101    ///
102    /// It may seem odd that we clear at the leaf level, this is however a product of the page
103    /// table structure used to map physical memory into a virtual address space - each virtual
104    /// address actually consists of a bitmap of array indices into page tables, which form a
105    /// hierarchical page table level structure.
106    ///
107    /// As a result, each page table level maps a multiple of page table levels below, and thus
108    /// span ever increasing ranges of pages. At the leaf or PTE level, we map the actual physical
109    /// memory.
110    ///
111    /// It is here where a zap operates, as it the only place we can be certain of clearing without
112    /// impacting any other virtual mappings. It is an implementation detail as to whether the
113    /// kernel goes further in freeing unused page tables, but for the purposes of this operation
114    /// we must only assume that the leaf level is cleared.
115    #[inline]
116    pub fn zap_vma_range(&self, address: usize, size: usize) {
117        let (end, did_overflow) = address.overflowing_add(size);
118        if did_overflow || address < self.start() || self.end() < end {
119            // TODO: call WARN_ONCE once Rust version of it is added
120            return;
121        }
122
123        // SAFETY: By the type invariants, the caller has read access to this VMA, which is
124        // sufficient for this method call. This method has no requirements on the vma flags. The
125        // address range is checked to be within the vma.
126        unsafe { bindings::zap_vma_range(self.as_ptr(), address, size) };
127    }
128
129    /// If the [`VM_MIXEDMAP`] flag is set, returns a [`VmaMixedMap`] to this VMA, otherwise
130    /// returns `None`.
131    ///
132    /// This can be used to access methods that require [`VM_MIXEDMAP`] to be set.
133    ///
134    /// [`VM_MIXEDMAP`]: flags::MIXEDMAP
135    #[inline]
136    pub fn as_mixedmap_vma(&self) -> Option<&VmaMixedMap> {
137        if self.flags() & flags::MIXEDMAP != 0 {
138            // SAFETY: We just checked that `VM_MIXEDMAP` is set. All other requirements are
139            // satisfied by the type invariants of `VmaRef`.
140            Some(unsafe { VmaMixedMap::from_raw(self.as_ptr()) })
141        } else {
142            None
143        }
144    }
145}
146
147/// A wrapper for the kernel's `struct vm_area_struct` with read access and [`VM_MIXEDMAP`] set.
148///
149/// It represents an area of virtual memory.
150///
151/// This struct is identical to [`VmaRef`] except that it must only be used when the
152/// [`VM_MIXEDMAP`] flag is set on the vma.
153///
154/// # Invariants
155///
156/// The caller must hold the mmap read lock or the vma read lock. The `VM_MIXEDMAP` flag must be
157/// set.
158///
159/// [`VM_MIXEDMAP`]: flags::MIXEDMAP
160#[repr(transparent)]
161pub struct VmaMixedMap {
162    vma: VmaRef,
163}
164
165// Make all `VmaRef` methods available on `VmaMixedMap`.
166impl Deref for VmaMixedMap {
167    type Target = VmaRef;
168
169    #[inline]
170    fn deref(&self) -> &VmaRef {
171        &self.vma
172    }
173}
174
175impl VmaMixedMap {
176    /// Access a virtual memory area given a raw pointer.
177    ///
178    /// # Safety
179    ///
180    /// Callers must ensure that `vma` is valid for the duration of 'a, and that the mmap read lock
181    /// (or stronger) is held for at least the duration of 'a. The `VM_MIXEDMAP` flag must be set.
182    #[inline]
183    pub unsafe fn from_raw<'a>(vma: *const bindings::vm_area_struct) -> &'a Self {
184        // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a.
185        unsafe { &*vma.cast() }
186    }
187
188    /// Maps a single page at the given address within the virtual memory area.
189    ///
190    /// This operation does not take ownership of the page.
191    #[inline]
192    pub fn vm_insert_page(&self, address: usize, page: &Page) -> Result {
193        // SAFETY: By the type invariant of `Self` caller has read access and has verified that
194        // `VM_MIXEDMAP` is set. By invariant on `Page` the page has order 0.
195        to_result(unsafe { bindings::vm_insert_page(self.as_ptr(), address, page.as_ptr()) })
196    }
197}
198
199/// A configuration object for setting up a VMA in an `f_ops->mmap()` hook.
200///
201/// The `f_ops->mmap()` hook is called when a new VMA is being created, and the hook is able to
202/// configure the VMA in various ways to fit the driver that owns it. Using `VmaNew` indicates that
203/// you are allowed to perform operations on the VMA that can only be performed before the VMA is
204/// fully initialized.
205///
206/// # Invariants
207///
208/// For the duration of 'a, the referenced vma must be undergoing initialization in an
209/// `f_ops->mmap()` hook.
210#[repr(transparent)]
211pub struct VmaNew {
212    vma: VmaRef,
213}
214
215// Make all `VmaRef` methods available on `VmaNew`.
216impl Deref for VmaNew {
217    type Target = VmaRef;
218
219    #[inline]
220    fn deref(&self) -> &VmaRef {
221        &self.vma
222    }
223}
224
225impl VmaNew {
226    /// Access a virtual memory area given a raw pointer.
227    ///
228    /// # Safety
229    ///
230    /// Callers must ensure that `vma` is undergoing initial vma setup for the duration of 'a.
231    #[inline]
232    pub unsafe fn from_raw<'a>(vma: *mut bindings::vm_area_struct) -> &'a Self {
233        // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a.
234        unsafe { &*vma.cast() }
235    }
236
237    /// Internal method for updating the vma flags.
238    ///
239    /// # Safety
240    ///
241    /// This must not be used to set the flags to an invalid value.
242    #[inline]
243    unsafe fn update_flags(&self, set: vm_flags_t, unset: vm_flags_t) {
244        let mut flags = self.flags();
245        flags |= set;
246        flags &= !unset;
247
248        // SAFETY: This is not a data race: the vma is undergoing initial setup, so it's not yet
249        // shared. Additionally, `VmaNew` is `!Sync`, so it cannot be used to write in parallel.
250        // The caller promises that this does not set the flags to an invalid value.
251        unsafe { (*self.as_ptr()).__bindgen_anon_2.vm_flags = flags };
252    }
253
254    /// Set the `VM_MIXEDMAP` flag on this vma.
255    ///
256    /// This enables the vma to contain both `struct page` and pure PFN pages. Returns a reference
257    /// that can be used to call `vm_insert_page` on the vma.
258    #[inline]
259    pub fn set_mixedmap(&self) -> &VmaMixedMap {
260        // SAFETY: We don't yet provide a way to set VM_PFNMAP, so this cannot put the flags in an
261        // invalid state.
262        unsafe { self.update_flags(flags::MIXEDMAP, 0) };
263
264        // SAFETY: We just set `VM_MIXEDMAP` on the vma.
265        unsafe { VmaMixedMap::from_raw(self.vma.as_ptr()) }
266    }
267
268    /// Set the `VM_IO` flag on this vma.
269    ///
270    /// This is used for memory mapped IO and similar. The flag tells other parts of the kernel to
271    /// avoid looking at the pages. For memory mapped IO this is useful as accesses to the pages
272    /// could have side effects.
273    #[inline]
274    pub fn set_io(&self) {
275        // SAFETY: Setting the VM_IO flag is always okay.
276        unsafe { self.update_flags(flags::IO, 0) };
277    }
278
279    /// Set the `VM_DONTEXPAND` flag on this vma.
280    ///
281    /// This prevents the vma from being expanded with `mremap()`.
282    #[inline]
283    pub fn set_dontexpand(&self) {
284        // SAFETY: Setting the VM_DONTEXPAND flag is always okay.
285        unsafe { self.update_flags(flags::DONTEXPAND, 0) };
286    }
287
288    /// Set the `VM_DONTCOPY` flag on this vma.
289    ///
290    /// This prevents the vma from being copied on fork. This option is only permanent if `VM_IO`
291    /// is set.
292    #[inline]
293    pub fn set_dontcopy(&self) {
294        // SAFETY: Setting the VM_DONTCOPY flag is always okay.
295        unsafe { self.update_flags(flags::DONTCOPY, 0) };
296    }
297
298    /// Set the `VM_DONTDUMP` flag on this vma.
299    ///
300    /// This prevents the vma from being included in core dumps. This option is only permanent if
301    /// `VM_IO` is set.
302    #[inline]
303    pub fn set_dontdump(&self) {
304        // SAFETY: Setting the VM_DONTDUMP flag is always okay.
305        unsafe { self.update_flags(flags::DONTDUMP, 0) };
306    }
307
308    /// Returns whether `VM_READ` is set.
309    ///
310    /// This flag indicates whether userspace is mapping this vma as readable.
311    #[inline]
312    pub fn readable(&self) -> bool {
313        (self.flags() & flags::READ) != 0
314    }
315
316    /// Try to clear the `VM_MAYREAD` flag, failing if `VM_READ` is set.
317    ///
318    /// This flag indicates whether userspace is allowed to make this vma readable with
319    /// `mprotect()`.
320    ///
321    /// Note that this operation is irreversible. Once `VM_MAYREAD` has been cleared, it can never
322    /// be set again.
323    #[inline]
324    pub fn try_clear_mayread(&self) -> Result {
325        if self.readable() {
326            return Err(EINVAL);
327        }
328        // SAFETY: Clearing `VM_MAYREAD` is okay when `VM_READ` is not set.
329        unsafe { self.update_flags(0, flags::MAYREAD) };
330        Ok(())
331    }
332
333    /// Returns whether `VM_WRITE` is set.
334    ///
335    /// This flag indicates whether userspace is mapping this vma as writable.
336    #[inline]
337    pub fn writable(&self) -> bool {
338        (self.flags() & flags::WRITE) != 0
339    }
340
341    /// Try to clear the `VM_MAYWRITE` flag, failing if `VM_WRITE` is set.
342    ///
343    /// This flag indicates whether userspace is allowed to make this vma writable with
344    /// `mprotect()`.
345    ///
346    /// Note that this operation is irreversible. Once `VM_MAYWRITE` has been cleared, it can never
347    /// be set again.
348    #[inline]
349    pub fn try_clear_maywrite(&self) -> Result {
350        if self.writable() {
351            return Err(EINVAL);
352        }
353        // SAFETY: Clearing `VM_MAYWRITE` is okay when `VM_WRITE` is not set.
354        unsafe { self.update_flags(0, flags::MAYWRITE) };
355        Ok(())
356    }
357
358    /// Returns whether `VM_EXEC` is set.
359    ///
360    /// This flag indicates whether userspace is mapping this vma as executable.
361    #[inline]
362    pub fn executable(&self) -> bool {
363        (self.flags() & flags::EXEC) != 0
364    }
365
366    /// Try to clear the `VM_MAYEXEC` flag, failing if `VM_EXEC` is set.
367    ///
368    /// This flag indicates whether userspace is allowed to make this vma executable with
369    /// `mprotect()`.
370    ///
371    /// Note that this operation is irreversible. Once `VM_MAYEXEC` has been cleared, it can never
372    /// be set again.
373    #[inline]
374    pub fn try_clear_mayexec(&self) -> Result {
375        if self.executable() {
376            return Err(EINVAL);
377        }
378        // SAFETY: Clearing `VM_MAYEXEC` is okay when `VM_EXEC` is not set.
379        unsafe { self.update_flags(0, flags::MAYEXEC) };
380        Ok(())
381    }
382}
383
384/// The integer type used for vma flags.
385#[doc(inline)]
386pub use bindings::vm_flags_t;
387
388/// All possible flags for [`VmaRef`].
389pub mod flags {
390    use super::vm_flags_t;
391    use crate::bindings;
392
393    /// No flags are set.
394    pub const NONE: vm_flags_t = bindings::VM_NONE as vm_flags_t;
395
396    /// Mapping allows reads.
397    pub const READ: vm_flags_t = bindings::VM_READ as vm_flags_t;
398
399    /// Mapping allows writes.
400    pub const WRITE: vm_flags_t = bindings::VM_WRITE as vm_flags_t;
401
402    /// Mapping allows execution.
403    pub const EXEC: vm_flags_t = bindings::VM_EXEC as vm_flags_t;
404
405    /// Mapping is shared.
406    pub const SHARED: vm_flags_t = bindings::VM_SHARED as vm_flags_t;
407
408    /// Mapping may be updated to allow reads.
409    pub const MAYREAD: vm_flags_t = bindings::VM_MAYREAD as vm_flags_t;
410
411    /// Mapping may be updated to allow writes.
412    pub const MAYWRITE: vm_flags_t = bindings::VM_MAYWRITE as vm_flags_t;
413
414    /// Mapping may be updated to allow execution.
415    pub const MAYEXEC: vm_flags_t = bindings::VM_MAYEXEC as vm_flags_t;
416
417    /// Mapping may be updated to be shared.
418    pub const MAYSHARE: vm_flags_t = bindings::VM_MAYSHARE as vm_flags_t;
419
420    /// Page-ranges managed without `struct page`, just pure PFN.
421    pub const PFNMAP: vm_flags_t = bindings::VM_PFNMAP as vm_flags_t;
422
423    /// Memory mapped I/O or similar.
424    pub const IO: vm_flags_t = bindings::VM_IO as vm_flags_t;
425
426    /// Do not copy this vma on fork.
427    pub const DONTCOPY: vm_flags_t = bindings::VM_DONTCOPY as vm_flags_t;
428
429    /// Cannot expand with mremap().
430    pub const DONTEXPAND: vm_flags_t = bindings::VM_DONTEXPAND as vm_flags_t;
431
432    /// Lock the pages covered when they are faulted in.
433    pub const LOCKONFAULT: vm_flags_t = bindings::VM_LOCKONFAULT as vm_flags_t;
434
435    /// Is a VM accounted object.
436    pub const ACCOUNT: vm_flags_t = bindings::VM_ACCOUNT as vm_flags_t;
437
438    /// Should the VM suppress accounting.
439    pub const NORESERVE: vm_flags_t = bindings::VM_NORESERVE as vm_flags_t;
440
441    /// Huge TLB Page VM.
442    pub const HUGETLB: vm_flags_t = bindings::VM_HUGETLB as vm_flags_t;
443
444    /// Synchronous page faults. (DAX-specific)
445    pub const SYNC: vm_flags_t = bindings::VM_SYNC as vm_flags_t;
446
447    /// Architecture-specific flag.
448    pub const ARCH_1: vm_flags_t = bindings::VM_ARCH_1 as vm_flags_t;
449
450    /// Wipe VMA contents in child on fork.
451    pub const WIPEONFORK: vm_flags_t = bindings::VM_WIPEONFORK as vm_flags_t;
452
453    /// Do not include in the core dump.
454    pub const DONTDUMP: vm_flags_t = bindings::VM_DONTDUMP as vm_flags_t;
455
456    /// Not soft dirty clean area.
457    pub const SOFTDIRTY: vm_flags_t = bindings::VM_SOFTDIRTY as vm_flags_t;
458
459    /// Can contain `struct page` and pure PFN pages.
460    pub const MIXEDMAP: vm_flags_t = bindings::VM_MIXEDMAP as vm_flags_t;
461
462    /// MADV_HUGEPAGE marked this vma.
463    pub const HUGEPAGE: vm_flags_t = bindings::VM_HUGEPAGE as vm_flags_t;
464
465    /// MADV_NOHUGEPAGE marked this vma.
466    pub const NOHUGEPAGE: vm_flags_t = bindings::VM_NOHUGEPAGE as vm_flags_t;
467
468    /// KSM may merge identical pages.
469    pub const MERGEABLE: vm_flags_t = bindings::VM_MERGEABLE as vm_flags_t;
470}