aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2024-05-02 08:33:11 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2024-05-02 08:33:11 +1000
commit1e60a5fbafefec6daeacf547e6e9719bbe9fbdeb (patch)
tree951bdc24be0b620e5e57134c7df2cb18e65037b4
parent609939ebb81cc404d3b90ce94838f6edb5c599ed (diff)
parent72801513b2bfb6bf571956a604d38f98ce9aacd9 (diff)
downloadlinux-next-history-1e60a5fbafefec6daeacf547e6e9719bbe9fbdeb.tar.gz
Merge branch 'mm-stable' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Notice: this object is not reachable from any branch.
Notice: this object is not reachable from any branch.
-rw-r--r--Documentation/admin-guide/blockdev/zram.rst5
-rw-r--r--Documentation/admin-guide/cgroup-v1/memory.rst4
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt6
-rw-r--r--Documentation/admin-guide/mm/hugetlbpage.rst7
-rw-r--r--Documentation/admin-guide/sysctl/vm.rst16
-rw-r--r--Documentation/filesystems/proc.rst29
-rw-r--r--Documentation/mm/allocation-profiling.rst100
-rw-r--r--Documentation/mm/index.rst1
-rw-r--r--Documentation/mm/vmemmap_dedup.rst22
-rw-r--r--Documentation/translations/zh_CN/core-api/cachetlb.rst2
-rw-r--r--MAINTAINERS18
-rw-r--r--arch/alpha/kernel/osf_sys.c5
-rw-r--r--arch/alpha/kernel/pci_iommu.c2
-rw-r--r--arch/alpha/lib/checksum.c1
-rw-r--r--arch/alpha/lib/fpreg.c1
-rw-r--r--arch/arc/include/asm/mmu-arcv2.h2
-rw-r--r--arch/arc/mm/mmap.c4
-rw-r--r--arch/arm/Kconfig2
-rw-r--r--arch/arm/include/asm/hugetlb.h6
-rw-r--r--arch/arm/include/asm/pgtable-2level.h5
-rw-r--r--arch/arm/include/asm/pgtable-3level-hwdef.h1
-rw-r--r--arch/arm/include/asm/pgtable-3level.h5
-rw-r--r--arch/arm/kernel/irq.c1
-rw-r--r--arch/arm/kernel/traps.c1
-rw-r--r--arch/arm/lib/uaccess_with_memcpy.c4
-rw-r--r--arch/arm/mm/Makefile1
-rw-r--r--arch/arm/mm/fault.c4
-rw-r--r--arch/arm/mm/hugetlbpage.c34
-rw-r--r--arch/arm/mm/mmap.c5
-rw-r--r--arch/arm64/Kconfig2
-rw-r--r--arch/arm64/include/asm/hugetlb.h6
-rw-r--r--arch/arm64/include/asm/pgtable.h25
-rw-r--r--arch/arm64/kernel/efi.c1
-rw-r--r--arch/arm64/mm/fault.c31
-rw-r--r--arch/arm64/mm/hugetlbpage.c18
-rw-r--r--arch/arm64/mm/mteswap.c45
-rw-r--r--arch/csky/abiv1/mmap.c12
-rw-r--r--arch/loongarch/Kconfig2
-rw-r--r--arch/loongarch/include/asm/kfence.h1
-rw-r--r--arch/loongarch/mm/hugetlbpage.c12
-rw-r--r--arch/loongarch/mm/mmap.c3
-rw-r--r--arch/mips/Kconfig2
-rw-r--r--arch/mips/include/asm/pgtable-32.h2
-rw-r--r--arch/mips/include/asm/pgtable-64.h2
-rw-r--r--arch/mips/jazz/jazzdma.c2
-rw-r--r--arch/mips/mm/hugetlbpage.c10
-rw-r--r--arch/mips/mm/mmap.c3
-rw-r--r--arch/mips/mm/tlb-r4k.c2
-rw-r--r--arch/parisc/kernel/sys_parisc.c6
-rw-r--r--arch/parisc/mm/hugetlbpage.c11
-rw-r--r--arch/powerpc/Kconfig2
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable-4k.h20
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable-64k.h25
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h27
-rw-r--r--arch/powerpc/include/asm/mmu.h4
-rw-r--r--arch/powerpc/include/asm/nohash/pgtable.h10
-rw-r--r--arch/powerpc/kernel/dma-iommu.c2
-rw-r--r--arch/powerpc/kernel/fadump.c5
-rw-r--r--arch/powerpc/kernel/iommu.c1
-rw-r--r--arch/powerpc/mm/book3s64/slice.c20
-rw-r--r--arch/powerpc/mm/fault.c33
-rw-r--r--arch/powerpc/mm/mem.c1
-rw-r--r--arch/powerpc/mm/pgtable_64.c6
-rw-r--r--arch/powerpc/platforms/ps3/system-bus.c4
-rw-r--r--arch/powerpc/platforms/pseries/vio.c2
-rw-r--r--arch/riscv/Kconfig2
-rw-r--r--arch/riscv/include/asm/hugetlb.h6
-rw-r--r--arch/riscv/include/asm/pgtable.h1
-rw-r--r--arch/riscv/kernel/elf_kexec.c1
-rw-r--r--arch/riscv/kernel/probes/kprobes.c1
-rw-r--r--arch/riscv/mm/fault.c5
-rw-r--r--arch/riscv/mm/hugetlbpage.c10
-rw-r--r--arch/s390/Kconfig2
-rw-r--r--arch/s390/include/asm/hugetlb.h6
-rw-r--r--arch/s390/include/asm/pgtable.h1
-rw-r--r--arch/s390/kernel/cert_store.c1
-rw-r--r--arch/s390/kernel/ipl.c1
-rw-r--r--arch/s390/mm/fault.c3
-rw-r--r--arch/s390/mm/hugetlbpage.c19
-rw-r--r--arch/s390/mm/mmap.c9
-rw-r--r--arch/sh/Kconfig2
-rw-r--r--arch/sh/include/asm/hugetlb.h6
-rw-r--r--arch/sh/mm/cache-sh4.c5
-rw-r--r--arch/sh/mm/hugetlbpage.c10
-rw-r--r--arch/sh/mm/mmap.c5
-rw-r--r--arch/sparc/include/asm/pgtable_64.h3
-rw-r--r--arch/sparc/kernel/sys_sparc_32.c3
-rw-r--r--arch/sparc/kernel/sys_sparc_64.c20
-rw-r--r--arch/sparc/mm/hugetlbpage.c21
-rw-r--r--arch/sparc/mm/tlb.c6
-rw-r--r--arch/um/include/shared/um_malloc.h3
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/include/asm/io.h1
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/pgtable_64.h1
-rw-r--r--arch/x86/kernel/amd_gart_64.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c1
-rw-r--r--arch/x86/kernel/irq_64.c1
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kernel/sys_x86_64.c42
-rw-r--r--arch/x86/mm/fault.c24
-rw-r--r--arch/x86/mm/hugetlbpage.c35
-rw-r--r--arch/x86/mm/init.c47
-rw-r--r--arch/x86/mm/mmap.c4
-rw-r--r--arch/x86/mm/numa_32.c1
-rw-r--r--arch/x86/mm/pat/memtype.c29
-rw-r--r--arch/x86/mm/pgtable.c4
-rw-r--r--arch/xtensa/mm/cache.c6
-rw-r--r--drivers/accel/ivpu/ivpu_mmu_context.c1
-rw-r--r--drivers/block/zram/zram_drv.c31
-rw-r--r--drivers/char/mem.c2
-rw-r--r--drivers/dax/device.c6
-rw-r--r--drivers/gpu/drm/gma500/mmu.c1
-rw-r--r--drivers/gpu/drm/i915/gem/i915_gem_pages.c1
-rw-r--r--drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c1
-rw-r--r--drivers/gpu/drm/i915/gt/shmem_utils.c1
-rw-r--r--drivers/gpu/drm/i915/gvt/firmware.c1
-rw-r--r--drivers/gpu/drm/i915/gvt/gtt.c1
-rw-r--r--drivers/gpu/drm/i915/gvt/handlers.c1
-rw-r--r--drivers/gpu/drm/i915/gvt/mmio.c1
-rw-r--r--drivers/gpu/drm/i915/gvt/vgpu.c1
-rw-r--r--drivers/gpu/drm/i915/intel_gvt.c1
-rw-r--r--drivers/gpu/drm/imagination/pvr_vm_mips.c1
-rw-r--r--drivers/gpu/drm/mediatek/mtk_drm_gem.c1
-rw-r--r--drivers/gpu/drm/omapdrm/omap_gem.c1
-rw-r--r--drivers/gpu/drm/v3d/v3d_bo.c1
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_binding.c1
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c1
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_devcaps.c1
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_drv.c1
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c1
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c1
-rw-r--r--drivers/gpu/drm/xen/xen_drm_front_gem.c1
-rw-r--r--drivers/hwtracing/coresight/coresight-trbe.c1
-rw-r--r--drivers/iommu/amd/amd_iommu.h5
-rw-r--r--drivers/iommu/dma-iommu.c2
-rw-r--r--drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c1
-rw-r--r--drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_mbox.c1
-rw-r--r--drivers/net/ethernet/microsoft/mana/hw_channel.c1
-rw-r--r--drivers/parisc/ccio-dma.c2
-rw-r--r--drivers/parisc/sba_iommu.c2
-rw-r--r--drivers/platform/x86/uv_sysfs.c1
-rw-r--r--drivers/scsi/mpi3mr/mpi3mr_transport.c2
-rw-r--r--drivers/staging/media/atomisp/pci/hmm/hmm.c2
-rw-r--r--drivers/vfio/pci/pds/dirty.c1
-rw-r--r--drivers/virt/acrn/mm.c11
-rw-r--r--drivers/virtio/virtio_mem.c1
-rw-r--r--drivers/xen/grant-dma-ops.c2
-rw-r--r--drivers/xen/swiotlb-xen.c2
-rw-r--r--fs/dax.c14
-rw-r--r--fs/exec.c11
-rw-r--r--fs/hugetlbfs/inode.c11
-rw-r--r--fs/nfs/iostat.h5
-rw-r--r--fs/proc/inode.c10
-rw-r--r--fs/proc/meminfo.c3
-rw-r--r--fs/proc/page.c69
-rw-r--r--fs/proc/task_mmu.c69
-rw-r--r--fs/ramfs/file-mmu.c2
-rw-r--r--fs/userfaultfd.c5
-rw-r--r--include/acpi/platform/aclinuxex.h19
-rw-r--r--include/asm-generic/codetag.lds.h14
-rw-r--r--include/asm-generic/io.h2
-rw-r--r--include/asm-generic/pgalloc.h35
-rw-r--r--include/asm-generic/vmlinux.lds.h3
-rw-r--r--include/crypto/hash.h7
-rw-r--r--include/crypto/internal/acompress.h5
-rw-r--r--include/crypto/skcipher.h7
-rw-r--r--include/linux/alloc_tag.h216
-rw-r--r--include/linux/bpf.h33
-rw-r--r--include/linux/bpfptr.h5
-rw-r--r--include/linux/codetag.h81
-rw-r--r--include/linux/dma-fence-chain.h6
-rw-r--r--include/linux/dma-map-ops.h2
-rw-r--r--include/linux/fortify-string.h5
-rw-r--r--include/linux/fs.h6
-rw-r--r--include/linux/gfp.h126
-rw-r--r--include/linux/gfp_types.h11
-rw-r--r--include/linux/hid_bpf.h6
-rw-r--r--include/linux/huge_mm.h77
-rw-r--r--include/linux/hugetlb.h81
-rw-r--r--include/linux/io.h1
-rw-r--r--include/linux/jbd2.h12
-rw-r--r--include/linux/ksm.h21
-rw-r--r--include/linux/memcontrol.h59
-rw-r--r--include/linux/mempolicy.h5
-rw-r--r--include/linux/mempool.h73
-rw-r--r--include/linux/mm.h128
-rw-r--r--include/linux/mm_types.h21
-rw-r--r--include/linux/mmap_lock.h10
-rw-r--r--include/linux/page-flags.h56
-rw-r--r--include/linux/page-isolation.h5
-rw-r--r--include/linux/page_ext.h5
-rw-r--r--include/linux/page_idle.h68
-rw-r--r--include/linux/pageblock-flags.h8
-rw-r--r--include/linux/pagemap.h15
-rw-r--r--include/linux/pds/pds_common.h2
-rw-r--r--include/linux/percpu.h30
-rw-r--r--include/linux/pgalloc_tag.h132
-rw-r--r--include/linux/pgtable.h105
-rw-r--r--include/linux/ptr_ring.h28
-rw-r--r--include/linux/rhashtable-types.h11
-rw-r--r--include/linux/rmap.h8
-rw-r--r--include/linux/sched.h24
-rw-r--r--include/linux/sched/coredump.h5
-rw-r--r--include/linux/sched/mm.h22
-rw-r--r--include/linux/secretmem.h21
-rw-r--r--include/linux/skb_array.h19
-rw-r--r--include/linux/skbuff.h20
-rw-r--r--include/linux/skmsg.h8
-rw-r--r--include/linux/slab.h184
-rw-r--r--include/linux/sockptr.h10
-rw-r--r--include/linux/string.h4
-rw-r--r--include/linux/swap.h35
-rw-r--r--include/linux/vmalloc.h60
-rw-r--r--include/linux/vmstat.h8
-rw-r--r--include/linux/xarray.h6
-rw-r--r--include/linux/zpool.h4
-rw-r--r--include/linux/zswap.h2
-rw-r--r--include/net/netlabel.h16
-rw-r--r--include/net/netlink.h5
-rw-r--r--include/net/request_sock.h5
-rw-r--r--include/net/tcx.h5
-rw-r--r--include/rdma/rdmavt_qp.h1
-rw-r--r--include/trace/events/fs_dax.h16
-rw-r--r--include/trace/events/huge_memory.h12
-rw-r--r--include/trace/events/mmflags.h2
-rw-r--r--init/Kconfig4
-rw-r--r--io_uring/io_uring.c2
-rw-r--r--kernel/bpf/arena.c2
-rw-r--r--kernel/bpf/memalloc.c6
-rw-r--r--kernel/bpf/syscall.c2
-rw-r--r--kernel/dma/mapping.c4
-rw-r--r--kernel/events/core.c4
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/kallsyms_selftest.c2
-rw-r--r--kernel/module/main.c29
-rw-r--r--kernel/vmcore_info.c3
-rw-r--r--lib/Kconfig.debug31
-rw-r--r--lib/Makefile3
-rw-r--r--lib/alloc_tag.c246
-rw-r--r--lib/codetag.c283
-rw-r--r--lib/rhashtable.c22
-rw-r--r--lib/test_hmm.c8
-rw-r--r--lib/test_xarray.c93
-rw-r--r--lib/xarray.c49
-rw-r--r--mm/Kconfig8
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c23
-rw-r--r--mm/cma.c4
-rw-r--r--mm/compaction.c8
-rw-r--r--mm/debug.c25
-rw-r--r--mm/debug_page_alloc.c12
-rw-r--r--mm/debug_vm_pgtable.c1
-rw-r--r--mm/filemap.c96
-rw-r--r--mm/folio-compat.c6
-rw-r--r--mm/gup.c777
-rw-r--r--mm/hmm.c9
-rw-r--r--mm/huge_memory.c293
-rw-r--r--mm/hugetlb.c377
-rw-r--r--mm/hugetlb_vmemmap.c1
-rw-r--r--mm/internal.h191
-rw-r--r--mm/kasan/hw_tags.c1
-rw-r--r--mm/kfence/core.c14
-rw-r--r--mm/kfence/kfence.h4
-rw-r--r--mm/khugepaged.c295
-rw-r--r--mm/kmemleak.c6
-rw-r--r--mm/madvise.c131
-rw-r--r--mm/memcontrol.c169
-rw-r--r--mm/memory-failure.c13
-rw-r--r--mm/memory.c196
-rw-r--r--mm/memory_hotplug.c1
-rw-r--r--mm/mempolicy.c104
-rw-r--r--mm/mempool.c36
-rw-r--r--mm/memremap.c30
-rw-r--r--mm/migrate.c37
-rw-r--r--mm/migrate_device.c16
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mm_init.c216
-rw-r--r--mm/mmap.c217
-rw-r--r--mm/mprotect.c3
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nommu.c77
-rw-r--r--mm/oom_kill.c1
-rw-r--r--mm/page-writeback.c1
-rw-r--r--mm/page_alloc.c824
-rw-r--r--mm/page_ext.c15
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/page_isolation.c121
-rw-r--r--mm/page_owner.c2
-rw-r--r--mm/page_vma_mapped.c4
-rw-r--r--mm/percpu-internal.h26
-rw-r--r--mm/percpu-vm.c4
-rw-r--r--mm/percpu.c118
-rw-r--r--mm/readahead.c8
-rw-r--r--mm/rmap.c20
-rw-r--r--mm/shmem.c13
-rw-r--r--mm/show_mem.c26
-rw-r--r--mm/slab.h62
-rw-r--r--mm/slab_common.c6
-rw-r--r--mm/slub.c468
-rw-r--r--mm/sparse.c26
-rw-r--r--mm/swap.c45
-rw-r--r--mm/swap_slots.c8
-rw-r--r--mm/swap_state.c10
-rw-r--r--mm/swapfile.c327
-rw-r--r--mm/truncate.c36
-rw-r--r--mm/userfaultfd.c4
-rw-r--r--mm/util.c40
-rw-r--r--mm/vmalloc.c128
-rw-r--r--mm/vmscan.c37
-rw-r--r--mm/z3fold.c10
-rw-r--r--mm/zbud.c10
-rw-r--r--mm/zpool.c10
-rw-r--r--mm/zsmalloc.c6
-rw-r--r--mm/zswap.c305
-rw-r--r--net/sunrpc/auth_gss/auth_gss_internal.h6
-rw-r--r--rust/helpers.c8
-rw-r--r--scripts/kallsyms.c13
-rwxr-xr-xscripts/kernel-doc1
-rw-r--r--scripts/module.lds.S7
-rw-r--r--security/Kconfig.hardening15
-rw-r--r--sound/pci/hda/cs35l41_hda.c1
-rw-r--r--tools/cgroup/memcg_slabinfo.py5
-rw-r--r--tools/include/uapi/linux/memfd.h39
-rw-r--r--tools/include/uapi/linux/userfaultfd.h386
-rw-r--r--tools/testing/selftests/lib.mk9
-rw-r--r--tools/testing/selftests/mm/Makefile2
-rw-r--r--tools/testing/selftests/mm/ksm_functional_tests.c135
-rw-r--r--tools/testing/selftests/mm/memfd_secret.c51
-rw-r--r--tools/testing/selftests/mm/mlock2-tests.c15
-rw-r--r--tools/testing/selftests/mm/mremap_test.c204
-rwxr-xr-xtools/testing/selftests/mm/run_vmtests.sh13
-rw-r--r--tools/testing/selftests/mm/virtual_address_range.c78
-rw-r--r--tools/testing/selftests/x86/test_shadow_stack.c67
335 files changed, 6977 insertions, 4351 deletions
diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst
index ee2b0030d4168a..091e8bb388875d 100644
--- a/Documentation/admin-guide/blockdev/zram.rst
+++ b/Documentation/admin-guide/blockdev/zram.rst
@@ -466,6 +466,11 @@ of equal or greater size:::
#recompress idle pages larger than 2000 bytes
echo "type=idle threshold=2000" > /sys/block/zramX/recompress
+It is also possible to limit the number of pages zram re-compression will
+attempt to recompress:::
+
+ echo "type=huge_idle max_pages=42" > /sys/block/zramX/recompress
+
Recompression of idle pages requires memory tracking.
During re-compression for every page, that matches re-compression criteria,
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
index ca7d9402f6be16..46110e6a31bbd9 100644
--- a/Documentation/admin-guide/cgroup-v1/memory.rst
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -300,14 +300,14 @@ When oom event notifier is registered, event will be delivered.
Lock order is as follows::
- Page lock (PG_locked bit of page->flags)
+ folio_lock
mm->page_table_lock or split pte_lock
folio_memcg_lock (memcg->move_lock)
mapping->i_pages lock
lruvec->lru_lock.
Per-node-per-memcgroup LRU (cgroup's private LRU) is guarded by
-lruvec->lru_lock; PG_lru bit of page->flags is cleared before
+lruvec->lru_lock; the folio LRU flag is cleared before
isolating a page from its LRU under lruvec->lru_lock.
.. _cgroup-v1-memory-kernel-extension:
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 213d0719e2b7a6..827b51450cc28d 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2148,6 +2148,12 @@
Format: 0 | 1
Default set by CONFIG_INIT_ON_FREE_DEFAULT_ON.
+ init_mlocked_on_free= [MM] Fill freed userspace memory with zeroes if
+ it was mlock'ed and not explicitly munlock'ed
+ afterwards.
+ Format: 0 | 1
+ Default set by CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON
+
init_pkru= [X86] Specify the default memory protection keys rights
register contents for all processes. 0x55555554 by
default (disallow access to all but pkey 0). Can
diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst
index e4d4b4a8dc9736..f34a0d798d5b53 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -376,6 +376,13 @@ Note that the number of overcommit and reserve pages remain global quantities,
as we don't know until fault time, when the faulting task's mempolicy is
applied, from which node the huge page allocation will be attempted.
+The hugetlb may be migrated between the per-node hugepages pool in the following
+scenarios: memory offline, memory failure, longterm pinning, syscalls(mbind,
+migrate_pages and move_pages), alloc_contig_range() and alloc_contig_pages().
+Now only memory offline, memory failure and syscalls allow fallbacking to allocate
+a new hugetlb on a different node if the current node is unable to allocate during
+hugetlb migration, that means these 3 cases can break the per-node hugepages pool.
+
.. _using_huge_pages:
Using Huge Pages
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index c59889de122b9f..e86c968a7a0ece 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm:
- legacy_va_layout
- lowmem_reserve_ratio
- max_map_count
+- mem_profiling (only if CONFIG_MEM_ALLOC_PROFILING=y)
- memory_failure_early_kill
- memory_failure_recovery
- min_free_kbytes
@@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation.
The default value is 65530.
+mem_profiling
+==============
+
+Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y)
+
+1: Enable memory profiling.
+
+0: Disable memory profiling.
+
+Enabling memory profiling introduces a small performance overhead for all
+memory allocations.
+
+The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT.
+
+
memory_failure_early_kill:
==========================
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index c6a6b9df210497..245269dd6e0289 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -688,6 +688,7 @@ files are there, and which are missing.
============ ===============================================================
File Content
============ ===============================================================
+ allocinfo Memory allocations profiling information
apm Advanced power management info
bootconfig Kernel command line obtained from boot config,
and, if there were kernel parameters from the
@@ -953,6 +954,34 @@ also be allocatable although a lot of filesystem metadata may have to be
reclaimed to achieve this.
+allocinfo
+~~~~~~~~~
+
+Provides information about memory allocations at all locations in the code
+base. Each allocation in the code is identified by its source file, line
+number, module (if originates from a loadable module) and the function calling
+the allocation. The number of bytes allocated and number of calls at each
+location are reported.
+
+Example output.
+
+::
+
+ > sort -rn /proc/allocinfo
+ 127664128 31168 mm/page_ext.c:270 func:alloc_page_ext
+ 56373248 4737 mm/slub.c:2259 func:alloc_slab_page
+ 14880768 3633 mm/readahead.c:247 func:page_cache_ra_unbounded
+ 14417920 3520 mm/mm_init.c:2530 func:alloc_large_system_hash
+ 13377536 234 block/blk-mq.c:3421 func:blk_mq_alloc_rqs
+ 11718656 2861 mm/filemap.c:1919 func:__filemap_get_folio
+ 9192960 2800 kernel/fork.c:307 func:alloc_thread_stack_node
+ 4206592 4 net/netfilter/nf_conntrack_core.c:2567 func:nf_ct_alloc_hashtable
+ 4136960 1010 drivers/staging/ctagmod/ctagmod.c:20 [ctagmod] func:ctagmod_start
+ 3940352 962 mm/memory.c:4214 func:alloc_anon_folio
+ 2894464 22613 fs/kernfs/dir.c:615 func:__kernfs_new_node
+ ...
+
+
meminfo
~~~~~~~
diff --git a/Documentation/mm/allocation-profiling.rst b/Documentation/mm/allocation-profiling.rst
new file mode 100644
index 00000000000000..d3b733b41ae658
--- /dev/null
+++ b/Documentation/mm/allocation-profiling.rst
@@ -0,0 +1,100 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========================
+MEMORY ALLOCATION PROFILING
+===========================
+
+Low overhead (suitable for production) accounting of all memory allocations,
+tracked by file and line number.
+
+Usage:
+kconfig options:
+- CONFIG_MEM_ALLOC_PROFILING
+
+- CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
+
+- CONFIG_MEM_ALLOC_PROFILING_DEBUG
+ adds warnings for allocations that weren't accounted because of a
+ missing annotation
+
+Boot parameter:
+ sysctl.vm.mem_profiling=0|1|never
+
+ When set to "never", memory allocation profiling overhead is minimized and it
+ cannot be enabled at runtime (sysctl becomes read-only).
+ When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=y, default value is "1".
+ When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n, default value is "never".
+
+sysctl:
+ /proc/sys/vm/mem_profiling
+
+Runtime info:
+ /proc/allocinfo
+
+Example output::
+
+ root@moria-kvm:~# sort -g /proc/allocinfo|tail|numfmt --to=iec
+ 2.8M 22648 fs/kernfs/dir.c:615 func:__kernfs_new_node
+ 3.8M 953 mm/memory.c:4214 func:alloc_anon_folio
+ 4.0M 1010 drivers/staging/ctagmod/ctagmod.c:20 [ctagmod] func:ctagmod_start
+ 4.1M 4 net/netfilter/nf_conntrack_core.c:2567 func:nf_ct_alloc_hashtable
+ 6.0M 1532 mm/filemap.c:1919 func:__filemap_get_folio
+ 8.8M 2785 kernel/fork.c:307 func:alloc_thread_stack_node
+ 13M 234 block/blk-mq.c:3421 func:blk_mq_alloc_rqs
+ 14M 3520 mm/mm_init.c:2530 func:alloc_large_system_hash
+ 15M 3656 mm/readahead.c:247 func:page_cache_ra_unbounded
+ 55M 4887 mm/slub.c:2259 func:alloc_slab_page
+ 122M 31168 mm/page_ext.c:270 func:alloc_page_ext
+
+===================
+Theory of operation
+===================
+
+Memory allocation profiling builds off of code tagging, which is a library for
+declaring static structs (that typically describe a file and line number in
+some way, hence code tagging) and then finding and operating on them at runtime,
+- i.e. iterating over them to print them in debugfs/procfs.
+
+To add accounting for an allocation call, we replace it with a macro
+invocation, alloc_hooks(), that
+- declares a code tag
+- stashes a pointer to it in task_struct
+- calls the real allocation function
+- and finally, restores the task_struct alloc tag pointer to its previous value.
+
+This allows for alloc_hooks() calls to be nested, with the most recent one
+taking effect. This is important for allocations internal to the mm/ code that
+do not properly belong to the outer allocation context and should be counted
+separately: for example, slab object extension vectors, or when the slab
+allocates pages from the page allocator.
+
+Thus, proper usage requires determining which function in an allocation call
+stack should be tagged. There are many helper functions that essentially wrap
+e.g. kmalloc() and do a little more work, then are called in multiple places;
+we'll generally want the accounting to happen in the callers of these helpers,
+not in the helpers themselves.
+
+To fix up a given helper, for example foo(), do the following:
+- switch its allocation call to the _noprof() version, e.g. kmalloc_noprof()
+
+- rename it to foo_noprof()
+
+- define a macro version of foo() like so:
+
+ #define foo(...) alloc_hooks(foo_noprof(__VA_ARGS__))
+
+It's also possible to stash a pointer to an alloc tag in your own data structures.
+
+Do this when you're implementing a generic data structure that does allocations
+"on behalf of" some other code - for example, the rhashtable code. This way,
+instead of seeing a large line in /proc/allocinfo for rhashtable.c, we can
+break it out by rhashtable type.
+
+To do so:
+- Hook your data structure's init function, like any other allocation function.
+
+- Within your init function, use the convenience macro alloc_tag_record() to
+ record alloc tag in your data structure.
+
+- Then, use the following form for your allocations:
+ alloc_hooks_tag(ht->your_saved_tag, kmalloc_noprof(...))
diff --git a/Documentation/mm/index.rst b/Documentation/mm/index.rst
index 31d2ac3064387b..48b9b559ca7b45 100644
--- a/Documentation/mm/index.rst
+++ b/Documentation/mm/index.rst
@@ -26,6 +26,7 @@ see the :doc:`admin guide <../admin-guide/mm/index>`.
page_cache
shmfs
oom
+ allocation-profiling
Legacy Documentation
====================
diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst
index 593ede6d314ba6..b4a55b6569fa54 100644
--- a/Documentation/mm/vmemmap_dedup.rst
+++ b/Documentation/mm/vmemmap_dedup.rst
@@ -180,27 +180,7 @@ this correctly. There is only **one** head ``struct page``, the tail
``struct page`` with ``PG_head`` are fake head ``struct page``. We need an
approach to distinguish between those two different types of ``struct page`` so
that ``compound_head()`` can return the real head ``struct page`` when the
-parameter is the tail ``struct page`` but with ``PG_head``. The following code
-snippet describes how to distinguish between real and fake head ``struct page``.
-
-.. code-block:: c
-
- if (test_bit(PG_head, &page->flags)) {
- unsigned long head = READ_ONCE(page[1].compound_head);
-
- if (head & 1) {
- if (head == (unsigned long)page + 1)
- /* head struct page */
- else
- /* tail struct page */
- } else {
- /* head struct page */
- }
- }
-
-We can safely access the field of the **page[1]** with ``PG_head`` because the
-page is a compound page composed with at least two contiguous pages.
-The implementation refers to ``page_fixed_fake_head()``.
+parameter is the tail ``struct page`` but with ``PG_head``.
Device DAX
==========
diff --git a/Documentation/translations/zh_CN/core-api/cachetlb.rst b/Documentation/translations/zh_CN/core-api/cachetlb.rst
index b4a76ec75daa57..64295c61d1c136 100644
--- a/Documentation/translations/zh_CN/core-api/cachetlb.rst
+++ b/Documentation/translations/zh_CN/core-api/cachetlb.rst
@@ -260,7 +260,7 @@ HyperSparc cpu就是这样一个具有这种属性的cpu。
如果D-cache别名不是一个问题,这个程序可以简单地定义为该架构上
的nop。
- 在page->flags (PG_arch_1)中有一个位是“架构私有”。内核保证,
+ 在folio->flags (PG_arch_1)中有一个位是“架构私有”。内核保证,
对于分页缓存的页面,当这样的页面第一次进入分页缓存时,它将清除
这个位。
diff --git a/MAINTAINERS b/MAINTAINERS
index 0410a18761c6a9..3b63a83dd87c6f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5254,6 +5254,14 @@ S: Supported
F: Documentation/process/code-of-conduct-interpretation.rst
F: Documentation/process/code-of-conduct.rst
+CODE TAGGING
+M: Suren Baghdasaryan <surenb@google.com>
+M: Kent Overstreet <kent.overstreet@linux.dev>
+S: Maintained
+F: include/asm-generic/codetag.lds.h
+F: include/linux/codetag.h
+F: lib/codetag.c
+
COMEDI DRIVERS
M: Ian Abbott <abbotti@mev.co.uk>
M: H Hartley Sweeten <hsweeten@visionengravers.com>
@@ -14152,6 +14160,16 @@ F: mm/memblock.c
F: mm/mm_init.c
F: tools/testing/memblock/
+MEMORY ALLOCATION PROFILING
+M: Suren Baghdasaryan <surenb@google.com>
+M: Kent Overstreet <kent.overstreet@linux.dev>
+L: linux-mm@kvack.org
+S: Maintained
+F: Documentation/mm/allocation-profiling.rst
+F: include/linux/alloc_tag.h
+F: include/linux/pgalloc_tag.h
+F: lib/alloc_tag.c
+
MEMORY CONTROLLER DRIVERS
M: Krzysztof Kozlowski <krzk@kernel.org>
L: linux-kernel@vger.kernel.org
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 5db88b6274396d..e5f881bc82881c 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -1218,14 +1218,11 @@ static unsigned long
arch_get_unmapped_area_1(unsigned long addr, unsigned long len,
unsigned long limit)
{
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
- info.flags = 0;
info.length = len;
info.low_limit = addr;
info.high_limit = limit;
- info.align_mask = 0;
- info.align_offset = 0;
return vm_unmapped_area(&info);
}
diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index c81183935e9707..7fcf3e9b710305 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -929,7 +929,7 @@ const struct dma_map_ops alpha_pci_ops = {
.dma_supported = alpha_pci_supported,
.mmap = dma_common_mmap,
.get_sgtable = dma_common_get_sgtable,
- .alloc_pages = dma_common_alloc_pages,
+ .alloc_pages_op = dma_common_alloc_pages,
.free_pages = dma_common_free_pages,
};
EXPORT_SYMBOL(alpha_pci_ops);
diff --git a/arch/alpha/lib/checksum.c b/arch/alpha/lib/checksum.c
index 3f35c3ed694886..c29b98ef9c8267 100644
--- a/arch/alpha/lib/checksum.c
+++ b/arch/alpha/lib/checksum.c
@@ -14,6 +14,7 @@
#include <linux/string.h>
#include <asm/byteorder.h>
+#include <asm/checksum.h>
static inline unsigned short from64to16(unsigned long x)
{
diff --git a/arch/alpha/lib/fpreg.c b/arch/alpha/lib/fpreg.c
index 7c08b225261c4a..3d32165043f8b1 100644
--- a/arch/alpha/lib/fpreg.c
+++ b/arch/alpha/lib/fpreg.c
@@ -8,6 +8,7 @@
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/preempt.h>
+#include <asm/fpu.h>
#include <asm/thread_info.h>
#if defined(CONFIG_ALPHA_EV6) || defined(CONFIG_ALPHA_EV67)
diff --git a/arch/arc/include/asm/mmu-arcv2.h b/arch/arc/include/asm/mmu-arcv2.h
index ed9036d4ede310..d85dc07219071f 100644
--- a/arch/arc/include/asm/mmu-arcv2.h
+++ b/arch/arc/include/asm/mmu-arcv2.h
@@ -9,6 +9,8 @@
#ifndef _ASM_ARC_MMU_ARCV2_H
#define _ASM_ARC_MMU_ARCV2_H
+#include <soc/arc/aux.h>
+
/*
* TLB Management regs
*/
diff --git a/arch/arc/mm/mmap.c b/arch/arc/mm/mmap.c
index 3c1c7ae732925c..69a91529715599 100644
--- a/arch/arc/mm/mmap.c
+++ b/arch/arc/mm/mmap.c
@@ -27,7 +27,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
/*
* We enforce the MAP_FIXED case.
@@ -51,11 +51,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
return addr;
}
- info.flags = 0;
info.length = len;
info.low_limit = mm->mmap_base;
info.high_limit = TASK_SIZE;
- info.align_mask = 0;
info.align_offset = pgoff << PAGE_SHIFT;
return vm_unmapped_area(&info);
}
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index b14aed3a17abb6..817918f6635a50 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -99,7 +99,7 @@ config ARM
select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU
select HAVE_EXIT_THREAD
- select HAVE_FAST_GUP if ARM_LPAE
+ select HAVE_GUP_FAST if ARM_LPAE
select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_FUNCTION_GRAPH_TRACER
diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h
index a3a82b7158d4cb..b766c4b373f647 100644
--- a/arch/arm/include/asm/hugetlb.h
+++ b/arch/arm/include/asm/hugetlb.h
@@ -15,10 +15,10 @@
#include <asm/hugetlb-3level.h>
#include <asm-generic/hugetlb.h>
-static inline void arch_clear_hugepage_flags(struct page *page)
+static inline void arch_clear_hugetlb_flags(struct folio *folio)
{
- clear_bit(PG_dcache_clean, &page->flags);
+ clear_bit(PG_dcache_clean, &folio->flags);
}
-#define arch_clear_hugepage_flags arch_clear_hugepage_flags
+#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
#endif /* _ASM_ARM_HUGETLB_H */
diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h
index b0a262566eb95a..6b5392e20f4113 100644
--- a/arch/arm/include/asm/pgtable-2level.h
+++ b/arch/arm/include/asm/pgtable-2level.h
@@ -213,8 +213,8 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
#define pmd_pfn(pmd) (__phys_to_pfn(pmd_val(pmd) & PHYS_MASK))
-#define pmd_leaf(pmd) (pmd_val(pmd) & 2)
-#define pmd_bad(pmd) (pmd_val(pmd) & 2)
+#define pmd_leaf(pmd) (pmd_val(pmd) & PMD_TYPE_SECT)
+#define pmd_bad(pmd) pmd_leaf(pmd)
#define pmd_present(pmd) (pmd_val(pmd))
#define copy_pmd(pmdpd,pmdps) \
@@ -241,7 +241,6 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
* define empty stubs for use by pin_page_for_write.
*/
#define pmd_hugewillfault(pmd) (0)
-#define pmd_thp_or_huge(pmd) (0)
#endif /* __ASSEMBLY__ */
diff --git a/arch/arm/include/asm/pgtable-3level-hwdef.h b/arch/arm/include/asm/pgtable-3level-hwdef.h
index 2f35b4eddaa809..e7b666cf006031 100644
--- a/arch/arm/include/asm/pgtable-3level-hwdef.h
+++ b/arch/arm/include/asm/pgtable-3level-hwdef.h
@@ -14,6 +14,7 @@
* + Level 1/2 descriptor
* - common
*/
+#define PUD_TABLE_BIT (_AT(pmdval_t, 1) << 1)
#define PMD_TYPE_MASK (_AT(pmdval_t, 3) << 0)
#define PMD_TYPE_FAULT (_AT(pmdval_t, 0) << 0)
#define PMD_TYPE_TABLE (_AT(pmdval_t, 3) << 0)
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 4b1d9eb3908a22..fa5939eb9864ef 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -112,7 +112,7 @@
#ifndef __ASSEMBLY__
#define pud_none(pud) (!pud_val(pud))
-#define pud_bad(pud) (!(pud_val(pud) & 2))
+#define pud_bad(pud) (!(pud_val(pud) & PUD_TABLE_BIT))
#define pud_present(pud) (pud_val(pud))
#define pmd_table(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \
PMD_TYPE_TABLE)
@@ -137,7 +137,7 @@ static inline pmd_t *pud_pgtable(pud_t pud)
return __va(pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK);
}
-#define pmd_bad(pmd) (!(pmd_val(pmd) & 2))
+#define pmd_bad(pmd) (!(pmd_val(pmd) & PMD_TABLE_BIT))
#define copy_pmd(pmdpd,pmdps) \
do { \
@@ -190,7 +190,6 @@ static inline pte_t pte_mkspecial(pte_t pte)
#define pmd_dirty(pmd) (pmd_isset((pmd), L_PMD_SECT_DIRTY))
#define pmd_hugewillfault(pmd) (!pmd_young(pmd) || !pmd_write(pmd))
-#define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd))
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define pmd_trans_huge(pmd) (pmd_val(pmd) && !pmd_table(pmd))
diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index fe28fc1f759d93..dab42d066d068b 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -32,6 +32,7 @@
#include <linux/kallsyms.h>
#include <linux/proc_fs.h>
#include <linux/export.h>
+#include <linux/vmalloc.h>
#include <asm/hardware/cache-l2x0.h>
#include <asm/hardware/cache-uniphier.h>
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 72c82a4d63ac20..480e307501bb4b 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -26,6 +26,7 @@
#include <linux/sched/debug.h>
#include <linux/sched/task_stack.h>
#include <linux/irq.h>
+#include <linux/vmalloc.h>
#include <linux/atomic.h>
#include <asm/cacheflush.h>
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
index 2f6163f05e9357..c0ac7796d77508 100644
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -56,10 +56,10 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
* to see that it's still huge and whether or not we will
* need to fault on write.
*/
- if (unlikely(pmd_thp_or_huge(*pmd))) {
+ if (unlikely(pmd_leaf(*pmd))) {
ptl = &current->mm->page_table_lock;
spin_lock(ptl);
- if (unlikely(!pmd_thp_or_huge(*pmd)
+ if (unlikely(!pmd_leaf(*pmd)
|| pmd_hugewillfault(*pmd))) {
spin_unlock(ptl);
return 0;
diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index 71b858c9b10c47..1779e12db0850d 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -21,7 +21,6 @@ KASAN_SANITIZE_physaddr.o := n
obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o
obj-$(CONFIG_ALIGNMENT_TRAP) += alignment.o
-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_ARM_PV_FIXUP) += pv-fixup-asm.o
obj-$(CONFIG_CPU_ABRT_NOMMU) += abort-nommu.o
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 439dc6a26bb982..5c4b417e24f9ed 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -294,7 +294,9 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
if (!(vma->vm_flags & vm_flags)) {
vma_end_read(vma);
- goto lock_mmap;
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ fault = VM_FAULT_BADACCESS;
+ goto bad_area;
}
fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs);
if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c
deleted file mode 100644
index dd7a0277c5c094..00000000000000
--- a/arch/arm/mm/hugetlbpage.c
+++ /dev/null
@@ -1,34 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * arch/arm/mm/hugetlbpage.c
- *
- * Copyright (C) 2012 ARM Ltd.
- *
- * Based on arch/x86/include/asm/hugetlb.h and Bill Carson's patches
- */
-
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/pagemap.h>
-#include <linux/err.h>
-#include <linux/sysctl.h>
-#include <asm/mman.h>
-#include <asm/tlb.h>
-#include <asm/tlbflush.h>
-
-/*
- * On ARM, huge pages are backed by pmd's rather than pte's, so we do a lot
- * of type casting from pmd_t * to pte_t *.
- */
-
-int pud_huge(pud_t pud)
-{
- return 0;
-}
-
-int pmd_huge(pmd_t pmd)
-{
- return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
-}
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index a0f8a0ca0788ad..d65d0e6ed10ab4 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -34,7 +34,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct vm_area_struct *vma;
int do_align = 0;
int aliasing = cache_is_vipt_aliasing();
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
/*
* We only need to do colour alignment if either the I or D
@@ -68,7 +68,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
return addr;
}
- info.flags = 0;
info.length = len;
info.low_limit = mm->mmap_base;
info.high_limit = TASK_SIZE;
@@ -87,7 +86,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
unsigned long addr = addr0;
int do_align = 0;
int aliasing = cache_is_vipt_aliasing();
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
/*
* We only need to do colour alignment if either the I or D
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7b11c98b3e84bf..de076a191e9fd6 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -205,7 +205,7 @@ config ARM64
select HAVE_SAMPLE_FTRACE_DIRECT
select HAVE_SAMPLE_FTRACE_DIRECT_MULTI
select HAVE_EFFICIENT_UNALIGNED_ACCESS
- select HAVE_FAST_GUP
+ select HAVE_GUP_FAST
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_ERROR_INJECTION
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 2ddc33d93b13b2..3954cbd2ff56bc 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -18,11 +18,11 @@
extern bool arch_hugetlb_migration_supported(struct hstate *h);
#endif
-static inline void arch_clear_hugepage_flags(struct page *page)
+static inline void arch_clear_hugetlb_flags(struct folio *folio)
{
- clear_bit(PG_dcache_clean, &page->flags);
+ clear_bit(PG_dcache_clean, &folio->flags);
}
-#define arch_clear_hugepage_flags arch_clear_hugepage_flags
+#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
#define arch_make_huge_pte arch_make_huge_pte
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 7d580271a46dc1..7eeb50892319a5 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -49,12 +49,6 @@
__flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1)
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static inline bool arch_thp_swp_supported(void)
-{
- return !system_supports_mte();
-}
-#define arch_thp_swp_supported arch_thp_swp_supported
-
/*
* Outside of a few very special situations (e.g. hibernation), we always
* use broadcast TLB invalidation instructions, therefore a spurious page
@@ -525,8 +519,6 @@ static inline pmd_t pmd_mkinvalid(pmd_t pmd)
return pmd;
}
-#define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd))
-
#define pmd_write(pmd) pte_write(pmd_pte(pmd))
#define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
@@ -717,7 +709,11 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
#define pud_none(pud) (!pud_val(pud))
#define pud_bad(pud) (!pud_table(pud))
#define pud_present(pud) pte_present(pud_pte(pud))
+#ifndef __PAGETABLE_PMD_FOLDED
#define pud_leaf(pud) (pud_present(pud) && !pud_table(pud))
+#else
+#define pud_leaf(pud) false
+#endif
#define pud_valid(pud) pte_valid(pud_pte(pud))
#define pud_user(pud) pte_user(pud_pte(pud))
#define pud_user_exec(pud) pte_user_exec(pud_pte(pud))
@@ -1288,12 +1284,7 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
#ifdef CONFIG_ARM64_MTE
#define __HAVE_ARCH_PREPARE_TO_SWAP
-static inline int arch_prepare_to_swap(struct page *page)
-{
- if (system_supports_mte())
- return mte_save_tags(page);
- return 0;
-}
+extern int arch_prepare_to_swap(struct folio *folio);
#define __HAVE_ARCH_SWAP_INVALIDATE
static inline void arch_swap_invalidate_page(int type, pgoff_t offset)
@@ -1309,11 +1300,7 @@ static inline void arch_swap_invalidate_area(int type)
}
#define __HAVE_ARCH_SWAP_RESTORE
-static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
-{
- if (system_supports_mte())
- mte_restore_tags(entry, &folio->page);
-}
+extern void arch_swap_restore(swp_entry_t entry, struct folio *folio);
#endif /* CONFIG_ARM64_MTE */
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 9afcc690fe73c2..4a92096db34e0b 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -10,6 +10,7 @@
#include <linux/efi.h>
#include <linux/init.h>
#include <linux/screen_info.h>
+#include <linux/vmalloc.h>
#include <asm/efi.h>
#include <asm/stacktrace.h>
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 8251e2fea9c757..405f9aa831bd20 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -486,25 +486,6 @@ static void do_bad_area(unsigned long far, unsigned long esr,
}
}
-#define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000)
-#define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000)
-
-static vm_fault_t __do_page_fault(struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long addr,
- unsigned int mm_flags, unsigned long vm_flags,
- struct pt_regs *regs)
-{
- /*
- * Ok, we have a good vm_area for this memory access, so we can handle
- * it.
- * Check that the permissions on the VMA allow for the fault which
- * occurred.
- */
- if (!(vma->vm_flags & vm_flags))
- return VM_FAULT_BADACCESS;
- return handle_mm_fault(vma, addr, mm_flags, regs);
-}
-
static bool is_el0_instruction_abort(unsigned long esr)
{
return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW;
@@ -519,6 +500,9 @@ static bool is_write_abort(unsigned long esr)
return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
}
+#define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000)
+#define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000)
+
static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
struct pt_regs *regs)
{
@@ -588,7 +572,9 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
if (!(vma->vm_flags & vm_flags)) {
vma_end_read(vma);
- goto lock_mmap;
+ fault = VM_FAULT_BADACCESS;
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ goto done;
}
fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs);
if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
@@ -617,7 +603,10 @@ retry:
goto done;
}
- fault = __do_page_fault(mm, vma, addr, mm_flags, vm_flags, regs);
+ if (!(vma->vm_flags & vm_flags))
+ fault = VM_FAULT_BADACCESS;
+ else
+ fault = handle_mm_fault(vma, addr, mm_flags, regs);
/* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) {
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index b872b003a55f3d..3f09ac73cce3b2 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -79,20 +79,6 @@ bool arch_hugetlb_migration_supported(struct hstate *h)
}
#endif
-int pmd_huge(pmd_t pmd)
-{
- return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
-}
-
-int pud_huge(pud_t pud)
-{
-#ifndef __PAGETABLE_PMD_FOLDED
- return pud_val(pud) && !(pud_val(pud) & PUD_TABLE_BIT);
-#else
- return 0;
-#endif
-}
-
static int find_num_contig(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, size_t *pgsize)
{
@@ -328,7 +314,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
if (sz != PUD_SIZE && pud_none(pud))
return NULL;
/* hugepage or swap? */
- if (pud_huge(pud) || !pud_present(pud))
+ if (pud_leaf(pud) || !pud_present(pud))
return (pte_t *)pudp;
/* table; check the next level */
@@ -340,7 +326,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
if (!(sz == PMD_SIZE || sz == CONT_PMD_SIZE) &&
pmd_none(pmd))
return NULL;
- if (pmd_huge(pmd) || !pmd_present(pmd))
+ if (pmd_leaf(pmd) || !pmd_present(pmd))
return (pte_t *)pmdp;
if (sz == CONT_PTE_SIZE)
diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c
index a31833e3ddc544..63e8d72f202a3b 100644
--- a/arch/arm64/mm/mteswap.c
+++ b/arch/arm64/mm/mteswap.c
@@ -68,6 +68,13 @@ void mte_invalidate_tags(int type, pgoff_t offset)
mte_free_tag_storage(tags);
}
+static inline void __mte_invalidate_tags(struct page *page)
+{
+ swp_entry_t entry = page_swap_entry(page);
+
+ mte_invalidate_tags(swp_type(entry), swp_offset(entry));
+}
+
void mte_invalidate_tags_area(int type)
{
swp_entry_t entry = swp_entry(type, 0);
@@ -83,3 +90,41 @@ void mte_invalidate_tags_area(int type)
}
xa_unlock(&mte_pages);
}
+
+int arch_prepare_to_swap(struct folio *folio)
+{
+ long i, nr;
+ int err;
+
+ if (!system_supports_mte())
+ return 0;
+
+ nr = folio_nr_pages(folio);
+
+ for (i = 0; i < nr; i++) {
+ err = mte_save_tags(folio_page(folio, i));
+ if (err)
+ goto out;
+ }
+ return 0;
+
+out:
+ while (i--)
+ __mte_invalidate_tags(folio_page(folio, i));
+ return err;
+}
+
+void arch_swap_restore(swp_entry_t entry, struct folio *folio)
+{
+ long i, nr;
+
+ if (!system_supports_mte())
+ return;
+
+ nr = folio_nr_pages(folio);
+
+ for (i = 0; i < nr; i++) {
+ mte_restore_tags(entry, folio_page(folio, i));
+ entry.val++;
+ }
+}
diff --git a/arch/csky/abiv1/mmap.c b/arch/csky/abiv1/mmap.c
index 6792aca4999917..7f826331d409b2 100644
--- a/arch/csky/abiv1/mmap.c
+++ b/arch/csky/abiv1/mmap.c
@@ -28,7 +28,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
int do_align = 0;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {
+ .length = len,
+ .low_limit = mm->mmap_base,
+ .high_limit = TASK_SIZE,
+ .align_offset = pgoff << PAGE_SHIFT
+ };
/*
* We only need to do colour alignment if either the I or D
@@ -61,11 +66,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
return addr;
}
- info.flags = 0;
- info.length = len;
- info.low_limit = mm->mmap_base;
- info.high_limit = TASK_SIZE;
info.align_mask = do_align ? (PAGE_MASK & (SHMLBA - 1)) : 0;
- info.align_offset = pgoff << PAGE_SHIFT;
return vm_unmapped_area(&info);
}
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 54ad04dacdee94..331450bd23c78c 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -119,7 +119,7 @@ config LOONGARCH
select HAVE_EBPF_JIT
select HAVE_EFFICIENT_UNALIGNED_ACCESS if !ARCH_STRICT_ALIGN
select HAVE_EXIT_THREAD
- select HAVE_FAST_GUP
+ select HAVE_GUP_FAST
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_ARG_ACCESS_API
select HAVE_FUNCTION_ERROR_INJECTION
diff --git a/arch/loongarch/include/asm/kfence.h b/arch/loongarch/include/asm/kfence.h
index a6a5760da3a332..92636e82957c7e 100644
--- a/arch/loongarch/include/asm/kfence.h
+++ b/arch/loongarch/include/asm/kfence.h
@@ -10,6 +10,7 @@
#define _ASM_LOONGARCH_KFENCE_H
#include <linux/kfence.h>
+#include <linux/vmalloc.h>
#include <asm/pgtable.h>
#include <asm/tlb.h>
diff --git a/arch/loongarch/mm/hugetlbpage.c b/arch/loongarch/mm/hugetlbpage.c
index 1e76fcb83093dd..12222c56cb5948 100644
--- a/arch/loongarch/mm/hugetlbpage.c
+++ b/arch/loongarch/mm/hugetlbpage.c
@@ -50,21 +50,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
return (pte_t *) pmd;
}
-int pmd_huge(pmd_t pmd)
-{
- return (pmd_val(pmd) & _PAGE_HUGE) != 0;
-}
-
-int pud_huge(pud_t pud)
-{
- return (pud_val(pud) & _PAGE_HUGE) != 0;
-}
-
uint64_t pmd_to_entrylo(unsigned long pmd_val)
{
uint64_t val;
/* PMD as PTE. Must be huge page */
- if (!pmd_huge(__pmd(pmd_val)))
+ if (!pmd_leaf(__pmd(pmd_val)))
panic("%s", __func__);
val = pmd_val ^ _PAGE_HUGE;
diff --git a/arch/loongarch/mm/mmap.c b/arch/loongarch/mm/mmap.c
index 89af7c12e8c08d..8890309851351c 100644
--- a/arch/loongarch/mm/mmap.c
+++ b/arch/loongarch/mm/mmap.c
@@ -25,7 +25,7 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp,
struct vm_area_struct *vma;
unsigned long addr = addr0;
int do_color_align;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
if (unlikely(len > TASK_SIZE))
return -ENOMEM;
@@ -83,7 +83,6 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp,
*/
}
- info.flags = 0;
info.low_limit = mm->mmap_base;
info.high_limit = TASK_SIZE;
return vm_unmapped_area(&info);
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 516dc7022bd74a..f1aa1bf11166bc 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -68,7 +68,7 @@ config MIPS
select HAVE_DYNAMIC_FTRACE
select HAVE_EBPF_JIT if !CPU_MICROMIPS
select HAVE_EXIT_THREAD
- select HAVE_FAST_GUP
+ select HAVE_GUP_FAST
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h
index 0e196650f4f488..92b7591aac2acd 100644
--- a/arch/mips/include/asm/pgtable-32.h
+++ b/arch/mips/include/asm/pgtable-32.h
@@ -129,7 +129,7 @@ static inline int pmd_none(pmd_t pmd)
static inline int pmd_bad(pmd_t pmd)
{
#ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
- /* pmd_huge(pmd) but inline */
+ /* pmd_leaf(pmd) but inline */
if (unlikely(pmd_val(pmd) & _PAGE_HUGE))
return 0;
#endif
diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h
index 20ca48c1b6063e..7c28510b3768f0 100644
--- a/arch/mips/include/asm/pgtable-64.h
+++ b/arch/mips/include/asm/pgtable-64.h
@@ -245,7 +245,7 @@ static inline int pmd_none(pmd_t pmd)
static inline int pmd_bad(pmd_t pmd)
{
#ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
- /* pmd_huge(pmd) but inline */
+ /* pmd_leaf(pmd) but inline */
if (unlikely(pmd_val(pmd) & _PAGE_HUGE))
return 0;
#endif
diff --git a/arch/mips/jazz/jazzdma.c b/arch/mips/jazz/jazzdma.c
index eabddb89d221fe..c97b089b99029d 100644
--- a/arch/mips/jazz/jazzdma.c
+++ b/arch/mips/jazz/jazzdma.c
@@ -617,7 +617,7 @@ const struct dma_map_ops jazz_dma_ops = {
.sync_sg_for_device = jazz_dma_sync_sg_for_device,
.mmap = dma_common_mmap,
.get_sgtable = dma_common_get_sgtable,
- .alloc_pages = dma_common_alloc_pages,
+ .alloc_pages_op = dma_common_alloc_pages,
.free_pages = dma_common_free_pages,
};
EXPORT_SYMBOL(jazz_dma_ops);
diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c
index 7eaff5b078739b..0b9e15555b59bb 100644
--- a/arch/mips/mm/hugetlbpage.c
+++ b/arch/mips/mm/hugetlbpage.c
@@ -57,13 +57,3 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
}
return (pte_t *) pmd;
}
-
-int pmd_huge(pmd_t pmd)
-{
- return (pmd_val(pmd) & _PAGE_HUGE) != 0;
-}
-
-int pud_huge(pud_t pud)
-{
- return (pud_val(pud) & _PAGE_HUGE) != 0;
-}
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index 00fe90c6db3e80..7e11d7b5876107 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -34,7 +34,7 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp,
struct vm_area_struct *vma;
unsigned long addr = addr0;
int do_color_align;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
if (unlikely(len > TASK_SIZE))
return -ENOMEM;
@@ -92,7 +92,6 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp,
*/
}
- info.flags = 0;
info.low_limit = mm->mmap_base;
info.high_limit = TASK_SIZE;
return vm_unmapped_area(&info);
diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c
index 4106084e57d728..76f3b9c0a9f0ce 100644
--- a/arch/mips/mm/tlb-r4k.c
+++ b/arch/mips/mm/tlb-r4k.c
@@ -326,7 +326,7 @@ void __update_tlb(struct vm_area_struct * vma, unsigned long address, pte_t pte)
idx = read_c0_index();
#ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
/* this could be a huge page */
- if (pmd_huge(*pmdp)) {
+ if (pmd_leaf(*pmdp)) {
unsigned long lo;
write_c0_pagemask(PM_HUGE_MASK);
ptep = (pte_t *)pmdp;
diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c
index 98af719d5f85b2..f7722451276ef4 100644
--- a/arch/parisc/kernel/sys_parisc.c
+++ b/arch/parisc/kernel/sys_parisc.c
@@ -104,7 +104,9 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp,
struct vm_area_struct *vma, *prev;
unsigned long filp_pgoff;
int do_color_align;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {
+ .length = len
+ };
if (unlikely(len > TASK_SIZE))
return -ENOMEM;
@@ -139,7 +141,6 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp,
return addr;
}
- info.length = len;
info.align_mask = do_color_align ? (PAGE_MASK & (SHM_COLOUR - 1)) : 0;
info.align_offset = shared_align_offset(filp_pgoff, pgoff);
@@ -160,7 +161,6 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp,
*/
}
- info.flags = 0;
info.low_limit = mm->mmap_base;
info.high_limit = mmap_upper_limit(NULL);
return vm_unmapped_area(&info);
diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c
index a9f7e21f66567a..0356199bd9e7e7 100644
--- a/arch/parisc/mm/hugetlbpage.c
+++ b/arch/parisc/mm/hugetlbpage.c
@@ -180,14 +180,3 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
}
return changed;
}
-
-
-int pmd_huge(pmd_t pmd)
-{
- return 0;
-}
-
-int pud_huge(pud_t pud)
-{
- return 0;
-}
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1c4be337368604..e42cc8cd415ff4 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -236,7 +236,7 @@ config PPC
select HAVE_DYNAMIC_FTRACE_WITH_REGS if ARCH_USING_PATCHABLE_FUNCTION_ENTRY || MPROFILE_KERNEL || PPC32
select HAVE_EBPF_JIT
select HAVE_EFFICIENT_UNALIGNED_ACCESS
- select HAVE_FAST_GUP
+ select HAVE_GUP_FAST
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_ARG_ACCESS_API
select HAVE_FUNCTION_DESCRIPTORS if PPC64_ELF_ABI_V1
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
index 48f21820afe209..baf934578c3a5e 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
@@ -6,26 +6,6 @@
*/
#ifndef __ASSEMBLY__
#ifdef CONFIG_HUGETLB_PAGE
-static inline int pmd_huge(pmd_t pmd)
-{
- /*
- * leaf pte for huge page
- */
- if (radix_enabled())
- return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
- return 0;
-}
-
-static inline int pud_huge(pud_t pud)
-{
- /*
- * leaf pte for huge page
- */
- if (radix_enabled())
- return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
- return 0;
-}
-
/*
* With radix , we have hugepage ptes in the pud and pmd entries. We don't
* need to setup hugepage directory for them. Our pte and page directory format
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
index ced7ee8b42fc35..6ac73da7b80e3c 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
@@ -4,31 +4,6 @@
#ifndef __ASSEMBLY__
#ifdef CONFIG_HUGETLB_PAGE
-/*
- * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
- * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD;
- *
- * Defined in such a way that we can optimize away code block at build time
- * if CONFIG_HUGETLB_PAGE=n.
- *
- * returns true for pmd migration entries, THP, devmap, hugetlb
- * But compile time dependent on CONFIG_HUGETLB_PAGE
- */
-static inline int pmd_huge(pmd_t pmd)
-{
- /*
- * leaf pte for huge page
- */
- return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
-}
-
-static inline int pud_huge(pud_t pud)
-{
- /*
- * leaf pte for huge page
- */
- return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
-}
/*
* With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index fac5615e6bc575..8f9432e3855ae5 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -262,6 +262,18 @@ extern unsigned long __kernel_io_end;
extern struct page *vmemmap;
extern unsigned long pci_io_base;
+
+#define pmd_leaf pmd_leaf
+static inline bool pmd_leaf(pmd_t pmd)
+{
+ return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
+}
+
+#define pud_leaf pud_leaf
+static inline bool pud_leaf(pud_t pud)
+{
+ return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
+}
#endif /* __ASSEMBLY__ */
#include <asm/book3s/64/hash.h>
@@ -1426,20 +1438,5 @@ static inline bool is_pte_rw_upgrade(unsigned long old_val, unsigned long new_va
return false;
}
-/*
- * Like pmd_huge(), but works regardless of config options
- */
-#define pmd_leaf pmd_leaf
-static inline bool pmd_leaf(pmd_t pmd)
-{
- return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
-}
-
-#define pud_leaf pud_leaf
-static inline bool pud_leaf(pud_t pud)
-{
- return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
-}
-
#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 3b72c7ed24cfde..aa5c0fd5edb1c9 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -406,9 +406,5 @@ extern void *abatron_pteptrs[2];
#include <asm/nohash/mmu.h>
#endif
-#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP)
-#define __HAVE_ARCH_RESERVED_KERNEL_PAGES
-#endif
-
#endif /* __KERNEL__ */
#endif /* _ASM_POWERPC_MMU_H_ */
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h
index 427db14292c9ec..f5f39d4f03c821 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -351,16 +351,6 @@ static inline int hugepd_ok(hugepd_t hpd)
#endif
}
-static inline int pmd_huge(pmd_t pmd)
-{
- return 0;
-}
-
-static inline int pud_huge(pud_t pud)
-{
- return 0;
-}
-
#define is_hugepd(hpd) (hugepd_ok(hpd))
#endif
diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index 8920862ffd7916..f0ae39e77e3741 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -216,6 +216,6 @@ const struct dma_map_ops dma_iommu_ops = {
.get_required_mask = dma_iommu_get_required_mask,
.mmap = dma_common_mmap,
.get_sgtable = dma_common_get_sgtable,
- .alloc_pages = dma_common_alloc_pages,
+ .alloc_pages_op = dma_common_alloc_pages,
.free_pages = dma_common_free_pages,
};
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index d14eda1e858968..ae8c7619e597d9 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -1735,8 +1735,3 @@ static void __init fadump_reserve_crash_area(u64 base)
memblock_reserve(mstart, msize);
}
}
-
-unsigned long __init arch_reserved_kernel_pages(void)
-{
- return memblock_reserved_size() / PAGE_SIZE;
-}
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 29a8c8e185851b..b70b4f93561fcc 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -26,6 +26,7 @@
#include <linux/iommu.h>
#include <linux/sched.h>
#include <linux/debugfs.h>
+#include <linux/vmalloc.h>
#include <asm/io.h>
#include <asm/iommu.h>
#include <asm/pci-bridge.h>
diff --git a/arch/powerpc/mm/book3s64/slice.c b/arch/powerpc/mm/book3s64/slice.c
index c0b58afb9a4762..ef3ce37f1bb3ec 100644
--- a/arch/powerpc/mm/book3s64/slice.c
+++ b/arch/powerpc/mm/book3s64/slice.c
@@ -282,12 +282,10 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
{
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
unsigned long found, next_end;
- struct vm_unmapped_area_info info;
-
- info.flags = 0;
- info.length = len;
- info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
- info.align_offset = 0;
+ struct vm_unmapped_area_info info = {
+ .length = len,
+ .align_mask = PAGE_MASK & ((1ul << pshift) - 1),
+ };
/*
* Check till the allow max value for this mmap request
*/
@@ -326,13 +324,13 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
{
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
unsigned long found, prev;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {
+ .flags = VM_UNMAPPED_AREA_TOPDOWN,
+ .length = len,
+ .align_mask = PAGE_MASK & ((1ul << pshift) - 1),
+ };
unsigned long min_addr = max(PAGE_SIZE, mmap_min_addr);
- info.flags = VM_UNMAPPED_AREA_TOPDOWN;
- info.length = len;
- info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
- info.align_offset = 0;
/*
* If we are trying to allocate above DEFAULT_MAP_WINDOW
* Add the different to the mmap_base.
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 53335ae21a40ae..21569045249594 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -71,23 +71,26 @@ static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long add
return __bad_area_nosemaphore(regs, address, SEGV_MAPERR);
}
-static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code)
+static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code,
+ struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct mm_struct *mm = current->mm;
/*
* Something tried to access memory that isn't in our memory map..
* Fix it, but check if it's kernel or user first..
*/
- mmap_read_unlock(mm);
+ if (mm)
+ mmap_read_unlock(mm);
+ else
+ vma_end_read(vma);
return __bad_area_nosemaphore(regs, address, si_code);
}
static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address,
+ struct mm_struct *mm,
struct vm_area_struct *vma)
{
- struct mm_struct *mm = current->mm;
int pkey;
/*
@@ -109,7 +112,10 @@ static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address,
*/
pkey = vma_pkey(vma);
- mmap_read_unlock(mm);
+ if (mm)
+ mmap_read_unlock(mm);
+ else
+ vma_end_read(vma);
/*
* If we are in kernel mode, bail out with a SEGV, this will
@@ -124,9 +130,10 @@ static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address,
return 0;
}
-static noinline int bad_access(struct pt_regs *regs, unsigned long address)
+static noinline int bad_access(struct pt_regs *regs, unsigned long address,
+ struct mm_struct *mm, struct vm_area_struct *vma)
{
- return __bad_area(regs, address, SEGV_ACCERR);
+ return __bad_area(regs, address, SEGV_ACCERR, mm, vma);
}
static int do_sigbus(struct pt_regs *regs, unsigned long address,
@@ -479,13 +486,13 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
if (unlikely(access_pkey_error(is_write, is_exec,
(error_code & DSISR_KEYFAULT), vma))) {
- vma_end_read(vma);
- goto lock_mmap;
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ return bad_access_pkey(regs, address, NULL, vma);
}
if (unlikely(access_error(is_write, is_exec, vma))) {
- vma_end_read(vma);
- goto lock_mmap;
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ return bad_access(regs, address, NULL, vma);
}
fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
@@ -521,10 +528,10 @@ retry:
if (unlikely(access_pkey_error(is_write, is_exec,
(error_code & DSISR_KEYFAULT), vma)))
- return bad_access_pkey(regs, address, vma);
+ return bad_access_pkey(regs, address, mm, vma);
if (unlikely(access_error(is_write, is_exec, vma)))
- return bad_access(regs, address);
+ return bad_access(regs, address, mm, vma);
/*
* If for any reason at all we couldn't handle the fault,
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 3a440004b97d2b..a197d4c2244b32 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -16,6 +16,7 @@
#include <linux/highmem.h>
#include <linux/suspend.h>
#include <linux/dma-direct.h>
+#include <linux/vmalloc.h>
#include <asm/swiotlb.h>
#include <asm/machdep.h>
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 9b99113cb51a8f..6621cfc3baf86c 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -102,7 +102,7 @@ struct page *p4d_page(p4d_t p4d)
{
if (p4d_leaf(p4d)) {
if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
- VM_WARN_ON(!p4d_huge(p4d));
+ VM_WARN_ON(!p4d_leaf(p4d));
return pte_page(p4d_pte(p4d));
}
return virt_to_page(p4d_pgtable(p4d));
@@ -113,7 +113,7 @@ struct page *pud_page(pud_t pud)
{
if (pud_leaf(pud)) {
if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
- VM_WARN_ON(!pud_huge(pud));
+ VM_WARN_ON(!pud_leaf(pud));
return pte_page(pud_pte(pud));
}
return virt_to_page(pud_pgtable(pud));
@@ -132,7 +132,7 @@ struct page *pmd_page(pmd_t pmd)
* enabled so these checks can't be used.
*/
if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
- VM_WARN_ON(!(pmd_leaf(pmd) || pmd_huge(pmd)));
+ VM_WARN_ON(!pmd_leaf(pmd));
return pte_page(pmd_pte(pmd));
}
return virt_to_page(pmd_page_vaddr(pmd));
diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c
index d6b5f5ecd5152b..56dc6b29a3e769 100644
--- a/arch/powerpc/platforms/ps3/system-bus.c
+++ b/arch/powerpc/platforms/ps3/system-bus.c
@@ -695,7 +695,7 @@ static const struct dma_map_ops ps3_sb_dma_ops = {
.unmap_page = ps3_unmap_page,
.mmap = dma_common_mmap,
.get_sgtable = dma_common_get_sgtable,
- .alloc_pages = dma_common_alloc_pages,
+ .alloc_pages_op = dma_common_alloc_pages,
.free_pages = dma_common_free_pages,
};
@@ -709,7 +709,7 @@ static const struct dma_map_ops ps3_ioc0_dma_ops = {
.unmap_page = ps3_unmap_page,
.mmap = dma_common_mmap,
.get_sgtable = dma_common_get_sgtable,
- .alloc_pages = dma_common_alloc_pages,
+ .alloc_pages_op = dma_common_alloc_pages,
.free_pages = dma_common_free_pages,
};
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
index 90ff85c879bfe9..477c1d5e1737ba 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -611,7 +611,7 @@ static const struct dma_map_ops vio_dma_mapping_ops = {
.get_required_mask = dma_iommu_get_required_mask,
.mmap = dma_common_mmap,
.get_sgtable = dma_common_get_sgtable,
- .alloc_pages = dma_common_alloc_pages,
+ .alloc_pages_op = dma_common_alloc_pages,
.free_pages = dma_common_free_pages,
};
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index be09c8836d56be..3ee60ddef93ebf 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -132,7 +132,7 @@ config RISCV
select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER if !XIP_KERNEL && !PREEMPTION
select HAVE_EBPF_JIT if MMU
- select HAVE_FAST_GUP if MMU
+ select HAVE_GUP_FAST if MMU
select HAVE_FUNCTION_ARG_ACCESS_API
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_GCC_PLUGINS
diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
index 22deb7a2a6ec4e..b1ce97a9dbfce7 100644
--- a/arch/riscv/include/asm/hugetlb.h
+++ b/arch/riscv/include/asm/hugetlb.h
@@ -5,11 +5,11 @@
#include <asm/cacheflush.h>
#include <asm/page.h>
-static inline void arch_clear_hugepage_flags(struct page *page)
+static inline void arch_clear_hugetlb_flags(struct folio *folio)
{
- clear_bit(PG_dcache_clean, &page->flags);
+ clear_bit(PG_dcache_clean, &folio->flags);
}
-#define arch_clear_hugepage_flags arch_clear_hugepage_flags
+#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
bool arch_hugetlb_migration_supported(struct hstate *h);
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 6afd6bb4882eb2..f2d5973a011bdd 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -648,6 +648,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
#define __pud_to_phys(pud) (__page_val_to_pfn(pud_val(pud)) << PAGE_SHIFT)
+#define pud_pfn pud_pfn
static inline unsigned long pud_pfn(pud_t pud)
{
return ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT);
diff --git a/arch/riscv/kernel/elf_kexec.c b/arch/riscv/kernel/elf_kexec.c
index 54260c16f9912a..11c0d2e0becfe9 100644
--- a/arch/riscv/kernel/elf_kexec.c
+++ b/arch/riscv/kernel/elf_kexec.c
@@ -19,6 +19,7 @@
#include <linux/libfdt.h>
#include <linux/types.h>
#include <linux/memblock.h>
+#include <linux/vmalloc.h>
#include <asm/setup.h>
int arch_kimage_file_post_load_cleanup(struct kimage *image)
diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c
index 2f08c14a933d05..71a8b8945b26fd 100644
--- a/arch/riscv/kernel/probes/kprobes.c
+++ b/arch/riscv/kernel/probes/kprobes.c
@@ -6,6 +6,7 @@
#include <linux/extable.h>
#include <linux/slab.h>
#include <linux/stop_machine.h>
+#include <linux/vmalloc.h>
#include <asm/ptrace.h>
#include <linux/uaccess.h>
#include <asm/sections.h>
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index 3ba1d4dde5dd1a..5224f373380225 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -292,7 +292,10 @@ void handle_page_fault(struct pt_regs *regs)
if (unlikely(access_error(cause, vma))) {
vma_end_read(vma);
- goto lock_mmap;
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ tsk->thread.bad_cause = cause;
+ bad_area_nosemaphore(regs, SEGV_ACCERR, addr);
+ return;
}
fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs);
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 5ef2a6891158a6..0ebd968b33c997 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -399,16 +399,6 @@ static bool is_napot_size(unsigned long size)
#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
-int pud_huge(pud_t pud)
-{
- return pud_leaf(pud);
-}
-
-int pmd_huge(pmd_t pmd)
-{
- return pmd_leaf(pmd);
-}
-
static bool __hugetlb_valid_size(unsigned long size)
{
if (size == HPAGE_SIZE)
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 8f01ada6845e36..d9aed4c93ee671 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -174,7 +174,7 @@ config S390
select HAVE_DYNAMIC_FTRACE_WITH_REGS
select HAVE_EBPF_JIT if HAVE_MARCH_Z196_FEATURES
select HAVE_EFFICIENT_UNALIGNED_ACCESS
- select HAVE_FAST_GUP
+ select HAVE_GUP_FAST
select HAVE_FENTRY
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_ARG_ACCESS_API
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index deb198a610395b..ce5f4fe8be4dce 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -39,11 +39,11 @@ static inline int prepare_hugepage_range(struct file *file,
return 0;
}
-static inline void arch_clear_hugepage_flags(struct page *page)
+static inline void arch_clear_hugetlb_flags(struct folio *folio)
{
- clear_bit(PG_arch_1, &page->flags);
+ clear_bit(PG_arch_1, &folio->flags);
}
-#define arch_clear_hugepage_flags arch_clear_hugepage_flags
+#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long sz)
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 60950e7a25f585..2cb2a2e7b34b83 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1405,6 +1405,7 @@ static inline unsigned long pud_deref(pud_t pud)
return (unsigned long)__va(pud_val(pud) & origin_mask);
}
+#define pud_pfn pud_pfn
static inline unsigned long pud_pfn(pud_t pud)
{
return __pa(pud_deref(pud)) >> PAGE_SHIFT;
diff --git a/arch/s390/kernel/cert_store.c b/arch/s390/kernel/cert_store.c
index 554447768bddc7..bf983513dd333b 100644
--- a/arch/s390/kernel/cert_store.c
+++ b/arch/s390/kernel/cert_store.c
@@ -21,6 +21,7 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
#include <crypto/sha2.h>
#include <keys/user-type.h>
#include <asm/debug.h>
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 1486350a417751..5d6d381aa0be4a 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -20,6 +20,7 @@
#include <linux/gfp.h>
#include <linux/crash_dump.h>
#include <linux/debug_locks.h>
+#include <linux/vmalloc.h>
#include <asm/asm-extable.h>
#include <asm/diag.h>
#include <asm/ipl.h>
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 0c66b32e0f9f1b..65747f15dbec4b 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -325,7 +325,8 @@ static void do_exception(struct pt_regs *regs, int access)
goto lock_mmap;
if (!(vma->vm_flags & access)) {
vma_end_read(vma);
- goto lock_mmap;
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ return handle_fault_error_nolock(regs, SEGV_ACCERR);
}
fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index dc3db86e13ffb8..2675aab4acc700 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -233,16 +233,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
return (pte_t *) pmdp;
}
-int pmd_huge(pmd_t pmd)
-{
- return pmd_leaf(pmd);
-}
-
-int pud_huge(pud_t pud)
-{
- return pud_leaf(pud);
-}
-
bool __init arch_hugetlb_valid_size(unsigned long size)
{
if (MACHINE_HAS_EDAT1 && size == PMD_SIZE)
@@ -258,14 +248,12 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
- info.flags = 0;
info.length = len;
info.low_limit = current->mm->mmap_base;
info.high_limit = TASK_SIZE;
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
return vm_unmapped_area(&info);
}
@@ -274,7 +262,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
unsigned long addr;
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
@@ -282,7 +270,6 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
info.low_limit = PAGE_SIZE;
info.high_limit = current->mm->mmap_base;
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
addr = vm_unmapped_area(&info);
/*
@@ -328,7 +315,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
goto check_asce_limit;
}
- if (mm->get_unmapped_area == arch_get_unmapped_area)
+ if (!test_bit(MMF_TOPDOWN, &mm->flags))
addr = hugetlb_get_unmapped_area_bottomup(file, addr, len,
pgoff, flags);
else
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index b14fc0887654d2..2067569465897b 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -86,7 +86,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
@@ -102,7 +102,6 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
goto check_asce_limit;
}
- info.flags = 0;
info.length = len;
info.low_limit = mm->mmap_base;
info.high_limit = TASK_SIZE;
@@ -122,7 +121,7 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long ad
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
/* requested length too big for entire address space */
if (len > TASK_SIZE - mmap_min_addr)
@@ -185,10 +184,10 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
*/
if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = mmap_base_legacy(random_factor);
- mm->get_unmapped_area = arch_get_unmapped_area;
+ clear_bit(MMF_TOPDOWN, &mm->flags);
} else {
mm->mmap_base = mmap_base(random_factor, rlim_stack);
- mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+ set_bit(MMF_TOPDOWN, &mm->flags);
}
}
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 2ad3e29f0ebec4..7292542f75e8ba 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -38,7 +38,7 @@ config SUPERH
select HAVE_DEBUG_BUGVERBOSE
select HAVE_DEBUG_KMEMLEAK
select HAVE_DYNAMIC_FTRACE
- select HAVE_FAST_GUP if MMU
+ select HAVE_GUP_FAST if MMU
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
select HAVE_FTRACE_MCOUNT_RECORD
diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h
index 4d3ba39e681c4f..75028bd568ba5e 100644
--- a/arch/sh/include/asm/hugetlb.h
+++ b/arch/sh/include/asm/hugetlb.h
@@ -27,11 +27,11 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
return *ptep;
}
-static inline void arch_clear_hugepage_flags(struct page *page)
+static inline void arch_clear_hugetlb_flags(struct folio *folio)
{
- clear_bit(PG_dcache_clean, &page->flags);
+ clear_bit(PG_dcache_clean, &folio->flags);
}
-#define arch_clear_hugepage_flags arch_clear_hugepage_flags
+#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
#include <asm-generic/hugetlb.h>
diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c
index 862046f26981b6..9a1e581cd192d1 100644
--- a/arch/sh/mm/cache-sh4.c
+++ b/arch/sh/mm/cache-sh4.c
@@ -241,13 +241,14 @@ static void sh4_flush_cache_page(void *args)
if ((vma->vm_mm == current->active_mm))
vaddr = NULL;
else {
+ struct folio *folio = page_folio(page);
/*
* Use kmap_coherent or kmap_atomic to do flushes for
* another ASID than the current one.
*/
map_coherent = (current_cpu_data.dcache.n_aliases &&
- test_bit(PG_dcache_clean, &page->flags) &&
- page_mapcount(page));
+ test_bit(PG_dcache_clean, folio_flags(folio, 0)) &&
+ page_mapped(page));
if (map_coherent)
vaddr = kmap_coherent(page, address);
else
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 6cb0ad73dbb9e3..ff209b55285ad2 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -70,13 +70,3 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
return pte;
}
-
-int pmd_huge(pmd_t pmd)
-{
- return 0;
-}
-
-int pud_huge(pud_t pud)
-{
- return 0;
-}
diff --git a/arch/sh/mm/mmap.c b/arch/sh/mm/mmap.c
index b82199878b4594..bee329d4149aa7 100644
--- a/arch/sh/mm/mmap.c
+++ b/arch/sh/mm/mmap.c
@@ -57,7 +57,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
int do_colour_align;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
if (flags & MAP_FIXED) {
/* We do not accept a shared mapping if it would violate
@@ -88,7 +88,6 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
return addr;
}
- info.flags = 0;
info.length = len;
info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = TASK_SIZE;
@@ -106,7 +105,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
int do_colour_align;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
if (flags & MAP_FIXED) {
/* We do not accept a shared mapping if it would violate
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 4d1bafaba942d2..3fe429d73a65b0 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -875,6 +875,7 @@ static inline bool pud_leaf(pud_t pud)
return pte_val(pte) & _PAGE_PMD_HUGE;
}
+#define pud_pfn pud_pfn
static inline unsigned long pud_pfn(pud_t pud)
{
pte_t pte = __pte(pud_val(pud));
@@ -956,7 +957,7 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
#ifdef DCACHE_ALIASING_POSSIBLE
#define __HAVE_ARCH_MOVE_PTE
-#define move_pte(pte, prot, old_addr, new_addr) \
+#define move_pte(pte, old_addr, new_addr) \
({ \
pte_t newpte = (pte); \
if (tlb_type != hypervisor && pte_present(pte)) { \
diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c
index 082a551897ed8f..08a19727795ca4 100644
--- a/arch/sparc/kernel/sys_sparc_32.c
+++ b/arch/sparc/kernel/sys_sparc_32.c
@@ -41,7 +41,7 @@ SYSCALL_DEFINE0(getpagesize)
unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags)
{
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
if (flags & MAP_FIXED) {
/* We do not accept a shared mapping if it would violate
@@ -59,7 +59,6 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
if (!addr)
addr = TASK_UNMAPPED_BASE;
- info.flags = 0;
info.length = len;
info.low_limit = addr;
info.high_limit = TASK_SIZE;
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 1e9a9e016237bf..d9c3b34ca74470 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -93,7 +93,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
struct vm_area_struct * vma;
unsigned long task_size = TASK_SIZE;
int do_color_align;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
if (flags & MAP_FIXED) {
/* We do not accept a shared mapping if it would violate
@@ -126,7 +126,6 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
return addr;
}
- info.flags = 0;
info.length = len;
info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = min(task_size, VA_EXCLUDE_START);
@@ -154,7 +153,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
unsigned long task_size = STACK_TOP32;
unsigned long addr = addr0;
int do_color_align;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
/* This should only ever run for 32-bit processes. */
BUG_ON(!test_thread_flag(TIF_32BIT));
@@ -218,14 +217,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags)
{
unsigned long align_goal, addr = -ENOMEM;
- unsigned long (*get_area)(struct file *, unsigned long,
- unsigned long, unsigned long, unsigned long);
-
- get_area = current->mm->get_unmapped_area;
if (flags & MAP_FIXED) {
/* Ok, don't mess with it. */
- return get_area(NULL, orig_addr, len, pgoff, flags);
+ return mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags);
}
flags &= ~MAP_SHARED;
@@ -238,7 +233,8 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u
align_goal = (64UL * 1024);
do {
- addr = get_area(NULL, orig_addr, len + (align_goal - PAGE_SIZE), pgoff, flags);
+ addr = mm_get_unmapped_area(current->mm, NULL, orig_addr,
+ len + (align_goal - PAGE_SIZE), pgoff, flags);
if (!(addr & ~PAGE_MASK)) {
addr = (addr + (align_goal - 1UL)) & ~(align_goal - 1UL);
break;
@@ -256,7 +252,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u
* be obtained.
*/
if (addr & ~PAGE_MASK)
- addr = get_area(NULL, orig_addr, len, pgoff, flags);
+ addr = mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags);
return addr;
}
@@ -292,7 +288,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
gap == RLIM_INFINITY ||
sysctl_legacy_va_layout) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
- mm->get_unmapped_area = arch_get_unmapped_area;
+ clear_bit(MMF_TOPDOWN, &mm->flags);
} else {
/* We know it's 32-bit */
unsigned long task_size = STACK_TOP32;
@@ -303,7 +299,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
gap = (task_size / 6 * 5);
mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor);
- mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+ set_bit(MMF_TOPDOWN, &mm->flags);
}
}
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index b432500c13a5d8..cc91ca7a1e182c 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -31,17 +31,15 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *filp,
{
struct hstate *h = hstate_file(filp);
unsigned long task_size = TASK_SIZE;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
if (test_thread_flag(TIF_32BIT))
task_size = STACK_TOP32;
- info.flags = 0;
info.length = len;
info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = min(task_size, VA_EXCLUDE_START);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
addr = vm_unmapped_area(&info);
if ((addr & ~PAGE_MASK) && task_size > VA_EXCLUDE_END) {
@@ -63,7 +61,7 @@ hugetlb_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
struct hstate *h = hstate_file(filp);
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
/* This should only ever run for 32-bit processes. */
BUG_ON(!test_thread_flag(TIF_32BIT));
@@ -73,7 +71,6 @@ hugetlb_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
info.low_limit = PAGE_SIZE;
info.high_limit = mm->mmap_base;
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
addr = vm_unmapped_area(&info);
/*
@@ -123,7 +120,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
(!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
- if (mm->get_unmapped_area == arch_get_unmapped_area)
+ if (!test_bit(MMF_TOPDOWN, &mm->flags))
return hugetlb_get_unmapped_area_bottomup(file, addr, len,
pgoff, flags);
else
@@ -407,18 +404,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
return entry;
}
-int pmd_huge(pmd_t pmd)
-{
- return !pmd_none(pmd) &&
- (pmd_val(pmd) & (_PAGE_VALID|_PAGE_PMD_HUGE)) != _PAGE_VALID;
-}
-
-int pud_huge(pud_t pud)
-{
- return !pud_none(pud) &&
- (pud_val(pud) & (_PAGE_VALID|_PAGE_PUD_HUGE)) != _PAGE_VALID;
-}
-
static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
unsigned long addr)
{
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
index b44d79d778c718..19642f7ffb52a5 100644
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -183,12 +183,12 @@ static void __set_pmd_acct(struct mm_struct *mm, unsigned long addr,
* hugetlb_pte_count.
*/
if (pmd_val(pmd) & _PAGE_PMD_HUGE) {
- if (is_huge_zero_page(pmd_page(pmd)))
+ if (is_huge_zero_pmd(pmd))
mm->context.hugetlb_pte_count++;
else
mm->context.thp_pte_count++;
} else {
- if (is_huge_zero_page(pmd_page(orig)))
+ if (is_huge_zero_pmd(orig))
mm->context.hugetlb_pte_count--;
else
mm->context.thp_pte_count--;
@@ -259,7 +259,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
* Sanity check pmd before doing the actual decrement.
*/
if ((pmd_val(entry) & _PAGE_PMD_HUGE) &&
- !is_huge_zero_page(pmd_page(entry)))
+ !is_huge_zero_pmd(entry))
(vma->vm_mm)->context.thp_pte_count--;
return old;
diff --git a/arch/um/include/shared/um_malloc.h b/arch/um/include/shared/um_malloc.h
index 13da93284c2c7f..bf503658f08eb3 100644
--- a/arch/um/include/shared/um_malloc.h
+++ b/arch/um/include/shared/um_malloc.h
@@ -11,7 +11,8 @@
extern void *uml_kmalloc(int size, int flags);
extern void kfree(const void *ptr);
-extern void *vmalloc(unsigned long size);
+extern void *vmalloc_noprof(unsigned long size);
+#define vmalloc(...) vmalloc_noprof(__VA_ARGS__)
extern void vfree(void *ptr);
#endif /* __UM_MALLOC_H__ */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 928820e61cb502..59df6c4180810a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -222,7 +222,7 @@ config X86
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_EISA
select HAVE_EXIT_THREAD
- select HAVE_FAST_GUP
+ select HAVE_GUP_FAST
select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 294cd2a4081812..7452fc193b4f9e 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -42,6 +42,7 @@
#include <asm/early_ioremap.h>
#include <asm/pgtable_types.h>
#include <asm/shared/io.h>
+#include <asm/special_insns.h>
#define build_mmio_read(name, size, type, reg, barrier) \
static inline type name(const volatile void __iomem *addr) \
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 315535ffb2582c..273f7557218cd6 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -234,6 +234,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
}
+#define pud_pfn pud_pfn
static inline unsigned long pud_pfn(pud_t pud)
{
phys_addr_t pfn = pud_val(pud);
@@ -1200,7 +1201,6 @@ static inline int pgd_none(pgd_t pgd)
extern int direct_gbpages;
void init_mem_mapping(void);
void early_alloc_pgt_buf(void);
-extern void memblock_find_dma_reserve(void);
void __init poking_init(void);
unsigned long init_memory_mapping(unsigned long start,
unsigned long end, pgprot_t prot);
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 7e9db77231ac7f..3c4407271d0838 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -245,6 +245,7 @@ extern void cleanup_highmap(void);
#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+#define HAVE_ARCH_UNMAPPED_AREA_VMFLAGS
#define PAGE_AGP PAGE_KERNEL_NOCACHE
#define HAVE_PAGE_AGP 1
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 2ae98f754e5918..c884deca839b2e 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -676,7 +676,7 @@ static const struct dma_map_ops gart_dma_ops = {
.get_sgtable = dma_common_get_sgtable,
.dma_supported = dma_direct_supported,
.get_required_mask = dma_direct_get_required_mask,
- .alloc_pages = dma_direct_alloc_pages,
+ .alloc_pages_op = dma_direct_alloc_pages,
.free_pages = dma_direct_free_pages,
};
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index 262f5fb18d74d3..22b65a5f5ec6c4 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -113,7 +113,7 @@ static unsigned long sgx_get_unmapped_area(struct file *file,
if (flags & MAP_FIXED)
return addr;
- return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+ return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
}
#ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 166692f2d50111..27892e57c4ef93 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -13,6 +13,7 @@
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
#include <asm/sgx.h>
#include "driver.h"
#include "encl.h"
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index fe0c859873d17e..ade0043ce56efe 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,6 +18,7 @@
#include <linux/uaccess.h>
#include <linux/smp.h>
#include <linux/sched/task_stack.h>
+#include <linux/vmalloc.h>
#include <asm/cpu_entry_area.h>
#include <asm/softirq_stack.h>
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e125e059e2c45d..47e7fcdbdacd5b 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1107,8 +1107,6 @@ void __init setup_arch(char **cmdline_p)
*/
arch_reserve_crashkernel();
- memblock_find_dma_reserve();
-
if (!early_xdbc_setup_hardware())
early_xdbc_register_console();
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index cb9fa1d5c66f73..01d7cd85ef9789 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -112,13 +112,21 @@ static void find_start_end(unsigned long addr, unsigned long flags,
*end = task_size_64bit(addr > DEFAULT_MAP_WINDOW);
}
+static inline unsigned long stack_guard_placement(vm_flags_t vm_flags)
+{
+ if (vm_flags & VM_SHADOW_STACK)
+ return PAGE_SIZE;
+
+ return 0;
+}
+
unsigned long
-arch_get_unmapped_area(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
+arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
unsigned long begin, end;
if (flags & MAP_FIXED)
@@ -137,12 +145,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
return addr;
}
- info.flags = 0;
info.length = len;
info.low_limit = begin;
info.high_limit = end;
- info.align_mask = 0;
info.align_offset = pgoff << PAGE_SHIFT;
+ info.start_gap = stack_guard_placement(vm_flags);
if (filp) {
info.align_mask = get_align_mask();
info.align_offset += get_align_bits();
@@ -151,14 +158,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
}
unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
- const unsigned long len, const unsigned long pgoff,
- const unsigned long flags)
+arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr0,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags, vm_flags_t vm_flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
/* requested length too big for entire address space */
if (len > TASK_SIZE)
@@ -192,6 +199,7 @@ get_unmapped_area:
info.low_limit = PAGE_SIZE;
info.high_limit = get_mmap_base(0);
+ info.start_gap = stack_guard_placement(vm_flags);
/*
* If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
@@ -203,7 +211,6 @@ get_unmapped_area:
if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall())
info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
- info.align_mask = 0;
info.align_offset = pgoff << PAGE_SHIFT;
if (filp) {
info.align_mask = get_align_mask();
@@ -223,3 +230,18 @@ bottomup:
*/
return arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
}
+
+unsigned long
+arch_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ return arch_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0);
+}
+
+unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr,
+ const unsigned long len, const unsigned long pgoff,
+ const unsigned long flags)
+{
+ return arch_get_unmapped_area_topdown_vmflags(filp, addr, len, pgoff, flags, 0);
+}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 622d12ec7f0851..67b18adc75ddc1 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,6 +20,7 @@
#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <linux/mm_types.h>
#include <linux/mm.h> /* find_and_lock_vma() */
+#include <linux/vmalloc.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
@@ -865,14 +866,17 @@ bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, u32 pkey, int si_code)
+ unsigned long address, struct mm_struct *mm,
+ struct vm_area_struct *vma, u32 pkey, int si_code)
{
- struct mm_struct *mm = current->mm;
/*
* Something tried to access memory that isn't in our memory map..
* Fix it, but check if it's kernel or user first..
*/
- mmap_read_unlock(mm);
+ if (mm)
+ mmap_read_unlock(mm);
+ else
+ vma_end_read(vma);
__bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
}
@@ -896,7 +900,8 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
static noinline void
bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, struct vm_area_struct *vma)
+ unsigned long address, struct mm_struct *mm,
+ struct vm_area_struct *vma)
{
/*
* This OSPKE check is not strictly necessary at runtime.
@@ -926,9 +931,9 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
*/
u32 pkey = vma_pkey(vma);
- __bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
+ __bad_area(regs, error_code, address, mm, vma, pkey, SEGV_PKUERR);
} else {
- __bad_area(regs, error_code, address, 0, SEGV_ACCERR);
+ __bad_area(regs, error_code, address, mm, vma, 0, SEGV_ACCERR);
}
}
@@ -1356,8 +1361,9 @@ void do_user_addr_fault(struct pt_regs *regs,
goto lock_mmap;
if (unlikely(access_error(error_code, vma))) {
- vma_end_read(vma);
- goto lock_mmap;
+ bad_area_access_error(regs, error_code, address, NULL, vma);
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ return;
}
fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
@@ -1393,7 +1399,7 @@ retry:
* we can handle it..
*/
if (unlikely(access_error(error_code, vma))) {
- bad_area_access_error(regs, error_code, address, vma);
+ bad_area_access_error(regs, error_code, address, mm, vma);
return;
}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 5804bbae4f0125..807a5859a3c4b3 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -19,41 +19,14 @@
#include <asm/tlbflush.h>
#include <asm/elf.h>
-/*
- * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal
- * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry.
- * Otherwise, returns 0.
- */
-int pmd_huge(pmd_t pmd)
-{
- return !pmd_none(pmd) &&
- (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
-}
-
-/*
- * pud_huge() returns 1 if @pud is hugetlb related entry, that is normal
- * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry.
- * Otherwise, returns 0.
- */
-int pud_huge(pud_t pud)
-{
-#if CONFIG_PGTABLE_LEVELS > 2
- return !pud_none(pud) &&
- (pud_val(pud) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
-#else
- return 0;
-#endif
-}
-
#ifdef CONFIG_HUGETLB_PAGE
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
- info.flags = 0;
info.length = len;
info.low_limit = get_mmap_base(1);
@@ -65,7 +38,6 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
return vm_unmapped_area(&info);
}
@@ -74,7 +46,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
@@ -89,7 +61,6 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
addr = vm_unmapped_area(&info);
/*
@@ -141,7 +112,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
}
get_unmapped_area:
- if (mm->get_unmapped_area == arch_get_unmapped_area)
+ if (!test_bit(MMF_TOPDOWN, &mm->flags))
return hugetlb_get_unmapped_area_bottomup(file, addr, len,
pgoff, flags);
else
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 679893ea5e6873..615f0bf4bda61a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -990,53 +990,6 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
}
#endif
-/*
- * Calculate the precise size of the DMA zone (first 16 MB of RAM),
- * and pass it to the MM layer - to help it set zone watermarks more
- * accurately.
- *
- * Done on 64-bit systems only for the time being, although 32-bit systems
- * might benefit from this as well.
- */
-void __init memblock_find_dma_reserve(void)
-{
-#ifdef CONFIG_X86_64
- u64 nr_pages = 0, nr_free_pages = 0;
- unsigned long start_pfn, end_pfn;
- phys_addr_t start_addr, end_addr;
- int i;
- u64 u;
-
- /*
- * Iterate over all memory ranges (free and reserved ones alike),
- * to calculate the total number of pages in the first 16 MB of RAM:
- */
- nr_pages = 0;
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
- start_pfn = min(start_pfn, MAX_DMA_PFN);
- end_pfn = min(end_pfn, MAX_DMA_PFN);
-
- nr_pages += end_pfn - start_pfn;
- }
-
- /*
- * Iterate over free memory ranges to calculate the number of free
- * pages in the DMA zone, while not counting potential partial
- * pages at the beginning or the end of the range:
- */
- nr_free_pages = 0;
- for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) {
- start_pfn = min_t(unsigned long, PFN_UP(start_addr), MAX_DMA_PFN);
- end_pfn = min_t(unsigned long, PFN_DOWN(end_addr), MAX_DMA_PFN);
-
- if (start_pfn < end_pfn)
- nr_free_pages += end_pfn - start_pfn;
- }
-
- set_dma_reserve(nr_pages - nr_free_pages);
-#endif
-}
-
void __init zone_sizes_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index c90c20904a6075..a2cabb1c81e1ae 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -129,9 +129,9 @@ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
if (mmap_is_legacy())
- mm->get_unmapped_area = arch_get_unmapped_area;
+ clear_bit(MMF_TOPDOWN, &mm->flags);
else
- mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+ set_bit(MMF_TOPDOWN, &mm->flags);
arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
arch_rnd(mmap64_rnd_bits), task_size_64bit(0),
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 025fd7ea5d69f5..65fda406e6f265 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -24,6 +24,7 @@
#include <linux/memblock.h>
#include <linux/init.h>
+#include <linux/vmalloc.h>
#include <asm/pgtable_areas.h>
#include "numa_internal.h"
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 36b603d0cddefc..d01c3b0bd6eb1c 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -39,6 +39,7 @@
#include <linux/pfn_t.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/highmem.h>
#include <linux/fs.h>
#include <linux/rbtree.h>
@@ -947,6 +948,32 @@ static void free_pfn_range(u64 paddr, unsigned long size)
memtype_free(paddr, paddr + size);
}
+static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
+ resource_size_t *phys)
+{
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ return -EINVAL;
+
+ if (follow_pte(vma->vm_mm, vma->vm_start, &ptep, &ptl))
+ return -EINVAL;
+
+ pte = ptep_get(ptep);
+
+ /* Never return PFNs of anon folios in COW mappings. */
+ if (vm_normal_folio(vma, vma->vm_start, pte)) {
+ pte_unmap_unlock(ptep, ptl);
+ return -EINVAL;
+ }
+
+ *prot = pgprot_val(pte_pgprot(pte));
+ *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+}
+
static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
pgprot_t *pgprot)
{
@@ -964,7 +991,7 @@ static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
* detect the PFN. If we need the cachemode as well, we're out of luck
* for now and have to fail fork().
*/
- if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) {
+ if (!follow_phys(vma, &prot, paddr)) {
if (pgprot)
*pgprot = __pgprot(prot);
return 0;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index d007591b80597a..94767c82fc0d72 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -731,7 +731,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
return 0;
/* Bail out if we are we on a populated non-leaf entry: */
- if (pud_present(*pud) && !pud_huge(*pud))
+ if (pud_present(*pud) && !pud_leaf(*pud))
return 0;
set_pte((pte_t *)pud, pfn_pte(
@@ -760,7 +760,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
}
/* Bail out if we are we on a populated non-leaf entry: */
- if (pmd_present(*pmd) && !pmd_huge(*pmd))
+ if (pmd_present(*pmd) && !pmd_leaf(*pmd))
return 0;
set_pte((pte_t *)pmd, pfn_pte(
diff --git a/arch/xtensa/mm/cache.c b/arch/xtensa/mm/cache.c
index 7ec66a79f4723c..23be0e7516cede 100644
--- a/arch/xtensa/mm/cache.c
+++ b/arch/xtensa/mm/cache.c
@@ -87,12 +87,13 @@ static inline void *coherent_kvaddr(struct page *page, unsigned long base,
void clear_user_highpage(struct page *page, unsigned long vaddr)
{
+ struct folio *folio = page_folio(page);
unsigned long paddr;
void *kvaddr = coherent_kvaddr(page, TLBTEMP_BASE_1, vaddr, &paddr);
preempt_disable();
kmap_invalidate_coherent(page, vaddr);
- set_bit(PG_arch_1, &page->flags);
+ set_bit(PG_arch_1, folio_flags(folio, 0));
clear_page_alias(kvaddr, paddr);
preempt_enable();
}
@@ -101,6 +102,7 @@ EXPORT_SYMBOL(clear_user_highpage);
void copy_user_highpage(struct page *dst, struct page *src,
unsigned long vaddr, struct vm_area_struct *vma)
{
+ struct folio *folio = page_folio(dst);
unsigned long dst_paddr, src_paddr;
void *dst_vaddr = coherent_kvaddr(dst, TLBTEMP_BASE_1, vaddr,
&dst_paddr);
@@ -109,7 +111,7 @@ void copy_user_highpage(struct page *dst, struct page *src,
preempt_disable();
kmap_invalidate_coherent(dst, vaddr);
- set_bit(PG_arch_1, &dst->flags);
+ set_bit(PG_arch_1, folio_flags(folio, 0));
copy_page_alias(dst_vaddr, src_vaddr, dst_paddr, src_paddr);
preempt_enable();
}
diff --git a/drivers/accel/ivpu/ivpu_mmu_context.c b/drivers/accel/ivpu/ivpu_mmu_context.c
index fe61612992364c..128aef8e5a1999 100644
--- a/drivers/accel/ivpu/ivpu_mmu_context.c
+++ b/drivers/accel/ivpu/ivpu_mmu_context.c
@@ -6,6 +6,7 @@
#include <linux/bitfield.h>
#include <linux/highmem.h>
#include <linux/set_memory.h>
+#include <linux/vmalloc.h>
#include <drm/drm_cache.h>
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index f0639df6cd184a..4cf38f7d3e0a3d 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1568,7 +1568,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
* Corresponding ZRAM slot should be locked.
*/
static int zram_recompress(struct zram *zram, u32 index, struct page *page,
- u32 threshold, u32 prio, u32 prio_max)
+ u64 *num_recomp_pages, u32 threshold, u32 prio,
+ u32 prio_max)
{
struct zcomp_strm *zstrm = NULL;
unsigned long handle_old;
@@ -1645,6 +1646,15 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page,
if (!zstrm)
return 0;
+ /*
+ * Decrement the limit (if set) on pages we can recompress, even
+ * when current recompression was unsuccessful or did not compress
+ * the page below the threshold, because we still spent resources
+ * on it.
+ */
+ if (*num_recomp_pages)
+ *num_recomp_pages -= 1;
+
if (class_index_new >= class_index_old) {
/*
* Secondary algorithms failed to re-compress the page
@@ -1710,6 +1720,7 @@ static ssize_t recompress_store(struct device *dev,
struct zram *zram = dev_to_zram(dev);
unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
char *args, *param, *val, *algo = NULL;
+ u64 num_recomp_pages = ULLONG_MAX;
u32 mode = 0, threshold = 0;
unsigned long index;
struct page *page;
@@ -1732,6 +1743,17 @@ static ssize_t recompress_store(struct device *dev,
continue;
}
+ if (!strcmp(param, "max_pages")) {
+ /*
+ * Limit the number of entries (pages) we attempt to
+ * recompress.
+ */
+ ret = kstrtoull(val, 10, &num_recomp_pages);
+ if (ret)
+ return ret;
+ continue;
+ }
+
if (!strcmp(param, "threshold")) {
/*
* We will re-compress only idle objects equal or
@@ -1788,6 +1810,9 @@ static ssize_t recompress_store(struct device *dev,
for (index = 0; index < nr_pages; index++) {
int err = 0;
+ if (!num_recomp_pages)
+ break;
+
zram_slot_lock(zram, index);
if (!zram_allocated(zram, index))
@@ -1807,8 +1832,8 @@ static ssize_t recompress_store(struct device *dev,
zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
goto next;
- err = zram_recompress(zram, index, page, threshold,
- prio, prio_max);
+ err = zram_recompress(zram, index, page, &num_recomp_pages,
+ threshold, prio, prio_max);
next:
zram_slot_unlock(zram, index);
if (err) {
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 3c6670cf905f11..9b80e622ae80a8 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -544,7 +544,7 @@ static unsigned long get_unmapped_area_zero(struct file *file,
}
/* Otherwise flags & MAP_PRIVATE: with no shmem object beneath it */
- return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+ return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
#else
return -ENOSYS;
#endif
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 93ebedc5ec8ca3..47c126d37b59ab 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -329,14 +329,14 @@ static unsigned long dax_get_unmapped_area(struct file *filp,
if ((off + len_align) < off)
goto out;
- addr_align = current->mm->get_unmapped_area(filp, addr, len_align,
- pgoff, flags);
+ addr_align = mm_get_unmapped_area(current->mm, filp, addr, len_align,
+ pgoff, flags);
if (!IS_ERR_VALUE(addr_align)) {
addr_align += (off - addr_align) & (align - 1);
return addr_align;
}
out:
- return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+ return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
}
static const struct address_space_operations dev_dax_aops = {
diff --git a/drivers/gpu/drm/gma500/mmu.c b/drivers/gpu/drm/gma500/mmu.c
index a70b01ccdf7022..4d78b33eaa8230 100644
--- a/drivers/gpu/drm/gma500/mmu.c
+++ b/drivers/gpu/drm/gma500/mmu.c
@@ -5,6 +5,7 @@
**************************************************************************/
#include <linux/highmem.h>
+#include <linux/vmalloc.h>
#include "mmu.h"
#include "psb_drv.h"
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
index 0ba955611dfb54..8780aa24310535 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
@@ -5,6 +5,7 @@
*/
#include <drm/drm_cache.h>
+#include <linux/vmalloc.h>
#include "gt/intel_gt.h"
#include "gt/intel_tlb.h"
diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
index b2a5882b8f81a0..07565701873922 100644
--- a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
@@ -4,6 +4,7 @@
* Copyright © 2016 Intel Corporation
*/
+#include <linux/vmalloc.h>
#include "mock_dmabuf.h"
static struct sg_table *mock_map_dma_buf(struct dma_buf_attachment *attachment,
diff --git a/drivers/gpu/drm/i915/gt/shmem_utils.c b/drivers/gpu/drm/i915/gt/shmem_utils.c
index bccc3a1200bc6e..1fb6ff77fd8991 100644
--- a/drivers/gpu/drm/i915/gt/shmem_utils.c
+++ b/drivers/gpu/drm/i915/gt/shmem_utils.c
@@ -7,6 +7,7 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/shmem_fs.h>
+#include <linux/vmalloc.h>
#include "i915_drv.h"
#include "gem/i915_gem_object.h"
diff --git a/drivers/gpu/drm/i915/gvt/firmware.c b/drivers/gpu/drm/i915/gvt/firmware.c
index 4dd52ac2043e7a..d800d267f0e9bd 100644
--- a/drivers/gpu/drm/i915/gvt/firmware.c
+++ b/drivers/gpu/drm/i915/gvt/firmware.c
@@ -30,6 +30,7 @@
#include <linux/firmware.h>
#include <linux/crc32.h>
+#include <linux/vmalloc.h>
#include "i915_drv.h"
#include "gvt.h"
diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
index 094fca9b0e73d1..58cca4906f4135 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.c
+++ b/drivers/gpu/drm/i915/gvt/gtt.c
@@ -39,6 +39,7 @@
#include "trace.h"
#include "gt/intel_gt_regs.h"
+#include <linux/vmalloc.h>
#if defined(VERBOSE_DEBUG)
#define gvt_vdbg_mm(fmt, args...) gvt_dbg_mm(fmt, ##args)
diff --git a/drivers/gpu/drm/i915/gvt/handlers.c b/drivers/gpu/drm/i915/gvt/handlers.c
index efcb00472be247..ea9c300927671f 100644
--- a/drivers/gpu/drm/i915/gvt/handlers.c
+++ b/drivers/gpu/drm/i915/gvt/handlers.c
@@ -52,6 +52,7 @@
#include "display/skl_watermark_regs.h"
#include "display/vlv_dsi_pll_regs.h"
#include "gt/intel_gt_regs.h"
+#include <linux/vmalloc.h>
/* XXX FIXME i915 has changed PP_XXX definition */
#define PCH_PP_STATUS _MMIO(0xc7200)
diff --git a/drivers/gpu/drm/i915/gvt/mmio.c b/drivers/gpu/drm/i915/gvt/mmio.c
index 5b5def6ddef7ae..780762f28aa4fb 100644
--- a/drivers/gpu/drm/i915/gvt/mmio.c
+++ b/drivers/gpu/drm/i915/gvt/mmio.c
@@ -33,6 +33,7 @@
*
*/
+#include <linux/vmalloc.h>
#include "i915_drv.h"
#include "i915_reg.h"
#include "gvt.h"
diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c
index 08ad1bd651f103..63c751ca411964 100644
--- a/drivers/gpu/drm/i915/gvt/vgpu.c
+++ b/drivers/gpu/drm/i915/gvt/vgpu.c
@@ -34,6 +34,7 @@
#include "i915_drv.h"
#include "gvt.h"
#include "i915_pvinfo.h"
+#include <linux/vmalloc.h>
void populate_pvinfo_page(struct intel_vgpu *vgpu)
{
diff --git a/drivers/gpu/drm/i915/intel_gvt.c b/drivers/gpu/drm/i915/intel_gvt.c
index 9b6d87c8b5831c..5a01d60e51861e 100644
--- a/drivers/gpu/drm/i915/intel_gvt.c
+++ b/drivers/gpu/drm/i915/intel_gvt.c
@@ -28,6 +28,7 @@
#include "gt/intel_context.h"
#include "gt/intel_ring.h"
#include "gt/shmem_utils.h"
+#include <linux/vmalloc.h>
/**
* DOC: Intel GVT-g host support
diff --git a/drivers/gpu/drm/imagination/pvr_vm_mips.c b/drivers/gpu/drm/imagination/pvr_vm_mips.c
index b7fef3c797e6ca..6563dcde109cda 100644
--- a/drivers/gpu/drm/imagination/pvr_vm_mips.c
+++ b/drivers/gpu/drm/imagination/pvr_vm_mips.c
@@ -14,6 +14,7 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/types.h>
+#include <linux/vmalloc.h>
/**
* pvr_vm_mips_init() - Initialise MIPS FW pagetable
diff --git a/drivers/gpu/drm/mediatek/mtk_drm_gem.c b/drivers/gpu/drm/mediatek/mtk_drm_gem.c
index 4f2e3feabc0f8a..3e519869b632c6 100644
--- a/drivers/gpu/drm/mediatek/mtk_drm_gem.c
+++ b/drivers/gpu/drm/mediatek/mtk_drm_gem.c
@@ -4,6 +4,7 @@
*/
#include <linux/dma-buf.h>
+#include <linux/vmalloc.h>
#include <drm/drm.h>
#include <drm/drm_device.h>
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index 3421e8389222a4..9ea0c64c26b5c8 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -9,6 +9,7 @@
#include <linux/shmem_fs.h>
#include <linux/spinlock.h>
#include <linux/pfn_t.h>
+#include <linux/vmalloc.h>
#include <drm/drm_prime.h>
#include <drm/drm_vma_manager.h>
diff --git a/drivers/gpu/drm/v3d/v3d_bo.c b/drivers/gpu/drm/v3d/v3d_bo.c
index a07ede668cc161..a165cbcdd27b94 100644
--- a/drivers/gpu/drm/v3d/v3d_bo.c
+++ b/drivers/gpu/drm/v3d/v3d_bo.c
@@ -21,6 +21,7 @@
#include <linux/dma-buf.h>
#include <linux/pfn_t.h>
+#include <linux/vmalloc.h>
#include "v3d_drv.h"
#include "uapi/drm/v3d_drm.h"
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_binding.c b/drivers/gpu/drm/vmwgfx/vmwgfx_binding.c
index ae2de914eb890e..2731f6ded1c28c 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_binding.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_binding.c
@@ -54,6 +54,7 @@
#include "vmwgfx_drv.h"
#include "vmwgfx_binding.h"
#include "device_include/svga3d_reg.h"
+#include <linux/vmalloc.h>
#define VMW_BINDING_RT_BIT 0
#define VMW_BINDING_PS_BIT 1
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c
index 195ff8792e5aba..dd4ca6a9c690bd 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c
@@ -31,6 +31,7 @@
#include <drm/ttm/ttm_placement.h>
#include <linux/sched/signal.h>
+#include <linux/vmalloc.h>
bool vmw_supports_3d(struct vmw_private *dev_priv)
{
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_devcaps.c b/drivers/gpu/drm/vmwgfx/vmwgfx_devcaps.c
index 829df395c2ed7f..6e6beff9e2621b 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_devcaps.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_devcaps.c
@@ -25,6 +25,7 @@
*
**************************************************************************/
+#include <linux/vmalloc.h>
#include "vmwgfx_devcaps.h"
#include "vmwgfx_drv.h"
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 58fb40c93100a8..bd6cebadf37ee6 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -53,6 +53,7 @@
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/version.h>
+#include <linux/vmalloc.h>
#define VMWGFX_DRIVER_DESC "Linux drm driver for VMware graphics devices"
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
index cc3086e649eb5c..2e52d73eba4840 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
@@ -35,6 +35,7 @@
#include <linux/sync_file.h>
#include <linux/hashtable.h>
+#include <linux/vmalloc.h>
/*
* Helper macro to get dx_ctx_node if available otherwise print an error
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c
index a1da5678c73140..835d1eed8dd931 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c
@@ -31,6 +31,7 @@
#include <drm/vmwgfx_drm.h>
#include <linux/pci.h>
+#include <linux/vmalloc.h>
int vmw_getparam_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv)
diff --git a/drivers/gpu/drm/xen/xen_drm_front_gem.c b/drivers/gpu/drm/xen/xen_drm_front_gem.c
index 3ad2b4cfd1f0b2..63112ed975c472 100644
--- a/drivers/gpu/drm/xen/xen_drm_front_gem.c
+++ b/drivers/gpu/drm/xen/xen_drm_front_gem.c
@@ -11,6 +11,7 @@
#include <linux/dma-buf.h>
#include <linux/scatterlist.h>
#include <linux/shmem_fs.h>
+#include <linux/vmalloc.h>
#include <drm/drm_gem.h>
#include <drm/drm_prime.h>
diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c
index 6136776482e6d8..96a32b21366994 100644
--- a/drivers/hwtracing/coresight/coresight-trbe.c
+++ b/drivers/hwtracing/coresight/coresight-trbe.c
@@ -17,6 +17,7 @@
#include <asm/barrier.h>
#include <asm/cpufeature.h>
+#include <linux/vmalloc.h>
#include "coresight-self-hosted-trace.h"
#include "coresight-trbe.h"
diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index f482aab420f787..52575ba9a141fe 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -134,13 +134,14 @@ static inline int get_pci_sbdf_id(struct pci_dev *pdev)
return PCI_SEG_DEVID_TO_SBDF(seg, devid);
}
-static inline void *alloc_pgtable_page(int nid, gfp_t gfp)
+static inline void *alloc_pgtable_page_noprof(int nid, gfp_t gfp)
{
struct page *page;
- page = alloc_pages_node(nid, gfp | __GFP_ZERO, 0);
+ page = alloc_pages_node_noprof(nid, gfp | __GFP_ZERO, 0);
return page ? page_address(page) : NULL;
}
+#define alloc_pgtable_page(...) alloc_hooks(alloc_pgtable_page_noprof(__VA_ARGS__))
/*
* This must be called after device probe completes. During probe
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index e4cb26f6a9434e..07d087eecc17c3 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1723,7 +1723,7 @@ static const struct dma_map_ops iommu_dma_ops = {
.flags = DMA_F_PCI_P2PDMA_SUPPORTED,
.alloc = iommu_dma_alloc,
.free = iommu_dma_free,
- .alloc_pages = dma_common_alloc_pages,
+ .alloc_pages_op = dma_common_alloc_pages,
.free_pages = dma_common_free_pages,
.alloc_noncontiguous = iommu_dma_alloc_noncontiguous,
.free_noncontiguous = iommu_dma_free_noncontiguous,
diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c
index 2e2c3be8a0b429..e6eb98d70f3c42 100644
--- a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c
+++ b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c
@@ -15,6 +15,7 @@
#include <linux/io.h>
#include <linux/pci.h>
#include <linux/etherdevice.h>
+#include <linux/vmalloc.h>
#include "octep_config.h"
#include "octep_main.h"
diff --git a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_mbox.c b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_mbox.c
index 2eab21e43048a1..445b626efe11d7 100644
--- a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_mbox.c
+++ b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_mbox.c
@@ -7,6 +7,7 @@
#include <linux/types.h>
#include <linux/pci.h>
#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
#include "octep_vf_config.h"
#include "octep_vf_main.h"
diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c
index 2729a2c5acf9cc..11021c34e47e7c 100644
--- a/drivers/net/ethernet/microsoft/mana/hw_channel.c
+++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c
@@ -3,6 +3,7 @@
#include <net/mana/gdma.h>
#include <net/mana/hw_channel.h>
+#include <linux/vmalloc.h>
static int mana_hwc_get_msg_index(struct hw_channel_context *hwc, u16 *msg_id)
{
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index 9ce0d20a6c5812..feef537257d057 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -1022,7 +1022,7 @@ static const struct dma_map_ops ccio_ops = {
.map_sg = ccio_map_sg,
.unmap_sg = ccio_unmap_sg,
.get_sgtable = dma_common_get_sgtable,
- .alloc_pages = dma_common_alloc_pages,
+ .alloc_pages_op = dma_common_alloc_pages,
.free_pages = dma_common_free_pages,
};
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index 784037837f65e0..fc3863c09f83d3 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -1090,7 +1090,7 @@ static const struct dma_map_ops sba_ops = {
.map_sg = sba_map_sg,
.unmap_sg = sba_unmap_sg,
.get_sgtable = dma_common_get_sgtable,
- .alloc_pages = dma_common_alloc_pages,
+ .alloc_pages_op = dma_common_alloc_pages,
.free_pages = dma_common_free_pages,
};
diff --git a/drivers/platform/x86/uv_sysfs.c b/drivers/platform/x86/uv_sysfs.c
index 38d1b692d3c0aa..40e0108771893b 100644
--- a/drivers/platform/x86/uv_sysfs.c
+++ b/drivers/platform/x86/uv_sysfs.c
@@ -11,6 +11,7 @@
#include <linux/device.h>
#include <linux/slab.h>
#include <linux/kobject.h>
+#include <linux/vmalloc.h>
#include <asm/uv/bios.h>
#include <asm/uv/uv.h>
#include <asm/uv/uv_hub.h>
diff --git a/drivers/scsi/mpi3mr/mpi3mr_transport.c b/drivers/scsi/mpi3mr/mpi3mr_transport.c
index d32ad46318cb09..dabb91f0f75df9 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_transport.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_transport.c
@@ -7,6 +7,8 @@
*
*/
+#include <linux/vmalloc.h>
+
#include "mpi3mr.h"
/**
diff --git a/drivers/staging/media/atomisp/pci/hmm/hmm.c b/drivers/staging/media/atomisp/pci/hmm/hmm.c
index bb12644fd033b5..3e2899ad85174f 100644
--- a/drivers/staging/media/atomisp/pci/hmm/hmm.c
+++ b/drivers/staging/media/atomisp/pci/hmm/hmm.c
@@ -205,7 +205,7 @@ static ia_css_ptr __hmm_alloc(size_t bytes, enum hmm_bo_type type,
}
dev_dbg(atomisp_dev, "pages: 0x%08x (%zu bytes), type: %d, vmalloc %p\n",
- bo->start, bytes, type, vmalloc);
+ bo->start, bytes, type, vmalloc_noprof);
return bo->start;
diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c
index 68e8f006dfdbf7..c51f5e4c3dd6d2 100644
--- a/drivers/vfio/pci/pds/dirty.c
+++ b/drivers/vfio/pci/pds/dirty.c
@@ -3,6 +3,7 @@
#include <linux/interval_tree.h>
#include <linux/vfio.h>
+#include <linux/vmalloc.h>
#include <linux/pds/pds_common.h>
#include <linux/pds/pds_core_if.h>
diff --git a/drivers/virt/acrn/mm.c b/drivers/virt/acrn/mm.c
index fa5d9ca6be5706..b30077baf352b4 100644
--- a/drivers/virt/acrn/mm.c
+++ b/drivers/virt/acrn/mm.c
@@ -12,6 +12,7 @@
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include "acrn_drv.h"
@@ -171,18 +172,24 @@ int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap)
mmap_read_lock(current->mm);
vma = vma_lookup(current->mm, memmap->vma_base);
if (vma && ((vma->vm_flags & VM_PFNMAP) != 0)) {
+ spinlock_t *ptl;
+ pte_t *ptep;
+
if ((memmap->vma_base + memmap->len) > vma->vm_end) {
mmap_read_unlock(current->mm);
return -EINVAL;
}
- ret = follow_pfn(vma, memmap->vma_base, &pfn);
- mmap_read_unlock(current->mm);
+ ret = follow_pte(vma->vm_mm, memmap->vma_base, &ptep, &ptl);
if (ret < 0) {
+ mmap_read_unlock(current->mm);
dev_dbg(acrn_dev.this_device,
"Failed to lookup PFN at VMA:%pK.\n", (void *)memmap->vma_base);
return ret;
}
+ pfn = pte_pfn(ptep_get(ptep));
+ pte_unmap_unlock(ptep, ptl);
+ mmap_read_unlock(current->mm);
return acrn_mm_region_add(vm, memmap->user_vm_pa,
PFN_PHYS(pfn), memmap->len,
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 8e322329444237..e8355f55a8f7e8 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -21,6 +21,7 @@
#include <linux/bitmap.h>
#include <linux/lockdep.h>
#include <linux/log2.h>
+#include <linux/vmalloc.h>
#include <acpi/acpi_numa.h>
diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c
index 76f6f26265a3b8..29257d2639dbf0 100644
--- a/drivers/xen/grant-dma-ops.c
+++ b/drivers/xen/grant-dma-ops.c
@@ -282,7 +282,7 @@ static int xen_grant_dma_supported(struct device *dev, u64 mask)
static const struct dma_map_ops xen_grant_dma_ops = {
.alloc = xen_grant_dma_alloc,
.free = xen_grant_dma_free,
- .alloc_pages = xen_grant_dma_alloc_pages,
+ .alloc_pages_op = xen_grant_dma_alloc_pages,
.free_pages = xen_grant_dma_free_pages,
.mmap = dma_common_mmap,
.get_sgtable = dma_common_get_sgtable,
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 0e6c6c25d154f5..1c4ef5111651d6 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -403,7 +403,7 @@ const struct dma_map_ops xen_swiotlb_dma_ops = {
.dma_supported = xen_swiotlb_dma_supported,
.mmap = dma_common_mmap,
.get_sgtable = dma_common_get_sgtable,
- .alloc_pages = dma_common_alloc_pages,
+ .alloc_pages_op = dma_common_alloc_pages,
.free_pages = dma_common_free_pages,
.max_mapping_size = swiotlb_max_mapping_size,
};
diff --git a/fs/dax.c b/fs/dax.c
index 423fc1607dfae5..becb4a6920c6aa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1207,17 +1207,17 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
struct vm_area_struct *vma = vmf->vma;
struct inode *inode = mapping->host;
pgtable_t pgtable = NULL;
- struct page *zero_page;
+ struct folio *zero_folio;
spinlock_t *ptl;
pmd_t pmd_entry;
pfn_t pfn;
- zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
+ zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm);
- if (unlikely(!zero_page))
+ if (unlikely(!zero_folio))
goto fallback;
- pfn = page_to_pfn_t(zero_page);
+ pfn = page_to_pfn_t(&zero_folio->page);
*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
DAX_PMD | DAX_ZERO_PAGE);
@@ -1237,17 +1237,17 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
mm_inc_nr_ptes(vma->vm_mm);
}
- pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
+ pmd_entry = mk_pmd(&zero_folio->page, vmf->vma->vm_page_prot);
pmd_entry = pmd_mkhuge(pmd_entry);
set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
spin_unlock(ptl);
- trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
+ trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
return VM_FAULT_NOPAGE;
fallback:
if (pgtable)
pte_free(vma->vm_mm, pgtable);
- trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
+ trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry);
return VM_FAULT_FALLBACK;
}
#else
diff --git a/fs/exec.c b/fs/exec.c
index cf1df7f16e55cc..0c5f06d08c355a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -67,6 +67,7 @@
#include <linux/time_namespace.h>
#include <linux/user_events.h>
#include <linux/rseq.h>
+#include <linux/ksm.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -268,6 +269,14 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
}
/*
+ * Need to be called with mmap write lock
+ * held, to avoid race with ksmd.
+ */
+ err = ksm_execve(mm);
+ if (err)
+ goto err_ksm;
+
+ /*
* Place the stack at the largest stack address the architecture
* supports. Later, we'll move this to an appropriate place. We don't
* use STACK_TOP because that can depend on attributes which aren't
@@ -288,6 +297,8 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
bprm->p = vma->vm_end - sizeof(void *);
return 0;
err:
+ ksm_exit(mm);
+err_ksm:
mmap_write_unlock(mm);
err_free:
bprm->vma = NULL;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6502c7e776d195..2f4e88552d3f28 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -176,14 +176,12 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
- info.flags = 0;
info.length = len;
info.low_limit = current->mm->mmap_base;
info.high_limit = arch_get_mmap_end(addr, len, flags);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
return vm_unmapped_area(&info);
}
@@ -192,14 +190,13 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = PAGE_SIZE;
info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
addr = vm_unmapped_area(&info);
/*
@@ -249,11 +246,11 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
}
/*
- * Use mm->get_unmapped_area value as a hint to use topdown routine.
+ * Use MMF_TOPDOWN flag as a hint to use topdown routine.
* If architectures have special needs, they should define their own
* version of hugetlb_get_unmapped_area.
*/
- if (mm->get_unmapped_area == arch_get_unmapped_area_topdown)
+ if (test_bit(MMF_TOPDOWN, &mm->flags))
return hugetlb_get_unmapped_area_topdown(file, addr, len,
pgoff, flags);
return hugetlb_get_unmapped_area_bottomup(file, addr, len,
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 5aa776b5a3e7f8..b17a9eb9b1488b 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -46,10 +46,7 @@ static inline void nfs_add_stats(const struct inode *inode,
nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
}
-static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
-{
- return alloc_percpu(struct nfs_iostats);
-}
+#define nfs_alloc_iostats() alloc_percpu(struct nfs_iostats)
static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats)
{
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index dcd513dccf55cb..d19434e2a58e71 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -451,15 +451,13 @@ pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned lo
unsigned long len, unsigned long pgoff,
unsigned long flags)
{
- typeof_member(struct proc_ops, proc_get_unmapped_area) get_area;
+ if (pde->proc_ops->proc_get_unmapped_area)
+ return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags);
- get_area = pde->proc_ops->proc_get_unmapped_area;
#ifdef CONFIG_MMU
- if (!get_area)
- get_area = current->mm->get_unmapped_area;
+ return mm_get_unmapped_area(current->mm, file, orig_addr, len, pgoff, flags);
#endif
- if (get_area)
- return get_area(file, orig_addr, len, pgoff, flags);
+
return orig_addr;
}
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 45af9a989d4040..245171d9164be3 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -89,8 +89,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "SwapTotal: ", i.totalswap);
show_val_kb(m, "SwapFree: ", i.freeswap);
#ifdef CONFIG_ZSWAP
- seq_printf(m, "Zswap: %8lu kB\n",
- (unsigned long)(zswap_pool_total_size >> 10));
+ show_val_kb(m, "Zswap: ", zswap_total_pages());
seq_printf(m, "Zswapped: %8lu kB\n",
(unsigned long)atomic_read(&zswap_stored_pages) <<
(PAGE_SHIFT - 10));
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 9223856c934b40..2fb64bdb64eb1d 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -107,10 +107,13 @@ static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
return ((kflags >> kbit) & 1) << ubit;
}
-u64 stable_page_flags(struct page *page)
+u64 stable_page_flags(const struct page *page)
{
- u64 k;
- u64 u;
+ const struct folio *folio;
+ unsigned long k;
+ unsigned long mapping;
+ bool is_anon;
+ u64 u = 0;
/*
* pseudo flag: KPF_NOPAGE
@@ -118,49 +121,47 @@ u64 stable_page_flags(struct page *page)
*/
if (!page)
return 1 << KPF_NOPAGE;
+ folio = page_folio(page);
- k = page->flags;
- u = 0;
+ k = folio->flags;
+ mapping = (unsigned long)folio->mapping;
+ is_anon = mapping & PAGE_MAPPING_ANON;
/*
* pseudo flags for the well known (anonymous) memory mapped pages
*/
if (page_mapped(page))
u |= 1 << KPF_MMAP;
- if (PageAnon(page))
+ if (is_anon) {
u |= 1 << KPF_ANON;
- if (PageKsm(page))
- u |= 1 << KPF_KSM;
+ if (mapping & PAGE_MAPPING_KSM)
+ u |= 1 << KPF_KSM;
+ }
/*
* compound pages: export both head/tail info
* they together define a compound page's start/end pos and order
*/
- if (PageHead(page))
- u |= 1 << KPF_COMPOUND_HEAD;
- if (PageTail(page))
+ if (page == &folio->page)
+ u |= kpf_copy_bit(k, KPF_COMPOUND_HEAD, PG_head);
+ else
u |= 1 << KPF_COMPOUND_TAIL;
- if (PageHuge(page))
+ if (folio_test_hugetlb(folio))
u |= 1 << KPF_HUGE;
/*
- * PageTransCompound can be true for non-huge compound pages (slab
- * pages or pages allocated by drivers with __GFP_COMP) because it
- * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
+ * We need to check PageLRU/PageAnon
* to make sure a given page is a thp, not a non-huge compound page.
*/
- else if (PageTransCompound(page)) {
- struct page *head = compound_head(page);
-
- if (PageLRU(head) || PageAnon(head))
+ else if (folio_test_large(folio)) {
+ if ((k & (1 << PG_lru)) || is_anon)
u |= 1 << KPF_THP;
- else if (is_huge_zero_page(head)) {
+ else if (is_huge_zero_folio(folio)) {
u |= 1 << KPF_ZERO_PAGE;
u |= 1 << KPF_THP;
}
} else if (is_zero_pfn(page_to_pfn(page)))
u |= 1 << KPF_ZERO_PAGE;
-
/*
* Caveats on high order pages: PG_buddy and PG_slab will only be set
* on the head page.
@@ -174,16 +175,17 @@ u64 stable_page_flags(struct page *page)
u |= 1 << KPF_OFFLINE;
if (PageTable(page))
u |= 1 << KPF_PGTABLE;
+ if (folio_test_slab(folio))
+ u |= 1 << KPF_SLAB;
- if (page_is_idle(page))
+#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
+ u |= kpf_copy_bit(k, KPF_IDLE, PG_idle);
+#else
+ if (folio_test_idle(folio))
u |= 1 << KPF_IDLE;
+#endif
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
-
- u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
- if (PageTail(page) && PageSlab(page))
- u |= 1 << KPF_SLAB;
-
u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate);
@@ -194,7 +196,8 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active);
u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim);
- if (PageSwapCache(page))
+#define SWAPCACHE ((1 << PG_swapbacked) | (1 << PG_swapcache))
+ if ((k & SWAPCACHE) == SWAPCACHE)
u |= 1 << KPF_SWAPCACHE;
u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked);
@@ -202,7 +205,10 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked);
#ifdef CONFIG_MEMORY_FAILURE
- u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
+ if (u & (1 << KPF_HUGE))
+ u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
+ else
+ u |= kpf_copy_bit(page->flags, KPF_HWPOISON, PG_hwpoison);
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
@@ -228,7 +234,6 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
{
const unsigned long max_dump_pfn = get_max_dump_pfn();
u64 __user *out = (u64 __user *)buf;
- struct page *ppage;
unsigned long src = *ppos;
unsigned long pfn;
ssize_t ret = 0;
@@ -245,9 +250,9 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
* TODO: ZONE_DEVICE support requires to identify
* memmaps that were actually initialized.
*/
- ppage = pfn_to_online_page(pfn);
+ struct page *page = pfn_to_online_page(pfn);
- if (put_user(stable_page_flags(ppage), out)) {
+ if (put_user(stable_page_flags(page), out)) {
ret = -EFAULT;
break;
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 102f48668c352a..9211c53d5a6b1e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -411,14 +411,14 @@ struct mem_size_stats {
};
static void smaps_page_accumulate(struct mem_size_stats *mss,
- struct page *page, unsigned long size, unsigned long pss,
+ struct folio *folio, unsigned long size, unsigned long pss,
bool dirty, bool locked, bool private)
{
mss->pss += pss;
- if (PageAnon(page))
+ if (folio_test_anon(folio))
mss->pss_anon += pss;
- else if (PageSwapBacked(page))
+ else if (folio_test_swapbacked(folio))
mss->pss_shmem += pss;
else
mss->pss_file += pss;
@@ -426,7 +426,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
if (locked)
mss->pss_locked += pss;
- if (dirty || PageDirty(page)) {
+ if (dirty || folio_test_dirty(folio)) {
mss->pss_dirty += pss;
if (private)
mss->private_dirty += size;
@@ -444,6 +444,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
bool compound, bool young, bool dirty, bool locked,
bool migration)
{
+ struct folio *folio = page_folio(page);
int i, nr = compound ? compound_nr(page) : 1;
unsigned long size = nr * PAGE_SIZE;
@@ -451,27 +452,28 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
* First accumulate quantities that depend only on |size| and the type
* of the compound page.
*/
- if (PageAnon(page)) {
+ if (folio_test_anon(folio)) {
mss->anonymous += size;
- if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
+ if (!folio_test_swapbacked(folio) && !dirty &&
+ !folio_test_dirty(folio))
mss->lazyfree += size;
}
- if (PageKsm(page))
+ if (folio_test_ksm(folio))
mss->ksm += size;
mss->resident += size;
/* Accumulate the size in pages that have been accessed. */
- if (young || page_is_young(page) || PageReferenced(page))
+ if (young || folio_test_young(folio) || folio_test_referenced(folio))
mss->referenced += size;
/*
* Then accumulate quantities that may depend on sharing, or that may
* differ page-by-page.
*
- * page_count(page) == 1 guarantees the page is mapped exactly once.
+ * refcount == 1 guarantees the page is mapped exactly once.
* If any subpage of the compound page mapped with PTE it would elevate
- * page_count().
+ * the refcount.
*
* The page_mapcount() is called to get a snapshot of the mapcount.
* Without holding the page lock this snapshot can be slightly wrong as
@@ -480,9 +482,9 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
* especially for migration entries. Treat regular migration entries
* as mapcount == 1.
*/
- if ((page_count(page) == 1) || migration) {
- smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
- locked, true);
+ if ((folio_ref_count(folio) == 1) || migration) {
+ smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT,
+ dirty, locked, true);
return;
}
for (i = 0; i < nr; i++, page++) {
@@ -490,8 +492,8 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
unsigned long pss = PAGE_SIZE << PSS_SHIFT;
if (mapcount >= 2)
pss /= mapcount;
- smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked,
- mapcount < 2);
+ smaps_page_accumulate(mss, folio, PAGE_SIZE, pss,
+ dirty, locked, mapcount < 2);
}
}
@@ -576,6 +578,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
+ struct folio *folio;
bool migration = false;
if (pmd_present(*pmd)) {
@@ -590,11 +593,12 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
}
if (IS_ERR_OR_NULL(page))
return;
- if (PageAnon(page))
+ folio = page_folio(page);
+ if (folio_test_anon(folio))
mss->anonymous_thp += HPAGE_PMD_SIZE;
- else if (PageSwapBacked(page))
+ else if (folio_test_swapbacked(folio))
mss->shmem_thp += HPAGE_PMD_SIZE;
- else if (is_zone_device_page(page))
+ else if (folio_is_zone_device(folio))
/* pass */;
else
mss->file_thp += HPAGE_PMD_SIZE;
@@ -1161,7 +1165,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
pte_t *pte, ptent;
spinlock_t *ptl;
- struct page *page;
+ struct folio *folio;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -1173,12 +1177,12 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
if (!pmd_present(*pmd))
goto out;
- page = pmd_page(*pmd);
+ folio = pmd_folio(*pmd);
/* Clear accessed and referenced bits. */
pmdp_test_and_clear_young(vma, addr, pmd);
- test_and_clear_page_young(page);
- ClearPageReferenced(page);
+ folio_test_clear_young(folio);
+ folio_clear_referenced(folio);
out:
spin_unlock(ptl);
return 0;
@@ -1200,14 +1204,14 @@ out:
if (!pte_present(ptent))
continue;
- page = vm_normal_page(vma, addr, ptent);
- if (!page)
+ folio = vm_normal_folio(vma, addr, ptent);
+ if (!folio)
continue;
/* Clear accessed and referenced bits. */
ptep_test_and_clear_young(vma, addr, pte);
- test_and_clear_page_young(page);
- ClearPageReferenced(page);
+ folio_test_clear_young(folio);
+ folio_clear_referenced(folio);
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
@@ -2551,28 +2555,29 @@ struct numa_maps_private {
static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
unsigned long nr_pages)
{
+ struct folio *folio = page_folio(page);
int count = page_mapcount(page);
md->pages += nr_pages;
- if (pte_dirty || PageDirty(page))
+ if (pte_dirty || folio_test_dirty(folio))
md->dirty += nr_pages;
- if (PageSwapCache(page))
+ if (folio_test_swapcache(folio))
md->swapcache += nr_pages;
- if (PageActive(page) || PageUnevictable(page))
+ if (folio_test_active(folio) || folio_test_unevictable(folio))
md->active += nr_pages;
- if (PageWriteback(page))
+ if (folio_test_writeback(folio))
md->writeback += nr_pages;
- if (PageAnon(page))
+ if (folio_test_anon(folio))
md->anon += nr_pages;
if (count > md->mapcount_max)
md->mapcount_max = count;
- md->node[page_to_nid(page)] += nr_pages;
+ md->node[folio_nid(folio)] += nr_pages;
}
static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index c7a1aa3c882b02..b45c7edc3225e4 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -35,7 +35,7 @@ static unsigned long ramfs_mmu_get_unmapped_area(struct file *file,
unsigned long addr, unsigned long len, unsigned long pgoff,
unsigned long flags)
{
- return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+ return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
}
const struct file_operations ramfs_file_operations = {
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 292f5fd501047f..d2c3879745e535 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -657,7 +657,10 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
struct userfaultfd_fork_ctx *fctx;
octx = vma->vm_userfaultfd_ctx.ctx;
- if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+ if (!octx)
+ return 0;
+
+ if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) {
vma_start_write(vma);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
diff --git a/include/acpi/platform/aclinuxex.h b/include/acpi/platform/aclinuxex.h
index 600d4e2641dae1..62cac266a1c89e 100644
--- a/include/acpi/platform/aclinuxex.h
+++ b/include/acpi/platform/aclinuxex.h
@@ -47,26 +47,19 @@ acpi_status acpi_os_terminate(void);
* However, boot has (system_state != SYSTEM_RUNNING)
* to quiet __might_sleep() in kmalloc() and resume does not.
*/
-static inline void *acpi_os_allocate(acpi_size size)
-{
- return kmalloc(size, irqs_disabled()? GFP_ATOMIC : GFP_KERNEL);
-}
+#define acpi_os_allocate(_size) \
+ kmalloc(_size, irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL)
-static inline void *acpi_os_allocate_zeroed(acpi_size size)
-{
- return kzalloc(size, irqs_disabled()? GFP_ATOMIC : GFP_KERNEL);
-}
+#define acpi_os_allocate_zeroed(_size) \
+ kzalloc(_size, irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL)
static inline void acpi_os_free(void *memory)
{
kfree(memory);
}
-static inline void *acpi_os_acquire_object(acpi_cache_t * cache)
-{
- return kmem_cache_zalloc(cache,
- irqs_disabled()? GFP_ATOMIC : GFP_KERNEL);
-}
+#define acpi_os_acquire_object(_cache) \
+ kmem_cache_zalloc(_cache, irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL)
static inline acpi_thread_id acpi_os_get_thread_id(void)
{
diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h
new file mode 100644
index 00000000000000..64f536b803802e
--- /dev/null
+++ b/include/asm-generic/codetag.lds.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ASM_GENERIC_CODETAG_LDS_H
+#define __ASM_GENERIC_CODETAG_LDS_H
+
+#define SECTION_WITH_BOUNDARIES(_name) \
+ . = ALIGN(8); \
+ __start_##_name = .; \
+ KEEP(*(_name)) \
+ __stop_##_name = .;
+
+#define CODETAG_SECTIONS() \
+ SECTION_WITH_BOUNDARIES(alloc_tags)
+
+#endif /* __ASM_GENERIC_CODETAG_LDS_H */
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index bac63e874c7bf9..80de699bf6af4b 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -9,6 +9,7 @@
#include <asm/page.h> /* I/O is all done through memory accesses */
#include <linux/string.h> /* for memset() and memcpy() */
+#include <linux/sizes.h>
#include <linux/types.h>
#include <linux/instruction_pointer.h>
@@ -991,7 +992,6 @@ static inline void iowrite64_rep(volatile void __iomem *addr,
#ifdef __KERNEL__
-#include <linux/vmalloc.h>
#define __io_virt(x) ((void __force *)(x))
/*
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index 879e5f8aa5e92a..7c48f5fbf8aa79 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -16,15 +16,16 @@
*
* Return: pointer to the allocated memory or %NULL on error
*/
-static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm)
+static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm)
{
- struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL &
+ struct ptdesc *ptdesc = pagetable_alloc_noprof(GFP_PGTABLE_KERNEL &
~__GFP_HIGHMEM, 0);
if (!ptdesc)
return NULL;
return ptdesc_address(ptdesc);
}
+#define __pte_alloc_one_kernel(...) alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__))
#ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
/**
@@ -33,10 +34,11 @@ static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm)
*
* Return: pointer to the allocated memory or %NULL on error
*/
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
+static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
{
- return __pte_alloc_one_kernel(mm);
+ return __pte_alloc_one_kernel_noprof(mm);
}
+#define pte_alloc_one_kernel(...) alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__))
#endif
/**
@@ -61,11 +63,11 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
*
* Return: `struct page` referencing the ptdesc or %NULL on error
*/
-static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp)
+static inline pgtable_t __pte_alloc_one_noprof(struct mm_struct *mm, gfp_t gfp)
{
struct ptdesc *ptdesc;
- ptdesc = pagetable_alloc(gfp, 0);
+ ptdesc = pagetable_alloc_noprof(gfp, 0);
if (!ptdesc)
return NULL;
if (!pagetable_pte_ctor(ptdesc)) {
@@ -75,6 +77,7 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp)
return ptdesc_page(ptdesc);
}
+#define __pte_alloc_one(...) alloc_hooks(__pte_alloc_one_noprof(__VA_ARGS__))
#ifndef __HAVE_ARCH_PTE_ALLOC_ONE
/**
@@ -85,10 +88,11 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp)
*
* Return: `struct page` referencing the ptdesc or %NULL on error
*/
-static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
+static inline pgtable_t pte_alloc_one_noprof(struct mm_struct *mm)
{
- return __pte_alloc_one(mm, GFP_PGTABLE_USER);
+ return __pte_alloc_one_noprof(mm, GFP_PGTABLE_USER);
}
+#define pte_alloc_one(...) alloc_hooks(pte_alloc_one_noprof(__VA_ARGS__))
#endif
/*
@@ -124,14 +128,14 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
*
* Return: pointer to the allocated memory or %NULL on error
*/
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
struct ptdesc *ptdesc;
gfp_t gfp = GFP_PGTABLE_USER;
if (mm == &init_mm)
gfp = GFP_PGTABLE_KERNEL;
- ptdesc = pagetable_alloc(gfp, 0);
+ ptdesc = pagetable_alloc_noprof(gfp, 0);
if (!ptdesc)
return NULL;
if (!pagetable_pmd_ctor(ptdesc)) {
@@ -140,6 +144,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
}
return ptdesc_address(ptdesc);
}
+#define pmd_alloc_one(...) alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__))
#endif
#ifndef __HAVE_ARCH_PMD_FREE
@@ -157,7 +162,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
#if CONFIG_PGTABLE_LEVELS > 3
-static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
gfp_t gfp = GFP_PGTABLE_USER;
struct ptdesc *ptdesc;
@@ -166,13 +171,14 @@ static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr)
gfp = GFP_PGTABLE_KERNEL;
gfp &= ~__GFP_HIGHMEM;
- ptdesc = pagetable_alloc(gfp, 0);
+ ptdesc = pagetable_alloc_noprof(gfp, 0);
if (!ptdesc)
return NULL;
pagetable_pud_ctor(ptdesc);
return ptdesc_address(ptdesc);
}
+#define __pud_alloc_one(...) alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__))
#ifndef __HAVE_ARCH_PUD_ALLOC_ONE
/**
@@ -184,10 +190,11 @@ static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr)
*
* Return: pointer to the allocated memory or %NULL on error
*/
-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+static inline pud_t *pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
- return __pud_alloc_one(mm, addr);
+ return __pud_alloc_one_noprof(mm, addr);
}
+#define pud_alloc_one(...) alloc_hooks(pud_alloc_one_noprof(__VA_ARGS__))
#endif
static inline void __pud_free(struct mm_struct *mm, pud_t *pud)
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index f7749d0f2562f1..3e4497b5135a16 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -50,6 +50,8 @@
* [__nosave_begin, __nosave_end] for the nosave data
*/
+#include <asm-generic/codetag.lds.h>
+
#ifndef LOAD_OFFSET
#define LOAD_OFFSET 0
#endif
@@ -366,6 +368,7 @@
. = ALIGN(8); \
BOUNDED_SECTION_BY(__dyndbg_classes, ___dyndbg_classes) \
BOUNDED_SECTION_BY(__dyndbg, ___dyndbg) \
+ CODETAG_SECTIONS() \
LIKELY_PROFILE() \
BRANCH_PROFILE() \
TRACE_PRINTKS() \
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index 5d61f576cfc860..e5181cc9b7c563 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -578,19 +578,20 @@ static inline void ahash_request_set_tfm(struct ahash_request *req,
*
* Return: allocated request handle in case of success, or NULL if out of memory
*/
-static inline struct ahash_request *ahash_request_alloc(
+static inline struct ahash_request *ahash_request_alloc_noprof(
struct crypto_ahash *tfm, gfp_t gfp)
{
struct ahash_request *req;
- req = kmalloc(sizeof(struct ahash_request) +
- crypto_ahash_reqsize(tfm), gfp);
+ req = kmalloc_noprof(sizeof(struct ahash_request) +
+ crypto_ahash_reqsize(tfm), gfp);
if (likely(req))
ahash_request_set_tfm(req, tfm);
return req;
}
+#define ahash_request_alloc(...) alloc_hooks(ahash_request_alloc_noprof(__VA_ARGS__))
/**
* ahash_request_free() - zeroize and free the request data structure
diff --git a/include/crypto/internal/acompress.h b/include/crypto/internal/acompress.h
index 4ac46bafba9d7c..2a67793f52ad87 100644
--- a/include/crypto/internal/acompress.h
+++ b/include/crypto/internal/acompress.h
@@ -69,15 +69,16 @@ static inline void acomp_request_complete(struct acomp_req *req,
crypto_request_complete(&req->base, err);
}
-static inline struct acomp_req *__acomp_request_alloc(struct crypto_acomp *tfm)
+static inline struct acomp_req *__acomp_request_alloc_noprof(struct crypto_acomp *tfm)
{
struct acomp_req *req;
- req = kzalloc(sizeof(*req) + crypto_acomp_reqsize(tfm), GFP_KERNEL);
+ req = kzalloc_noprof(sizeof(*req) + crypto_acomp_reqsize(tfm), GFP_KERNEL);
if (likely(req))
acomp_request_set_tfm(req, tfm);
return req;
}
+#define __acomp_request_alloc(...) alloc_hooks(__acomp_request_alloc_noprof(__VA_ARGS__))
static inline void __acomp_request_free(struct acomp_req *req)
{
diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h
index c8857d7bdb37f1..6c5330e316b0e2 100644
--- a/include/crypto/skcipher.h
+++ b/include/crypto/skcipher.h
@@ -861,19 +861,20 @@ static inline struct skcipher_request *skcipher_request_cast(
*
* Return: allocated request handle in case of success, or NULL if out of memory
*/
-static inline struct skcipher_request *skcipher_request_alloc(
+static inline struct skcipher_request *skcipher_request_alloc_noprof(
struct crypto_skcipher *tfm, gfp_t gfp)
{
struct skcipher_request *req;
- req = kmalloc(sizeof(struct skcipher_request) +
- crypto_skcipher_reqsize(tfm), gfp);
+ req = kmalloc_noprof(sizeof(struct skcipher_request) +
+ crypto_skcipher_reqsize(tfm), gfp);
if (likely(req))
skcipher_request_set_tfm(req, tfm);
return req;
}
+#define skcipher_request_alloc(...) alloc_hooks(skcipher_request_alloc_noprof(__VA_ARGS__))
/**
* skcipher_request_free() - zeroize and free request data structure
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
new file mode 100644
index 00000000000000..afc9e259a2d354
--- /dev/null
+++ b/include/linux/alloc_tag.h
@@ -0,0 +1,216 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * allocation tagging
+ */
+#ifndef _LINUX_ALLOC_TAG_H
+#define _LINUX_ALLOC_TAG_H
+
+#include <linux/bug.h>
+#include <linux/codetag.h>
+#include <linux/container_of.h>
+#include <linux/preempt.h>
+#include <asm/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/static_key.h>
+#include <linux/irqflags.h>
+
+struct alloc_tag_counters {
+ u64 bytes;
+ u64 calls;
+};
+
+/*
+ * An instance of this structure is created in a special ELF section at every
+ * allocation callsite. At runtime, the special section is treated as
+ * an array of these. Embedded codetag utilizes codetag framework.
+ */
+struct alloc_tag {
+ struct codetag ct;
+ struct alloc_tag_counters __percpu *counters;
+} __aligned(8);
+
+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
+
+#define CODETAG_EMPTY ((void *)1)
+
+static inline bool is_codetag_empty(union codetag_ref *ref)
+{
+ return ref->ct == CODETAG_EMPTY;
+}
+
+static inline void set_codetag_empty(union codetag_ref *ref)
+{
+ if (ref)
+ ref->ct = CODETAG_EMPTY;
+}
+
+#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
+
+static inline bool is_codetag_empty(union codetag_ref *ref) { return false; }
+static inline void set_codetag_empty(union codetag_ref *ref) {}
+
+#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
+
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+
+struct codetag_bytes {
+ struct codetag *ct;
+ s64 bytes;
+};
+
+size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sleep);
+
+static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct)
+{
+ return container_of(ct, struct alloc_tag, ct);
+}
+
+#ifdef ARCH_NEEDS_WEAK_PER_CPU
+/*
+ * When percpu variables are required to be defined as weak, static percpu
+ * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION).
+ * Instead we will accound all module allocations to a single counter.
+ */
+DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
+
+#define DEFINE_ALLOC_TAG(_alloc_tag) \
+ static struct alloc_tag _alloc_tag __used __aligned(8) \
+ __section("alloc_tags") = { \
+ .ct = CODE_TAG_INIT, \
+ .counters = &_shared_alloc_tag };
+
+#else /* ARCH_NEEDS_WEAK_PER_CPU */
+
+#define DEFINE_ALLOC_TAG(_alloc_tag) \
+ static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \
+ static struct alloc_tag _alloc_tag __used __aligned(8) \
+ __section("alloc_tags") = { \
+ .ct = CODE_TAG_INIT, \
+ .counters = &_alloc_tag_cntr };
+
+#endif /* ARCH_NEEDS_WEAK_PER_CPU */
+
+DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
+ mem_alloc_profiling_key);
+
+static inline bool mem_alloc_profiling_enabled(void)
+{
+ return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
+ &mem_alloc_profiling_key);
+}
+
+static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
+{
+ struct alloc_tag_counters v = { 0, 0 };
+ struct alloc_tag_counters *counter;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ counter = per_cpu_ptr(tag->counters, cpu);
+ v.bytes += counter->bytes;
+ v.calls += counter->calls;
+ }
+
+ return v;
+}
+
+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
+static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag)
+{
+ WARN_ONCE(ref && ref->ct,
+ "alloc_tag was not cleared (got tag for %s:%u)\n",
+ ref->ct->filename, ref->ct->lineno);
+
+ WARN_ONCE(!tag, "current->alloc_tag not set");
+}
+
+static inline void alloc_tag_sub_check(union codetag_ref *ref)
+{
+ WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
+}
+#else
+static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {}
+static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
+#endif
+
+/* Caller should verify both ref and tag to be valid */
+static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
+{
+ ref->ct = &tag->ct;
+ /*
+ * We need in increment the call counter every time we have a new
+ * allocation or when we split a large allocation into smaller ones.
+ * Each new reference for every sub-allocation needs to increment call
+ * counter because when we free each part the counter will be decremented.
+ */
+ this_cpu_inc(tag->counters->calls);
+}
+
+static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
+{
+ alloc_tag_add_check(ref, tag);
+ if (!ref || !tag)
+ return;
+
+ __alloc_tag_ref_set(ref, tag);
+}
+
+static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes)
+{
+ alloc_tag_add_check(ref, tag);
+ if (!ref || !tag)
+ return;
+
+ __alloc_tag_ref_set(ref, tag);
+ this_cpu_add(tag->counters->bytes, bytes);
+}
+
+static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
+{
+ struct alloc_tag *tag;
+
+ alloc_tag_sub_check(ref);
+ if (!ref || !ref->ct)
+ return;
+
+ if (is_codetag_empty(ref)) {
+ ref->ct = NULL;
+ return;
+ }
+
+ tag = ct_to_alloc_tag(ref->ct);
+
+ this_cpu_sub(tag->counters->bytes, bytes);
+ this_cpu_dec(tag->counters->calls);
+
+ ref->ct = NULL;
+}
+
+#define alloc_tag_record(p) ((p) = current->alloc_tag)
+
+#else /* CONFIG_MEM_ALLOC_PROFILING */
+
+#define DEFINE_ALLOC_TAG(_alloc_tag)
+static inline bool mem_alloc_profiling_enabled(void) { return false; }
+static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag,
+ size_t bytes) {}
+static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {}
+#define alloc_tag_record(p) do {} while (0)
+
+#endif /* CONFIG_MEM_ALLOC_PROFILING */
+
+#define alloc_hooks_tag(_tag, _do_alloc) \
+({ \
+ struct alloc_tag * __maybe_unused _old = alloc_tag_save(_tag); \
+ typeof(_do_alloc) _res = _do_alloc; \
+ alloc_tag_restore(_tag, _old); \
+ _res; \
+})
+
+#define alloc_hooks(_do_alloc) \
+({ \
+ DEFINE_ALLOC_TAG(_alloc_tag); \
+ alloc_hooks_tag(&_alloc_tag, _do_alloc); \
+})
+
+#endif /* _LINUX_ALLOC_TAG_H */
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 890e152d553ea3..a86da1f38b3c34 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2244,31 +2244,14 @@ void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
size_t align, gfp_t flags);
#else
-static inline void *
-bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
- int node)
-{
- return kmalloc_node(size, flags, node);
-}
-
-static inline void *
-bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
-{
- return kzalloc(size, flags);
-}
-
-static inline void *
-bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, gfp_t flags)
-{
- return kvcalloc(n, size, flags);
-}
-
-static inline void __percpu *
-bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align,
- gfp_t flags)
-{
- return __alloc_percpu_gfp(size, align, flags);
-}
+#define bpf_map_kmalloc_node(_map, _size, _flags, _node) \
+ kmalloc_node(_size, _flags, _node)
+#define bpf_map_kzalloc(_map, _size, _flags) \
+ kzalloc(_size, _flags)
+#define bpf_map_kvcalloc(_map, _n, _size, _flags) \
+ kvcalloc(_n, _size, _flags)
+#define bpf_map_alloc_percpu(_map, _size, _align, _flags) \
+ __alloc_percpu_gfp(_size, _align, _flags)
#endif
static inline int
diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h
index 79b2f78eec1a06..1af241525a17cf 100644
--- a/include/linux/bpfptr.h
+++ b/include/linux/bpfptr.h
@@ -65,9 +65,9 @@ static inline int copy_to_bpfptr_offset(bpfptr_t dst, size_t offset,
return copy_to_sockptr_offset((sockptr_t) dst, offset, src, size);
}
-static inline void *kvmemdup_bpfptr(bpfptr_t src, size_t len)
+static inline void *kvmemdup_bpfptr_noprof(bpfptr_t src, size_t len)
{
- void *p = kvmalloc(len, GFP_USER | __GFP_NOWARN);
+ void *p = kvmalloc_noprof(len, GFP_USER | __GFP_NOWARN);
if (!p)
return ERR_PTR(-ENOMEM);
@@ -77,6 +77,7 @@ static inline void *kvmemdup_bpfptr(bpfptr_t src, size_t len)
}
return p;
}
+#define kvmemdup_bpfptr(...) alloc_hooks(kvmemdup_bpfptr_noprof(__VA_ARGS__))
static inline long strncpy_from_bpfptr(char *dst, bpfptr_t src, size_t count)
{
diff --git a/include/linux/codetag.h b/include/linux/codetag.h
new file mode 100644
index 00000000000000..c2a579ccd45588
--- /dev/null
+++ b/include/linux/codetag.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * code tagging framework
+ */
+#ifndef _LINUX_CODETAG_H
+#define _LINUX_CODETAG_H
+
+#include <linux/types.h>
+
+struct codetag_iterator;
+struct codetag_type;
+struct codetag_module;
+struct seq_buf;
+struct module;
+
+/*
+ * An instance of this structure is created in a special ELF section at every
+ * code location being tagged. At runtime, the special section is treated as
+ * an array of these.
+ */
+struct codetag {
+ unsigned int flags; /* used in later patches */
+ unsigned int lineno;
+ const char *modname;
+ const char *function;
+ const char *filename;
+} __aligned(8);
+
+union codetag_ref {
+ struct codetag *ct;
+};
+
+struct codetag_type_desc {
+ const char *section;
+ size_t tag_size;
+ void (*module_load)(struct codetag_type *cttype,
+ struct codetag_module *cmod);
+ bool (*module_unload)(struct codetag_type *cttype,
+ struct codetag_module *cmod);
+};
+
+struct codetag_iterator {
+ struct codetag_type *cttype;
+ struct codetag_module *cmod;
+ unsigned long mod_id;
+ struct codetag *ct;
+};
+
+#ifdef MODULE
+#define CT_MODULE_NAME KBUILD_MODNAME
+#else
+#define CT_MODULE_NAME NULL
+#endif
+
+#define CODE_TAG_INIT { \
+ .modname = CT_MODULE_NAME, \
+ .function = __func__, \
+ .filename = __FILE__, \
+ .lineno = __LINE__, \
+ .flags = 0, \
+}
+
+void codetag_lock_module_list(struct codetag_type *cttype, bool lock);
+bool codetag_trylock_module_list(struct codetag_type *cttype);
+struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype);
+struct codetag *codetag_next_ct(struct codetag_iterator *iter);
+
+void codetag_to_text(struct seq_buf *out, struct codetag *ct);
+
+struct codetag_type *
+codetag_register_type(const struct codetag_type_desc *desc);
+
+#if defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES)
+void codetag_load_module(struct module *mod);
+bool codetag_unload_module(struct module *mod);
+#else
+static inline void codetag_load_module(struct module *mod) {}
+static inline bool codetag_unload_module(struct module *mod) { return true; }
+#endif
+
+#endif /* _LINUX_CODETAG_H */
diff --git a/include/linux/dma-fence-chain.h b/include/linux/dma-fence-chain.h
index 4bdf0b96da2831..ad9e2506c2f401 100644
--- a/include/linux/dma-fence-chain.h
+++ b/include/linux/dma-fence-chain.h
@@ -86,10 +86,8 @@ dma_fence_chain_contained(struct dma_fence *fence)
*
* Returns a new struct dma_fence_chain object or NULL on failure.
*/
-static inline struct dma_fence_chain *dma_fence_chain_alloc(void)
-{
- return kmalloc(sizeof(struct dma_fence_chain), GFP_KERNEL);
-};
+#define dma_fence_chain_alloc() \
+ ((struct dma_fence_chain *)kmalloc(sizeof(struct dma_fence_chain), GFP_KERNEL))
/**
* dma_fence_chain_free
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 4abc60f0420928..9ee319851b5f63 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -29,7 +29,7 @@ struct dma_map_ops {
unsigned long attrs);
void (*free)(struct device *dev, size_t size, void *vaddr,
dma_addr_t dma_handle, unsigned long attrs);
- struct page *(*alloc_pages)(struct device *dev, size_t size,
+ struct page *(*alloc_pages_op)(struct device *dev, size_t size,
dma_addr_t *dma_handle, enum dma_data_direction dir,
gfp_t gfp);
void (*free_pages)(struct device *dev, size_t size, struct page *vaddr,
diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index 6aeebe0a677704..b24d62bad0b329 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -725,9 +725,9 @@ __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size)
return __real_memchr_inv(p, c, size);
}
-extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup)
+extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup_noprof)
__realloc_size(2);
-__FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp)
+__FORTIFY_INLINE void *kmemdup_noprof(const void * const POS0 p, size_t size, gfp_t gfp)
{
const size_t p_size = __struct_size(p);
@@ -737,6 +737,7 @@ __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp
fortify_panic(FORTIFY_FUNC_kmemdup, FORTIFY_READ, p_size, size, NULL);
return __real_kmemdup(p, size, gfp);
}
+#define kmemdup(...) alloc_hooks(kmemdup_noprof(__VA_ARGS__))
/**
* strcpy - Copy a string into another string buffer
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8dfd53b52744a4..4f2e2e5b6ab114 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3085,11 +3085,7 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap,
* This must be used for allocating filesystems specific inodes to set
* up the inode reclaim context correctly.
*/
-static inline void *
-alloc_inode_sb(struct super_block *sb, struct kmem_cache *cache, gfp_t gfp)
-{
- return kmem_cache_alloc_lru(cache, &sb->s_inode_lru, gfp);
-}
+#define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp)
extern void __insert_inode_hash(struct inode *, unsigned long hashval);
static inline void insert_inode_hash(struct inode *inode)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c775ea3c601558..450c2cbcf04b8d 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -6,6 +6,8 @@
#include <linux/mmzone.h>
#include <linux/topology.h>
+#include <linux/alloc_tag.h>
+#include <linux/sched.h>
struct vm_area_struct;
struct mempolicy;
@@ -175,42 +177,46 @@ static inline void arch_free_page(struct page *page, int order) { }
static inline void arch_alloc_page(struct page *page, int order) { }
#endif
-struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
+struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
nodemask_t *nodemask);
-struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
+#define __alloc_pages(...) alloc_hooks(__alloc_pages_noprof(__VA_ARGS__))
+
+struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
nodemask_t *nodemask);
+#define __folio_alloc(...) alloc_hooks(__folio_alloc_noprof(__VA_ARGS__))
-unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
+unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
nodemask_t *nodemask, int nr_pages,
struct list_head *page_list,
struct page **page_array);
+#define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__))
-unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
+unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
unsigned long nr_pages,
struct page **page_array);
+#define alloc_pages_bulk_array_mempolicy(...) \
+ alloc_hooks(alloc_pages_bulk_array_mempolicy_noprof(__VA_ARGS__))
/* Bulk allocate order-0 pages */
-static inline unsigned long
-alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
-{
- return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, NULL);
-}
+#define alloc_pages_bulk_list(_gfp, _nr_pages, _list) \
+ __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _list, NULL)
-static inline unsigned long
-alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_array)
-{
- return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array);
-}
+#define alloc_pages_bulk_array(_gfp, _nr_pages, _page_array) \
+ __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, NULL, _page_array)
static inline unsigned long
-alloc_pages_bulk_array_node(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array)
+alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages,
+ struct page **page_array)
{
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
- return __alloc_pages_bulk(gfp, nid, NULL, nr_pages, NULL, page_array);
+ return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, NULL, page_array);
}
+#define alloc_pages_bulk_array_node(...) \
+ alloc_hooks(alloc_pages_bulk_array_node_noprof(__VA_ARGS__))
+
static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask)
{
gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN);
@@ -230,82 +236,104 @@ static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask)
* online. For more general interface, see alloc_pages_node().
*/
static inline struct page *
-__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
+__alloc_pages_node_noprof(int nid, gfp_t gfp_mask, unsigned int order)
{
VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
warn_if_node_offline(nid, gfp_mask);
- return __alloc_pages(gfp_mask, order, nid, NULL);
+ return __alloc_pages_noprof(gfp_mask, order, nid, NULL);
}
+#define __alloc_pages_node(...) alloc_hooks(__alloc_pages_node_noprof(__VA_ARGS__))
+
static inline
-struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid)
+struct folio *__folio_alloc_node_noprof(gfp_t gfp, unsigned int order, int nid)
{
VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
warn_if_node_offline(nid, gfp);
- return __folio_alloc(gfp, order, nid, NULL);
+ return __folio_alloc_noprof(gfp, order, nid, NULL);
}
+#define __folio_alloc_node(...) alloc_hooks(__folio_alloc_node_noprof(__VA_ARGS__))
+
/*
* Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
* prefer the current CPU's closest node. Otherwise node must be valid and
* online.
*/
-static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
- unsigned int order)
+static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask,
+ unsigned int order)
{
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
- return __alloc_pages_node(nid, gfp_mask, order);
+ return __alloc_pages_node_noprof(nid, gfp_mask, order);
}
+#define alloc_pages_node(...) alloc_hooks(alloc_pages_node_noprof(__VA_ARGS__))
+
#ifdef CONFIG_NUMA
-struct page *alloc_pages(gfp_t gfp, unsigned int order);
-struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
+struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order);
+struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
struct mempolicy *mpol, pgoff_t ilx, int nid);
-struct folio *folio_alloc(gfp_t gfp, unsigned int order);
-struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
+struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order);
+struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, bool hugepage);
#else
-static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
+static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order)
{
- return alloc_pages_node(numa_node_id(), gfp_mask, order);
+ return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order);
}
-static inline struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
+static inline struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
struct mempolicy *mpol, pgoff_t ilx, int nid)
{
- return alloc_pages(gfp, order);
+ return alloc_pages_noprof(gfp, order);
}
-static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
+static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
{
return __folio_alloc_node(gfp, order, numa_node_id());
}
-#define vma_alloc_folio(gfp, order, vma, addr, hugepage) \
- folio_alloc(gfp, order)
+#define vma_alloc_folio_noprof(gfp, order, vma, addr, hugepage) \
+ folio_alloc_noprof(gfp, order)
#endif
+
+#define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__))
+#define alloc_pages_mpol(...) alloc_hooks(alloc_pages_mpol_noprof(__VA_ARGS__))
+#define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__))
+#define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__))
+
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
-static inline struct page *alloc_page_vma(gfp_t gfp,
+
+static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
struct vm_area_struct *vma, unsigned long addr)
{
- struct folio *folio = vma_alloc_folio(gfp, 0, vma, addr, false);
+ struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr, false);
return &folio->page;
}
+#define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__))
+
+extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);
+#define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__))
-extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
-extern unsigned long get_zeroed_page(gfp_t gfp_mask);
+extern unsigned long get_zeroed_page_noprof(gfp_t gfp_mask);
+#define get_zeroed_page(...) alloc_hooks(get_zeroed_page_noprof(__VA_ARGS__))
+
+void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) __alloc_size(1);
+#define alloc_pages_exact(...) alloc_hooks(alloc_pages_exact_noprof(__VA_ARGS__))
-void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1);
void free_pages_exact(void *virt, size_t size);
-__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2);
-#define __get_free_page(gfp_mask) \
- __get_free_pages((gfp_mask), 0)
+__meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2);
+#define alloc_pages_exact_nid(...) \
+ alloc_hooks(alloc_pages_exact_nid_noprof(__VA_ARGS__))
+
+#define __get_free_page(gfp_mask) \
+ __get_free_pages((gfp_mask), 0)
-#define __get_dma_pages(gfp_mask, order) \
- __get_free_pages((gfp_mask) | GFP_DMA, (order))
+#define __get_dma_pages(gfp_mask, order) \
+ __get_free_pages((gfp_mask) | GFP_DMA, (order))
extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);
@@ -374,10 +402,14 @@ extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);
#ifdef CONFIG_CONTIG_ALLOC
/* The below functions must be run on a range from a single zone. */
-extern int alloc_contig_range(unsigned long start, unsigned long end,
+extern int alloc_contig_range_noprof(unsigned long start, unsigned long end,
unsigned migratetype, gfp_t gfp_mask);
-extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
- int nid, nodemask_t *nodemask);
+#define alloc_contig_range(...) alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__))
+
+extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask);
+#define alloc_contig_pages(...) alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__))
+
#endif
void free_contig_range(unsigned long pfn, unsigned long nr_pages);
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index 13becafe41df00..313be4ad79fd97 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -55,6 +55,9 @@ enum {
#ifdef CONFIG_LOCKDEP
___GFP_NOLOCKDEP_BIT,
#endif
+#ifdef CONFIG_SLAB_OBJ_EXT
+ ___GFP_NO_OBJ_EXT_BIT,
+#endif
___GFP_LAST_BIT
};
@@ -95,6 +98,11 @@ enum {
#else
#define ___GFP_NOLOCKDEP 0
#endif
+#ifdef CONFIG_SLAB_OBJ_EXT
+#define ___GFP_NO_OBJ_EXT BIT(___GFP_NO_OBJ_EXT_BIT)
+#else
+#define ___GFP_NO_OBJ_EXT 0
+#endif
/*
* Physical address zone modifiers (see linux/mmzone.h - low four bits)
@@ -135,12 +143,15 @@ enum {
* node with no fallbacks or placement policy enforcements.
*
* %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
+ *
+ * %__GFP_NO_OBJ_EXT causes slab allocation to have no object extension.
*/
#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE)
#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL)
#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)
#define __GFP_ACCOUNT ((__force gfp_t)___GFP_ACCOUNT)
+#define __GFP_NO_OBJ_EXT ((__force gfp_t)___GFP_NO_OBJ_EXT)
/**
* DOC: Watermark modifiers
diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h
index 7118ac28d46879..ca70eeb6393e7f 100644
--- a/include/linux/hid_bpf.h
+++ b/include/linux/hid_bpf.h
@@ -149,10 +149,8 @@ static inline int hid_bpf_connect_device(struct hid_device *hdev) { return 0; }
static inline void hid_bpf_disconnect_device(struct hid_device *hdev) {}
static inline void hid_bpf_destroy_device(struct hid_device *hid) {}
static inline void hid_bpf_device_init(struct hid_device *hid) {}
-static inline u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, u8 *rdesc, unsigned int *size)
-{
- return kmemdup(rdesc, *size, GFP_KERNEL);
-}
+#define call_hid_bpf_rdesc_fixup(_hdev, _rdesc, _size) \
+ ((u8 *)kmemdup(_rdesc, *(_size), GFP_KERNEL))
#endif /* CONFIG_HID_BPF */
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index de0c8910507690..e896ca4760f673 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -64,9 +64,6 @@ ssize_t single_hugepage_flag_show(struct kobject *kobj,
enum transparent_hugepage_flag flag);
extern struct kobj_attribute shmem_enabled_attr;
-#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
-#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
-
/*
* Mask of all large folio orders supported for anonymous THP; all orders up to
* and including PMD_ORDER, except order-0 (which is not "huge") and order-1
@@ -87,14 +84,25 @@ extern struct kobj_attribute shmem_enabled_attr;
#define thp_vma_allowable_order(vma, vm_flags, smaps, in_pf, enforce_sysfs, order) \
(!!thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, enforce_sysfs, BIT(order)))
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
#define HPAGE_PMD_SHIFT PMD_SHIFT
-#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT)
+#define HPAGE_PUD_SHIFT PUD_SHIFT
+#else
+#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
+#define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; })
+#endif
+
+#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
+#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
#define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1))
+#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT)
-#define HPAGE_PUD_SHIFT PUD_SHIFT
-#define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT)
+#define HPAGE_PUD_ORDER (HPAGE_PUD_SHIFT-PAGE_SHIFT)
+#define HPAGE_PUD_NR (1<<HPAGE_PUD_ORDER)
#define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1))
+#define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT)
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern unsigned long transparent_hugepage_flags;
extern unsigned long huge_anon_orders_always;
@@ -262,8 +270,10 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags);
+unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags,
+ vm_flags_t vm_flags);
-void folio_prep_large_rmappable(struct folio *folio);
bool can_split_folio(struct folio *folio, int *pextra_pins);
int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
unsigned int new_order);
@@ -344,17 +354,15 @@ static inline bool folio_test_pmd_mappable(struct folio *folio)
struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, int flags, struct dev_pagemap **pgmap);
-struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, int flags, struct dev_pagemap **pgmap);
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
-extern struct page *huge_zero_page;
+extern struct folio *huge_zero_folio;
extern unsigned long huge_zero_pfn;
-static inline bool is_huge_zero_page(struct page *page)
+static inline bool is_huge_zero_folio(const struct folio *folio)
{
- return READ_ONCE(huge_zero_page) == page;
+ return READ_ONCE(huge_zero_folio) == folio;
}
static inline bool is_huge_zero_pmd(pmd_t pmd)
@@ -367,8 +375,8 @@ static inline bool is_huge_zero_pud(pud_t pud)
return false;
}
-struct page *mm_get_huge_zero_page(struct mm_struct *mm);
-void mm_put_huge_zero_page(struct mm_struct *mm);
+struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
+void mm_put_huge_zero_folio(struct mm_struct *mm);
#define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))
@@ -378,13 +386,6 @@ static inline bool thp_migration_supported(void)
}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
-#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
-#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
-#define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; })
-
-#define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; })
-#define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; })
-#define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; })
static inline bool folio_test_pmd_mappable(struct folio *folio)
{
@@ -411,12 +412,18 @@ static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
return 0;
}
-static inline void folio_prep_large_rmappable(struct folio *folio) {}
-
#define transparent_hugepage_flags 0UL
#define thp_get_unmapped_area NULL
+static inline unsigned long
+thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags, vm_flags_t vm_flags)
+{
+ return 0;
+}
+
static inline bool
can_split_folio(struct folio *folio, int *pextra_pins)
{
@@ -483,7 +490,7 @@ static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
return 0;
}
-static inline bool is_huge_zero_page(struct page *page)
+static inline bool is_huge_zero_folio(const struct folio *folio)
{
return false;
}
@@ -498,7 +505,7 @@ static inline bool is_huge_zero_pud(pud_t pud)
return false;
}
-static inline void mm_put_huge_zero_page(struct mm_struct *mm)
+static inline void mm_put_huge_zero_folio(struct mm_struct *mm)
{
return;
}
@@ -509,12 +516,6 @@ static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
return NULL;
}
-static inline struct page *follow_devmap_pud(struct vm_area_struct *vma,
- unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap)
-{
- return NULL;
-}
-
static inline bool thp_migration_supported(void)
{
return false;
@@ -535,16 +536,4 @@ static inline int split_folio_to_order(struct folio *folio, int new_order)
#define split_folio_to_list(f, l) split_folio_to_list_to_order(f, l, 0)
#define split_folio(f) split_folio_to_order(f, 0)
-/*
- * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to
- * limitations in the implementation like arm64 MTE can override this to
- * false
- */
-#ifndef arch_thp_swp_supported
-static inline bool arch_thp_swp_supported(void)
-{
- return true;
-}
-#endif
-
#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 77b30a8c6076b6..3f3e628802792a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -174,6 +174,9 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud);
+bool hugetlbfs_pagecache_present(struct hstate *h,
+ struct vm_area_struct *vma,
+ unsigned long address);
struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);
@@ -272,13 +275,9 @@ void hugetlb_vma_unlock_write(struct vm_area_struct *vma);
int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
void hugetlb_vma_lock_release(struct kref *kref);
-
-int pmd_huge(pmd_t pmd);
-int pud_huge(pud_t pud);
long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot,
unsigned long cp_flags);
-
bool is_hugetlb_entry_migration(pte_t pte);
bool is_hugetlb_entry_hwpoisoned(pte_t pte);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
@@ -329,13 +328,6 @@ static inline void hugetlb_zap_end(
{
}
-static inline struct page *hugetlb_follow_page_mask(
- struct vm_area_struct *vma, unsigned long address, unsigned int flags,
- unsigned int *page_mask)
-{
- BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/
-}
-
static inline int copy_hugetlb_page_range(struct mm_struct *dst,
struct mm_struct *src,
struct vm_area_struct *dst_vma,
@@ -399,16 +391,6 @@ static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
{
}
-static inline int pmd_huge(pmd_t pmd)
-{
- return 0;
-}
-
-static inline int pud_huge(pud_t pud)
-{
- return 0;
-}
-
static inline int is_hugepage_only_range(struct mm_struct *mm,
unsigned long addr, unsigned long len)
{
@@ -493,16 +475,6 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }
#endif /* !CONFIG_HUGETLB_PAGE */
-/*
- * hugepages at page global directory. If arch support
- * hugepages at pgd level, they need to define this.
- */
-#ifndef pgd_huge
-#define pgd_huge(x) 0
-#endif
-#ifndef p4d_huge
-#define p4d_huge(x) 0
-#endif
#ifndef pgd_write
static inline int pgd_write(pgd_t pgd)
@@ -747,7 +719,8 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve);
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
- nodemask_t *nmask, gfp_t gfp_mask);
+ nodemask_t *nmask, gfp_t gfp_mask,
+ bool allow_alloc_fallback);
int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
pgoff_t idx);
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
@@ -859,9 +832,9 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
#define is_hugepage_only_range is_hugepage_only_range
#endif
-#ifndef arch_clear_hugepage_flags
-static inline void arch_clear_hugepage_flags(struct page *page) { }
-#define arch_clear_hugepage_flags arch_clear_hugepage_flags
+#ifndef arch_clear_hugetlb_flags
+static inline void arch_clear_hugetlb_flags(struct folio *folio) { }
+#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
#endif
#ifndef arch_make_huge_pte
@@ -970,6 +943,30 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
return modified_mask;
}
+static inline bool htlb_allow_alloc_fallback(int reason)
+{
+ bool allowed_fallback = false;
+
+ /*
+ * Note: the memory offline, memory failure and migration syscalls will
+ * be allowed to fallback to other nodes due to lack of a better chioce,
+ * that might break the per-node hugetlb pool. While other cases will
+ * set the __GFP_THISNODE to avoid breaking the per-node hugetlb pool.
+ */
+ switch (reason) {
+ case MR_MEMORY_HOTPLUG:
+ case MR_MEMORY_FAILURE:
+ case MR_SYSCALL:
+ case MR_MEMPOLICY_MBIND:
+ allowed_fallback = true;
+ break;
+ default:
+ break;
+ }
+
+ return allowed_fallback;
+}
+
static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
struct mm_struct *mm, pte_t *pte)
{
@@ -1065,7 +1062,8 @@ static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
static inline struct folio *
alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
- nodemask_t *nmask, gfp_t gfp_mask)
+ nodemask_t *nmask, gfp_t gfp_mask,
+ bool allow_alloc_fallback)
{
return NULL;
}
@@ -1181,6 +1179,11 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
return 0;
}
+static inline bool htlb_allow_alloc_fallback(int reason)
+{
+ return false;
+}
+
static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
struct mm_struct *mm, pte_t *pte)
{
@@ -1221,6 +1224,12 @@ static inline void hugetlb_register_node(struct node *node)
static inline void hugetlb_unregister_node(struct node *node)
{
}
+
+static inline bool hugetlbfs_pagecache_present(
+ struct hstate *h, struct vm_area_struct *vma, unsigned long address)
+{
+ return false;
+}
#endif /* CONFIG_HUGETLB_PAGE */
static inline spinlock_t *huge_pte_lock(struct hstate *h,
diff --git a/include/linux/io.h b/include/linux/io.h
index 235ba7d80a8f0d..e8ee40ee1843b2 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -6,6 +6,7 @@
#ifndef _LINUX_IO_H
#define _LINUX_IO_H
+#include <linux/sizes.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/bug.h>
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 971f3e826e1528..9512fe33266841 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1586,10 +1586,8 @@ void jbd2_journal_put_journal_head(struct journal_head *jh);
*/
extern struct kmem_cache *jbd2_handle_cache;
-static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags)
-{
- return kmem_cache_zalloc(jbd2_handle_cache, gfp_flags);
-}
+#define jbd2_alloc_handle(_gfp_flags) \
+ ((handle_t *)kmem_cache_zalloc(jbd2_handle_cache, _gfp_flags))
static inline void jbd2_free_handle(handle_t *handle)
{
@@ -1602,10 +1600,8 @@ static inline void jbd2_free_handle(handle_t *handle)
*/
extern struct kmem_cache *jbd2_inode_cache;
-static inline struct jbd2_inode *jbd2_alloc_inode(gfp_t gfp_flags)
-{
- return kmem_cache_alloc(jbd2_inode_cache, gfp_flags);
-}
+#define jbd2_alloc_inode(_gfp_flags) \
+ ((struct jbd2_inode *)kmem_cache_alloc(jbd2_inode_cache, _gfp_flags))
static inline void jbd2_free_inode(struct jbd2_inode *jinode)
{
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 401348e9f92b4e..358803cfd4d5eb 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -45,16 +45,16 @@ static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
- int ret;
+ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
+ return __ksm_enter(mm);
- if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) {
- ret = __ksm_enter(mm);
- if (ret)
- return ret;
- }
+ return 0;
+}
- if (test_bit(MMF_VM_MERGE_ANY, &oldmm->flags))
- set_bit(MMF_VM_MERGE_ANY, &mm->flags);
+static inline int ksm_execve(struct mm_struct *mm)
+{
+ if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ return __ksm_enter(mm);
return 0;
}
@@ -107,6 +107,11 @@ static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
return 0;
}
+static inline int ksm_execve(struct mm_struct *mm)
+{
+ return 0;
+}
+
static inline void ksm_exit(struct mm_struct *mm)
{
}
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 394fd0a887ae75..8f332b4ae84ca8 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -349,15 +349,32 @@ struct mem_cgroup {
extern struct mem_cgroup *root_mem_cgroup;
enum page_memcg_data_flags {
- /* page->memcg_data is a pointer to an objcgs vector */
- MEMCG_DATA_OBJCGS = (1UL << 0),
+ /* page->memcg_data is a pointer to an slabobj_ext vector */
+ MEMCG_DATA_OBJEXTS = (1UL << 0),
/* page has been accounted as a non-slab kernel page */
MEMCG_DATA_KMEM = (1UL << 1),
/* the next bit after the last actual flag */
__NR_MEMCG_DATA_FLAGS = (1UL << 2),
};
-#define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)
+#define __FIRST_OBJEXT_FLAG __NR_MEMCG_DATA_FLAGS
+
+#else /* CONFIG_MEMCG */
+
+#define __FIRST_OBJEXT_FLAG (1UL << 0)
+
+#endif /* CONFIG_MEMCG */
+
+enum objext_flags {
+ /* slabobj_ext vector failed to allocate */
+ OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG,
+ /* the next bit after the last actual flag */
+ __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1),
+};
+
+#define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1)
+
+#ifdef CONFIG_MEMCG
static inline bool folio_memcg_kmem(struct folio *folio);
@@ -388,10 +405,10 @@ static inline struct mem_cgroup *__folio_memcg(struct folio *folio)
unsigned long memcg_data = folio->memcg_data;
VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
- VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
+ VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio);
- return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}
/*
@@ -409,10 +426,10 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
unsigned long memcg_data = folio->memcg_data;
VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
- VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
+ VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio);
- return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}
/*
@@ -469,11 +486,11 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
if (memcg_data & MEMCG_DATA_KMEM) {
struct obj_cgroup *objcg;
- objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
return obj_cgroup_memcg(objcg);
}
- return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}
/*
@@ -506,17 +523,17 @@ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
*/
unsigned long memcg_data = READ_ONCE(folio->memcg_data);
- if (memcg_data & MEMCG_DATA_OBJCGS)
+ if (memcg_data & MEMCG_DATA_OBJEXTS)
return NULL;
if (memcg_data & MEMCG_DATA_KMEM) {
struct obj_cgroup *objcg;
- objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
return obj_cgroup_memcg(objcg);
}
- return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}
static inline struct mem_cgroup *page_memcg_check(struct page *page)
@@ -552,7 +569,7 @@ retry:
static inline bool folio_memcg_kmem(struct folio *folio)
{
VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page);
- VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJCGS, folio);
+ VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJEXTS, folio);
return folio->memcg_data & MEMCG_DATA_KMEM;
}
@@ -818,7 +835,8 @@ static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
- percpu_ref_put(&objcg->refcnt);
+ if (objcg)
+ percpu_ref_put(&objcg->refcnt);
}
static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
@@ -1632,6 +1650,19 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
}
#endif /* CONFIG_MEMCG */
+/*
+ * Extended information for slab objects stored as an array in page->memcg_data
+ * if MEMCG_DATA_OBJEXTS is set.
+ */
+struct slabobj_ext {
+#ifdef CONFIG_MEMCG_KMEM
+ struct obj_cgroup *objcg;
+#endif
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+ union codetag_ref ref;
+#endif
+} __aligned(8);
+
static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
{
__mod_lruvec_kmem_state(p, idx, 1);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 931b118336f48c..1add16f216124d 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -167,7 +167,8 @@ extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
/* Check if a vma is migratable */
extern bool vma_migratable(struct vm_area_struct *vma);
-int mpol_misplaced(struct folio *, struct vm_area_struct *, unsigned long);
+int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
+ unsigned long addr);
extern void mpol_put_task_policy(struct task_struct *);
static inline bool mpol_is_preferred_many(struct mempolicy *pol)
@@ -282,7 +283,7 @@ static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
#endif
static inline int mpol_misplaced(struct folio *folio,
- struct vm_area_struct *vma,
+ struct vm_fault *vmf,
unsigned long address)
{
return -1; /* no node preference */
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 16c5cc807ff6b4..7b151441341b4c 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -5,6 +5,8 @@
#ifndef _LINUX_MEMPOOL_H
#define _LINUX_MEMPOOL_H
+#include <linux/sched.h>
+#include <linux/alloc_tag.h>
#include <linux/wait.h>
#include <linux/compiler.h>
@@ -39,18 +41,32 @@ void mempool_exit(mempool_t *pool);
int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data,
gfp_t gfp_mask, int node_id);
-int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+
+int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data);
+#define mempool_init(...) \
+ alloc_hooks(mempool_init_noprof(__VA_ARGS__))
extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data);
-extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
+
+extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data,
gfp_t gfp_mask, int nid);
+#define mempool_create_node(...) \
+ alloc_hooks(mempool_create_node_noprof(__VA_ARGS__))
+
+#define mempool_create(_min_nr, _alloc_fn, _free_fn, _pool_data) \
+ mempool_create_node(_min_nr, _alloc_fn, _free_fn, _pool_data, \
+ GFP_KERNEL, NUMA_NO_NODE)
extern int mempool_resize(mempool_t *pool, int new_min_nr);
extern void mempool_destroy(mempool_t *pool);
-extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc;
+
+extern void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) __malloc;
+#define mempool_alloc(...) \
+ alloc_hooks(mempool_alloc_noprof(__VA_ARGS__))
+
extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc;
extern void mempool_free(void *element, mempool_t *pool);
@@ -62,19 +78,10 @@ extern void mempool_free(void *element, mempool_t *pool);
void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data);
void mempool_free_slab(void *element, void *pool_data);
-static inline int
-mempool_init_slab_pool(mempool_t *pool, int min_nr, struct kmem_cache *kc)
-{
- return mempool_init(pool, min_nr, mempool_alloc_slab,
- mempool_free_slab, (void *) kc);
-}
-
-static inline mempool_t *
-mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
-{
- return mempool_create(min_nr, mempool_alloc_slab, mempool_free_slab,
- (void *) kc);
-}
+#define mempool_init_slab_pool(_pool, _min_nr, _kc) \
+ mempool_init(_pool, (_min_nr), mempool_alloc_slab, mempool_free_slab, (void *)(_kc))
+#define mempool_create_slab_pool(_min_nr, _kc) \
+ mempool_create((_min_nr), mempool_alloc_slab, mempool_free_slab, (void *)(_kc))
/*
* a mempool_alloc_t and a mempool_free_t to kmalloc and kfree the
@@ -83,17 +90,12 @@ mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data);
void mempool_kfree(void *element, void *pool_data);
-static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, size_t size)
-{
- return mempool_init(pool, min_nr, mempool_kmalloc,
- mempool_kfree, (void *) size);
-}
-
-static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
-{
- return mempool_create(min_nr, mempool_kmalloc, mempool_kfree,
- (void *) size);
-}
+#define mempool_init_kmalloc_pool(_pool, _min_nr, _size) \
+ mempool_init(_pool, (_min_nr), mempool_kmalloc, mempool_kfree, \
+ (void *)(unsigned long)(_size))
+#define mempool_create_kmalloc_pool(_min_nr, _size) \
+ mempool_create((_min_nr), mempool_kmalloc, mempool_kfree, \
+ (void *)(unsigned long)(_size))
void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data);
void mempool_kvfree(void *element, void *pool_data);
@@ -115,16 +117,11 @@ static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size)
void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data);
void mempool_free_pages(void *element, void *pool_data);
-static inline int mempool_init_page_pool(mempool_t *pool, int min_nr, int order)
-{
- return mempool_init(pool, min_nr, mempool_alloc_pages,
- mempool_free_pages, (void *)(long)order);
-}
-
-static inline mempool_t *mempool_create_page_pool(int min_nr, int order)
-{
- return mempool_create(min_nr, mempool_alloc_pages, mempool_free_pages,
- (void *)(long)order);
-}
+#define mempool_init_page_pool(_pool, _min_nr, _order) \
+ mempool_init(_pool, (_min_nr), mempool_alloc_pages, \
+ mempool_free_pages, (void *)(long)(_order))
+#define mempool_create_page_pool(_min_nr, _order) \
+ mempool_create((_min_nr), mempool_alloc_pages, \
+ mempool_free_pages, (void *)(long)(_order))
#endif /* _LINUX_MEMPOOL_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b6bdaa18b9e9d4..cc9410d772ddf0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -5,6 +5,7 @@
#include <linux/errno.h>
#include <linux/mmdebug.h>
#include <linux/gfp.h>
+#include <linux/pgalloc_tag.h>
#include <linux/bug.h>
#include <linux/list.h>
#include <linux/mmzone.h>
@@ -1199,7 +1200,7 @@ static inline int is_vmalloc_or_module_addr(const void *x)
* debugging purposes - it does not include PTE-mapped sub-pages; look
* at folio_mapcount() or page_mapcount() instead.
*/
-static inline int folio_entire_mapcount(struct folio *folio)
+static inline int folio_entire_mapcount(const struct folio *folio)
{
VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
return atomic_read(&folio->_entire_mapcount) + 1;
@@ -1239,7 +1240,7 @@ static inline int page_mapcount(struct page *page)
return mapcount;
}
-int folio_total_mapcount(struct folio *folio);
+int folio_total_mapcount(const struct folio *folio);
/**
* folio_mapcount() - Calculate the number of mappings of this folio.
@@ -1252,14 +1253,14 @@ int folio_total_mapcount(struct folio *folio);
*
* Return: The number of times this folio is mapped.
*/
-static inline int folio_mapcount(struct folio *folio)
+static inline int folio_mapcount(const struct folio *folio)
{
if (likely(!folio_test_large(folio)))
return atomic_read(&folio->_mapcount) + 1;
return folio_total_mapcount(folio);
}
-static inline bool folio_large_is_mapped(struct folio *folio)
+static inline bool folio_large_is_mapped(const struct folio *folio)
{
/*
* Reading _entire_mapcount below could be omitted if hugetlb
@@ -1287,7 +1288,7 @@ static inline bool folio_mapped(struct folio *folio)
* For compound page it returns true if any sub-page of compound page is mapped,
* even if this particular sub-page is not itself mapped by any PTE or PMD.
*/
-static inline bool page_mapped(struct page *page)
+static inline bool page_mapped(const struct page *page)
{
if (likely(!PageCompound(page)))
return atomic_read(&page->_mapcount) >= 0;
@@ -1317,8 +1318,6 @@ void folio_copy(struct folio *dst, struct folio *src);
unsigned long nr_free_buffer_pages(void);
-void destroy_large_folio(struct folio *folio);
-
/* Returns the number of bytes in this potentially compound page. */
static inline unsigned long page_size(struct page *page)
{
@@ -2069,7 +2068,7 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
*
* Return: A positive power of two.
*/
-static inline long folio_nr_pages(struct folio *folio)
+static inline long folio_nr_pages(const struct folio *folio)
{
if (!folio_test_large(folio))
return 1;
@@ -2164,21 +2163,49 @@ static inline size_t folio_size(struct folio *folio)
}
/**
- * folio_estimated_sharers - Estimate the number of sharers of a folio.
+ * folio_likely_mapped_shared - Estimate if the folio is mapped into the page
+ * tables of more than one MM
* @folio: The folio.
*
- * folio_estimated_sharers() aims to serve as a function to efficiently
- * estimate the number of processes sharing a folio. This is done by
- * looking at the precise mapcount of the first subpage in the folio, and
- * assuming the other subpages are the same. This may not be true for large
- * folios. If you want exact mapcounts for exact calculations, look at
- * page_mapcount() or folio_total_mapcount().
+ * This function checks if the folio is currently mapped into more than one
+ * MM ("mapped shared"), or if the folio is only mapped into a single MM
+ * ("mapped exclusively").
+ *
+ * As precise information is not easily available for all folios, this function
+ * estimates the number of MMs ("sharers") that are currently mapping a folio
+ * using the number of times the first page of the folio is currently mapped
+ * into page tables.
+ *
+ * For small anonymous folios (except KSM folios) and anonymous hugetlb folios,
+ * the return value will be exactly correct, because they can only be mapped
+ * at most once into an MM, and they cannot be partially mapped.
+ *
+ * For other folios, the result can be fuzzy:
+ * #. For partially-mappable large folios (THP), the return value can wrongly
+ * indicate "mapped exclusively" (false negative) when the folio is
+ * only partially mapped into at least one MM.
+ * #. For pagecache folios (including hugetlb), the return value can wrongly
+ * indicate "mapped shared" (false positive) when two VMAs in the same MM
+ * cover the same file range.
+ * #. For (small) KSM folios, the return value can wrongly indicate "mapped
+ * shared" (false negative), when the folio is mapped multiple times into
+ * the same MM.
+ *
+ * Further, this function only considers current page table mappings that
+ * are tracked using the folio mapcount(s).
*
- * Return: The estimated number of processes sharing a folio.
+ * This function does not consider:
+ * #. If the folio might get mapped in the (near) future (e.g., swapcache,
+ * pagecache, temporary unmapping for migration).
+ * #. If the folio is mapped differently (VM_PFNMAP).
+ * #. If hugetlb page table sharing applies. Callers might want to check
+ * hugetlb_pmd_shared().
+ *
+ * Return: Whether the folio is estimated to be mapped into more than one MM.
*/
-static inline int folio_estimated_sharers(struct folio *folio)
+static inline bool folio_likely_mapped_shared(struct folio *folio)
{
- return page_mapcount(folio_page(folio, 0));
+ return page_mapcount(folio_page(folio, 0)) > 1;
}
#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
@@ -2395,10 +2422,6 @@ int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
int follow_pte(struct mm_struct *mm, unsigned long address,
pte_t **ptepp, spinlock_t **ptlp);
-int follow_pfn(struct vm_area_struct *vma, unsigned long address,
- unsigned long *pfn);
-int follow_phys(struct vm_area_struct *vma, unsigned long address,
- unsigned int flags, unsigned long *prot, resource_size_t *phys);
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write);
@@ -2859,12 +2882,13 @@ static inline bool pagetable_is_reserved(struct ptdesc *pt)
*
* Return: The ptdesc describing the allocated page tables.
*/
-static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order)
+static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order)
{
- struct page *page = alloc_pages(gfp | __GFP_COMP, order);
+ struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order);
return page_ptdesc(page);
}
+#define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__))
/**
* pagetable_free - Free pagetables
@@ -3134,6 +3158,14 @@ extern void reserve_bootmem_region(phys_addr_t start,
/* Free the reserved page into the buddy system, so it gets managed. */
static inline void free_reserved_page(struct page *page)
{
+ if (mem_alloc_profiling_enabled()) {
+ union codetag_ref *ref = get_page_tag_ref(page);
+
+ if (ref) {
+ set_codetag_empty(ref);
+ put_page_tag_ref(ref);
+ }
+ }
ClearPageReserved(page);
init_page_count(page);
__free_page(page);
@@ -3195,8 +3227,6 @@ static inline unsigned long get_num_physpages(void)
*/
void free_area_init(unsigned long *max_zone_pfn);
unsigned long node_map_pfn_alignment(void);
-unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
- unsigned long end_pfn);
extern unsigned long absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn);
extern void get_pfn_range_for_nid(unsigned int nid,
@@ -3212,7 +3242,6 @@ static inline int early_pfn_to_nid(unsigned long pfn)
extern int __meminit early_pfn_to_nid(unsigned long pfn);
#endif
-extern void set_dma_reserve(unsigned long new_dma_reserve);
extern void mem_init(void);
extern void __init mmap_init(void);
@@ -3224,9 +3253,6 @@ static inline void show_mem(void)
extern long si_mem_available(void);
extern void si_meminfo(struct sysinfo * val);
extern void si_meminfo_node(struct sysinfo *val, int nid);
-#ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES
-extern unsigned long arch_reserved_kernel_pages(void);
-#endif
extern __printf(3, 4)
void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);
@@ -3385,7 +3411,16 @@ extern int install_special_mapping(struct mm_struct *mm,
unsigned long randomize_stack_top(unsigned long stack_top);
unsigned long randomize_page(unsigned long start, unsigned long range);
-extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+unsigned long
+__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags);
+
+static inline unsigned long
+get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ return __get_unmapped_area(file, addr, len, pgoff, flags, 0);
+}
extern unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
@@ -3431,6 +3466,7 @@ struct vm_unmapped_area_info {
unsigned long high_limit;
unsigned long align_mask;
unsigned long align_offset;
+ unsigned long start_gap;
};
extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);
@@ -3724,7 +3760,14 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
static inline bool want_init_on_free(void)
{
return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON,
- &init_on_free);
+ &init_on_free);
+}
+
+DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free);
+static inline bool want_init_mlocked_on_free(void)
+{
+ return static_branch_maybe(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON,
+ &init_mlocked_on_free);
}
extern bool _debug_pagealloc_enabled_early;
@@ -3787,24 +3830,22 @@ static inline bool page_is_guard(struct page *page)
return PageGuard(page);
}
-bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
- int migratetype);
+bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order);
static inline bool set_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype)
+ unsigned int order)
{
if (!debug_guardpage_enabled())
return false;
- return __set_page_guard(zone, page, order, migratetype);
+ return __set_page_guard(zone, page, order);
}
-void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
- int migratetype);
+void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order);
static inline void clear_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype)
+ unsigned int order)
{
if (!debug_guardpage_enabled())
return;
- __clear_page_guard(zone, page, order, migratetype);
+ __clear_page_guard(zone, page, order);
}
#else /* CONFIG_DEBUG_PAGEALLOC */
@@ -3814,9 +3855,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; }
static inline bool debug_guardpage_enabled(void) { return false; }
static inline bool page_is_guard(struct page *page) { return false; }
static inline bool set_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype) { return false; }
+ unsigned int order) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype) {}
+ unsigned int order) {}
#endif /* CONFIG_DEBUG_PAGEALLOC */
#ifdef __HAVE_ARCH_GATE_AREA
@@ -4204,4 +4245,7 @@ static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE);
}
+void vma_pgtable_walk_begin(struct vm_area_struct *vma);
+void vma_pgtable_walk_end(struct vm_area_struct *vma);
+
#endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5240bd7bca338c..fa0d6995706f4f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -169,7 +169,7 @@ struct page {
/* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
atomic_t _refcount;
-#ifdef CONFIG_MEMCG
+#ifdef CONFIG_SLAB_OBJ_EXT
unsigned long memcg_data;
#endif
@@ -331,7 +331,7 @@ struct folio {
};
atomic_t _mapcount;
atomic_t _refcount;
-#ifdef CONFIG_MEMCG
+#ifdef CONFIG_SLAB_OBJ_EXT
unsigned long memcg_data;
#endif
#if defined(WANT_PAGE_VIRTUAL)
@@ -671,6 +671,9 @@ struct vm_area_struct {
};
#ifdef CONFIG_PER_VMA_LOCK
+ /* Flag to indicate areas detached from the mm->mm_mt tree */
+ bool detached;
+
/*
* Can only be written (using WRITE_ONCE()) while holding both:
* - mmap_lock (in write mode)
@@ -687,9 +690,6 @@ struct vm_area_struct {
*/
int vm_lock_seq;
struct vma_lock *vm_lock;
-
- /* Flag to indicate areas detached from the mm->mm_mt tree */
- bool detached;
#endif
/*
@@ -777,11 +777,7 @@ struct mm_struct {
} ____cacheline_aligned_in_smp;
struct maple_tree mm_mt;
-#ifdef CONFIG_MMU
- unsigned long (*get_unmapped_area) (struct file *filp,
- unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags);
-#endif
+
unsigned long mmap_base; /* base of mmap area */
unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
@@ -1170,14 +1166,15 @@ static inline void mm_init_cid(struct mm_struct *mm)
cpumask_clear(mm_cidmask(mm));
}
-static inline int mm_alloc_cid(struct mm_struct *mm)
+static inline int mm_alloc_cid_noprof(struct mm_struct *mm)
{
- mm->pcpu_cid = alloc_percpu(struct mm_cid);
+ mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid);
if (!mm->pcpu_cid)
return -ENOMEM;
mm_init_cid(mm);
return 0;
}
+#define mm_alloc_cid(...) alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__))
static inline void mm_destroy_cid(struct mm_struct *mm)
{
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 8d38dcb6d044cb..de9dc20b01ba74 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -60,16 +60,14 @@ static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
#endif /* CONFIG_TRACING */
-static inline void mmap_assert_locked(struct mm_struct *mm)
+static inline void mmap_assert_locked(const struct mm_struct *mm)
{
- lockdep_assert_held(&mm->mmap_lock);
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+ rwsem_assert_held(&mm->mmap_lock);
}
-static inline void mmap_assert_write_locked(struct mm_struct *mm)
+static inline void mmap_assert_write_locked(const struct mm_struct *mm)
{
- lockdep_assert_held_write(&mm->mmap_lock);
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+ rwsem_assert_held_write(&mm->mmap_lock);
}
#ifdef CONFIG_PER_VMA_LOCK
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 4bf1c25fd1dc56..f83331684e470d 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -109,7 +109,6 @@ enum pageflags {
PG_active,
PG_workingset,
PG_error,
- PG_slab,
PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/
PG_arch_1,
PG_reserved,
@@ -524,7 +523,6 @@ PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
TESTCLEARFLAG(Active, active, PF_HEAD)
PAGEFLAG(Workingset, workingset, PF_HEAD)
TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
-__PAGEFLAG(Slab, slab, PF_NO_TAIL)
PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */
/* Xen */
@@ -628,12 +626,19 @@ PAGEFLAG_FALSE(HWPoison, hwpoison)
#define __PG_HWPOISON 0
#endif
-#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
+#ifdef CONFIG_PAGE_IDLE_FLAG
+#ifdef CONFIG_64BIT
FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE)
FOLIO_SET_FLAG(young, FOLIO_HEAD_PAGE)
FOLIO_TEST_CLEAR_FLAG(young, FOLIO_HEAD_PAGE)
FOLIO_FLAG(idle, FOLIO_HEAD_PAGE)
#endif
+/* See page_idle.h for !64BIT workaround */
+#else /* !CONFIG_PAGE_IDLE_FLAG */
+FOLIO_FLAG_FALSE(young)
+FOLIO_TEST_CLEAR_FLAG_FALSE(young)
+FOLIO_FLAG_FALSE(idle)
+#endif
/*
* PageReported() is used to track reported free pages within the Buddy
@@ -688,7 +693,7 @@ static __always_inline bool folio_mapping_flags(const struct folio *folio)
return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0;
}
-static __always_inline int PageMappingFlags(const struct page *page)
+static __always_inline bool PageMappingFlags(const struct page *page)
{
return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0;
}
@@ -709,7 +714,7 @@ static __always_inline bool __folio_test_movable(const struct folio *folio)
PAGE_MAPPING_MOVABLE;
}
-static __always_inline int __PageMovable(const struct page *page)
+static __always_inline bool __PageMovable(const struct page *page)
{
return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
PAGE_MAPPING_MOVABLE;
@@ -736,7 +741,7 @@ static __always_inline bool PageKsm(const struct page *page)
TESTPAGEFLAG_FALSE(Ksm, ksm)
#endif
-u64 stable_page_flags(struct page *page);
+u64 stable_page_flags(const struct page *page);
/**
* folio_xor_flags_has_waiters - Change some folio flags.
@@ -868,9 +873,9 @@ static inline void ClearPageCompound(struct page *page)
BUG_ON(!PageHead(page));
ClearPageHead(page);
}
-PAGEFLAG(LargeRmappable, large_rmappable, PF_SECOND)
+FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
#else
-TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable)
+FOLIO_FLAG_FALSE(large_rmappable)
#endif
#define PG_head_mask ((1UL << PG_head))
@@ -931,7 +936,7 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned)
#endif
/*
- * For pages that are never mapped to userspace (and aren't PageSlab),
+ * For pages that are never mapped to userspace,
* page_type may be used. Because it is initialised to -1, we invert the
* sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and
* __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and
@@ -947,6 +952,7 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned)
#define PG_table 0x00000200
#define PG_guard 0x00000400
#define PG_hugetlb 0x00000800
+#define PG_slab 0x00001000
#define PageType(page, flag) \
((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
@@ -1041,6 +1047,20 @@ PAGE_TYPE_OPS(Table, table, pgtable)
*/
PAGE_TYPE_OPS(Guard, guard, guard)
+FOLIO_TYPE_OPS(slab, slab)
+
+/**
+ * PageSlab - Determine if the page belongs to the slab allocator
+ * @page: The page to test.
+ *
+ * Context: Any context.
+ * Return: True for slab pages, false for any other kind of page.
+ */
+static inline bool PageSlab(const struct page *page)
+{
+ return folio_test_slab(page_folio(page));
+}
+
#ifdef CONFIG_HUGETLB_PAGE
FOLIO_TYPE_OPS(hugetlb, hugetlb)
#else
@@ -1065,21 +1085,29 @@ static inline bool PageHuge(const struct page *page)
* best effort only and inherently racy: there is no way to synchronize with
* failing hardware.
*/
-static inline bool is_page_hwpoison(struct page *page)
+static inline bool is_page_hwpoison(const struct page *page)
{
+ const struct folio *folio;
+
if (PageHWPoison(page))
return true;
- return PageHuge(page) && PageHWPoison(compound_head(page));
+ folio = page_folio(page);
+ return folio_test_hugetlb(folio) && PageHWPoison(&folio->page);
}
-extern bool is_free_buddy_page(struct page *page);
+bool is_free_buddy_page(const struct page *page);
PAGEFLAG(Isolated, isolated, PF_ANY);
static __always_inline int PageAnonExclusive(const struct page *page)
{
VM_BUG_ON_PGFLAGS(!PageAnon(page), page);
- VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
+ /*
+ * HugeTLB stores this information on the head page; THP keeps it per
+ * page
+ */
+ if (PageHuge(page))
+ page = compound_head(page);
return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
}
@@ -1118,7 +1146,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
(1UL << PG_lru | 1UL << PG_locked | \
1UL << PG_private | 1UL << PG_private_2 | \
1UL << PG_writeback | 1UL << PG_reserved | \
- 1UL << PG_slab | 1UL << PG_active | \
+ 1UL << PG_active | \
1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
/*
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 4ac34392823a9c..c16db006709073 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -34,8 +34,9 @@ static inline bool is_migrate_isolate(int migratetype)
#define REPORT_FAILURE 0x2
void set_pageblock_migratetype(struct page *page, int migratetype);
-int move_freepages_block(struct zone *zone, struct page *page,
- int migratetype, int *num_movable);
+
+bool move_freepages_block_isolate(struct zone *zone, struct page *page,
+ int migratetype);
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
int migratetype, int flags, gfp_t gfp_flags);
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index be98564191e676..e4b48a0dda2446 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -4,7 +4,6 @@
#include <linux/types.h>
#include <linux/stacktrace.h>
-#include <linux/stackdepot.h>
struct pglist_data;
@@ -78,7 +77,7 @@ static inline void page_ext_init(void)
}
#endif
-extern struct page_ext *page_ext_get(struct page *page);
+extern struct page_ext *page_ext_get(const struct page *page);
extern void page_ext_put(struct page_ext *page_ext);
static inline void *page_ext_data(struct page_ext *page_ext,
@@ -118,7 +117,7 @@ static inline void page_ext_init_flatmem(void)
{
}
-static inline struct page_ext *page_ext_get(struct page *page)
+static inline struct page_ext *page_ext_get(const struct page *page)
{
return NULL;
}
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index d8f34484064302..89ca0d5dc1e71d 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -6,14 +6,12 @@
#include <linux/page-flags.h>
#include <linux/page_ext.h>
-#ifdef CONFIG_PAGE_IDLE_FLAG
-
-#ifndef CONFIG_64BIT
+#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
/*
* If there is not enough space to store Idle and Young bits in page flags, use
* page ext flags instead.
*/
-static inline bool folio_test_young(struct folio *folio)
+static inline bool folio_test_young(const struct folio *folio)
{
struct page_ext *page_ext = page_ext_get(&folio->page);
bool page_young;
@@ -52,7 +50,7 @@ static inline bool folio_test_clear_young(struct folio *folio)
return page_young;
}
-static inline bool folio_test_idle(struct folio *folio)
+static inline bool folio_test_idle(const struct folio *folio)
{
struct page_ext *page_ext = page_ext_get(&folio->page);
bool page_idle;
@@ -60,7 +58,7 @@ static inline bool folio_test_idle(struct folio *folio)
if (unlikely(!page_ext))
return false;
- page_idle = test_bit(PAGE_EXT_IDLE, &page_ext->flags);
+ page_idle = test_bit(PAGE_EXT_IDLE, &page_ext->flags);
page_ext_put(page_ext);
return page_idle;
@@ -87,61 +85,5 @@ static inline void folio_clear_idle(struct folio *folio)
clear_bit(PAGE_EXT_IDLE, &page_ext->flags);
page_ext_put(page_ext);
}
-#endif /* !CONFIG_64BIT */
-
-#else /* !CONFIG_PAGE_IDLE_FLAG */
-
-static inline bool folio_test_young(struct folio *folio)
-{
- return false;
-}
-
-static inline void folio_set_young(struct folio *folio)
-{
-}
-
-static inline bool folio_test_clear_young(struct folio *folio)
-{
- return false;
-}
-
-static inline bool folio_test_idle(struct folio *folio)
-{
- return false;
-}
-
-static inline void folio_set_idle(struct folio *folio)
-{
-}
-
-static inline void folio_clear_idle(struct folio *folio)
-{
-}
-
-#endif /* CONFIG_PAGE_IDLE_FLAG */
-
-static inline bool page_is_young(struct page *page)
-{
- return folio_test_young(page_folio(page));
-}
-
-static inline void set_page_young(struct page *page)
-{
- folio_set_young(page_folio(page));
-}
-
-static inline bool test_and_clear_page_young(struct page *page)
-{
- return folio_test_clear_young(page_folio(page));
-}
-
-static inline bool page_is_idle(struct page *page)
-{
- return folio_test_idle(page_folio(page));
-}
-
-static inline void set_page_idle(struct page *page)
-{
- folio_set_idle(page_folio(page));
-}
+#endif /* CONFIG_PAGE_IDLE_FLAG && !64BIT */
#endif /* _LINUX_MM_PAGE_IDLE_H */
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 3f2409b968ec65..547e82cdc89a44 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -28,7 +28,7 @@ enum pageblock_bits {
NR_PAGEBLOCK_BITS
};
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE)
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -45,7 +45,11 @@ extern unsigned int pageblock_order;
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
-#else /* CONFIG_HUGETLB_PAGE */
+#elif defined(CONFIG_TRANSPARENT_HUGEPAGE)
+
+#define pageblock_order min_t(unsigned int, HPAGE_PMD_ORDER, MAX_PAGE_ORDER)
+
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
/* If huge pages are not used, group by MAX_ORDER_NR_PAGES */
#define pageblock_order MAX_PAGE_ORDER
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 2df35e65557d27..65e332df4d7696 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -542,14 +542,17 @@ static inline void *detach_page_private(struct page *page)
#endif
#ifdef CONFIG_NUMA
-struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order);
+struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
#else
-static inline struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
+static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
{
- return folio_alloc(gfp, order);
+ return folio_alloc_noprof(gfp, order);
}
#endif
+#define filemap_alloc_folio(...) \
+ alloc_hooks(filemap_alloc_folio_noprof(__VA_ARGS__))
+
static inline struct page *__page_cache_alloc(gfp_t gfp)
{
return &filemap_alloc_folio(gfp, 0)->page;
@@ -1144,11 +1147,6 @@ void folio_end_writeback(struct folio *folio);
void wait_for_stable_page(struct page *page);
void folio_wait_stable(struct folio *folio);
void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn);
-static inline void __set_page_dirty(struct page *page,
- struct address_space *mapping, int warn)
-{
- __folio_mark_dirty(page_folio(page), mapping, warn);
-}
void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb);
void __folio_cancel_dirty(struct folio *folio);
static inline void folio_cancel_dirty(struct folio *folio)
@@ -1160,7 +1158,6 @@ static inline void folio_cancel_dirty(struct folio *folio)
bool folio_clear_dirty_for_io(struct folio *folio);
bool clear_page_dirty_for_io(struct page *page);
void folio_invalidate(struct folio *folio, size_t offset, size_t length);
-int __set_page_dirty_nobuffers(struct page *page);
bool noop_dirty_folio(struct address_space *mapping, struct folio *folio);
#ifdef CONFIG_MIGRATION
diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h
index 30581e2e04cc1f..5802e1deef24b5 100644
--- a/include/linux/pds/pds_common.h
+++ b/include/linux/pds/pds_common.h
@@ -4,6 +4,8 @@
#ifndef _PDS_COMMON_H_
#define _PDS_COMMON_H_
+#include <linux/notifier.h>
+
#define PDS_CORE_DRV_NAME "pds_core"
/* the device's internal addressing uses up to 52 bits */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 8c677f185901bc..03053de557cf60 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -2,6 +2,7 @@
#ifndef __LINUX_PERCPU_H
#define __LINUX_PERCPU_H
+#include <linux/alloc_tag.h>
#include <linux/mmdebug.h>
#include <linux/preempt.h>
#include <linux/smp.h>
@@ -9,12 +10,17 @@
#include <linux/pfn.h>
#include <linux/init.h>
#include <linux/cleanup.h>
+#include <linux/sched.h>
#include <asm/percpu.h>
/* enough to cover all DEFINE_PER_CPUs in modules */
#ifdef CONFIG_MODULES
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+#define PERCPU_MODULE_RESERVE (8 << 13)
+#else
#define PERCPU_MODULE_RESERVE (8 << 10)
+#endif
#else
#define PERCPU_MODULE_RESERVE 0
#endif
@@ -121,7 +127,6 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn);
#endif
-extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __alloc_size(1);
extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr);
extern bool is_kernel_percpu_address(unsigned long addr);
@@ -129,14 +134,16 @@ extern bool is_kernel_percpu_address(unsigned long addr);
extern void __init setup_per_cpu_areas(void);
#endif
-extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1);
-extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1);
-extern void free_percpu(void __percpu *__pdata);
+extern void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
+ gfp_t gfp) __alloc_size(1);
extern size_t pcpu_alloc_size(void __percpu *__pdata);
-DEFINE_FREE(free_percpu, void __percpu *, free_percpu(_T))
-
-extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
+#define __alloc_percpu_gfp(_size, _align, _gfp) \
+ alloc_hooks(pcpu_alloc_noprof(_size, _align, false, _gfp))
+#define __alloc_percpu(_size, _align) \
+ alloc_hooks(pcpu_alloc_noprof(_size, _align, false, GFP_KERNEL))
+#define __alloc_reserved_percpu(_size, _align) \
+ alloc_hooks(pcpu_alloc_noprof(_size, _align, true, GFP_KERNEL))
#define alloc_percpu_gfp(type, gfp) \
(typeof(type) __percpu *)__alloc_percpu_gfp(sizeof(type), \
@@ -144,6 +151,15 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
#define alloc_percpu(type) \
(typeof(type) __percpu *)__alloc_percpu(sizeof(type), \
__alignof__(type))
+#define alloc_percpu_noprof(type) \
+ ((typeof(type) __percpu *)pcpu_alloc_noprof(sizeof(type), \
+ __alignof__(type), false, GFP_KERNEL))
+
+extern void free_percpu(void __percpu *__pdata);
+
+DEFINE_FREE(free_percpu, void __percpu *, free_percpu(_T))
+
+extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
extern unsigned long pcpu_nr_pages(void);
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
new file mode 100644
index 00000000000000..86ba5d33e43bdf
--- /dev/null
+++ b/include/linux/pgalloc_tag.h
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * page allocation tagging
+ */
+#ifndef _LINUX_PGALLOC_TAG_H
+#define _LINUX_PGALLOC_TAG_H
+
+#include <linux/alloc_tag.h>
+
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+
+#include <linux/page_ext.h>
+
+extern struct page_ext_operations page_alloc_tagging_ops;
+
+static inline union codetag_ref *codetag_ref_from_page_ext(struct page_ext *page_ext)
+{
+ return (void *)page_ext + page_alloc_tagging_ops.offset;
+}
+
+static inline struct page_ext *page_ext_from_codetag_ref(union codetag_ref *ref)
+{
+ return (void *)ref - page_alloc_tagging_ops.offset;
+}
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static inline union codetag_ref *get_page_tag_ref(struct page *page)
+{
+ if (page) {
+ struct page_ext *page_ext = page_ext_get(page);
+
+ if (page_ext)
+ return codetag_ref_from_page_ext(page_ext);
+ }
+ return NULL;
+}
+
+static inline void put_page_tag_ref(union codetag_ref *ref)
+{
+ page_ext_put(page_ext_from_codetag_ref(ref));
+}
+
+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr)
+{
+ if (mem_alloc_profiling_enabled()) {
+ union codetag_ref *ref = get_page_tag_ref(page);
+
+ if (ref) {
+ alloc_tag_add(ref, task->alloc_tag, PAGE_SIZE * nr);
+ put_page_tag_ref(ref);
+ }
+ }
+}
+
+static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
+{
+ if (mem_alloc_profiling_enabled()) {
+ union codetag_ref *ref = get_page_tag_ref(page);
+
+ if (ref) {
+ alloc_tag_sub(ref, PAGE_SIZE * nr);
+ put_page_tag_ref(ref);
+ }
+ }
+}
+
+static inline void pgalloc_tag_split(struct page *page, unsigned int nr)
+{
+ int i;
+ struct page_ext *page_ext;
+ union codetag_ref *ref;
+ struct alloc_tag *tag;
+
+ if (!mem_alloc_profiling_enabled())
+ return;
+
+ page_ext = page_ext_get(page);
+ if (unlikely(!page_ext))
+ return;
+
+ ref = codetag_ref_from_page_ext(page_ext);
+ if (!ref->ct)
+ goto out;
+
+ tag = ct_to_alloc_tag(ref->ct);
+ page_ext = page_ext_next(page_ext);
+ for (i = 1; i < nr; i++) {
+ /* Set new reference to point to the original tag */
+ alloc_tag_ref_set(codetag_ref_from_page_ext(page_ext), tag);
+ page_ext = page_ext_next(page_ext);
+ }
+out:
+ page_ext_put(page_ext);
+}
+
+static inline struct alloc_tag *pgalloc_tag_get(struct page *page)
+{
+ struct alloc_tag *tag = NULL;
+
+ if (mem_alloc_profiling_enabled()) {
+ union codetag_ref *ref = get_page_tag_ref(page);
+
+ alloc_tag_sub_check(ref);
+ if (ref && ref->ct)
+ tag = ct_to_alloc_tag(ref->ct);
+ put_page_tag_ref(ref);
+ }
+
+ return tag;
+}
+
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
+{
+ if (mem_alloc_profiling_enabled() && tag)
+ this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
+}
+
+#else /* CONFIG_MEM_ALLOC_PROFILING */
+
+static inline union codetag_ref *get_page_tag_ref(struct page *page) { return NULL; }
+static inline void put_page_tag_ref(union codetag_ref *ref) {}
+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr) {}
+static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
+static inline void pgalloc_tag_split(struct page *page, unsigned int nr) {}
+static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; }
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
+
+#endif /* CONFIG_MEM_ALLOC_PROFILING */
+
+#endif /* _LINUX_PGALLOC_TAG_H */
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 85fc7554cd52bc..e2f45e22a6d138 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -50,6 +50,8 @@
#define pmd_pgtable(pmd) pmd_page(pmd)
#endif
+#define pmd_folio(pmd) page_folio(pmd_page(pmd))
+
/*
* A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD]
*
@@ -149,9 +151,7 @@ static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address)
* a shortcut which implies the use of the kernel's pgd, instead
* of a process's
*/
-#ifndef pgd_offset_k
#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
-#endif
/*
* In many cases it is known that a virtual address is mapped at PMD or PTE
@@ -361,6 +361,36 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
}
#endif
+#ifndef mkold_ptes
+/**
+ * mkold_ptes - Mark PTEs that map consecutive pages of the same folio as old.
+ * @vma: VMA the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to mark old.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_test_and_clear_young().
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock. The PTEs map consecutive
+ * pages that belong to the same folio. The PTEs are all in the same PMD.
+ */
+static inline void mkold_ptes(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+ for (;;) {
+ ptep_test_and_clear_young(vma, addr, ptep);
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ }
+}
+#endif
+
#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
@@ -708,6 +738,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
}
#endif
+#ifndef clear_not_present_full_ptes
+/**
+ * clear_not_present_full_ptes - Clear multiple not present PTEs which are
+ * consecutive in the pgtable.
+ * @mm: Address space the ptes represent.
+ * @addr: Address of the first pte.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ * @full: Whether we are clearing a full mm.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over pte_clear_not_present_full().
+ *
+ * Context: The caller holds the page table lock. The PTEs are all not present.
+ * The PTEs are all in the same PMD.
+ */
+static inline void clear_not_present_full_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, unsigned int nr, int full)
+{
+ for (;;) {
+ pte_clear_not_present_full(mm, addr, ptep, full);
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ }
+}
+#endif
+
#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
unsigned long address,
@@ -1052,7 +1111,7 @@ static inline int arch_unmap_one(struct mm_struct *mm,
* prototypes must be defined in the arch-specific asm/pgtable.h file.
*/
#ifndef __HAVE_ARCH_PREPARE_TO_SWAP
-static inline int arch_prepare_to_swap(struct page *page)
+static inline int arch_prepare_to_swap(struct folio *folio)
{
return 0;
}
@@ -1079,7 +1138,7 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
#endif
#ifndef __HAVE_ARCH_MOVE_PTE
-#define move_pte(pte, prot, old_addr, new_addr) (pte)
+#define move_pte(pte, old_addr, new_addr) (pte)
#endif
#ifndef pte_accessible
@@ -1770,11 +1829,25 @@ typedef unsigned int pgtbl_mod_mask;
#endif
/*
- * p?d_leaf() - true if this entry is a final mapping to a physical address.
- * This differs from p?d_huge() by the fact that they are always available (if
- * the architecture supports large pages at the appropriate level) even
- * if CONFIG_HUGETLB_PAGE is not defined.
- * Only meaningful when called on a valid entry.
+ * pXd_leaf() is the API to check whether a pgtable entry is a huge page
+ * mapping. It should work globally across all archs, without any
+ * dependency on CONFIG_* options. For architectures that do not support
+ * huge mappings on specific levels, below fallbacks will be used.
+ *
+ * A leaf pgtable entry should always imply the following:
+ *
+ * - It is a "present" entry. IOW, before using this API, please check it
+ * with pXd_present() first. NOTE: it may not always mean the "present
+ * bit" is set. For example, PROT_NONE entries are always "present".
+ *
+ * - It should _never_ be a swap entry of any type. Above "present" check
+ * should have guarded this, but let's be crystal clear on this.
+ *
+ * - It should contain a huge PFN, which points to a huge page larger than
+ * PAGE_SIZE of the platform. The PFN format isn't important here.
+ *
+ * - It should cover all kinds of huge mappings (e.g., pXd_trans_huge(),
+ * pXd_devmap(), or hugetlb mappings).
*/
#ifndef pgd_leaf
#define pgd_leaf(x) false
@@ -1806,6 +1879,20 @@ typedef unsigned int pgtbl_mod_mask;
#endif
/*
+ * We always define pmd_pfn for all archs as it's used in lots of generic
+ * code. Now it happens too for pud_pfn (and can happen for larger
+ * mappings too in the future; we're not there yet). Instead of defining
+ * it for all archs (like pmd_pfn), provide a fallback.
+ *
+ * Note that returning 0 here means any arch that didn't define this can
+ * get severely wrong when it hits a real pud leaf. It's arch's
+ * responsibility to properly define it when a huge pud is possible.
+ */
+#ifndef pud_pfn
+#define pud_pfn(x) 0
+#endif
+
+/*
* Some architectures have MMUs that are configurable or selectable at boot
* time. These lead to variable PTRS_PER_x. For statically allocated arrays it
* helps to have a static maximum value.
diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 808f9d3ee54654..fd037c127bb071 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -464,11 +464,11 @@ static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,
/* Not all gfp_t flags (besides GFP_KERNEL) are allowed. See
* documentation for vmalloc for which of them are legal.
*/
-static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp)
+static inline void **__ptr_ring_init_queue_alloc_noprof(unsigned int size, gfp_t gfp)
{
if (size > KMALLOC_MAX_SIZE / sizeof(void *))
return NULL;
- return kvmalloc_array(size, sizeof(void *), gfp | __GFP_ZERO);
+ return kvmalloc_array_noprof(size, sizeof(void *), gfp | __GFP_ZERO);
}
static inline void __ptr_ring_set_size(struct ptr_ring *r, int size)
@@ -484,9 +484,9 @@ static inline void __ptr_ring_set_size(struct ptr_ring *r, int size)
r->batch = 1;
}
-static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp)
+static inline int ptr_ring_init_noprof(struct ptr_ring *r, int size, gfp_t gfp)
{
- r->queue = __ptr_ring_init_queue_alloc(size, gfp);
+ r->queue = __ptr_ring_init_queue_alloc_noprof(size, gfp);
if (!r->queue)
return -ENOMEM;
@@ -497,6 +497,7 @@ static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp)
return 0;
}
+#define ptr_ring_init(...) alloc_hooks(ptr_ring_init_noprof(__VA_ARGS__))
/*
* Return entries into ring. Destroy entries that don't fit.
@@ -587,11 +588,11 @@ static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
* In particular if you consume ring in interrupt or BH context, you must
* disable interrupts/BH when doing so.
*/
-static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
+static inline int ptr_ring_resize_noprof(struct ptr_ring *r, int size, gfp_t gfp,
void (*destroy)(void *))
{
unsigned long flags;
- void **queue = __ptr_ring_init_queue_alloc(size, gfp);
+ void **queue = __ptr_ring_init_queue_alloc_noprof(size, gfp);
void **old;
if (!queue)
@@ -609,6 +610,7 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
return 0;
}
+#define ptr_ring_resize(...) alloc_hooks(ptr_ring_resize_noprof(__VA_ARGS__))
/*
* Note: producer lock is nested within consumer lock, so if you
@@ -616,21 +618,21 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
* In particular if you consume ring in interrupt or BH context, you must
* disable interrupts/BH when doing so.
*/
-static inline int ptr_ring_resize_multiple(struct ptr_ring **rings,
- unsigned int nrings,
- int size,
- gfp_t gfp, void (*destroy)(void *))
+static inline int ptr_ring_resize_multiple_noprof(struct ptr_ring **rings,
+ unsigned int nrings,
+ int size,
+ gfp_t gfp, void (*destroy)(void *))
{
unsigned long flags;
void ***queues;
int i;
- queues = kmalloc_array(nrings, sizeof(*queues), gfp);
+ queues = kmalloc_array_noprof(nrings, sizeof(*queues), gfp);
if (!queues)
goto noqueues;
for (i = 0; i < nrings; ++i) {
- queues[i] = __ptr_ring_init_queue_alloc(size, gfp);
+ queues[i] = __ptr_ring_init_queue_alloc_noprof(size, gfp);
if (!queues[i])
goto nomem;
}
@@ -660,6 +662,8 @@ nomem:
noqueues:
return -ENOMEM;
}
+#define ptr_ring_resize_multiple(...) \
+ alloc_hooks(ptr_ring_resize_multiple_noprof(__VA_ARGS__))
static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *))
{
diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
index b6f3797277ff83..015c8298bebc45 100644
--- a/include/linux/rhashtable-types.h
+++ b/include/linux/rhashtable-types.h
@@ -9,6 +9,7 @@
#ifndef _LINUX_RHASHTABLE_TYPES_H
#define _LINUX_RHASHTABLE_TYPES_H
+#include <linux/alloc_tag.h>
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/mutex.h>
@@ -88,6 +89,9 @@ struct rhashtable {
struct mutex mutex;
spinlock_t lock;
atomic_t nelems;
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+ struct alloc_tag *alloc_tag;
+#endif
};
/**
@@ -127,9 +131,12 @@ struct rhashtable_iter {
bool end_of_table;
};
-int rhashtable_init(struct rhashtable *ht,
+int rhashtable_init_noprof(struct rhashtable *ht,
const struct rhashtable_params *params);
-int rhltable_init(struct rhltable *hlt,
+#define rhashtable_init(...) alloc_hooks(rhashtable_init_noprof(__VA_ARGS__))
+
+int rhltable_init_noprof(struct rhltable *hlt,
const struct rhashtable_params *params);
+#define rhltable_init(...) alloc_hooks(rhltable_init_noprof(__VA_ARGS__))
#endif /* _LINUX_RHASHTABLE_TYPES_H */
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b7944a833668a7..9bf9324214fc3f 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -284,7 +284,7 @@ static inline int hugetlb_try_share_anon_rmap(struct folio *folio)
VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio);
/* Paired with the memory barrier in try_grab_folio(). */
- if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
smp_mb();
if (unlikely(folio_maybe_dma_pinned(folio)))
@@ -295,7 +295,7 @@ static inline int hugetlb_try_share_anon_rmap(struct folio *folio)
* This is conceptually a smp_wmb() paired with the smp_rmb() in
* gup_must_unshare().
*/
- if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
smp_mb__after_atomic();
return 0;
}
@@ -541,7 +541,7 @@ static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
*/
/* Paired with the memory barrier in try_grab_folio(). */
- if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
smp_mb();
if (unlikely(folio_maybe_dma_pinned(folio)))
@@ -552,7 +552,7 @@ static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
* This is conceptually a smp_wmb() paired with the smp_rmb() in
* gup_must_unshare().
*/
- if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
smp_mb__after_atomic();
return 0;
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3c2abbc587b49c..4118b3f959c324 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -770,6 +770,10 @@ struct task_struct {
unsigned int flags;
unsigned int ptrace;
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+ struct alloc_tag *alloc_tag;
+#endif
+
#ifdef CONFIG_SMP
int on_cpu;
struct __call_single_node wake_entry;
@@ -810,6 +814,7 @@ struct task_struct {
struct task_group *sched_task_group;
#endif
+
#ifdef CONFIG_UCLAMP_TASK
/*
* Clamp values requested for a scheduling entity.
@@ -2187,4 +2192,23 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
extern void sched_set_stop_task(int cpu, struct task_struct *stop);
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag)
+{
+ swap(current->alloc_tag, tag);
+ return tag;
+}
+
+static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old)
+{
+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
+ WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n");
+#endif
+ current->alloc_tag = old;
+}
+#else
+#define alloc_tag_save(_tag) NULL
+#define alloc_tag_restore(_tag, _old) do {} while (0)
+#endif
+
#endif
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 02f5090ffea29c..e62ff805cfc950 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -92,9 +92,12 @@ static inline int get_dumpable(struct mm_struct *mm)
#define MMF_VM_MERGE_ANY 30
#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY)
+#define MMF_TOPDOWN 31 /* mm searches top down by default */
+#define MMF_TOPDOWN_MASK (1 << MMF_TOPDOWN)
+
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
- MMF_VM_MERGE_ANY_MASK)
+ MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)
static inline unsigned long mmf_init_flags(unsigned long flags)
{
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index b6543f9d78d6b8..91546493c43da2 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -8,6 +8,7 @@
#include <linux/mm_types.h>
#include <linux/gfp.h>
#include <linux/sync_core.h>
+#include <linux/sched/coredump.h>
/*
* Routines for handling mm_structs
@@ -186,6 +187,27 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags);
+unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags);
+
+unsigned long
+arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags, vm_flags_t vm_flags);
+unsigned long
+arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags, vm_flags_t);
+
+unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm,
+ struct file *filp,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long pgoff,
+ unsigned long flags,
+ vm_flags_t vm_flags);
+
unsigned long
generic_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
index acf7e1a3f3def9..e918f96881f569 100644
--- a/include/linux/secretmem.h
+++ b/include/linux/secretmem.h
@@ -6,25 +6,8 @@
extern const struct address_space_operations secretmem_aops;
-static inline bool folio_is_secretmem(struct folio *folio)
+static inline bool secretmem_mapping(struct address_space *mapping)
{
- struct address_space *mapping;
-
- /*
- * Using folio_mapping() is quite slow because of the actual call
- * instruction.
- * We know that secretmem pages are not compound, so we can
- * save a couple of cycles here.
- */
- if (folio_test_large(folio))
- return false;
-
- mapping = (struct address_space *)
- ((unsigned long)folio->mapping & ~PAGE_MAPPING_FLAGS);
-
- if (!mapping || mapping != folio->mapping)
- return false;
-
return mapping->a_ops == &secretmem_aops;
}
@@ -38,7 +21,7 @@ static inline bool vma_is_secretmem(struct vm_area_struct *vma)
return false;
}
-static inline bool folio_is_secretmem(struct folio *folio)
+static inline bool secretmem_mapping(struct address_space *mapping)
{
return false;
}
diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index e2d45b7cb61980..926496c9cc9c3b 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -177,10 +177,11 @@ static inline int skb_array_peek_len_any(struct skb_array *a)
return PTR_RING_PEEK_CALL_ANY(&a->ring, __skb_array_len_with_tag);
}
-static inline int skb_array_init(struct skb_array *a, int size, gfp_t gfp)
+static inline int skb_array_init_noprof(struct skb_array *a, int size, gfp_t gfp)
{
- return ptr_ring_init(&a->ring, size, gfp);
+ return ptr_ring_init_noprof(&a->ring, size, gfp);
}
+#define skb_array_init(...) alloc_hooks(skb_array_init_noprof(__VA_ARGS__))
static void __skb_array_destroy_skb(void *ptr)
{
@@ -198,15 +199,17 @@ static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb);
}
-static inline int skb_array_resize_multiple(struct skb_array **rings,
- int nrings, unsigned int size,
- gfp_t gfp)
+static inline int skb_array_resize_multiple_noprof(struct skb_array **rings,
+ int nrings, unsigned int size,
+ gfp_t gfp)
{
BUILD_BUG_ON(offsetof(struct skb_array, ring));
- return ptr_ring_resize_multiple((struct ptr_ring **)rings,
- nrings, size, gfp,
- __skb_array_destroy_skb);
+ return ptr_ring_resize_multiple_noprof((struct ptr_ring **)rings,
+ nrings, size, gfp,
+ __skb_array_destroy_skb);
}
+#define skb_array_resize_multiple(...) \
+ alloc_hooks(skb_array_resize_multiple_noprof(__VA_ARGS__))
static inline void skb_array_cleanup(struct skb_array *a)
{
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4ff48eda3f642d..18523f39f4a4cb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3386,7 +3386,7 @@ void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason);
*
* %NULL is returned if there is no free memory.
*/
-static inline struct page *__dev_alloc_pages(gfp_t gfp_mask,
+static inline struct page *__dev_alloc_pages_noprof(gfp_t gfp_mask,
unsigned int order)
{
/* This piece of code contains several assumptions.
@@ -3399,13 +3399,11 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask,
*/
gfp_mask |= __GFP_COMP | __GFP_MEMALLOC;
- return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
+ return alloc_pages_node_noprof(NUMA_NO_NODE, gfp_mask, order);
}
+#define __dev_alloc_pages(...) alloc_hooks(__dev_alloc_pages_noprof(__VA_ARGS__))
-static inline struct page *dev_alloc_pages(unsigned int order)
-{
- return __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, order);
-}
+#define dev_alloc_pages(_order) __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, _order)
/**
* __dev_alloc_page - allocate a page for network Rx
@@ -3415,15 +3413,13 @@ static inline struct page *dev_alloc_pages(unsigned int order)
*
* %NULL is returned if there is no free memory.
*/
-static inline struct page *__dev_alloc_page(gfp_t gfp_mask)
+static inline struct page *__dev_alloc_page_noprof(gfp_t gfp_mask)
{
- return __dev_alloc_pages(gfp_mask, 0);
+ return __dev_alloc_pages_noprof(gfp_mask, 0);
}
+#define __dev_alloc_page(...) alloc_hooks(__dev_alloc_page_noprof(__VA_ARGS__))
-static inline struct page *dev_alloc_page(void)
-{
- return dev_alloc_pages(0);
-}
+#define dev_alloc_page() dev_alloc_pages(0)
/**
* dev_page_is_reusable - check whether a page can be reused for network Rx
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index a509caf823d618..df848425e05414 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -410,11 +410,9 @@ void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock);
int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
struct sk_msg *msg);
-static inline struct sk_psock_link *sk_psock_init_link(void)
-{
- return kzalloc(sizeof(struct sk_psock_link),
- GFP_ATOMIC | __GFP_NOWARN);
-}
+#define sk_psock_init_link() \
+ ((struct sk_psock_link *)kzalloc(sizeof(struct sk_psock_link), \
+ GFP_ATOMIC | __GFP_NOWARN))
static inline void sk_psock_free_link(struct sk_psock_link *link)
{
diff --git a/include/linux/slab.h b/include/linux/slab.h
index e53cbfa1832558..4cc37ef22aaed1 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -56,6 +56,9 @@ enum _slab_flag_bits {
#endif
_SLAB_OBJECT_POISON,
_SLAB_CMPXCHG_DOUBLE,
+#ifdef CONFIG_SLAB_OBJ_EXT
+ _SLAB_NO_OBJ_EXT,
+#endif
_SLAB_FLAGS_LAST_BIT
};
@@ -202,6 +205,13 @@ enum _slab_flag_bits {
#endif
#define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */
+/* Slab created using create_boot_cache */
+#ifdef CONFIG_SLAB_OBJ_EXT
+#define SLAB_NO_OBJ_EXT __SLAB_FLAG_BIT(_SLAB_NO_OBJ_EXT)
+#else
+#define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED
+#endif
+
/*
* ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
*
@@ -261,7 +271,10 @@ int kmem_cache_shrink(struct kmem_cache *s);
/*
* Common kmalloc functions provided by all allocators
*/
-void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __realloc_size(2);
+void * __must_check krealloc_noprof(const void *objp, size_t new_size,
+ gfp_t flags) __realloc_size(2);
+#define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__))
+
void kfree(const void *objp);
void kfree_sensitive(const void *objp);
size_t __ksize(const void *objp);
@@ -513,7 +526,10 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
static_assert(PAGE_SHIFT <= 20);
#define kmalloc_index(s) __kmalloc_index(s, true)
-void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1);
+#include <linux/alloc_tag.h>
+
+void *__kmalloc_noprof(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1);
+#define __kmalloc(...) alloc_hooks(__kmalloc_noprof(__VA_ARGS__))
/**
* kmem_cache_alloc - Allocate an object
@@ -525,9 +541,14 @@ void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_siz
*
* Return: pointer to the new object or %NULL in case of error
*/
-void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) __assume_slab_alignment __malloc;
-void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
- gfp_t gfpflags) __assume_slab_alignment __malloc;
+void *kmem_cache_alloc_noprof(struct kmem_cache *cachep,
+ gfp_t flags) __assume_slab_alignment __malloc;
+#define kmem_cache_alloc(...) alloc_hooks(kmem_cache_alloc_noprof(__VA_ARGS__))
+
+void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
+ gfp_t gfpflags) __assume_slab_alignment __malloc;
+#define kmem_cache_alloc_lru(...) alloc_hooks(kmem_cache_alloc_lru_noprof(__VA_ARGS__))
+
void kmem_cache_free(struct kmem_cache *s, void *objp);
/*
@@ -538,29 +559,40 @@ void kmem_cache_free(struct kmem_cache *s, void *objp);
* Note that interrupts must be enabled when calling these functions.
*/
void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
-int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p);
+
+int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p);
+#define kmem_cache_alloc_bulk(...) alloc_hooks(kmem_cache_alloc_bulk_noprof(__VA_ARGS__))
static __always_inline void kfree_bulk(size_t size, void **p)
{
kmem_cache_free_bulk(NULL, size, p);
}
-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment
+void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment
__alloc_size(1);
-void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment
- __malloc;
+#define __kmalloc_node(...) alloc_hooks(__kmalloc_node_noprof(__VA_ARGS__))
-void *kmalloc_trace(struct kmem_cache *s, gfp_t flags, size_t size)
+void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags,
+ int node) __assume_slab_alignment __malloc;
+#define kmem_cache_alloc_node(...) alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__))
+
+void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t flags, size_t size)
__assume_kmalloc_alignment __alloc_size(3);
-void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
- int node, size_t size) __assume_kmalloc_alignment
+void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags,
+ int node, size_t size) __assume_kmalloc_alignment
__alloc_size(4);
-void *kmalloc_large(size_t size, gfp_t flags) __assume_page_alignment
+#define kmalloc_trace(...) alloc_hooks(kmalloc_trace_noprof(__VA_ARGS__))
+
+#define kmalloc_node_trace(...) alloc_hooks(kmalloc_node_trace_noprof(__VA_ARGS__))
+
+void *kmalloc_large_noprof(size_t size, gfp_t flags) __assume_page_alignment
__alloc_size(1);
+#define kmalloc_large(...) alloc_hooks(kmalloc_large_noprof(__VA_ARGS__))
-void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_alignment
+void *kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) __assume_page_alignment
__alloc_size(1);
+#define kmalloc_large_node(...) alloc_hooks(kmalloc_large_node_noprof(__VA_ARGS__))
/**
* kmalloc - allocate kernel memory
@@ -616,37 +648,39 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_align
* Try really hard to succeed the allocation but fail
* eventually.
*/
-static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags)
+static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t flags)
{
if (__builtin_constant_p(size) && size) {
unsigned int index;
if (size > KMALLOC_MAX_CACHE_SIZE)
- return kmalloc_large(size, flags);
+ return kmalloc_large_noprof(size, flags);
index = kmalloc_index(size);
- return kmalloc_trace(
+ return kmalloc_trace_noprof(
kmalloc_caches[kmalloc_type(flags, _RET_IP_)][index],
flags, size);
}
- return __kmalloc(size, flags);
+ return __kmalloc_noprof(size, flags);
}
+#define kmalloc(...) alloc_hooks(kmalloc_noprof(__VA_ARGS__))
-static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node)
+static __always_inline __alloc_size(1) void *kmalloc_node_noprof(size_t size, gfp_t flags, int node)
{
if (__builtin_constant_p(size) && size) {
unsigned int index;
if (size > KMALLOC_MAX_CACHE_SIZE)
- return kmalloc_large_node(size, flags, node);
+ return kmalloc_large_node_noprof(size, flags, node);
index = kmalloc_index(size);
- return kmalloc_node_trace(
+ return kmalloc_node_trace_noprof(
kmalloc_caches[kmalloc_type(flags, _RET_IP_)][index],
flags, node, size);
}
- return __kmalloc_node(size, flags, node);
+ return __kmalloc_node_noprof(size, flags, node);
}
+#define kmalloc_node(...) alloc_hooks(kmalloc_node_noprof(__VA_ARGS__))
/**
* kmalloc_array - allocate memory for an array.
@@ -654,16 +688,17 @@ static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t fla
* @size: element size.
* @flags: the type of memory to allocate (see kmalloc).
*/
-static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_t flags)
+static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
{
size_t bytes;
if (unlikely(check_mul_overflow(n, size, &bytes)))
return NULL;
if (__builtin_constant_p(n) && __builtin_constant_p(size))
- return kmalloc(bytes, flags);
- return __kmalloc(bytes, flags);
+ return kmalloc_noprof(bytes, flags);
+ return kmalloc_noprof(bytes, flags);
}
+#define kmalloc_array(...) alloc_hooks(kmalloc_array_noprof(__VA_ARGS__))
/**
* krealloc_array - reallocate memory for an array.
@@ -672,18 +707,19 @@ static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_
* @new_size: new size of a single member of the array
* @flags: the type of memory to allocate (see kmalloc)
*/
-static inline __realloc_size(2, 3) void * __must_check krealloc_array(void *p,
- size_t new_n,
- size_t new_size,
- gfp_t flags)
+static inline __realloc_size(2, 3) void * __must_check krealloc_array_noprof(void *p,
+ size_t new_n,
+ size_t new_size,
+ gfp_t flags)
{
size_t bytes;
if (unlikely(check_mul_overflow(new_n, new_size, &bytes)))
return NULL;
- return krealloc(p, bytes, flags);
+ return krealloc_noprof(p, bytes, flags);
}
+#define krealloc_array(...) alloc_hooks(krealloc_array_noprof(__VA_ARGS__))
/**
* kcalloc - allocate memory for an array. The memory is set to zero.
@@ -691,16 +727,12 @@ static inline __realloc_size(2, 3) void * __must_check krealloc_array(void *p,
* @size: element size.
* @flags: the type of memory to allocate (see kmalloc).
*/
-static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flags)
-{
- return kmalloc_array(n, size, flags | __GFP_ZERO);
-}
+#define kcalloc(n, size, flags) kmalloc_array(n, size, (flags) | __GFP_ZERO)
-void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
+void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags, int node,
unsigned long caller) __alloc_size(1);
-#define kmalloc_node_track_caller(size, flags, node) \
- __kmalloc_node_track_caller(size, flags, node, \
- _RET_IP_)
+#define kmalloc_node_track_caller(...) \
+ alloc_hooks(kmalloc_node_track_caller_noprof(__VA_ARGS__, _RET_IP_))
/*
* kmalloc_track_caller is a special version of kmalloc that records the
@@ -710,11 +742,12 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
* allocator where we care about the real place the memory allocation
* request comes from.
*/
-#define kmalloc_track_caller(size, flags) \
- __kmalloc_node_track_caller(size, flags, \
- NUMA_NO_NODE, _RET_IP_)
+#define kmalloc_track_caller(...) kmalloc_node_track_caller(__VA_ARGS__, NUMA_NO_NODE)
-static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, gfp_t flags,
+#define kmalloc_track_caller_noprof(...) \
+ kmalloc_node_track_caller_noprof(__VA_ARGS__, NUMA_NO_NODE, _RET_IP_)
+
+static inline __alloc_size(1, 2) void *kmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags,
int node)
{
size_t bytes;
@@ -722,75 +755,58 @@ static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size,
if (unlikely(check_mul_overflow(n, size, &bytes)))
return NULL;
if (__builtin_constant_p(n) && __builtin_constant_p(size))
- return kmalloc_node(bytes, flags, node);
- return __kmalloc_node(bytes, flags, node);
+ return kmalloc_node_noprof(bytes, flags, node);
+ return __kmalloc_node_noprof(bytes, flags, node);
}
+#define kmalloc_array_node(...) alloc_hooks(kmalloc_array_node_noprof(__VA_ARGS__))
-static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node)
-{
- return kmalloc_array_node(n, size, flags | __GFP_ZERO, node);
-}
+#define kcalloc_node(_n, _size, _flags, _node) \
+ kmalloc_array_node(_n, _size, (_flags) | __GFP_ZERO, _node)
/*
* Shortcuts
*/
-static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
-{
- return kmem_cache_alloc(k, flags | __GFP_ZERO);
-}
+#define kmem_cache_zalloc(_k, _flags) kmem_cache_alloc(_k, (_flags)|__GFP_ZERO)
/**
* kzalloc - allocate memory. The memory is set to zero.
* @size: how many bytes of memory are required.
* @flags: the type of memory to allocate (see kmalloc).
*/
-static inline __alloc_size(1) void *kzalloc(size_t size, gfp_t flags)
+static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags)
{
- return kmalloc(size, flags | __GFP_ZERO);
+ return kmalloc_noprof(size, flags | __GFP_ZERO);
}
+#define kzalloc(...) alloc_hooks(kzalloc_noprof(__VA_ARGS__))
+#define kzalloc_node(_size, _flags, _node) kmalloc_node(_size, (_flags)|__GFP_ZERO, _node)
-/**
- * kzalloc_node - allocate zeroed memory from a particular memory node.
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate (see kmalloc).
- * @node: memory node from which to allocate
- */
-static inline __alloc_size(1) void *kzalloc_node(size_t size, gfp_t flags, int node)
-{
- return kmalloc_node(size, flags | __GFP_ZERO, node);
-}
+extern void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) __alloc_size(1);
+#define kvmalloc_node(...) alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__))
-extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __alloc_size(1);
-static inline __alloc_size(1) void *kvmalloc(size_t size, gfp_t flags)
-{
- return kvmalloc_node(size, flags, NUMA_NO_NODE);
-}
-static inline __alloc_size(1) void *kvzalloc_node(size_t size, gfp_t flags, int node)
-{
- return kvmalloc_node(size, flags | __GFP_ZERO, node);
-}
-static inline __alloc_size(1) void *kvzalloc(size_t size, gfp_t flags)
-{
- return kvmalloc(size, flags | __GFP_ZERO);
-}
+#define kvmalloc(_size, _flags) kvmalloc_node(_size, _flags, NUMA_NO_NODE)
+#define kvmalloc_noprof(_size, _flags) kvmalloc_node_noprof(_size, _flags, NUMA_NO_NODE)
+#define kvzalloc(_size, _flags) kvmalloc(_size, _flags|__GFP_ZERO)
-static inline __alloc_size(1, 2) void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
+#define kvzalloc_node(_size, _flags, _node) kvmalloc_node(_size, _flags|__GFP_ZERO, _node)
+
+static inline __alloc_size(1, 2) void *kvmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
{
size_t bytes;
if (unlikely(check_mul_overflow(n, size, &bytes)))
return NULL;
- return kvmalloc(bytes, flags);
+ return kvmalloc_node_noprof(bytes, flags, NUMA_NO_NODE);
}
-static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t flags)
-{
- return kvmalloc_array(n, size, flags | __GFP_ZERO);
-}
+#define kvmalloc_array(...) alloc_hooks(kvmalloc_array_noprof(__VA_ARGS__))
+#define kvcalloc(_n, _size, _flags) kvmalloc_array(_n, _size, _flags|__GFP_ZERO)
+#define kvcalloc_noprof(_n, _size, _flags) kvmalloc_array_noprof(_n, _size, _flags|__GFP_ZERO)
-extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
+extern void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
__realloc_size(3);
+#define kvrealloc(...) alloc_hooks(kvrealloc_noprof(__VA_ARGS__))
+
extern void kvfree(const void *addr);
DEFINE_FREE(kvfree, void *, if (_T) kvfree(_T))
diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h
index 317200cd3a603e..fc5a206c40435f 100644
--- a/include/linux/sockptr.h
+++ b/include/linux/sockptr.h
@@ -117,9 +117,9 @@ static inline int copy_to_sockptr(sockptr_t dst, const void *src, size_t size)
return copy_to_sockptr_offset(dst, 0, src, size);
}
-static inline void *memdup_sockptr(sockptr_t src, size_t len)
+static inline void *memdup_sockptr_noprof(sockptr_t src, size_t len)
{
- void *p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
+ void *p = kmalloc_track_caller_noprof(len, GFP_USER | __GFP_NOWARN);
if (!p)
return ERR_PTR(-ENOMEM);
@@ -129,10 +129,11 @@ static inline void *memdup_sockptr(sockptr_t src, size_t len)
}
return p;
}
+#define memdup_sockptr(...) alloc_hooks(memdup_sockptr_noprof(__VA_ARGS__))
-static inline void *memdup_sockptr_nul(sockptr_t src, size_t len)
+static inline void *memdup_sockptr_nul_noprof(sockptr_t src, size_t len)
{
- char *p = kmalloc_track_caller(len + 1, GFP_KERNEL);
+ char *p = kmalloc_track_caller_noprof(len + 1, GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
@@ -143,6 +144,7 @@ static inline void *memdup_sockptr_nul(sockptr_t src, size_t len)
p[len] = '\0';
return p;
}
+#define memdup_sockptr_nul(...) alloc_hooks(memdup_sockptr_nul_noprof(__VA_ARGS__))
static inline long strncpy_from_sockptr(char *dst, sockptr_t src, size_t count)
{
diff --git a/include/linux/string.h b/include/linux/string.h
index 9ba8b4597009c9..793c27ad7c0d21 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -282,7 +282,9 @@ extern void kfree_const(const void *x);
extern char *kstrdup(const char *s, gfp_t gfp) __malloc;
extern const char *kstrdup_const(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
-extern void *kmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2);
+extern void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) __realloc_size(2);
+#define kmemdup(...) alloc_hooks(kmemdup_noprof(__VA_ARGS__))
+
extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2);
extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp);
extern void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index f53d608daa0131..6b672a67ae5d9d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -259,7 +259,20 @@ struct swap_cluster_info {
};
#define CLUSTER_FLAG_FREE 1 /* This cluster is free */
#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
-#define CLUSTER_FLAG_HUGE 4 /* This cluster is backing a transparent huge page */
+
+/*
+ * The first page in the swap file is the swap header, which is always marked
+ * bad to prevent it from being allocated as an entry. This also prevents the
+ * cluster to which it belongs being marked free. Therefore 0 is safe to use as
+ * a sentinel to indicate next is not valid in percpu_cluster.
+ */
+#define SWAP_NEXT_INVALID 0
+
+#ifdef CONFIG_THP_SWAP
+#define SWAP_NR_ORDERS (PMD_ORDER + 1)
+#else
+#define SWAP_NR_ORDERS 1
+#endif
/*
* We assign a cluster to each CPU, so each CPU can allocate swap entry from
@@ -267,8 +280,7 @@ struct swap_cluster_info {
* throughput.
*/
struct percpu_cluster {
- struct swap_cluster_info index; /* Current cluster index */
- unsigned int next; /* Likely next allocation offset */
+ unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
};
struct swap_cluster_list {
@@ -462,14 +474,14 @@ swp_entry_t folio_alloc_swap(struct folio *folio);
bool folio_free_swap(struct folio *folio);
void put_swap_folio(struct folio *folio, swp_entry_t entry);
extern swp_entry_t get_swap_page_of_type(int);
-extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size);
+extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
extern void swap_shmem_alloc(swp_entry_t);
extern int swap_duplicate(swp_entry_t);
extern int swapcache_prepare(swp_entry_t);
extern void swap_free(swp_entry_t);
extern void swapcache_free_entries(swp_entry_t *entries, int n);
-extern int free_swap_and_cache(swp_entry_t);
+extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
int swap_type_of(dev_t device, sector_t offset);
int find_first_swap(dev_t *device);
extern unsigned int count_swap_pages(int, int);
@@ -518,8 +530,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
#define free_pages_and_swap_cache(pages, nr) \
release_pages((pages), (nr));
-/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
-#define free_swap_and_cache(e) is_pfn_swap_entry(e)
+static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
+{
+}
static inline void free_swap_cache(struct folio *folio)
{
@@ -587,14 +600,10 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
}
#endif /* CONFIG_SWAP */
-#ifdef CONFIG_THP_SWAP
-extern int split_swap_cluster(swp_entry_t entry);
-#else
-static inline int split_swap_cluster(swp_entry_t entry)
+static inline void free_swap_and_cache(swp_entry_t entry)
{
- return 0;
+ free_swap_and_cache_nr(entry, 1);
}
-#endif
#ifdef CONFIG_MEMCG
static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 98ea90e90439d8..e4a631ec430bb7 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -2,6 +2,8 @@
#ifndef _LINUX_VMALLOC_H
#define _LINUX_VMALLOC_H
+#include <linux/alloc_tag.h>
+#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/init.h>
#include <linux/list.h>
@@ -138,26 +140,54 @@ extern unsigned long vmalloc_nr_pages(void);
static inline unsigned long vmalloc_nr_pages(void) { return 0; }
#endif
-extern void *vmalloc(unsigned long size) __alloc_size(1);
-extern void *vzalloc(unsigned long size) __alloc_size(1);
-extern void *vmalloc_user(unsigned long size) __alloc_size(1);
-extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1);
-extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1);
-extern void *vmalloc_32(unsigned long size) __alloc_size(1);
-extern void *vmalloc_32_user(unsigned long size) __alloc_size(1);
-extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
-extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
+extern void *vmalloc_noprof(unsigned long size) __alloc_size(1);
+#define vmalloc(...) alloc_hooks(vmalloc_noprof(__VA_ARGS__))
+
+extern void *vzalloc_noprof(unsigned long size) __alloc_size(1);
+#define vzalloc(...) alloc_hooks(vzalloc_noprof(__VA_ARGS__))
+
+extern void *vmalloc_user_noprof(unsigned long size) __alloc_size(1);
+#define vmalloc_user(...) alloc_hooks(vmalloc_user_noprof(__VA_ARGS__))
+
+extern void *vmalloc_node_noprof(unsigned long size, int node) __alloc_size(1);
+#define vmalloc_node(...) alloc_hooks(vmalloc_node_noprof(__VA_ARGS__))
+
+extern void *vzalloc_node_noprof(unsigned long size, int node) __alloc_size(1);
+#define vzalloc_node(...) alloc_hooks(vzalloc_node_noprof(__VA_ARGS__))
+
+extern void *vmalloc_32_noprof(unsigned long size) __alloc_size(1);
+#define vmalloc_32(...) alloc_hooks(vmalloc_32_noprof(__VA_ARGS__))
+
+extern void *vmalloc_32_user_noprof(unsigned long size) __alloc_size(1);
+#define vmalloc_32_user(...) alloc_hooks(vmalloc_32_user_noprof(__VA_ARGS__))
+
+extern void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
+#define __vmalloc(...) alloc_hooks(__vmalloc_noprof(__VA_ARGS__))
+
+extern void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
const void *caller) __alloc_size(1);
-void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
+#define __vmalloc_node_range(...) alloc_hooks(__vmalloc_node_range_noprof(__VA_ARGS__))
+
+void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask,
int node, const void *caller) __alloc_size(1);
-void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
+#define __vmalloc_node(...) alloc_hooks(__vmalloc_node_noprof(__VA_ARGS__))
+
+void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
+#define vmalloc_huge(...) alloc_hooks(vmalloc_huge_noprof(__VA_ARGS__))
+
+extern void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
+#define __vmalloc_array(...) alloc_hooks(__vmalloc_array_noprof(__VA_ARGS__))
+
+extern void *vmalloc_array_noprof(size_t n, size_t size) __alloc_size(1, 2);
+#define vmalloc_array(...) alloc_hooks(vmalloc_array_noprof(__VA_ARGS__))
+
+extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
+#define __vcalloc(...) alloc_hooks(__vcalloc_noprof(__VA_ARGS__))
-extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
-extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2);
-extern void *__vcalloc(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
-extern void *vcalloc(size_t n, size_t size) __alloc_size(1, 2);
+extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2);
+#define vcalloc(...) alloc_hooks(vcalloc_noprof(__VA_ARGS__))
extern void vfree(const void *addr);
extern void vfree_atomic(const void *addr);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 343906a98d6eed..735eae6e272cba 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -487,14 +487,6 @@ static inline void node_stat_sub_folio(struct folio *folio,
mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
}
-static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
- int migratetype)
-{
- __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
- if (is_migrate_cma(migratetype))
- __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
-}
-
extern const char * const vmstat_text[];
static inline const char *zone_stat_name(enum zone_stat_item item)
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index cb571dfcf4b167..d9d479334c9e65 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -1548,6 +1548,7 @@ void xas_create_range(struct xa_state *);
#ifdef CONFIG_XARRAY_MULTI
int xa_get_order(struct xarray *, unsigned long index);
+int xas_get_order(struct xa_state *xas);
void xas_split(struct xa_state *, void *entry, unsigned int order);
void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t);
#else
@@ -1556,6 +1557,11 @@ static inline int xa_get_order(struct xarray *xa, unsigned long index)
return 0;
}
+static inline int xas_get_order(struct xa_state *xas)
+{
+ return 0;
+}
+
static inline void xas_split(struct xa_state *xas, void *entry,
unsigned int order)
{
diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index 3296438eec0620..a67d62b796980a 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -53,7 +53,7 @@ void *zpool_map_handle(struct zpool *pool, unsigned long handle,
void zpool_unmap_handle(struct zpool *pool, unsigned long handle);
-u64 zpool_get_total_size(struct zpool *pool);
+u64 zpool_get_total_pages(struct zpool *pool);
/**
@@ -91,7 +91,7 @@ struct zpool_driver {
enum zpool_mapmode mm);
void (*unmap)(void *pool, unsigned long handle);
- u64 (*total_size)(void *pool);
+ u64 (*total_pages)(void *pool);
};
void zpool_register_driver(struct zpool_driver *driver);
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 341aea4900704c..2a85b941db975d 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -7,7 +7,6 @@
struct lruvec;
-extern u64 zswap_pool_total_size;
extern atomic_t zswap_stored_pages;
#ifdef CONFIG_ZSWAP
@@ -27,6 +26,7 @@ struct zswap_lruvec_state {
atomic_long_t nr_zswap_protected;
};
+unsigned long zswap_total_pages(void);
bool zswap_store(struct folio *folio);
bool zswap_load(struct folio *folio);
void zswap_invalidate(swp_entry_t swp);
diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index f3ab0b8a4b18ad..c31bd96dafdbac 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -274,15 +274,17 @@ struct netlbl_calipso_ops {
* on success, NULL on failure.
*
*/
-static inline struct netlbl_lsm_cache *netlbl_secattr_cache_alloc(gfp_t flags)
+static inline struct netlbl_lsm_cache *netlbl_secattr_cache_alloc_noprof(gfp_t flags)
{
struct netlbl_lsm_cache *cache;
- cache = kzalloc(sizeof(*cache), flags);
+ cache = kzalloc_noprof(sizeof(*cache), flags);
if (cache)
refcount_set(&cache->refcount, 1);
return cache;
}
+#define netlbl_secattr_cache_alloc(...) \
+ alloc_hooks(netlbl_secattr_cache_alloc_noprof(__VA_ARGS__))
/**
* netlbl_secattr_cache_free - Frees a netlbl_lsm_cache struct
@@ -311,10 +313,11 @@ static inline void netlbl_secattr_cache_free(struct netlbl_lsm_cache *cache)
* on failure.
*
*/
-static inline struct netlbl_lsm_catmap *netlbl_catmap_alloc(gfp_t flags)
+static inline struct netlbl_lsm_catmap *netlbl_catmap_alloc_noprof(gfp_t flags)
{
- return kzalloc(sizeof(struct netlbl_lsm_catmap), flags);
+ return kzalloc_noprof(sizeof(struct netlbl_lsm_catmap), flags);
}
+#define netlbl_catmap_alloc(...) alloc_hooks(netlbl_catmap_alloc_noprof(__VA_ARGS__))
/**
* netlbl_catmap_free - Free a LSM secattr catmap
@@ -376,10 +379,11 @@ static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr)
* pointer on success, or NULL on failure.
*
*/
-static inline struct netlbl_lsm_secattr *netlbl_secattr_alloc(gfp_t flags)
+static inline struct netlbl_lsm_secattr *netlbl_secattr_alloc_noprof(gfp_t flags)
{
- return kzalloc(sizeof(struct netlbl_lsm_secattr), flags);
+ return kzalloc_noprof(sizeof(struct netlbl_lsm_secattr), flags);
}
+#define netlbl_secattr_alloc(...) alloc_hooks(netlbl_secattr_alloc_noprof(__VA_ARGS__))
/**
* netlbl_secattr_free - Frees a netlbl_lsm_secattr struct
diff --git a/include/net/netlink.h b/include/net/netlink.h
index c19ff921b661ad..972b5484fa6fc4 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -1891,10 +1891,11 @@ static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla)
* @src: netlink attribute to duplicate from
* @gfp: GFP mask
*/
-static inline void *nla_memdup(const struct nlattr *src, gfp_t gfp)
+static inline void *nla_memdup_noprof(const struct nlattr *src, gfp_t gfp)
{
- return kmemdup(nla_data(src), nla_len(src), gfp);
+ return kmemdup_noprof(nla_data(src), nla_len(src), gfp);
}
+#define nla_memdup(...) alloc_hooks(nla_memdup_noprof(__VA_ARGS__))
/**
* nla_nest_start_noflag - Start a new level of nested attributes
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 004e651e6067e7..29495c331d20e9 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -127,12 +127,12 @@ static inline struct sock *skb_steal_sock(struct sk_buff *skb,
}
static inline struct request_sock *
-reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener,
+reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener,
bool attach_listener)
{
struct request_sock *req;
- req = kmem_cache_alloc(ops->slab, GFP_ATOMIC | __GFP_NOWARN);
+ req = kmem_cache_alloc_noprof(ops->slab, GFP_ATOMIC | __GFP_NOWARN);
if (!req)
return NULL;
req->rsk_listener = NULL;
@@ -157,6 +157,7 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener,
return req;
}
+#define reqsk_alloc(...) alloc_hooks(reqsk_alloc_noprof(__VA_ARGS__))
static inline void __reqsk_free(struct request_sock *req)
{
diff --git a/include/net/tcx.h b/include/net/tcx.h
index 04be9377785d77..72a3e75e539fb0 100644
--- a/include/net/tcx.h
+++ b/include/net/tcx.h
@@ -75,9 +75,9 @@ tcx_entry_fetch(struct net_device *dev, bool ingress)
return rcu_dereference_rtnl(dev->tcx_egress);
}
-static inline struct bpf_mprog_entry *tcx_entry_create(void)
+static inline struct bpf_mprog_entry *tcx_entry_create_noprof(void)
{
- struct tcx_entry *tcx = kzalloc(sizeof(*tcx), GFP_KERNEL);
+ struct tcx_entry *tcx = kzalloc_noprof(sizeof(*tcx), GFP_KERNEL);
if (tcx) {
bpf_mprog_bundle_init(&tcx->bundle);
@@ -85,6 +85,7 @@ static inline struct bpf_mprog_entry *tcx_entry_create(void)
}
return NULL;
}
+#define tcx_entry_create(...) alloc_hooks(tcx_entry_create_noprof(__VA_ARGS__))
static inline void tcx_entry_free(struct bpf_mprog_entry *entry)
{
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 2e58d5e6ac0e46..d678929441933b 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -11,6 +11,7 @@
#include <rdma/ib_verbs.h>
#include <rdma/rdmavt_cq.h>
#include <rdma/rvt-abi.h>
+#include <linux/vmalloc.h>
/*
* Atomic bit definitions for r_aflags.
*/
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index 97b09fcf7e5287..86fe6aecff1e79 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -62,14 +62,14 @@ DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
TP_PROTO(struct inode *inode, struct vm_fault *vmf,
- struct page *zero_page,
+ struct folio *zero_folio,
void *radix_entry),
- TP_ARGS(inode, vmf, zero_page, radix_entry),
+ TP_ARGS(inode, vmf, zero_folio, radix_entry),
TP_STRUCT__entry(
__field(unsigned long, ino)
__field(unsigned long, vm_flags)
__field(unsigned long, address)
- __field(struct page *, zero_page)
+ __field(struct folio *, zero_folio)
__field(void *, radix_entry)
__field(dev_t, dev)
),
@@ -78,17 +78,17 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
__entry->ino = inode->i_ino;
__entry->vm_flags = vmf->vma->vm_flags;
__entry->address = vmf->address;
- __entry->zero_page = zero_page;
+ __entry->zero_folio = zero_folio;
__entry->radix_entry = radix_entry;
),
- TP_printk("dev %d:%d ino %#lx %s address %#lx zero_page %p "
+ TP_printk("dev %d:%d ino %#lx %s address %#lx zero_folio %p "
"radix_entry %#lx",
MAJOR(__entry->dev),
MINOR(__entry->dev),
__entry->ino,
__entry->vm_flags & VM_SHARED ? "shared" : "private",
__entry->address,
- __entry->zero_page,
+ __entry->zero_folio,
(unsigned long)__entry->radix_entry
)
)
@@ -96,8 +96,8 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
#define DEFINE_PMD_LOAD_HOLE_EVENT(name) \
DEFINE_EVENT(dax_pmd_load_hole_class, name, \
TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
- struct page *zero_page, void *radix_entry), \
- TP_ARGS(inode, vmf, zero_page, radix_entry))
+ struct folio *zero_folio, void *radix_entry), \
+ TP_ARGS(inode, vmf, zero_folio, radix_entry))
DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole);
DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback);
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index 6e2ef1d4b0028d..ab576898a1264d 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -174,10 +174,10 @@ TRACE_EVENT(mm_collapse_huge_page_swapin,
TRACE_EVENT(mm_khugepaged_scan_file,
- TP_PROTO(struct mm_struct *mm, struct page *page, struct file *file,
+ TP_PROTO(struct mm_struct *mm, struct folio *folio, struct file *file,
int present, int swap, int result),
- TP_ARGS(mm, page, file, present, swap, result),
+ TP_ARGS(mm, folio, file, present, swap, result),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
@@ -190,7 +190,7 @@ TRACE_EVENT(mm_khugepaged_scan_file,
TP_fast_assign(
__entry->mm = mm;
- __entry->pfn = page ? page_to_pfn(page) : -1;
+ __entry->pfn = folio ? folio_pfn(folio) : -1;
__assign_str(filename, file->f_path.dentry->d_iname);
__entry->present = present;
__entry->swap = swap;
@@ -207,10 +207,10 @@ TRACE_EVENT(mm_khugepaged_scan_file,
);
TRACE_EVENT(mm_khugepaged_collapse_file,
- TP_PROTO(struct mm_struct *mm, struct page *hpage, pgoff_t index,
+ TP_PROTO(struct mm_struct *mm, struct folio *new_folio, pgoff_t index,
bool is_shmem, unsigned long addr, struct file *file,
int nr, int result),
- TP_ARGS(mm, hpage, index, addr, is_shmem, file, nr, result),
+ TP_ARGS(mm, new_folio, index, addr, is_shmem, file, nr, result),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
__field(unsigned long, hpfn)
@@ -224,7 +224,7 @@ TRACE_EVENT(mm_khugepaged_collapse_file,
TP_fast_assign(
__entry->mm = mm;
- __entry->hpfn = hpage ? page_to_pfn(hpage) : -1;
+ __entry->hpfn = new_folio ? folio_pfn(new_folio) : -1;
__entry->index = index;
__entry->addr = addr;
__entry->is_shmem = is_shmem;
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index d55e53ac91bd2c..e46d6e82765e71 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -107,7 +107,6 @@
DEF_PAGEFLAG_NAME(lru), \
DEF_PAGEFLAG_NAME(active), \
DEF_PAGEFLAG_NAME(workingset), \
- DEF_PAGEFLAG_NAME(slab), \
DEF_PAGEFLAG_NAME(owner_priv_1), \
DEF_PAGEFLAG_NAME(arch_1), \
DEF_PAGEFLAG_NAME(reserved), \
@@ -135,6 +134,7 @@ IF_HAVE_PG_ARCH_X(arch_3)
#define DEF_PAGETYPE_NAME(_name) { PG_##_name, __stringify(_name) }
#define __def_pagetype_names \
+ DEF_PAGETYPE_NAME(slab), \
DEF_PAGETYPE_NAME(hugetlb), \
DEF_PAGETYPE_NAME(offline), \
DEF_PAGETYPE_NAME(guard), \
diff --git a/init/Kconfig b/init/Kconfig
index 664bedb9a71fbe..0a021d6b4939ae 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -929,6 +929,9 @@ config NUMA_BALANCING_DEFAULT_ENABLED
If set, automatic NUMA balancing will be enabled if running on a NUMA
machine.
+config SLAB_OBJ_EXT
+ bool
+
menuconfig CGROUPS
bool "Control Group support"
select KERNFS
@@ -962,6 +965,7 @@ config MEMCG
bool "Memory controller"
select PAGE_COUNTER
select EVENTFD
+ select SLAB_OBJ_EXT
help
Provides control over the memory footprint of tasks in a cgroup.
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c170a2b8d2cf21..d5edfb8444d78f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3525,7 +3525,7 @@ static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
#else
addr = 0UL;
#endif
- return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+ return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
}
#else /* !CONFIG_MMU */
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 343c3456c8ddf0..16dbf4f6b77fa5 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -314,7 +314,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad
return -EINVAL;
}
- ret = current->mm->get_unmapped_area(filp, addr, len * 2, 0, flags);
+ ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags);
if (IS_ERR_VALUE(ret))
return ret;
if ((ret >> 32) == ((ret + len - 1) >> 32))
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 550f02e2cb1327..a546aba46d5da9 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -759,8 +759,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
- if (ma->objcg)
- obj_cgroup_put(ma->objcg);
+ obj_cgroup_put(ma->objcg);
destroy_mem_alloc(ma, rcu_in_progress);
}
if (ma->caches) {
@@ -776,8 +775,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
}
- if (ma->objcg)
- obj_cgroup_put(ma->objcg);
+ obj_cgroup_put(ma->objcg);
destroy_mem_alloc(ma, rcu_in_progress);
}
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cb61d8880dbe03..c403f772599965 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -980,7 +980,7 @@ static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr
if (map->ops->map_get_unmapped_area)
return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags);
#ifdef CONFIG_MMU
- return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+ return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
#else
return addr;
#endif
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 58db8fd70471a1..5e2d51e1cdf694 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -570,9 +570,9 @@ static struct page *__dma_alloc_pages(struct device *dev, size_t size,
size = PAGE_ALIGN(size);
if (dma_alloc_direct(dev, ops))
return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp);
- if (!ops->alloc_pages)
+ if (!ops->alloc_pages_op)
return NULL;
- return ops->alloc_pages(dev, size, dma_handle, dir, gfp);
+ return ops->alloc_pages_op(dev, size, dma_handle, dir, gfp);
}
struct page *dma_alloc_pages(struct device *dev, size_t size,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 724e6d7e128f37..c5a0dc1f135f02 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7539,7 +7539,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
{
u64 size = 0;
-#ifdef CONFIG_HAVE_FAST_GUP
+#ifdef CONFIG_HAVE_GUP_FAST
pgd_t *pgdp, pgd;
p4d_t *p4dp, p4d;
pud_t *pudp, pud;
@@ -7587,7 +7587,7 @@ again:
if (pte_present(pte))
size = pte_leaf_size(pte);
pte_unmap(ptep);
-#endif /* CONFIG_HAVE_FAST_GUP */
+#endif /* CONFIG_HAVE_GUP_FAST */
return size;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index aebb3e6c96dc62..99076dbe27d83f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1344,7 +1344,7 @@ static inline void __mmput(struct mm_struct *mm)
ksm_exit(mm);
khugepaged_exit(mm); /* must run before exit_mmap */
exit_mmap(mm);
- mm_put_huge_zero_page(mm);
+ mm_put_huge_zero_folio(mm);
set_mm_exe_file(mm, NULL);
if (!list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
index 8a689b4ff4f982..2f84896a7bcbdf 100644
--- a/kernel/kallsyms_selftest.c
+++ b/kernel/kallsyms_selftest.c
@@ -82,7 +82,7 @@ static struct test_item test_items[] = {
ITEM_FUNC(kallsyms_test_func_static),
ITEM_FUNC(kallsyms_test_func),
ITEM_FUNC(kallsyms_test_func_weak),
- ITEM_FUNC(vmalloc),
+ ITEM_FUNC(vmalloc_noprof),
ITEM_FUNC(vfree),
#ifdef CONFIG_KALLSYMS_ALL
ITEM_DATA(kallsyms_test_var_bss_static),
diff --git a/kernel/module/main.c b/kernel/module/main.c
index e1e8a7a9d6c193..2d25eebc549dbe 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -56,6 +56,7 @@
#include <linux/dynamic_debug.h>
#include <linux/audit.h>
#include <linux/cfi.h>
+#include <linux/codetag.h>
#include <linux/debugfs.h>
#include <uapi/linux/module.h>
#include "internal.h"
@@ -1210,15 +1211,19 @@ static void *module_memory_alloc(unsigned int size, enum mod_mem_type type)
return module_alloc(size);
}
-static void module_memory_free(void *ptr, enum mod_mem_type type)
+static void module_memory_free(void *ptr, enum mod_mem_type type,
+ bool unload_codetags)
{
+ if (!unload_codetags && mod_mem_type_is_core_data(type))
+ return;
+
if (mod_mem_use_vmalloc(type))
vfree(ptr);
else
module_memfree(ptr);
}
-static void free_mod_mem(struct module *mod)
+static void free_mod_mem(struct module *mod, bool unload_codetags)
{
for_each_mod_mem_type(type) {
struct module_memory *mod_mem = &mod->mem[type];
@@ -1229,19 +1234,27 @@ static void free_mod_mem(struct module *mod)
/* Free lock-classes; relies on the preceding sync_rcu(). */
lockdep_free_key_range(mod_mem->base, mod_mem->size);
if (mod_mem->size)
- module_memory_free(mod_mem->base, type);
+ module_memory_free(mod_mem->base, type,
+ unload_codetags);
}
/* MOD_DATA hosts mod, so free it at last */
lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size);
- module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA);
+ module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA, unload_codetags);
}
/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
+ bool unload_codetags;
+
trace_module_free(mod);
+ unload_codetags = codetag_unload_module(mod);
+ if (!unload_codetags)
+ pr_warn("%s: memory allocation(s) from the module still alive, cannot unload cleanly\n",
+ mod->name);
+
mod_sysfs_teardown(mod);
/*
@@ -1283,7 +1296,7 @@ static void free_module(struct module *mod)
kfree(mod->args);
percpu_modfree(mod);
- free_mod_mem(mod);
+ free_mod_mem(mod, unload_codetags);
}
void *__symbol_get(const char *symbol)
@@ -2296,7 +2309,7 @@ static int move_module(struct module *mod, struct load_info *info)
return 0;
out_enomem:
for (t--; t >= 0; t--)
- module_memory_free(mod->mem[t].base, t);
+ module_memory_free(mod->mem[t].base, t, true);
return ret;
}
@@ -2426,7 +2439,7 @@ static void module_deallocate(struct module *mod, struct load_info *info)
percpu_modfree(mod);
module_arch_freeing_init(mod);
- free_mod_mem(mod);
+ free_mod_mem(mod, true);
}
int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -2995,6 +3008,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
/* Get rid of temporary copy. */
free_copy(info, flags);
+ codetag_load_module(mod);
+
/* Done! */
trace_module_load(mod);
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
index 23c125c2e2436c..1d5eadd9dd61cd 100644
--- a/kernel/vmcore_info.c
+++ b/kernel/vmcore_info.c
@@ -198,7 +198,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PG_private);
VMCOREINFO_NUMBER(PG_swapcache);
VMCOREINFO_NUMBER(PG_swapbacked);
- VMCOREINFO_NUMBER(PG_slab);
+#define PAGE_SLAB_MAPCOUNT_VALUE (~PG_slab)
+ VMCOREINFO_NUMBER(PAGE_SLAB_MAPCOUNT_VALUE);
#ifdef CONFIG_MEMORY_FAILURE
VMCOREINFO_NUMBER(PG_hwpoison);
#endif
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 291185f54ee4c2..ca33eaf04398b8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -969,6 +969,37 @@ config DEBUG_STACKOVERFLOW
If in doubt, say "N".
+config CODE_TAGGING
+ bool
+ select KALLSYMS
+
+config MEM_ALLOC_PROFILING
+ bool "Enable memory allocation profiling"
+ default n
+ depends on PROC_FS
+ depends on !DEBUG_FORCE_WEAK_PER_CPU
+ select CODE_TAGGING
+ select PAGE_EXTENSION
+ select SLAB_OBJ_EXT
+ help
+ Track allocation source code and record total allocation size
+ initiated at that code location. The mechanism can be used to track
+ memory leaks with a low performance and memory impact.
+
+config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
+ bool "Enable memory allocation profiling by default"
+ default y
+ depends on MEM_ALLOC_PROFILING
+
+config MEM_ALLOC_PROFILING_DEBUG
+ bool "Memory allocation profiler debugging"
+ default n
+ depends on MEM_ALLOC_PROFILING
+ select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
+ help
+ Adds warnings with helpful error messages for memory allocation
+ profiling.
+
source "lib/Kconfig.kasan"
source "lib/Kconfig.kfence"
source "lib/Kconfig.kmsan"
diff --git a/lib/Makefile b/lib/Makefile
index ffc6b2341b45a5..2f4e17bfb2990d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -233,6 +233,9 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \
of-reconfig-notifier-error-inject.o
obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
+obj-$(CONFIG_CODE_TAGGING) += codetag.o
+obj-$(CONFIG_MEM_ALLOC_PROFILING) += alloc_tag.o
+
lib-$(CONFIG_GENERIC_BUG) += bug.o
obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
new file mode 100644
index 00000000000000..531dbe2f545635
--- /dev/null
+++ b/lib/alloc_tag.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/alloc_tag.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/page_ext.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_buf.h>
+#include <linux/seq_file.h>
+
+static struct codetag_type *alloc_tag_cttype;
+
+DEFINE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
+EXPORT_SYMBOL(_shared_alloc_tag);
+
+DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
+ mem_alloc_profiling_key);
+
+static void *allocinfo_start(struct seq_file *m, loff_t *pos)
+{
+ struct codetag_iterator *iter;
+ struct codetag *ct;
+ loff_t node = *pos;
+
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ m->private = iter;
+ if (!iter)
+ return NULL;
+
+ codetag_lock_module_list(alloc_tag_cttype, true);
+ *iter = codetag_get_ct_iter(alloc_tag_cttype);
+ while ((ct = codetag_next_ct(iter)) != NULL && node)
+ node--;
+
+ return ct ? iter : NULL;
+}
+
+static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+ struct codetag_iterator *iter = (struct codetag_iterator *)arg;
+ struct codetag *ct = codetag_next_ct(iter);
+
+ (*pos)++;
+ if (!ct)
+ return NULL;
+
+ return iter;
+}
+
+static void allocinfo_stop(struct seq_file *m, void *arg)
+{
+ struct codetag_iterator *iter = (struct codetag_iterator *)m->private;
+
+ if (iter) {
+ codetag_lock_module_list(alloc_tag_cttype, false);
+ kfree(iter);
+ }
+}
+
+static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct)
+{
+ struct alloc_tag *tag = ct_to_alloc_tag(ct);
+ struct alloc_tag_counters counter = alloc_tag_read(tag);
+ s64 bytes = counter.bytes;
+
+ seq_buf_printf(out, "%12lli %8llu ", bytes, counter.calls);
+ codetag_to_text(out, ct);
+ seq_buf_putc(out, ' ');
+ seq_buf_putc(out, '\n');
+}
+
+static int allocinfo_show(struct seq_file *m, void *arg)
+{
+ struct codetag_iterator *iter = (struct codetag_iterator *)arg;
+ char *bufp;
+ size_t n = seq_get_buf(m, &bufp);
+ struct seq_buf buf;
+
+ seq_buf_init(&buf, bufp, n);
+ alloc_tag_to_text(&buf, iter->ct);
+ seq_commit(m, seq_buf_used(&buf));
+ return 0;
+}
+
+static const struct seq_operations allocinfo_seq_op = {
+ .start = allocinfo_start,
+ .next = allocinfo_next,
+ .stop = allocinfo_stop,
+ .show = allocinfo_show,
+};
+
+size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sleep)
+{
+ struct codetag_iterator iter;
+ struct codetag *ct;
+ struct codetag_bytes n;
+ unsigned int i, nr = 0;
+
+ if (can_sleep)
+ codetag_lock_module_list(alloc_tag_cttype, true);
+ else if (!codetag_trylock_module_list(alloc_tag_cttype))
+ return 0;
+
+ iter = codetag_get_ct_iter(alloc_tag_cttype);
+ while ((ct = codetag_next_ct(&iter))) {
+ struct alloc_tag_counters counter = alloc_tag_read(ct_to_alloc_tag(ct));
+
+ n.ct = ct;
+ n.bytes = counter.bytes;
+
+ for (i = 0; i < nr; i++)
+ if (n.bytes > tags[i].bytes)
+ break;
+
+ if (i < count) {
+ nr -= nr == count;
+ memmove(&tags[i + 1],
+ &tags[i],
+ sizeof(tags[0]) * (nr - i));
+ nr++;
+ tags[i] = n;
+ }
+ }
+
+ codetag_lock_module_list(alloc_tag_cttype, false);
+
+ return nr;
+}
+
+static void __init procfs_init(void)
+{
+ proc_create_seq("allocinfo", 0400, NULL, &allocinfo_seq_op);
+}
+
+static bool alloc_tag_module_unload(struct codetag_type *cttype,
+ struct codetag_module *cmod)
+{
+ struct codetag_iterator iter = codetag_get_ct_iter(cttype);
+ struct alloc_tag_counters counter;
+ bool module_unused = true;
+ struct alloc_tag *tag;
+ struct codetag *ct;
+
+ for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) {
+ if (iter.cmod != cmod)
+ continue;
+
+ tag = ct_to_alloc_tag(ct);
+ counter = alloc_tag_read(tag);
+
+ if (WARN(counter.bytes,
+ "%s:%u module %s func:%s has %llu allocated at module unload",
+ ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes))
+ module_unused = false;
+ }
+
+ return module_unused;
+}
+
+#ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
+static bool mem_profiling_support __meminitdata = true;
+#else
+static bool mem_profiling_support __meminitdata;
+#endif
+
+static int __init setup_early_mem_profiling(char *str)
+{
+ bool enable;
+
+ if (!str || !str[0])
+ return -EINVAL;
+
+ if (!strncmp(str, "never", 5)) {
+ enable = false;
+ mem_profiling_support = false;
+ } else {
+ int res;
+
+ res = kstrtobool(str, &enable);
+ if (res)
+ return res;
+
+ mem_profiling_support = true;
+ }
+
+ if (enable != static_key_enabled(&mem_alloc_profiling_key)) {
+ if (enable)
+ static_branch_enable(&mem_alloc_profiling_key);
+ else
+ static_branch_disable(&mem_alloc_profiling_key);
+ }
+
+ return 0;
+}
+early_param("sysctl.vm.mem_profiling", setup_early_mem_profiling);
+
+static __init bool need_page_alloc_tagging(void)
+{
+ return mem_profiling_support;
+}
+
+static __init void init_page_alloc_tagging(void)
+{
+}
+
+struct page_ext_operations page_alloc_tagging_ops = {
+ .size = sizeof(union codetag_ref),
+ .need = need_page_alloc_tagging,
+ .init = init_page_alloc_tagging,
+};
+EXPORT_SYMBOL(page_alloc_tagging_ops);
+
+static struct ctl_table memory_allocation_profiling_sysctls[] = {
+ {
+ .procname = "mem_profiling",
+ .data = &mem_alloc_profiling_key,
+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
+ .mode = 0444,
+#else
+ .mode = 0644,
+#endif
+ .proc_handler = proc_do_static_key,
+ },
+ { }
+};
+
+static int __init alloc_tag_init(void)
+{
+ const struct codetag_type_desc desc = {
+ .section = "alloc_tags",
+ .tag_size = sizeof(struct alloc_tag),
+ .module_unload = alloc_tag_module_unload,
+ };
+
+ alloc_tag_cttype = codetag_register_type(&desc);
+ if (IS_ERR(alloc_tag_cttype))
+ return PTR_ERR(alloc_tag_cttype);
+
+ if (!mem_profiling_support)
+ memory_allocation_profiling_sysctls[0].mode = 0444;
+ register_sysctl_init("vm", memory_allocation_profiling_sysctls);
+ procfs_init();
+
+ return 0;
+}
+module_init(alloc_tag_init);
diff --git a/lib/codetag.c b/lib/codetag.c
new file mode 100644
index 00000000000000..5ace625f2328f2
--- /dev/null
+++ b/lib/codetag.c
@@ -0,0 +1,283 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/codetag.h>
+#include <linux/idr.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/seq_buf.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+struct codetag_type {
+ struct list_head link;
+ unsigned int count;
+ struct idr mod_idr;
+ struct rw_semaphore mod_lock; /* protects mod_idr */
+ struct codetag_type_desc desc;
+};
+
+struct codetag_range {
+ struct codetag *start;
+ struct codetag *stop;
+};
+
+struct codetag_module {
+ struct module *mod;
+ struct codetag_range range;
+};
+
+static DEFINE_MUTEX(codetag_lock);
+static LIST_HEAD(codetag_types);
+
+void codetag_lock_module_list(struct codetag_type *cttype, bool lock)
+{
+ if (lock)
+ down_read(&cttype->mod_lock);
+ else
+ up_read(&cttype->mod_lock);
+}
+
+bool codetag_trylock_module_list(struct codetag_type *cttype)
+{
+ return down_read_trylock(&cttype->mod_lock) != 0;
+}
+
+struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype)
+{
+ struct codetag_iterator iter = {
+ .cttype = cttype,
+ .cmod = NULL,
+ .mod_id = 0,
+ .ct = NULL,
+ };
+
+ return iter;
+}
+
+static inline struct codetag *get_first_module_ct(struct codetag_module *cmod)
+{
+ return cmod->range.start < cmod->range.stop ? cmod->range.start : NULL;
+}
+
+static inline
+struct codetag *get_next_module_ct(struct codetag_iterator *iter)
+{
+ struct codetag *res = (struct codetag *)
+ ((char *)iter->ct + iter->cttype->desc.tag_size);
+
+ return res < iter->cmod->range.stop ? res : NULL;
+}
+
+struct codetag *codetag_next_ct(struct codetag_iterator *iter)
+{
+ struct codetag_type *cttype = iter->cttype;
+ struct codetag_module *cmod;
+ struct codetag *ct;
+
+ lockdep_assert_held(&cttype->mod_lock);
+
+ if (unlikely(idr_is_empty(&cttype->mod_idr)))
+ return NULL;
+
+ ct = NULL;
+ while (true) {
+ cmod = idr_find(&cttype->mod_idr, iter->mod_id);
+
+ /* If module was removed move to the next one */
+ if (!cmod)
+ cmod = idr_get_next_ul(&cttype->mod_idr,
+ &iter->mod_id);
+
+ /* Exit if no more modules */
+ if (!cmod)
+ break;
+
+ if (cmod != iter->cmod) {
+ iter->cmod = cmod;
+ ct = get_first_module_ct(cmod);
+ } else
+ ct = get_next_module_ct(iter);
+
+ if (ct)
+ break;
+
+ iter->mod_id++;
+ }
+
+ iter->ct = ct;
+ return ct;
+}
+
+void codetag_to_text(struct seq_buf *out, struct codetag *ct)
+{
+ if (ct->modname)
+ seq_buf_printf(out, "%s:%u [%s] func:%s",
+ ct->filename, ct->lineno,
+ ct->modname, ct->function);
+ else
+ seq_buf_printf(out, "%s:%u func:%s",
+ ct->filename, ct->lineno, ct->function);
+}
+
+static inline size_t range_size(const struct codetag_type *cttype,
+ const struct codetag_range *range)
+{
+ return ((char *)range->stop - (char *)range->start) /
+ cttype->desc.tag_size;
+}
+
+#ifdef CONFIG_MODULES
+static void *get_symbol(struct module *mod, const char *prefix, const char *name)
+{
+ DECLARE_SEQ_BUF(sb, KSYM_NAME_LEN);
+ const char *buf;
+ void *ret;
+
+ seq_buf_printf(&sb, "%s%s", prefix, name);
+ if (seq_buf_has_overflowed(&sb))
+ return NULL;
+
+ buf = seq_buf_str(&sb);
+ preempt_disable();
+ ret = mod ?
+ (void *)find_kallsyms_symbol_value(mod, buf) :
+ (void *)kallsyms_lookup_name(buf);
+ preempt_enable();
+
+ return ret;
+}
+
+static struct codetag_range get_section_range(struct module *mod,
+ const char *section)
+{
+ return (struct codetag_range) {
+ get_symbol(mod, "__start_", section),
+ get_symbol(mod, "__stop_", section),
+ };
+}
+
+static int codetag_module_init(struct codetag_type *cttype, struct module *mod)
+{
+ struct codetag_range range;
+ struct codetag_module *cmod;
+ int err;
+
+ range = get_section_range(mod, cttype->desc.section);
+ if (!range.start || !range.stop) {
+ pr_warn("Failed to load code tags of type %s from the module %s\n",
+ cttype->desc.section,
+ mod ? mod->name : "(built-in)");
+ return -EINVAL;
+ }
+
+ /* Ignore empty ranges */
+ if (range.start == range.stop)
+ return 0;
+
+ BUG_ON(range.start > range.stop);
+
+ cmod = kmalloc(sizeof(*cmod), GFP_KERNEL);
+ if (unlikely(!cmod))
+ return -ENOMEM;
+
+ cmod->mod = mod;
+ cmod->range = range;
+
+ down_write(&cttype->mod_lock);
+ err = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
+ if (err >= 0) {
+ cttype->count += range_size(cttype, &range);
+ if (cttype->desc.module_load)
+ cttype->desc.module_load(cttype, cmod);
+ }
+ up_write(&cttype->mod_lock);
+
+ if (err < 0) {
+ kfree(cmod);
+ return err;
+ }
+
+ return 0;
+}
+
+void codetag_load_module(struct module *mod)
+{
+ struct codetag_type *cttype;
+
+ if (!mod)
+ return;
+
+ mutex_lock(&codetag_lock);
+ list_for_each_entry(cttype, &codetag_types, link)
+ codetag_module_init(cttype, mod);
+ mutex_unlock(&codetag_lock);
+}
+
+bool codetag_unload_module(struct module *mod)
+{
+ struct codetag_type *cttype;
+ bool unload_ok = true;
+
+ if (!mod)
+ return true;
+
+ mutex_lock(&codetag_lock);
+ list_for_each_entry(cttype, &codetag_types, link) {
+ struct codetag_module *found = NULL;
+ struct codetag_module *cmod;
+ unsigned long mod_id, tmp;
+
+ down_write(&cttype->mod_lock);
+ idr_for_each_entry_ul(&cttype->mod_idr, cmod, tmp, mod_id) {
+ if (cmod->mod && cmod->mod == mod) {
+ found = cmod;
+ break;
+ }
+ }
+ if (found) {
+ if (cttype->desc.module_unload)
+ if (!cttype->desc.module_unload(cttype, cmod))
+ unload_ok = false;
+
+ cttype->count -= range_size(cttype, &cmod->range);
+ idr_remove(&cttype->mod_idr, mod_id);
+ kfree(cmod);
+ }
+ up_write(&cttype->mod_lock);
+ }
+ mutex_unlock(&codetag_lock);
+
+ return unload_ok;
+}
+
+#else /* CONFIG_MODULES */
+static int codetag_module_init(struct codetag_type *cttype, struct module *mod) { return 0; }
+#endif /* CONFIG_MODULES */
+
+struct codetag_type *
+codetag_register_type(const struct codetag_type_desc *desc)
+{
+ struct codetag_type *cttype;
+ int err;
+
+ BUG_ON(desc->tag_size <= 0);
+
+ cttype = kzalloc(sizeof(*cttype), GFP_KERNEL);
+ if (unlikely(!cttype))
+ return ERR_PTR(-ENOMEM);
+
+ cttype->desc = *desc;
+ idr_init(&cttype->mod_idr);
+ init_rwsem(&cttype->mod_lock);
+
+ err = codetag_module_init(cttype, NULL);
+ if (unlikely(err)) {
+ kfree(cttype);
+ return ERR_PTR(err);
+ }
+
+ mutex_lock(&codetag_lock);
+ list_add_tail(&cttype->link, &codetag_types);
+ mutex_unlock(&codetag_lock);
+
+ return cttype;
+}
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 6ae2ba8e06a213..dbbed19f8fff99 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -130,7 +130,8 @@ static union nested_table *nested_table_alloc(struct rhashtable *ht,
if (ntbl)
return ntbl;
- ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);
+ ntbl = alloc_hooks_tag(ht->alloc_tag,
+ kmalloc_noprof(PAGE_SIZE, GFP_ATOMIC|__GFP_ZERO));
if (ntbl && leaf) {
for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++)
@@ -157,7 +158,8 @@ static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
size = sizeof(*tbl) + sizeof(tbl->buckets[0]);
- tbl = kzalloc(size, gfp);
+ tbl = alloc_hooks_tag(ht->alloc_tag,
+ kmalloc_noprof(size, gfp|__GFP_ZERO));
if (!tbl)
return NULL;
@@ -181,7 +183,9 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
int i;
static struct lock_class_key __key;
- tbl = kvzalloc(struct_size(tbl, buckets, nbuckets), gfp);
+ tbl = alloc_hooks_tag(ht->alloc_tag,
+ kvmalloc_node_noprof(struct_size(tbl, buckets, nbuckets),
+ gfp|__GFP_ZERO, NUMA_NO_NODE));
size = nbuckets;
@@ -1016,7 +1020,7 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
* .obj_hashfn = my_hash_fn,
* };
*/
-int rhashtable_init(struct rhashtable *ht,
+int rhashtable_init_noprof(struct rhashtable *ht,
const struct rhashtable_params *params)
{
struct bucket_table *tbl;
@@ -1031,6 +1035,8 @@ int rhashtable_init(struct rhashtable *ht,
spin_lock_init(&ht->lock);
memcpy(&ht->p, params, sizeof(*params));
+ alloc_tag_record(ht->alloc_tag);
+
if (params->min_size)
ht->p.min_size = roundup_pow_of_two(params->min_size);
@@ -1076,7 +1082,7 @@ int rhashtable_init(struct rhashtable *ht,
return 0;
}
-EXPORT_SYMBOL_GPL(rhashtable_init);
+EXPORT_SYMBOL_GPL(rhashtable_init_noprof);
/**
* rhltable_init - initialize a new hash list table
@@ -1087,15 +1093,15 @@ EXPORT_SYMBOL_GPL(rhashtable_init);
*
* See documentation for rhashtable_init.
*/
-int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
+int rhltable_init_noprof(struct rhltable *hlt, const struct rhashtable_params *params)
{
int err;
- err = rhashtable_init(&hlt->ht, params);
+ err = rhashtable_init_noprof(&hlt->ht, params);
hlt->ht.rhlist = true;
return err;
}
-EXPORT_SYMBOL_GPL(rhltable_init);
+EXPORT_SYMBOL_GPL(rhltable_init_noprof);
static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
void (*free_fn)(void *ptr, void *arg),
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 717dcb83012733..b823ba7cb6a156 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -1226,8 +1226,8 @@ static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk)
unsigned long *src_pfns;
unsigned long *dst_pfns;
- src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL);
- dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL);
+ src_pfns = kvcalloc(npages, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL);
+ dst_pfns = kvcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL);
migrate_device_range(src_pfns, start_pfn, npages);
for (i = 0; i < npages; i++) {
@@ -1250,8 +1250,8 @@ static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk)
}
migrate_device_pages(src_pfns, dst_pfns, npages);
migrate_device_finalize(src_pfns, dst_pfns, npages);
- kfree(src_pfns);
- kfree(dst_pfns);
+ kvfree(src_pfns);
+ kvfree(dst_pfns);
}
/* Removes free pages from the free list so they can't be re-allocated */
diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index 5ab35190aae37d..0aea6a85099d53 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -1989,6 +1989,97 @@ static noinline void check_get_order(struct xarray *xa)
}
}
+static noinline void check_xas_get_order(struct xarray *xa)
+{
+ XA_STATE(xas, xa, 0);
+
+ unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1;
+ unsigned int order;
+ unsigned long i, j;
+
+ for (order = 0; order < max_order; order++) {
+ for (i = 0; i < 10; i++) {
+ xas_set_order(&xas, i << order, order);
+ do {
+ xas_lock(&xas);
+ xas_store(&xas, xa_mk_value(i));
+ xas_unlock(&xas);
+ } while (xas_nomem(&xas, GFP_KERNEL));
+
+ for (j = i << order; j < (i + 1) << order; j++) {
+ xas_set_order(&xas, j, 0);
+ rcu_read_lock();
+ xas_load(&xas);
+ XA_BUG_ON(xa, xas_get_order(&xas) != order);
+ rcu_read_unlock();
+ }
+
+ xas_lock(&xas);
+ xas_set_order(&xas, i << order, order);
+ xas_store(&xas, NULL);
+ xas_unlock(&xas);
+ }
+ }
+}
+
+static noinline void check_xas_conflict_get_order(struct xarray *xa)
+{
+ XA_STATE(xas, xa, 0);
+
+ void *entry;
+ int only_once;
+ unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1;
+ unsigned int order;
+ unsigned long i, j, k;
+
+ for (order = 0; order < max_order; order++) {
+ for (i = 0; i < 10; i++) {
+ xas_set_order(&xas, i << order, order);
+ do {
+ xas_lock(&xas);
+ xas_store(&xas, xa_mk_value(i));
+ xas_unlock(&xas);
+ } while (xas_nomem(&xas, GFP_KERNEL));
+
+ /*
+ * Ensure xas_get_order works with xas_for_each_conflict.
+ */
+ j = i << order;
+ for (k = 0; k < order; k++) {
+ only_once = 0;
+ xas_set_order(&xas, j + (1 << k), k);
+ xas_lock(&xas);
+ xas_for_each_conflict(&xas, entry) {
+ XA_BUG_ON(xa, entry != xa_mk_value(i));
+ XA_BUG_ON(xa, xas_get_order(&xas) != order);
+ only_once++;
+ }
+ XA_BUG_ON(xa, only_once != 1);
+ xas_unlock(&xas);
+ }
+
+ if (order < max_order - 1) {
+ only_once = 0;
+ xas_set_order(&xas, (i & ~1UL) << order, order + 1);
+ xas_lock(&xas);
+ xas_for_each_conflict(&xas, entry) {
+ XA_BUG_ON(xa, entry != xa_mk_value(i));
+ XA_BUG_ON(xa, xas_get_order(&xas) != order);
+ only_once++;
+ }
+ XA_BUG_ON(xa, only_once != 1);
+ xas_unlock(&xas);
+ }
+
+ xas_set_order(&xas, i << order, order);
+ xas_lock(&xas);
+ xas_store(&xas, NULL);
+ xas_unlock(&xas);
+ }
+ }
+}
+
+
static noinline void check_destroy(struct xarray *xa)
{
unsigned long index;
@@ -2040,6 +2131,8 @@ static int xarray_checks(void)
check_multi_store(&array);
check_multi_store_advanced(&array);
check_get_order(&array);
+ check_xas_get_order(&array);
+ check_xas_conflict_get_order(&array);
check_xa_alloc();
check_find(&array);
check_find_entry(&array);
diff --git a/lib/xarray.c b/lib/xarray.c
index 39f07bfc4dccac..da79128ad754fc 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -1750,39 +1750,52 @@ unlock:
EXPORT_SYMBOL(xa_store_range);
/**
- * xa_get_order() - Get the order of an entry.
- * @xa: XArray.
- * @index: Index of the entry.
+ * xas_get_order() - Get the order of an entry.
+ * @xas: XArray operation state.
+ *
+ * Called after xas_load, the xas should not be in an error state.
*
* Return: A number between 0 and 63 indicating the order of the entry.
*/
-int xa_get_order(struct xarray *xa, unsigned long index)
+int xas_get_order(struct xa_state *xas)
{
- XA_STATE(xas, xa, index);
- void *entry;
int order = 0;
- rcu_read_lock();
- entry = xas_load(&xas);
-
- if (!entry)
- goto unlock;
-
- if (!xas.xa_node)
- goto unlock;
+ if (!xas->xa_node)
+ return 0;
for (;;) {
- unsigned int slot = xas.xa_offset + (1 << order);
+ unsigned int slot = xas->xa_offset + (1 << order);
if (slot >= XA_CHUNK_SIZE)
break;
- if (!xa_is_sibling(xas.xa_node->slots[slot]))
+ if (!xa_is_sibling(xa_entry(xas->xa, xas->xa_node, slot)))
break;
order++;
}
- order += xas.xa_node->shift;
-unlock:
+ order += xas->xa_node->shift;
+ return order;
+}
+EXPORT_SYMBOL_GPL(xas_get_order);
+
+/**
+ * xa_get_order() - Get the order of an entry.
+ * @xa: XArray.
+ * @index: Index of the entry.
+ *
+ * Return: A number between 0 and 63 indicating the order of the entry.
+ */
+int xa_get_order(struct xarray *xa, unsigned long index)
+{
+ XA_STATE(xas, xa, index);
+ int order = 0;
+ void *entry;
+
+ rcu_read_lock();
+ entry = xas_load(&xas);
+ if (entry)
+ order = xas_get_order(&xas);
rcu_read_unlock();
return order;
diff --git a/mm/Kconfig b/mm/Kconfig
index b1448aa81e15f4..50df323eaecef0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -473,7 +473,7 @@ config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
config HAVE_MEMBLOCK_PHYS_MAP
bool
-config HAVE_FAST_GUP
+config HAVE_GUP_FAST
depends on MMU
bool
@@ -850,6 +850,12 @@ config READ_ONLY_THP_FOR_FS
endif # TRANSPARENT_HUGEPAGE
#
+# The architecture supports pgtable leaves that is larger than PAGE_SIZE
+#
+config PGTABLE_HAS_HUGE_LEAVES
+ def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE
+
+#
# UP and nommu archs use km based percpu allocator
#
config NEED_PER_CPU_KM
diff --git a/mm/Makefile b/mm/Makefile
index 4abb40b911ec43..e164eed35d467d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,6 +5,7 @@
KASAN_SANITIZE_slab_common.o := n
KASAN_SANITIZE_slub.o := n
+KASAN_SANITIZE_kmemleak.o := n
KCSAN_SANITIZE_kmemleak.o := n
# These produce frequent data race reports: most of them are due to races on
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 5f2be8c8df11f1..5fa3666356f957 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -388,7 +388,7 @@ static void wb_update_bandwidth_workfn(struct work_struct *work)
static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
gfp_t gfp)
{
- int i, err;
+ int err;
memset(wb, 0, sizeof(*wb));
@@ -416,18 +416,10 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
if (err)
return err;
- for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
- err = percpu_counter_init(&wb->stat[i], 0, gfp);
- if (err)
- goto out_destroy_stat;
- }
-
- return 0;
+ err = percpu_counter_init_many(wb->stat, 0, gfp, NR_WB_STAT_ITEMS);
+ if (err)
+ fprop_local_destroy_percpu(&wb->completions);
-out_destroy_stat:
- while (i--)
- percpu_counter_destroy(&wb->stat[i]);
- fprop_local_destroy_percpu(&wb->completions);
return err;
}
@@ -460,13 +452,8 @@ static void wb_shutdown(struct bdi_writeback *wb)
static void wb_exit(struct bdi_writeback *wb)
{
- int i;
-
WARN_ON(delayed_work_pending(&wb->dwork));
-
- for (i = 0; i < NR_WB_STAT_ITEMS; i++)
- percpu_counter_destroy(&wb->stat[i]);
-
+ percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS);
fprop_local_destroy_percpu(&wb->completions);
}
diff --git a/mm/cma.c b/mm/cma.c
index 01f5a8f71ddfa7..3e9724716bad87 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -182,10 +182,6 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
if (!size || !memblock_is_region_reserved(base, size))
return -EINVAL;
- /* alignment should be aligned with order_per_bit */
- if (!IS_ALIGNED(CMA_MIN_ALIGNMENT_PAGES, 1 << order_per_bit))
- return -EINVAL;
-
/* ensure minimal alignment required by mm core */
if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES))
return -EINVAL;
diff --git a/mm/compaction.c b/mm/compaction.c
index 807b58e6eb68b3..e731d45befc780 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1851,7 +1851,7 @@ static void isolate_freepages(struct compact_control *cc)
* This is a migrate-callback that "allocates" freepages by taking pages
* from the isolated freelists in the block we are migrating to.
*/
-static struct folio *compaction_alloc(struct folio *src, unsigned long data)
+static struct folio *compaction_alloc_noprof(struct folio *src, unsigned long data)
{
struct compact_control *cc = (struct compact_control *)data;
struct folio *dst;
@@ -1898,6 +1898,11 @@ again:
return page_rmappable_folio(&dst->page);
}
+static struct folio *compaction_alloc(struct folio *src, unsigned long data)
+{
+ return alloc_hooks(compaction_alloc_noprof(src, data));
+}
+
/*
* This is a migrate-callback that "frees" freepages back to the isolated
* freelist. All pages on the freelist are from the same zone, so there is no
@@ -3345,7 +3350,6 @@ static struct ctl_table vm_compaction[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
- { }
};
static int __init kcompactd_init(void)
diff --git a/mm/debug.c b/mm/debug.c
index c1c1a6a484e4c0..b71186f1fb0b1b 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -55,18 +55,14 @@ static void __dump_folio(struct folio *folio, struct page *page,
unsigned long pfn, unsigned long idx)
{
struct address_space *mapping = folio_mapping(folio);
- int mapcount = 0;
+ int mapcount = atomic_read(&page->_mapcount) + 1;
char *type = "";
- /*
- * page->_mapcount space in struct page is used by slab pages to
- * encode own info, and we must avoid calling page_folio() again.
- */
- if (!folio_test_slab(folio)) {
- mapcount = atomic_read(&page->_mapcount) + 1;
- if (folio_test_large(folio))
- mapcount += folio_entire_mapcount(folio);
- }
+ /* Open-code page_mapcount() to avoid looking up a stale folio */
+ if (mapcount < 0)
+ mapcount = 0;
+ if (folio_test_large(folio))
+ mapcount += folio_entire_mapcount(folio);
pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
folio_ref_count(folio), mapcount, mapping,
@@ -99,7 +95,8 @@ static void __dump_folio(struct folio *folio, struct page *page,
*/
pr_warn("%sflags: %pGp%s\n", type, &folio->flags,
is_migrate_cma_folio(folio, pfn) ? " CMA" : "");
- pr_warn("page_type: %pGt\n", &folio->page.page_type);
+ if (page_has_type(&folio->page))
+ pr_warn("page_type: %pGt\n", &folio->page.page_type);
print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
@@ -180,9 +177,6 @@ EXPORT_SYMBOL(dump_vma);
void dump_mm(const struct mm_struct *mm)
{
pr_emerg("mm %px task_size %lu\n"
-#ifdef CONFIG_MMU
- "get_unmapped_area %px\n"
-#endif
"mmap_base %lu mmap_legacy_base %lu\n"
"pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
@@ -208,9 +202,6 @@ void dump_mm(const struct mm_struct *mm)
"def_flags: %#lx(%pGv)\n",
mm, mm->task_size,
-#ifdef CONFIG_MMU
- mm->get_unmapped_area,
-#endif
mm->mmap_base, mm->mmap_legacy_base,
mm->pgd, atomic_read(&mm->mm_users),
atomic_read(&mm->mm_count),
diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c
index 6755f0c9d4a398..d46acf989dde88 100644
--- a/mm/debug_page_alloc.c
+++ b/mm/debug_page_alloc.c
@@ -32,8 +32,7 @@ static int __init debug_guardpage_minorder_setup(char *buf)
}
early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
-bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
- int migratetype)
+bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order)
{
if (order >= debug_guardpage_minorder())
return false;
@@ -41,19 +40,12 @@ bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
__SetPageGuard(page);
INIT_LIST_HEAD(&page->buddy_list);
set_page_private(page, order);
- /* Guard pages are not available for any usage */
- if (!is_migrate_isolate(migratetype))
- __mod_zone_freepage_state(zone, -(1 << order), migratetype);
return true;
}
-void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
- int migratetype)
+void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order)
{
__ClearPageGuard(page);
-
set_page_private(page, 0);
- if (!is_migrate_isolate(migratetype))
- __mod_zone_freepage_state(zone, (1 << order), migratetype);
}
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 65c19025da3dfe..f1c9a2c5abc04c 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -30,6 +30,7 @@
#include <linux/start_kernel.h>
#include <linux/sched/mm.h>
#include <linux/io.h>
+#include <linux/vmalloc.h>
#include <asm/cacheflush.h>
#include <asm/pgalloc.h>
diff --git a/mm/filemap.c b/mm/filemap.c
index 30de18c4fd28a9..18f4bce3741ec3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -852,23 +852,18 @@ noinline int __filemap_add_folio(struct address_space *mapping,
struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
{
XA_STATE(xas, &mapping->i_pages, index);
- bool huge = folio_test_hugetlb(folio);
- bool charged = false;
- long nr = 1;
+ void *alloced_shadow = NULL;
+ int alloced_order = 0;
+ bool huge;
+ long nr;
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
mapping_set_update(&xas, mapping);
- if (!huge) {
- int error = mem_cgroup_charge(folio, NULL, gfp);
- if (error)
- return error;
- charged = true;
- }
-
VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
xas_set_order(&xas, index, folio_order(folio));
+ huge = folio_test_hugetlb(folio);
nr = folio_nr_pages(folio);
gfp &= GFP_RECLAIM_MASK;
@@ -876,13 +871,10 @@ noinline int __filemap_add_folio(struct address_space *mapping,
folio->mapping = mapping;
folio->index = xas.xa_index;
- do {
- unsigned int order = xa_get_order(xas.xa, xas.xa_index);
+ for (;;) {
+ int order = -1, split_order = 0;
void *entry, *old = NULL;
- if (order > folio_order(folio))
- xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
- order, gfp);
xas_lock_irq(&xas);
xas_for_each_conflict(&xas, entry) {
old = entry;
@@ -890,19 +882,33 @@ noinline int __filemap_add_folio(struct address_space *mapping,
xas_set_err(&xas, -EEXIST);
goto unlock;
}
+ /*
+ * If a larger entry exists,
+ * it will be the first and only entry iterated.
+ */
+ if (order == -1)
+ order = xas_get_order(&xas);
+ }
+
+ /* entry may have changed before we re-acquire the lock */
+ if (alloced_order && (old != alloced_shadow || order != alloced_order)) {
+ xas_destroy(&xas);
+ alloced_order = 0;
}
if (old) {
- if (shadowp)
- *shadowp = old;
- /* entry may have been split before we acquired lock */
- order = xa_get_order(xas.xa, xas.xa_index);
- if (order > folio_order(folio)) {
+ if (order > 0 && order > folio_order(folio)) {
/* How to handle large swap entries? */
BUG_ON(shmem_mapping(mapping));
+ if (!alloced_order) {
+ split_order = order;
+ goto unlock;
+ }
xas_split(&xas, old, order);
xas_reset(&xas);
}
+ if (shadowp)
+ *shadowp = old;
}
xas_store(&xas, folio);
@@ -918,9 +924,24 @@ noinline int __filemap_add_folio(struct address_space *mapping,
__lruvec_stat_mod_folio(folio,
NR_FILE_THPS, nr);
}
+
unlock:
xas_unlock_irq(&xas);
- } while (xas_nomem(&xas, gfp));
+
+ /* split needed, alloc here and retry. */
+ if (split_order) {
+ xas_split_alloc(&xas, old, split_order, gfp);
+ if (xas_error(&xas))
+ goto error;
+ alloced_shadow = old;
+ alloced_order = split_order;
+ xas_reset(&xas);
+ continue;
+ }
+
+ if (!xas_nomem(&xas, gfp))
+ break;
+ }
if (xas_error(&xas))
goto error;
@@ -928,8 +949,6 @@ unlock:
trace_mm_filemap_add_to_page_cache(folio);
return 0;
error:
- if (charged)
- mem_cgroup_uncharge(folio);
folio->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
folio_put_refs(folio, nr);
@@ -943,11 +962,16 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
void *shadow = NULL;
int ret;
+ ret = mem_cgroup_charge(folio, NULL, gfp);
+ if (ret)
+ return ret;
+
__folio_set_locked(folio);
ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
- if (unlikely(ret))
+ if (unlikely(ret)) {
+ mem_cgroup_uncharge(folio);
__folio_clear_locked(folio);
- else {
+ } else {
/*
* The folio might have been evicted from cache only
* recently, in which case it should be activated like
@@ -966,7 +990,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
EXPORT_SYMBOL_GPL(filemap_add_folio);
#ifdef CONFIG_NUMA
-struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
+struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
{
int n;
struct folio *folio;
@@ -981,9 +1005,9 @@ struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
return folio;
}
- return folio_alloc(gfp, order);
+ return folio_alloc_noprof(gfp, order);
}
-EXPORT_SYMBOL(filemap_alloc_folio);
+EXPORT_SYMBOL(filemap_alloc_folio_noprof);
#endif
/*
@@ -1786,7 +1810,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
* C. Return the page to the page allocator
*
* This means that any page may have its reference count temporarily
- * increased by a speculative page cache (or fast GUP) lookup as it can
+ * increased by a speculative page cache (or GUP-fast) lookup as it can
* be allocated by another user before the RCU grace period expires.
* Because the refcount temporarily acquired here may end up being the
* last refcount on the page, any page allocation must be freeable by
@@ -3492,7 +3516,15 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
if (PageHWPoison(page + count))
goto skip;
- (*mmap_miss)++;
+ /*
+ * If there are too many folios that are recently evicted
+ * in a file, they will probably continue to be evicted.
+ * In such situation, read-ahead is only a waste of IO.
+ * Don't decrease mmap_miss in this scenario to make sure
+ * we can stop read-ahead.
+ */
+ if (!folio_test_workingset(folio))
+ (*mmap_miss)++;
/*
* NOTE: If there're PTE markers, we'll leave them to be
@@ -3541,7 +3573,9 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
if (PageHWPoison(page))
return ret;
- (*mmap_miss)++;
+ /* See comment of filemap_map_folio_range() */
+ if (!folio_test_workingset(folio))
+ (*mmap_miss)++;
/*
* NOTE: If there're PTE markers, we'll leave them to be
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 50412014f16f72..f31e0ce65b11d4 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -58,12 +58,6 @@ bool set_page_dirty(struct page *page)
}
EXPORT_SYMBOL(set_page_dirty);
-int __set_page_dirty_nobuffers(struct page *page)
-{
- return filemap_dirty_folio(page_mapping(page), page_folio(page));
-}
-EXPORT_SYMBOL(__set_page_dirty_nobuffers);
-
bool clear_page_dirty_for_io(struct page *page)
{
return folio_clear_dirty_for_io(page_folio(page));
diff --git a/mm/gup.c b/mm/gup.c
index 1611e73b1121b1..8dcbeae714e268 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -440,7 +440,7 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
}
EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
-static void unpin_user_pages_lockless(struct page **pages, unsigned long npages)
+static void gup_fast_unpin_user_pages(struct page **pages, unsigned long npages)
{
unsigned long i;
struct folio *folio;
@@ -500,23 +500,330 @@ static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
}
#ifdef CONFIG_MMU
+
+#if defined(CONFIG_ARCH_HAS_HUGEPD) || defined(CONFIG_HAVE_GUP_FAST)
+static int record_subpages(struct page *page, unsigned long sz,
+ unsigned long addr, unsigned long end,
+ struct page **pages)
+{
+ struct page *start_page;
+ int nr;
+
+ start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT);
+ for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
+ pages[nr] = nth_page(start_page, nr);
+
+ return nr;
+}
+#endif /* CONFIG_ARCH_HAS_HUGEPD || CONFIG_HAVE_GUP_FAST */
+
+#ifdef CONFIG_ARCH_HAS_HUGEPD
+static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
+ unsigned long sz)
+{
+ unsigned long __boundary = (addr + sz) & ~(sz-1);
+ return (__boundary - 1 < end - 1) ? __boundary : end;
+}
+
+static int gup_fast_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+ unsigned long end, unsigned int flags, struct page **pages,
+ int *nr)
+{
+ unsigned long pte_end;
+ struct page *page;
+ struct folio *folio;
+ pte_t pte;
+ int refs;
+
+ pte_end = (addr + sz) & ~(sz-1);
+ if (pte_end < end)
+ end = pte_end;
+
+ pte = huge_ptep_get(ptep);
+
+ if (!pte_access_permitted(pte, flags & FOLL_WRITE))
+ return 0;
+
+ /* hugepages are never "special" */
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+ page = pte_page(pte);
+ refs = record_subpages(page, sz, addr, end, pages + *nr);
+
+ folio = try_grab_folio(page, refs, flags);
+ if (!folio)
+ return 0;
+
+ if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
+ if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
+ *nr += refs;
+ folio_set_referenced(folio);
+ return 1;
+}
+
+/*
+ * NOTE: currently GUP for a hugepd is only possible on hugetlbfs file
+ * systems on Power, which does not have issue with folio writeback against
+ * GUP updates. When hugepd will be extended to support non-hugetlbfs or
+ * even anonymous memory, we need to do extra check as what we do with most
+ * of the other folios. See writable_file_mapping_allowed() and
+ * gup_fast_folio_allowed() for more information.
+ */
+static int gup_fast_hugepd(hugepd_t hugepd, unsigned long addr,
+ unsigned int pdshift, unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ pte_t *ptep;
+ unsigned long sz = 1UL << hugepd_shift(hugepd);
+ unsigned long next;
+
+ ptep = hugepte_offset(hugepd, addr, pdshift);
+ do {
+ next = hugepte_addr_end(addr, end, sz);
+ if (!gup_fast_hugepte(ptep, sz, addr, end, flags, pages, nr))
+ return 0;
+ } while (ptep++, addr = next, addr != end);
+
+ return 1;
+}
+
+static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
+ unsigned long addr, unsigned int pdshift,
+ unsigned int flags,
+ struct follow_page_context *ctx)
+{
+ struct page *page;
+ struct hstate *h;
+ spinlock_t *ptl;
+ int nr = 0, ret;
+ pte_t *ptep;
+
+ /* Only hugetlb supports hugepd */
+ if (WARN_ON_ONCE(!is_vm_hugetlb_page(vma)))
+ return ERR_PTR(-EFAULT);
+
+ h = hstate_vma(vma);
+ ptep = hugepte_offset(hugepd, addr, pdshift);
+ ptl = huge_pte_lock(h, vma->vm_mm, ptep);
+ ret = gup_fast_hugepd(hugepd, addr, pdshift, addr + PAGE_SIZE,
+ flags, &page, &nr);
+ spin_unlock(ptl);
+
+ if (ret) {
+ WARN_ON_ONCE(nr != 1);
+ ctx->page_mask = (1U << huge_page_order(h)) - 1;
+ return page;
+ }
+
+ return NULL;
+}
+#else /* CONFIG_ARCH_HAS_HUGEPD */
+static inline int gup_fast_hugepd(hugepd_t hugepd, unsigned long addr,
+ unsigned int pdshift, unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ return 0;
+}
+
+static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
+ unsigned long addr, unsigned int pdshift,
+ unsigned int flags,
+ struct follow_page_context *ctx)
+{
+ return NULL;
+}
+#endif /* CONFIG_ARCH_HAS_HUGEPD */
+
+
static struct page *no_page_table(struct vm_area_struct *vma,
- unsigned int flags)
+ unsigned int flags, unsigned long address)
{
+ if (!(flags & FOLL_DUMP))
+ return NULL;
+
/*
- * When core dumping an enormous anonymous area that nobody
- * has touched so far, we don't want to allocate unnecessary pages or
+ * When core dumping, we don't want to allocate unnecessary pages or
* page tables. Return error instead of NULL to skip handle_mm_fault,
* then get_dump_page() will return NULL to leave a hole in the dump.
* But we can only make this optimization where a hole would surely
* be zero-filled if handle_mm_fault() actually did handle it.
*/
- if ((flags & FOLL_DUMP) &&
- (vma_is_anonymous(vma) || !vma->vm_ops->fault))
+ if (is_vm_hugetlb_page(vma)) {
+ struct hstate *h = hstate_vma(vma);
+
+ if (!hugetlbfs_pagecache_present(h, vma, address))
+ return ERR_PTR(-EFAULT);
+ } else if ((vma_is_anonymous(vma) || !vma->vm_ops->fault)) {
return ERR_PTR(-EFAULT);
+ }
+
return NULL;
}
+#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
+static struct page *follow_huge_pud(struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pudp,
+ int flags, struct follow_page_context *ctx)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page;
+ pud_t pud = *pudp;
+ unsigned long pfn = pud_pfn(pud);
+ int ret;
+
+ assert_spin_locked(pud_lockptr(mm, pudp));
+
+ if ((flags & FOLL_WRITE) && !pud_write(pud))
+ return NULL;
+
+ if (!pud_present(pud))
+ return NULL;
+
+ pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
+
+ if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
+ pud_devmap(pud)) {
+ /*
+ * device mapped pages can only be returned if the caller
+ * will manage the page reference count.
+ *
+ * At least one of FOLL_GET | FOLL_PIN must be set, so
+ * assert that here:
+ */
+ if (!(flags & (FOLL_GET | FOLL_PIN)))
+ return ERR_PTR(-EEXIST);
+
+ if (flags & FOLL_TOUCH)
+ touch_pud(vma, addr, pudp, flags & FOLL_WRITE);
+
+ ctx->pgmap = get_dev_pagemap(pfn, ctx->pgmap);
+ if (!ctx->pgmap)
+ return ERR_PTR(-EFAULT);
+ }
+
+ page = pfn_to_page(pfn);
+
+ if (!pud_devmap(pud) && !pud_write(pud) &&
+ gup_must_unshare(vma, flags, page))
+ return ERR_PTR(-EMLINK);
+
+ ret = try_grab_page(page, flags);
+ if (ret)
+ page = ERR_PTR(ret);
+ else
+ ctx->page_mask = HPAGE_PUD_NR - 1;
+
+ return page;
+}
+
+/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
+static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned int flags)
+{
+ /* If the pmd is writable, we can write to the page. */
+ if (pmd_write(pmd))
+ return true;
+
+ /* Maybe FOLL_FORCE is set to override it? */
+ if (!(flags & FOLL_FORCE))
+ return false;
+
+ /* But FOLL_FORCE has no effect on shared mappings */
+ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+ return false;
+
+ /* ... or read-only private ones */
+ if (!(vma->vm_flags & VM_MAYWRITE))
+ return false;
+
+ /* ... or already writable ones that just need to take a write fault */
+ if (vma->vm_flags & VM_WRITE)
+ return false;
+
+ /*
+ * See can_change_pte_writable(): we broke COW and could map the page
+ * writable if we have an exclusive anonymous page ...
+ */
+ if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+ return false;
+
+ /* ... and a write-fault isn't required for other reasons. */
+ if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
+ return false;
+ return !userfaultfd_huge_pmd_wp(vma, pmd);
+}
+
+static struct page *follow_huge_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmd,
+ unsigned int flags,
+ struct follow_page_context *ctx)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t pmdval = *pmd;
+ struct page *page;
+ int ret;
+
+ assert_spin_locked(pmd_lockptr(mm, pmd));
+
+ page = pmd_page(pmdval);
+ if ((flags & FOLL_WRITE) &&
+ !can_follow_write_pmd(pmdval, page, vma, flags))
+ return NULL;
+
+ /* Avoid dumping huge zero page */
+ if ((flags & FOLL_DUMP) && is_huge_zero_pmd(pmdval))
+ return ERR_PTR(-EFAULT);
+
+ if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags))
+ return NULL;
+
+ if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page))
+ return ERR_PTR(-EMLINK);
+
+ VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+ !PageAnonExclusive(page), page);
+
+ ret = try_grab_page(page, flags);
+ if (ret)
+ return ERR_PTR(ret);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge(pmdval) && (flags & FOLL_TOUCH))
+ touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+ page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
+ ctx->page_mask = HPAGE_PMD_NR - 1;
+
+ return page;
+}
+
+#else /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
+static struct page *follow_huge_pud(struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pudp,
+ int flags, struct follow_page_context *ctx)
+{
+ return NULL;
+}
+
+static struct page *follow_huge_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmd,
+ unsigned int flags,
+ struct follow_page_context *ctx)
+{
+ return NULL;
+}
+#endif /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
+
static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
pte_t *pte, unsigned int flags)
{
@@ -593,7 +900,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!ptep)
- return no_page_table(vma, flags);
+ return no_page_table(vma, flags, address);
pte = ptep_get(ptep);
if (!pte_present(pte))
goto no_page;
@@ -685,7 +992,7 @@ no_page:
pte_unmap_unlock(ptep, ptl);
if (!pte_none(pte))
return NULL;
- return no_page_table(vma, flags);
+ return no_page_table(vma, flags, address);
}
static struct page *follow_pmd_mask(struct vm_area_struct *vma,
@@ -701,42 +1008,45 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
pmd = pmd_offset(pudp, address);
pmdval = pmdp_get_lockless(pmd);
if (pmd_none(pmdval))
- return no_page_table(vma, flags);
+ return no_page_table(vma, flags, address);
if (!pmd_present(pmdval))
- return no_page_table(vma, flags);
+ return no_page_table(vma, flags, address);
+ if (unlikely(is_hugepd(__hugepd(pmd_val(pmdval)))))
+ return follow_hugepd(vma, __hugepd(pmd_val(pmdval)),
+ address, PMD_SHIFT, flags, ctx);
if (pmd_devmap(pmdval)) {
ptl = pmd_lock(mm, pmd);
page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
spin_unlock(ptl);
if (page)
return page;
- return no_page_table(vma, flags);
+ return no_page_table(vma, flags, address);
}
- if (likely(!pmd_trans_huge(pmdval)))
+ if (likely(!pmd_leaf(pmdval)))
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags))
- return no_page_table(vma, flags);
+ return no_page_table(vma, flags, address);
ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_present(*pmd))) {
+ pmdval = *pmd;
+ if (unlikely(!pmd_present(pmdval))) {
spin_unlock(ptl);
- return no_page_table(vma, flags);
+ return no_page_table(vma, flags, address);
}
- if (unlikely(!pmd_trans_huge(*pmd))) {
+ if (unlikely(!pmd_leaf(pmdval))) {
spin_unlock(ptl);
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
- if (flags & FOLL_SPLIT_PMD) {
+ if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) {
spin_unlock(ptl);
split_huge_pmd(vma, pmd, address);
/* If pmd was left empty, stuff a page table in there quickly */
return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) :
follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
- page = follow_trans_huge_pmd(vma, address, pmd, flags);
+ page = follow_huge_pmd(vma, address, pmd, flags, ctx);
spin_unlock(ptl);
- ctx->page_mask = HPAGE_PMD_NR - 1;
return page;
}
@@ -745,26 +1055,30 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
unsigned int flags,
struct follow_page_context *ctx)
{
- pud_t *pud;
+ pud_t *pudp, pud;
spinlock_t *ptl;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
- pud = pud_offset(p4dp, address);
- if (pud_none(*pud))
- return no_page_table(vma, flags);
- if (pud_devmap(*pud)) {
- ptl = pud_lock(mm, pud);
- page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
+ pudp = pud_offset(p4dp, address);
+ pud = READ_ONCE(*pudp);
+ if (!pud_present(pud))
+ return no_page_table(vma, flags, address);
+ if (unlikely(is_hugepd(__hugepd(pud_val(pud)))))
+ return follow_hugepd(vma, __hugepd(pud_val(pud)),
+ address, PUD_SHIFT, flags, ctx);
+ if (pud_leaf(pud)) {
+ ptl = pud_lock(mm, pudp);
+ page = follow_huge_pud(vma, address, pudp, flags, ctx);
spin_unlock(ptl);
if (page)
return page;
- return no_page_table(vma, flags);
+ return no_page_table(vma, flags, address);
}
- if (unlikely(pud_bad(*pud)))
- return no_page_table(vma, flags);
+ if (unlikely(pud_bad(pud)))
+ return no_page_table(vma, flags, address);
- return follow_pmd_mask(vma, address, pud, flags, ctx);
+ return follow_pmd_mask(vma, address, pudp, flags, ctx);
}
static struct page *follow_p4d_mask(struct vm_area_struct *vma,
@@ -772,16 +1086,20 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
unsigned int flags,
struct follow_page_context *ctx)
{
- p4d_t *p4d;
+ p4d_t *p4dp, p4d;
- p4d = p4d_offset(pgdp, address);
- if (p4d_none(*p4d))
- return no_page_table(vma, flags);
- BUILD_BUG_ON(p4d_huge(*p4d));
- if (unlikely(p4d_bad(*p4d)))
- return no_page_table(vma, flags);
+ p4dp = p4d_offset(pgdp, address);
+ p4d = READ_ONCE(*p4dp);
+ BUILD_BUG_ON(p4d_leaf(p4d));
+
+ if (unlikely(is_hugepd(__hugepd(p4d_val(p4d)))))
+ return follow_hugepd(vma, __hugepd(p4d_val(p4d)),
+ address, P4D_SHIFT, flags, ctx);
- return follow_pud_mask(vma, address, p4d, flags, ctx);
+ if (!p4d_present(p4d) || p4d_bad(p4d))
+ return no_page_table(vma, flags, address);
+
+ return follow_pud_mask(vma, address, p4dp, flags, ctx);
}
/**
@@ -814,24 +1132,24 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
{
pgd_t *pgd;
struct mm_struct *mm = vma->vm_mm;
+ struct page *page;
- ctx->page_mask = 0;
-
- /*
- * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
- * special hugetlb page table walking code. This eliminates the
- * need to check for hugetlb entries in the general walking code.
- */
- if (is_vm_hugetlb_page(vma))
- return hugetlb_follow_page_mask(vma, address, flags,
- &ctx->page_mask);
+ vma_pgtable_walk_begin(vma);
+ ctx->page_mask = 0;
pgd = pgd_offset(mm, address);
- if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
- return no_page_table(vma, flags);
+ if (unlikely(is_hugepd(__hugepd(pgd_val(*pgd)))))
+ page = follow_hugepd(vma, __hugepd(pgd_val(*pgd)),
+ address, PGDIR_SHIFT, flags, ctx);
+ else if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ page = no_page_table(vma, flags, address);
+ else
+ page = follow_p4d_mask(vma, address, pgd, flags, ctx);
- return follow_p4d_mask(vma, address, pgd, flags, ctx);
+ vma_pgtable_walk_end(vma);
+
+ return page;
}
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
@@ -2144,6 +2462,7 @@ static int migrate_longterm_unpinnable_pages(
struct migration_target_control mtc = {
.nid = NUMA_NO_NODE,
.gfp_mask = GFP_USER | __GFP_NOWARN,
+ .reason = MR_LONGTERM_PIN,
};
if (migrate_pages(movable_page_list, alloc_migration_target,
@@ -2431,7 +2750,7 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
EXPORT_SYMBOL(get_user_pages_unlocked);
/*
- * Fast GUP
+ * GUP-fast
*
* get_user_pages_fast attempts to pin user pages by walking the page
* tables directly and avoids taking locks. Thus the walker needs to be
@@ -2445,7 +2764,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
*
* Another way to achieve this is to batch up page table containing pages
* belonging to more than one mm_user, then rcu_sched a callback to free those
- * pages. Disabling interrupts will allow the fast_gup walker to both block
+ * pages. Disabling interrupts will allow the gup_fast() walker to both block
* the rcu_sched callback, and an IPI that we broadcast for splitting THPs
* (which is a relatively rare event). The code below adopts this strategy.
*
@@ -2463,15 +2782,17 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
*
* This code is based heavily on the PowerPC implementation by Nick Piggin.
*/
-#ifdef CONFIG_HAVE_FAST_GUP
+#ifdef CONFIG_HAVE_GUP_FAST
/*
- * Used in the GUP-fast path to determine whether a pin is permitted for a
- * specific folio.
+ * Used in the GUP-fast path to determine whether GUP is permitted to work on
+ * a specific folio.
*
* This call assumes the caller has pinned the folio, that the lowest page table
* level still points to this folio, and that interrupts have been disabled.
*
+ * GUP-fast must reject all secretmem folios.
+ *
* Writing to pinned file-backed dirty tracked folios is inherently problematic
* (see comment describing the writable_file_mapping_allowed() function). We
* therefore try to avoid the most egregious case of a long-term mapping doing
@@ -2481,25 +2802,34 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
* in the fast path, so instead we whitelist known good cases and if in doubt,
* fall back to the slow path.
*/
-static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags)
+static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
{
+ bool reject_file_backed = false;
struct address_space *mapping;
+ bool check_secretmem = false;
unsigned long mapping_flags;
/*
* If we aren't pinning then no problematic write can occur. A long term
* pin is the most egregious case so this is the one we disallow.
*/
- if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) !=
+ if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) ==
(FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE))
- return true;
+ reject_file_backed = true;
+
+ /* We hold a folio reference, so we can safely access folio fields. */
- /* The folio is pinned, so we can safely access folio fields. */
+ /* secretmem folios are always order-0 folios. */
+ if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio))
+ check_secretmem = true;
+
+ if (!reject_file_backed && !check_secretmem)
+ return true;
if (WARN_ON_ONCE(folio_test_slab(folio)))
return false;
- /* hugetlb mappings do not require dirty-tracking. */
+ /* hugetlb neither requires dirty-tracking nor can be secretmem. */
if (folio_test_hugetlb(folio))
return true;
@@ -2535,15 +2865,16 @@ static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags)
/*
* At this point, we know the mapping is non-null and points to an
- * address_space object. The only remaining whitelisted file system is
- * shmem.
+ * address_space object.
*/
- return shmem_mapping(mapping);
+ if (check_secretmem && secretmem_mapping(mapping))
+ return false;
+ /* The only remaining allowed file system is shmem. */
+ return !reject_file_backed || shmem_mapping(mapping);
}
-static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
- unsigned int flags,
- struct page **pages)
+static void __maybe_unused gup_fast_undo_dev_pagemap(int *nr, int nr_start,
+ unsigned int flags, struct page **pages)
{
while ((*nr) - nr_start) {
struct page *page = pages[--(*nr)];
@@ -2558,27 +2889,27 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
/*
- * Fast-gup relies on pte change detection to avoid concurrent pgtable
+ * GUP-fast relies on pte change detection to avoid concurrent pgtable
* operations.
*
- * To pin the page, fast-gup needs to do below in order:
+ * To pin the page, GUP-fast needs to do below in order:
* (1) pin the page (by prefetching pte), then (2) check pte not changed.
*
* For the rest of pgtable operations where pgtable updates can be racy
- * with fast-gup, we need to do (1) clear pte, then (2) check whether page
+ * with GUP-fast, we need to do (1) clear pte, then (2) check whether page
* is pinned.
*
* Above will work for all pte-level operations, including THP split.
*
- * For THP collapse, it's a bit more complicated because fast-gup may be
+ * For THP collapse, it's a bit more complicated because GUP-fast may be
* walking a pgtable page that is being freed (pte is still valid but pmd
* can be cleared already). To avoid race in such condition, we need to
* also check pmd here to make sure pmd doesn't change (corresponds to
* pmdp_collapse_flush() in the THP collapse code path).
*/
-static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
- unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
+static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags, struct page **pages,
+ int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
@@ -2611,7 +2942,7 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
if (unlikely(!pgmap)) {
- undo_dev_pagemap(nr, nr_start, flags, pages);
+ gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
goto pte_unmap;
}
} else if (pte_special(pte))
@@ -2624,18 +2955,13 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
if (!folio)
goto pte_unmap;
- if (unlikely(folio_is_secretmem(folio))) {
- gup_put_folio(folio, 1, flags);
- goto pte_unmap;
- }
-
if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
- if (!folio_fast_pin_allowed(folio, flags)) {
+ if (!gup_fast_folio_allowed(folio, flags)) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
@@ -2680,20 +3006,19 @@ pte_unmap:
*
* For a futex to be placed on a THP tail page, get_futex_key requires a
* get_user_pages_fast_only implementation that can pin pages. Thus it's still
- * useful to have gup_huge_pmd even if we can't operate on ptes.
+ * useful to have gup_fast_pmd_leaf even if we can't operate on ptes.
*/
-static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
- unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
+static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags, struct page **pages,
+ int *nr)
{
return 0;
}
#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
-static int __gup_device_huge(unsigned long pfn, unsigned long addr,
- unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
+static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr,
+ unsigned long end, unsigned int flags, struct page **pages, int *nr)
{
int nr_start = *nr;
struct dev_pagemap *pgmap = NULL;
@@ -2703,19 +3028,19 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
pgmap = get_dev_pagemap(pfn, pgmap);
if (unlikely(!pgmap)) {
- undo_dev_pagemap(nr, nr_start, flags, pages);
+ gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
break;
}
if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) {
- undo_dev_pagemap(nr, nr_start, flags, pages);
+ gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
break;
}
SetPageReferenced(page);
pages[*nr] = page;
if (unlikely(try_grab_page(page, flags))) {
- undo_dev_pagemap(nr, nr_start, flags, pages);
+ gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
break;
}
(*nr)++;
@@ -2726,156 +3051,62 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
return addr == end;
}
-static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
+static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags, struct page **pages,
+ int *nr)
{
unsigned long fault_pfn;
int nr_start = *nr;
fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
+ if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr))
return 0;
if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
- undo_dev_pagemap(nr, nr_start, flags, pages);
+ gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
return 0;
}
return 1;
}
-static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
- unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
+static int gup_fast_devmap_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
+ unsigned long end, unsigned int flags, struct page **pages,
+ int *nr)
{
unsigned long fault_pfn;
int nr_start = *nr;
fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
+ if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr))
return 0;
if (unlikely(pud_val(orig) != pud_val(*pudp))) {
- undo_dev_pagemap(nr, nr_start, flags, pages);
+ gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
return 0;
}
return 1;
}
#else
-static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
+static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags, struct page **pages,
+ int *nr)
{
BUILD_BUG();
return 0;
}
-static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
- unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
+static int gup_fast_devmap_pud_leaf(pud_t pud, pud_t *pudp, unsigned long addr,
+ unsigned long end, unsigned int flags, struct page **pages,
+ int *nr)
{
BUILD_BUG();
return 0;
}
#endif
-static int record_subpages(struct page *page, unsigned long addr,
- unsigned long end, struct page **pages)
-{
- int nr;
-
- for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
- pages[nr] = nth_page(page, nr);
-
- return nr;
-}
-
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
- unsigned long sz)
-{
- unsigned long __boundary = (addr + sz) & ~(sz-1);
- return (__boundary - 1 < end - 1) ? __boundary : end;
-}
-
-static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
- unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
-{
- unsigned long pte_end;
- struct page *page;
- struct folio *folio;
- pte_t pte;
- int refs;
-
- pte_end = (addr + sz) & ~(sz-1);
- if (pte_end < end)
- end = pte_end;
-
- pte = huge_ptep_get(ptep);
-
- if (!pte_access_permitted(pte, flags & FOLL_WRITE))
- return 0;
-
- /* hugepages are never "special" */
- VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-
- page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT);
- refs = record_subpages(page, addr, end, pages + *nr);
-
- folio = try_grab_folio(page, refs, flags);
- if (!folio)
- return 0;
-
- if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
- gup_put_folio(folio, refs, flags);
- return 0;
- }
-
- if (!folio_fast_pin_allowed(folio, flags)) {
- gup_put_folio(folio, refs, flags);
- return 0;
- }
-
- if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) {
- gup_put_folio(folio, refs, flags);
- return 0;
- }
-
- *nr += refs;
- folio_set_referenced(folio);
- return 1;
-}
-
-static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
- unsigned int pdshift, unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
-{
- pte_t *ptep;
- unsigned long sz = 1UL << hugepd_shift(hugepd);
- unsigned long next;
-
- ptep = hugepte_offset(hugepd, addr, pdshift);
- do {
- next = hugepte_addr_end(addr, end, sz);
- if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
- return 0;
- } while (ptep++, addr = next, addr != end);
-
- return 1;
-}
-#else
-static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
- unsigned int pdshift, unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
-{
- return 0;
-}
-#endif /* CONFIG_ARCH_HAS_HUGEPD */
-
-static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
+static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags, struct page **pages,
+ int *nr)
{
struct page *page;
struct folio *folio;
@@ -2887,12 +3118,12 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
if (pmd_devmap(orig)) {
if (unlikely(flags & FOLL_LONGTERM))
return 0;
- return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
- pages, nr);
+ return gup_fast_devmap_pmd_leaf(orig, pmdp, addr, end, flags,
+ pages, nr);
}
- page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT);
- refs = record_subpages(page, addr, end, pages + *nr);
+ page = pmd_page(orig);
+ refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr);
folio = try_grab_folio(page, refs, flags);
if (!folio)
@@ -2903,7 +3134,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
}
- if (!folio_fast_pin_allowed(folio, flags)) {
+ if (!gup_fast_folio_allowed(folio, flags)) {
gup_put_folio(folio, refs, flags);
return 0;
}
@@ -2917,9 +3148,9 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 1;
}
-static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
- unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
+static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
+ unsigned long end, unsigned int flags, struct page **pages,
+ int *nr)
{
struct page *page;
struct folio *folio;
@@ -2931,12 +3162,12 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
if (pud_devmap(orig)) {
if (unlikely(flags & FOLL_LONGTERM))
return 0;
- return __gup_device_huge_pud(orig, pudp, addr, end, flags,
- pages, nr);
+ return gup_fast_devmap_pud_leaf(orig, pudp, addr, end, flags,
+ pages, nr);
}
- page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT);
- refs = record_subpages(page, addr, end, pages + *nr);
+ page = pud_page(orig);
+ refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr);
folio = try_grab_folio(page, refs, flags);
if (!folio)
@@ -2947,7 +3178,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}
- if (!folio_fast_pin_allowed(folio, flags)) {
+ if (!gup_fast_folio_allowed(folio, flags)) {
gup_put_folio(folio, refs, flags);
return 0;
}
@@ -2962,9 +3193,9 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 1;
}
-static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
- unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
+static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr,
+ unsigned long end, unsigned int flags, struct page **pages,
+ int *nr)
{
int refs;
struct page *page;
@@ -2975,8 +3206,8 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
BUILD_BUG_ON(pgd_devmap(orig));
- page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT);
- refs = record_subpages(page, addr, end, pages + *nr);
+ page = pgd_page(orig);
+ refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr);
folio = try_grab_folio(page, refs, flags);
if (!folio)
@@ -2992,7 +3223,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 0;
}
- if (!folio_fast_pin_allowed(folio, flags)) {
+ if (!gup_fast_folio_allowed(folio, flags)) {
gup_put_folio(folio, refs, flags);
return 0;
}
@@ -3002,8 +3233,9 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 1;
}
-static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
+ unsigned long end, unsigned int flags, struct page **pages,
+ int *nr)
{
unsigned long next;
pmd_t *pmdp;
@@ -3016,13 +3248,12 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
if (!pmd_present(pmd))
return 0;
- if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
- pmd_devmap(pmd))) {
- /* See gup_pte_range() */
+ if (unlikely(pmd_leaf(pmd))) {
+ /* See gup_fast_pte_range() */
if (pmd_protnone(pmd))
return 0;
- if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
+ if (!gup_fast_pmd_leaf(pmd, pmdp, addr, next, flags,
pages, nr))
return 0;
@@ -3031,18 +3262,20 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
* architecture have different format for hugetlbfs
* pmd format and THP pmd format
*/
- if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
- PMD_SHIFT, next, flags, pages, nr))
+ if (!gup_fast_hugepd(__hugepd(pmd_val(pmd)), addr,
+ PMD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
+ } else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags,
+ pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
return 1;
}
-static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr,
+ unsigned long end, unsigned int flags, struct page **pages,
+ int *nr)
{
unsigned long next;
pud_t *pudp;
@@ -3054,23 +3287,25 @@ static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned lo
next = pud_addr_end(addr, end);
if (unlikely(!pud_present(pud)))
return 0;
- if (unlikely(pud_huge(pud) || pud_devmap(pud))) {
- if (!gup_huge_pud(pud, pudp, addr, next, flags,
- pages, nr))
+ if (unlikely(pud_leaf(pud))) {
+ if (!gup_fast_pud_leaf(pud, pudp, addr, next, flags,
+ pages, nr))
return 0;
} else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
- if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
- PUD_SHIFT, next, flags, pages, nr))
+ if (!gup_fast_hugepd(__hugepd(pud_val(pud)), addr,
+ PUD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
+ } else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags,
+ pages, nr))
return 0;
} while (pudp++, addr = next, addr != end);
return 1;
}
-static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
+ unsigned long end, unsigned int flags, struct page **pages,
+ int *nr)
{
unsigned long next;
p4d_t *p4dp;
@@ -3080,21 +3315,22 @@ static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned lo
p4d_t p4d = READ_ONCE(*p4dp);
next = p4d_addr_end(addr, end);
- if (p4d_none(p4d))
+ if (!p4d_present(p4d))
return 0;
- BUILD_BUG_ON(p4d_huge(p4d));
+ BUILD_BUG_ON(p4d_leaf(p4d));
if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
- if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
- P4D_SHIFT, next, flags, pages, nr))
+ if (!gup_fast_hugepd(__hugepd(p4d_val(p4d)), addr,
+ P4D_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
+ } else if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags,
+ pages, nr))
return 0;
} while (p4dp++, addr = next, addr != end);
return 1;
}
-static void gup_pgd_range(unsigned long addr, unsigned long end,
+static void gup_fast_pgd_range(unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
@@ -3107,24 +3343,25 @@ static void gup_pgd_range(unsigned long addr, unsigned long end,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
return;
- if (unlikely(pgd_huge(pgd))) {
- if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
- pages, nr))
+ if (unlikely(pgd_leaf(pgd))) {
+ if (!gup_fast_pgd_leaf(pgd, pgdp, addr, next, flags,
+ pages, nr))
return;
} else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
- if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
- PGDIR_SHIFT, next, flags, pages, nr))
+ if (!gup_fast_hugepd(__hugepd(pgd_val(pgd)), addr,
+ PGDIR_SHIFT, next, flags, pages, nr))
return;
- } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
+ } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,
+ pages, nr))
return;
} while (pgdp++, addr = next, addr != end);
}
#else
-static inline void gup_pgd_range(unsigned long addr, unsigned long end,
+static inline void gup_fast_pgd_range(unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
}
-#endif /* CONFIG_HAVE_FAST_GUP */
+#endif /* CONFIG_HAVE_GUP_FAST */
#ifndef gup_fast_permitted
/*
@@ -3137,16 +3374,14 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end)
}
#endif
-static unsigned long lockless_pages_from_mm(unsigned long start,
- unsigned long end,
- unsigned int gup_flags,
- struct page **pages)
+static unsigned long gup_fast(unsigned long start, unsigned long end,
+ unsigned int gup_flags, struct page **pages)
{
unsigned long flags;
int nr_pinned = 0;
unsigned seq;
- if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) ||
+ if (!IS_ENABLED(CONFIG_HAVE_GUP_FAST) ||
!gup_fast_permitted(start, end))
return 0;
@@ -3168,16 +3403,16 @@ static unsigned long lockless_pages_from_mm(unsigned long start,
* that come from THPs splitting.
*/
local_irq_save(flags);
- gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
+ gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned);
local_irq_restore(flags);
/*
* When pinning pages for DMA there could be a concurrent write protect
- * from fork() via copy_page_range(), in this case always fail fast GUP.
+ * from fork() via copy_page_range(), in this case always fail GUP-fast.
*/
if (gup_flags & FOLL_PIN) {
if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
- unpin_user_pages_lockless(pages, nr_pinned);
+ gup_fast_unpin_user_pages(pages, nr_pinned);
return 0;
} else {
sanity_check_pinned_pages(pages, nr_pinned);
@@ -3186,10 +3421,8 @@ static unsigned long lockless_pages_from_mm(unsigned long start,
return nr_pinned;
}
-static int internal_get_user_pages_fast(unsigned long start,
- unsigned long nr_pages,
- unsigned int gup_flags,
- struct page **pages)
+static int gup_fast_fallback(unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages)
{
unsigned long len, end;
unsigned long nr_pinned;
@@ -3217,7 +3450,7 @@ static int internal_get_user_pages_fast(unsigned long start,
if (unlikely(!access_ok((void __user *)start, len)))
return -EFAULT;
- nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages);
+ nr_pinned = gup_fast(start, end, gup_flags, pages);
if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
return nr_pinned;
@@ -3271,7 +3504,7 @@ int get_user_pages_fast_only(unsigned long start, int nr_pages,
FOLL_GET | FOLL_FAST_ONLY))
return -EINVAL;
- return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
+ return gup_fast_fallback(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
@@ -3302,7 +3535,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
*/
if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET))
return -EINVAL;
- return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
+ return gup_fast_fallback(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
@@ -3330,7 +3563,7 @@ int pin_user_pages_fast(unsigned long start, int nr_pages,
{
if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
return -EINVAL;
- return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
+ return gup_fast_fallback(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(pin_user_pages_fast);
diff --git a/mm/hmm.c b/mm/hmm.c
index 277ddcab4947d9..93aebd9cc130ea 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -424,22 +424,17 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
walk->action = ACTION_CONTINUE;
pud = READ_ONCE(*pudp);
- if (pud_none(pud)) {
+ if (!pud_present(pud)) {
spin_unlock(ptl);
return hmm_vma_walk_hole(start, end, -1, walk);
}
- if (pud_huge(pud) && pud_devmap(pud)) {
+ if (pud_leaf(pud) && pud_devmap(pud)) {
unsigned long i, npages, pfn;
unsigned int required_fault;
unsigned long *hmm_pfns;
unsigned long cpu_flags;
- if (!pud_present(pud)) {
- spin_unlock(ptl);
- return hmm_vma_walk_hole(start, end, -1, walk);
- }
-
i = (addr - range->start) >> PAGE_SHIFT;
npages = (end - addr) >> PAGE_SHIFT;
hmm_pfns = &range->hmm_pfns[i];
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 89f58c7603b255..14f04fbc22d9aa 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -38,6 +38,7 @@
#include <linux/sched/sysctl.h>
#include <linux/memory-tiers.h>
#include <linux/compat.h>
+#include <linux/pgalloc_tag.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -73,7 +74,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc);
static atomic_t huge_zero_refcount;
-struct page *huge_zero_page __read_mostly;
+struct folio *huge_zero_folio __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
unsigned long huge_anon_orders_always __read_mostly;
unsigned long huge_anon_orders_madvise __read_mostly;
@@ -191,24 +192,24 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
static bool get_huge_zero_page(void)
{
- struct page *zero_page;
+ struct folio *zero_folio;
retry:
if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
return true;
- zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
+ zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
- if (!zero_page) {
+ if (!zero_folio) {
count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
return false;
}
preempt_disable();
- if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
+ if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
preempt_enable();
- __free_pages(zero_page, compound_order(zero_page));
+ folio_put(zero_folio);
goto retry;
}
- WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
+ WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));
/* We take additional reference here. It will be put back by shrinker */
atomic_set(&huge_zero_refcount, 2);
@@ -226,10 +227,10 @@ static void put_huge_zero_page(void)
BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}
-struct page *mm_get_huge_zero_page(struct mm_struct *mm)
+struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
{
if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
- return READ_ONCE(huge_zero_page);
+ return READ_ONCE(huge_zero_folio);
if (!get_huge_zero_page())
return NULL;
@@ -237,10 +238,10 @@ struct page *mm_get_huge_zero_page(struct mm_struct *mm)
if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
put_huge_zero_page();
- return READ_ONCE(huge_zero_page);
+ return READ_ONCE(huge_zero_folio);
}
-void mm_put_huge_zero_page(struct mm_struct *mm)
+void mm_put_huge_zero_folio(struct mm_struct *mm)
{
if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
put_huge_zero_page();
@@ -257,10 +258,10 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
- struct page *zero_page = xchg(&huge_zero_page, NULL);
- BUG_ON(zero_page == NULL);
+ struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
+ BUG_ON(zero_folio == NULL);
WRITE_ONCE(huge_zero_pfn, ~0UL);
- __free_pages(zero_page, compound_order(zero_page));
+ folio_put(zero_folio);
return HPAGE_PMD_NR;
}
@@ -788,27 +789,19 @@ struct deferred_split *get_deferred_split_queue(struct folio *folio)
}
#endif
-void folio_prep_large_rmappable(struct folio *folio)
-{
- if (!folio || !folio_test_large(folio))
- return;
- if (folio_order(folio) > 1)
- INIT_LIST_HEAD(&folio->_deferred_list);
- folio_set_large_rmappable(folio);
-}
-
-static inline bool is_transparent_hugepage(struct folio *folio)
+static inline bool is_transparent_hugepage(const struct folio *folio)
{
if (!folio_test_large(folio))
return false;
- return is_huge_zero_page(&folio->page) ||
+ return is_huge_zero_folio(folio) ||
folio_test_large_rmappable(folio);
}
static unsigned long __thp_get_unmapped_area(struct file *filp,
unsigned long addr, unsigned long len,
- loff_t off, unsigned long flags, unsigned long size)
+ loff_t off, unsigned long flags, unsigned long size,
+ vm_flags_t vm_flags)
{
loff_t off_end = off + len;
loff_t off_align = round_up(off, size);
@@ -824,8 +817,8 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
if (len_pad < len || (off + len_pad) < off)
return 0;
- ret = current->mm->get_unmapped_area(filp, addr, len_pad,
- off >> PAGE_SHIFT, flags);
+ ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad,
+ off >> PAGE_SHIFT, flags, vm_flags);
/*
* The failure might be due to length padding. The caller will retry
@@ -843,25 +836,32 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
off_sub = (off - ret) & (size - 1);
- if (current->mm->get_unmapped_area == arch_get_unmapped_area_topdown &&
- !off_sub)
+ if (test_bit(MMF_TOPDOWN, &current->mm->flags) && !off_sub)
return ret + size;
ret += off_sub;
return ret;
}
-unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
+unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags,
+ vm_flags_t vm_flags)
{
unsigned long ret;
loff_t off = (loff_t)pgoff << PAGE_SHIFT;
- ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
+ ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags);
if (ret)
return ret;
- return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+ return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags,
+ vm_flags);
+}
+
+unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0);
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
@@ -979,14 +979,14 @@ gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
}
/* Caller must hold page table lock. */
-static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
- struct page *zero_page)
+ struct folio *zero_folio)
{
pmd_t entry;
if (!pmd_none(*pmd))
return;
- entry = mk_pmd(zero_page, vma->vm_page_prot);
+ entry = mk_pmd(&zero_folio->page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
@@ -1010,13 +1010,14 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
!mm_forbids_zeropage(vma->vm_mm) &&
transparent_hugepage_use_zero_page()) {
pgtable_t pgtable;
- struct page *zero_page;
+ struct folio *zero_folio;
vm_fault_t ret;
+
pgtable = pte_alloc_one(vma->vm_mm);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
- zero_page = mm_get_huge_zero_page(vma->vm_mm);
- if (unlikely(!zero_page)) {
+ zero_folio = mm_get_huge_zero_folio(vma->vm_mm);
+ if (unlikely(!zero_folio)) {
pte_free(vma->vm_mm, pgtable);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -1034,8 +1035,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
ret = handle_userfault(vmf, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
} else {
- set_huge_zero_page(pgtable, vma->vm_mm, vma,
- haddr, vmf->pmd, zero_page);
+ set_huge_zero_folio(pgtable, vma->vm_mm, vma,
+ haddr, vmf->pmd, zero_folio);
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
spin_unlock(vmf->ptl);
}
@@ -1228,8 +1229,8 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, bool write)
+void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, bool write)
{
pmd_t _pmd;
@@ -1344,11 +1345,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
*/
if (is_huge_zero_pmd(pmd)) {
/*
- * get_huge_zero_page() will never allocate a new page here,
- * since we already have a zero page to copy. It just takes a
- * reference.
+ * mm_get_huge_zero_folio() will never allocate a new
+ * folio here, since we already have a zero page to
+ * copy. It just takes a reference.
*/
- mm_get_huge_zero_page(dst_mm);
+ mm_get_huge_zero_folio(dst_mm);
goto out_zero_page;
}
@@ -1385,8 +1386,8 @@ out:
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, bool write)
+void touch_pud(struct vm_area_struct *vma, unsigned long addr,
+ pud_t *pud, bool write)
{
pud_t _pud;
@@ -1398,49 +1399,6 @@ static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
update_mmu_cache_pud(vma, addr, pud);
}
-struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, int flags, struct dev_pagemap **pgmap)
-{
- unsigned long pfn = pud_pfn(*pud);
- struct mm_struct *mm = vma->vm_mm;
- struct page *page;
- int ret;
-
- assert_spin_locked(pud_lockptr(mm, pud));
-
- if (flags & FOLL_WRITE && !pud_write(*pud))
- return NULL;
-
- if (pud_present(*pud) && pud_devmap(*pud))
- /* pass */;
- else
- return NULL;
-
- if (flags & FOLL_TOUCH)
- touch_pud(vma, addr, pud, flags & FOLL_WRITE);
-
- /*
- * device mapped pages can only be returned if the
- * caller will manage the page reference count.
- *
- * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
- */
- if (!(flags & (FOLL_GET | FOLL_PIN)))
- return ERR_PTR(-EEXIST);
-
- pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
- *pgmap = get_dev_pagemap(pfn, *pgmap);
- if (!*pgmap)
- return ERR_PTR(-EFAULT);
- page = pfn_to_page(pfn);
-
- ret = try_grab_page(page, flags);
- if (ret)
- page = ERR_PTR(ret);
-
- return page;
-}
-
int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
struct vm_area_struct *vma)
@@ -1627,88 +1585,6 @@ static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
return pmd_dirty(pmd);
}
-/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
-static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
- struct vm_area_struct *vma,
- unsigned int flags)
-{
- /* If the pmd is writable, we can write to the page. */
- if (pmd_write(pmd))
- return true;
-
- /* Maybe FOLL_FORCE is set to override it? */
- if (!(flags & FOLL_FORCE))
- return false;
-
- /* But FOLL_FORCE has no effect on shared mappings */
- if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
- return false;
-
- /* ... or read-only private ones */
- if (!(vma->vm_flags & VM_MAYWRITE))
- return false;
-
- /* ... or already writable ones that just need to take a write fault */
- if (vma->vm_flags & VM_WRITE)
- return false;
-
- /*
- * See can_change_pte_writable(): we broke COW and could map the page
- * writable if we have an exclusive anonymous page ...
- */
- if (!page || !PageAnon(page) || !PageAnonExclusive(page))
- return false;
-
- /* ... and a write-fault isn't required for other reasons. */
- if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
- return false;
- return !userfaultfd_huge_pmd_wp(vma, pmd);
-}
-
-struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
- unsigned long addr,
- pmd_t *pmd,
- unsigned int flags)
-{
- struct mm_struct *mm = vma->vm_mm;
- struct page *page;
- int ret;
-
- assert_spin_locked(pmd_lockptr(mm, pmd));
-
- page = pmd_page(*pmd);
- VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
-
- if ((flags & FOLL_WRITE) &&
- !can_follow_write_pmd(*pmd, page, vma, flags))
- return NULL;
-
- /* Avoid dumping huge zero page */
- if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
- return ERR_PTR(-EFAULT);
-
- if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags))
- return NULL;
-
- if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page))
- return ERR_PTR(-EMLINK);
-
- VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
- !PageAnonExclusive(page), page);
-
- ret = try_grab_page(page, flags);
- if (ret)
- return ERR_PTR(ret);
-
- if (flags & FOLL_TOUCH)
- touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
-
- page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
- VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
-
- return page;
-}
-
/* NUMA hinting page fault entry point for trans huge pmds */
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{
@@ -1754,7 +1630,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
*/
if (node_is_toptier(nid))
last_cpupid = folio_last_cpupid(folio);
- target_nid = numa_migrate_prep(folio, vma, haddr, nid, &flags);
+ target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags);
if (target_nid == NUMA_NO_NODE) {
folio_put(folio);
goto out_map;
@@ -1824,12 +1700,12 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
goto out;
}
- folio = pfn_folio(pmd_pfn(orig_pmd));
+ folio = pmd_folio(orig_pmd);
/*
* If other processes are mapping this folio, we couldn't discard
* the folio unless they all do MADV_FREE so let's skip the folio.
*/
- if (folio_estimated_sharers(folio) != 1)
+ if (folio_likely_mapped_shared(folio))
goto out;
if (!folio_trylock(folio))
@@ -2094,7 +1970,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (pmd_protnone(*pmd))
goto unlock;
- folio = page_folio(pmd_page(*pmd));
+ folio = pmd_folio(*pmd);
toptier = node_is_toptier(folio_nid(folio));
/*
* Skip scanning top tier node if normal numa
@@ -2671,7 +2547,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* It's safe to call pmd_page when folio is set because it's
* guaranteed that pmd is present.
*/
- if (folio && folio != page_folio(pmd_page(*pmd)))
+ if (folio && folio != pmd_folio(*pmd))
goto out;
__split_huge_pmd_locked(vma, pmd, range.start, freeze);
}
@@ -2863,7 +2739,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
clear_compound_head(page_tail);
if (new_order) {
prep_compound_page(page_tail, new_order);
- folio_prep_large_rmappable(new_folio);
+ folio_set_large_rmappable(new_folio);
}
/* Finally unfreeze refcount. Additional reference from page cache. */
@@ -2946,6 +2822,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
/* Caller disabled irqs, so they are still disabled here */
split_page_owner(head, order, new_order);
+ pgalloc_tag_split(head, 1 << order);
/* See comment in __split_huge_page_tail() */
if (folio_test_anon(folio)) {
@@ -2967,9 +2844,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
shmem_uncharge(folio->mapping->host, nr_dropped);
remap_page(folio, nr);
- if (folio_test_swapcache(folio))
- split_swap_cluster(folio->swap);
-
/*
* set page to its compound_head when split to non order-0 pages, so
* we can skip unlocking it below, since PG_locked is transferred to
@@ -3013,28 +2887,40 @@ bool can_split_folio(struct folio *folio, int *pextra_pins)
}
/*
- * This function splits huge page into pages in @new_order. @page can point to
- * any subpage of huge page to split. Split doesn't change the position of
- * @page.
+ * This function splits a large folio into smaller folios of order @new_order.
+ * @page can point to any page of the large folio to split. The split operation
+ * does not change the position of @page.
+ *
+ * Prerequisites:
+ *
+ * 1) The caller must hold a reference on the @page's owning folio, also known
+ * as the large folio.
+ *
+ * 2) The large folio must be locked.
+ *
+ * 3) The folio must not be pinned. Any unexpected folio references, including
+ * GUP pins, will result in the folio not getting split; instead, the caller
+ * will receive an -EBUSY.
*
- * NOTE: order-1 anonymous folio is not supported because _deferred_list,
- * which is used by partially mapped folios, is stored in subpage 2 and an
- * order-1 folio only has subpage 0 and 1. File-backed order-1 folios are OK,
- * since they do not use _deferred_list.
+ * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
+ * supported for non-file-backed folios, because folio->_deferred_list, which
+ * is used by partially mapped folios, is stored in subpage 2, but an order-1
+ * folio only has subpages 0 and 1. File-backed order-1 folios are supported,
+ * since they do not use _deferred_list.
*
- * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
- * The huge page must be locked.
+ * After splitting, the caller's folio reference will be transferred to @page,
+ * resulting in a raised refcount of @page after this call. The other pages may
+ * be freed if they are not mapped.
*
* If @list is null, tail pages will be added to LRU list, otherwise, to @list.
*
- * Pages in new_order will inherit mapping, flags, and so on from the hugepage.
+ * Pages in @new_order will inherit the mapping, flags, and so on from the
+ * huge page.
*
- * GUP pin and PG_locked transferred to @page or the compound page @page belongs
- * to. Rest subpages can be freed if they are not mapped.
+ * Returns 0 if the huge page was split successfully.
*
- * Returns 0 if the hugepage is split successfully.
- * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
- * us.
+ * Returns -EBUSY if @page's folio is pinned, or if the anon_vma disappeared
+ * from under us.
*/
int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
unsigned int new_order)
@@ -3045,6 +2931,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
+ bool is_thp = folio_test_pmd_mappable(folio);
int extra_pins, ret;
pgoff_t end;
bool is_hzp;
@@ -3080,7 +2967,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
}
- is_hzp = is_huge_zero_page(&folio->page);
+ is_hzp = is_huge_zero_folio(folio);
if (is_hzp) {
pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
return -EBUSY;
@@ -3223,7 +3110,8 @@ out_unlock:
i_mmap_unlock_read(mapping);
out:
xas_destroy(&xas);
- count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
+ if (is_thp)
+ count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
return ret;
}
@@ -3285,7 +3173,8 @@ void deferred_split_folio(struct folio *folio)
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (list_empty(&folio->_deferred_list)) {
- count_vm_event(THP_DEFERRED_SPLIT_PAGE);
+ if (folio_test_pmd_mappable(folio))
+ count_vm_event(THP_DEFERRED_SPLIT_PAGE);
list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
ds_queue->split_queue_len++;
#ifdef CONFIG_MEMCG
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ce7be5c244429f..5dc3f5ea3a2ef2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1619,19 +1619,11 @@ static inline void destroy_compound_gigantic_folio(struct folio *folio,
unsigned int order) { }
#endif
-static inline void __clear_hugetlb_destructor(struct hstate *h,
- struct folio *folio)
-{
- lockdep_assert_held(&hugetlb_lock);
-
- __folio_clear_hugetlb(folio);
-}
-
/*
* Remove hugetlb folio from lists.
- * If vmemmap exists for the folio, update dtor so that the folio appears
- * as just a compound page. Otherwise, wait until after allocating vmemmap
- * to update dtor.
+ * If vmemmap exists for the folio, clear the hugetlb flag so that the
+ * folio appears as just a compound page. Otherwise, wait until after
+ * allocating vmemmap to clear the flag.
*
* A reference is held on the folio, except in the case of demote.
*
@@ -1662,12 +1654,12 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio,
}
/*
- * We can only clear the hugetlb destructor after allocating vmemmap
+ * We can only clear the hugetlb flag after allocating vmemmap
* pages. Otherwise, someone (memory error handling) may try to write
* to tail struct pages.
*/
if (!folio_test_hugetlb_vmemmap_optimized(folio))
- __clear_hugetlb_destructor(h, folio);
+ __folio_clear_hugetlb(folio);
/*
* In the case of demote we do not ref count the page as it will soon
@@ -1734,14 +1726,14 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
*/
return;
- arch_clear_hugepage_flags(&folio->page);
+ arch_clear_hugetlb_flags(folio);
enqueue_hugetlb_folio(h, folio);
}
static void __update_and_free_hugetlb_folio(struct hstate *h,
struct folio *folio)
{
- bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio);
+ bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio);
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
@@ -1754,11 +1746,11 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
return;
/*
- * If folio is not vmemmap optimized (!clear_dtor), then the folio
+ * If folio is not vmemmap optimized (!clear_flag), then the folio
* is no longer identified as a hugetlb page. hugetlb_vmemmap_restore_folio
* can only be passed hugetlb pages and will BUG otherwise.
*/
- if (clear_dtor && hugetlb_vmemmap_restore_folio(h, folio)) {
+ if (clear_flag && hugetlb_vmemmap_restore_folio(h, folio)) {
spin_lock_irq(&hugetlb_lock);
/*
* If we cannot allocate vmemmap pages, just refuse to free the
@@ -1779,11 +1771,11 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
/*
* If vmemmap pages were allocated above, then we need to clear the
- * hugetlb destructor under the hugetlb lock.
+ * hugetlb flag under the hugetlb lock.
*/
if (folio_test_hugetlb(folio)) {
spin_lock_irq(&hugetlb_lock);
- __clear_hugetlb_destructor(h, folio);
+ __folio_clear_hugetlb(folio);
spin_unlock_irq(&hugetlb_lock);
}
@@ -1796,7 +1788,8 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
destroy_compound_gigantic_folio(folio, huge_page_order(h));
free_gigantic_folio(folio, huge_page_order(h));
} else {
- __free_pages(&folio->page, huge_page_order(h));
+ INIT_LIST_HEAD(&folio->_deferred_list);
+ folio_put(folio);
}
}
@@ -1884,7 +1877,7 @@ static void bulk_vmemmap_restore_error(struct hstate *h,
list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) {
list_del(&folio->lru);
spin_lock_irq(&hugetlb_lock);
- __clear_hugetlb_destructor(h, folio);
+ __folio_clear_hugetlb(folio);
spin_unlock_irq(&hugetlb_lock);
update_and_free_hugetlb_folio(h, folio, false);
cond_resched();
@@ -1909,7 +1902,7 @@ static void bulk_vmemmap_restore_error(struct hstate *h,
} else {
list_del(&folio->lru);
spin_lock_irq(&hugetlb_lock);
- __clear_hugetlb_destructor(h, folio);
+ __folio_clear_hugetlb(folio);
spin_unlock_irq(&hugetlb_lock);
update_and_free_hugetlb_folio(h, folio, false);
cond_resched();
@@ -1942,14 +1935,14 @@ retry:
* should only be pages on the non_hvo_folios list.
* Do note that the non_hvo_folios list could be empty.
* Without HVO enabled, ret will be 0 and there is no need to call
- * __clear_hugetlb_destructor as this was done previously.
+ * __folio_clear_hugetlb as this was done previously.
*/
VM_WARN_ON(!list_empty(folio_list));
VM_WARN_ON(ret < 0);
if (!list_empty(&non_hvo_folios) && ret) {
spin_lock_irq(&hugetlb_lock);
list_for_each_entry(folio, &non_hvo_folios, lru)
- __clear_hugetlb_destructor(h, folio);
+ __folio_clear_hugetlb(folio);
spin_unlock_irq(&hugetlb_lock);
}
@@ -1974,7 +1967,7 @@ void free_huge_folio(struct folio *folio)
{
/*
* Can't pass hstate in here because it is called from the
- * compound page destructor.
+ * generic mm code.
*/
struct hstate *h = folio_hstate(folio);
int nid = folio_nid(folio);
@@ -2031,7 +2024,7 @@ void free_huge_folio(struct folio *folio)
spin_unlock_irqrestore(&hugetlb_lock, flags);
update_and_free_hugetlb_folio(h, folio, true);
} else {
- arch_clear_hugepage_flags(&folio->page);
+ arch_clear_hugetlb_flags(folio);
enqueue_hugetlb_folio(h, folio);
spin_unlock_irqrestore(&hugetlb_lock, flags);
}
@@ -2124,7 +2117,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio,
set_compound_head(p, &folio->page);
}
__folio_set_head(folio);
- /* we rely on prep_new_hugetlb_folio to set the destructor */
+ /* we rely on prep_new_hugetlb_folio to set the hugetlb flag */
folio_set_order(folio, order);
atomic_set(&folio->_entire_mapcount, -1);
atomic_set(&folio->_nr_pages_mapped, 0);
@@ -2184,13 +2177,13 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
nodemask_t *node_alloc_noretry)
{
int order = huge_page_order(h);
- struct page *page;
+ struct folio *folio;
bool alloc_try_hard = true;
bool retry = true;
/*
- * By default we always try hard to allocate the page with
- * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in
+ * By default we always try hard to allocate the folio with
+ * __GFP_RETRY_MAYFAIL flag. However, if we are allocating folios in
* a loop (to adjust global huge page counts) and previous allocation
* failed, do not continue to try hard on the same node. Use the
* node_alloc_noretry bitmap to manage this state information.
@@ -2203,43 +2196,42 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
retry:
- page = __alloc_pages(gfp_mask, order, nid, nmask);
+ folio = __folio_alloc(gfp_mask, order, nid, nmask);
- /* Freeze head page */
- if (page && !page_ref_freeze(page, 1)) {
- __free_pages(page, order);
+ if (folio && !folio_ref_freeze(folio, 1)) {
+ folio_put(folio);
if (retry) { /* retry once */
retry = false;
goto retry;
}
/* WOW! twice in a row. */
- pr_warn("HugeTLB head page unexpected inflated ref count\n");
- page = NULL;
+ pr_warn("HugeTLB unexpected inflated folio ref count\n");
+ folio = NULL;
}
/*
- * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
- * indicates an overall state change. Clear bit so that we resume
- * normal 'try hard' allocations.
+ * If we did not specify __GFP_RETRY_MAYFAIL, but still got a
+ * folio this indicates an overall state change. Clear bit so
+ * that we resume normal 'try hard' allocations.
*/
- if (node_alloc_noretry && page && !alloc_try_hard)
+ if (node_alloc_noretry && folio && !alloc_try_hard)
node_clear(nid, *node_alloc_noretry);
/*
- * If we tried hard to get a page but failed, set bit so that
+ * If we tried hard to get a folio but failed, set bit so that
* subsequent attempts will not try as hard until there is an
* overall state change.
*/
- if (node_alloc_noretry && !page && alloc_try_hard)
+ if (node_alloc_noretry && !folio && alloc_try_hard)
node_set(nid, *node_alloc_noretry);
- if (!page) {
+ if (!folio) {
__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
return NULL;
}
__count_vm_event(HTLB_BUDDY_PGALLOC);
- return page_folio(page);
+ return folio;
}
static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h,
@@ -2605,7 +2597,7 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h,
/* folio migration callback function */
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
- nodemask_t *nmask, gfp_t gfp_mask)
+ nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback)
{
spin_lock_irq(&hugetlb_lock);
if (available_huge_pages(h)) {
@@ -2620,6 +2612,10 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
}
spin_unlock_irq(&hugetlb_lock);
+ /* We cannot fallback to other nodes, as we could break the per-node pool. */
+ if (!allow_alloc_fallback)
+ gfp_mask |= __GFP_THISNODE;
+
return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
}
@@ -5032,7 +5028,6 @@ static struct ctl_table hugetlb_table[] = {
.mode = 0644,
.proc_handler = hugetlb_overcommit_handler,
},
- { }
};
static void hugetlb_sysctl_init(void)
@@ -5923,19 +5918,18 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* cannot race with other handlers or page migration.
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
-static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep, unsigned int flags,
- struct folio *pagecache_folio, spinlock_t *ptl,
+static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
struct vm_fault *vmf)
{
- const bool unshare = flags & FAULT_FLAG_UNSHARE;
- pte_t pte = huge_ptep_get(ptep);
+ struct vm_area_struct *vma = vmf->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
+ pte_t pte = huge_ptep_get(vmf->pte);
struct hstate *h = hstate_vma(vma);
struct folio *old_folio;
struct folio *new_folio;
int outside_reserve = 0;
vm_fault_t ret = 0;
- unsigned long haddr = address & huge_page_mask(h);
struct mmu_notifier_range range;
/*
@@ -5958,7 +5952,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
/* Let's take out MAP_SHARED mappings first. */
if (vma->vm_flags & VM_MAYSHARE) {
- set_huge_ptep_writable(vma, haddr, ptep);
+ set_huge_ptep_writable(vma, vmf->address, vmf->pte);
return 0;
}
@@ -5977,7 +5971,7 @@ retry_avoidcopy:
SetPageAnonExclusive(&old_folio->page);
}
if (likely(!unshare))
- set_huge_ptep_writable(vma, haddr, ptep);
+ set_huge_ptep_writable(vma, vmf->address, vmf->pte);
delayacct_wpcopy_end();
return 0;
@@ -6004,8 +5998,8 @@ retry_avoidcopy:
* Drop page table lock as buddy allocator may be called. It will
* be acquired again before returning to the caller, as expected.
*/
- spin_unlock(ptl);
- new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve);
+ spin_unlock(vmf->ptl);
+ new_folio = alloc_hugetlb_folio(vma, vmf->address, outside_reserve);
if (IS_ERR(new_folio)) {
/*
@@ -6030,19 +6024,21 @@ retry_avoidcopy:
*
* Reacquire both after unmap operation.
*/
- idx = vma_hugecache_offset(h, vma, haddr);
+ idx = vma_hugecache_offset(h, vma, vmf->address);
hash = hugetlb_fault_mutex_hash(mapping, idx);
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- unmap_ref_private(mm, vma, &old_folio->page, haddr);
+ unmap_ref_private(mm, vma, &old_folio->page,
+ vmf->address);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
hugetlb_vma_lock_read(vma);
- spin_lock(ptl);
- ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
- if (likely(ptep &&
- pte_same(huge_ptep_get(ptep), pte)))
+ spin_lock(vmf->ptl);
+ vmf->pte = hugetlb_walk(vma, vmf->address,
+ huge_page_size(h));
+ if (likely(vmf->pte &&
+ pte_same(huge_ptep_get(vmf->pte), pte)))
goto retry_avoidcopy;
/*
* race occurs while re-acquiring page table
@@ -6064,37 +6060,38 @@ retry_avoidcopy:
if (unlikely(ret))
goto out_release_all;
- if (copy_user_large_folio(new_folio, old_folio, address, vma)) {
+ if (copy_user_large_folio(new_folio, old_folio, vmf->real_address, vma)) {
ret = VM_FAULT_HWPOISON_LARGE;
goto out_release_all;
}
__folio_mark_uptodate(new_folio);
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr,
- haddr + huge_page_size(h));
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address,
+ vmf->address + huge_page_size(h));
mmu_notifier_invalidate_range_start(&range);
/*
* Retake the page table lock to check for racing updates
* before the page tables are altered
*/
- spin_lock(ptl);
- ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
- if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
+ spin_lock(vmf->ptl);
+ vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h));
+ if (likely(vmf->pte && pte_same(huge_ptep_get(vmf->pte), pte))) {
pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare);
/* Break COW or unshare */
- huge_ptep_clear_flush(vma, haddr, ptep);
+ huge_ptep_clear_flush(vma, vmf->address, vmf->pte);
hugetlb_remove_rmap(old_folio);
- hugetlb_add_new_anon_rmap(new_folio, vma, haddr);
+ hugetlb_add_new_anon_rmap(new_folio, vma, vmf->address);
if (huge_pte_uffd_wp(pte))
newpte = huge_pte_mkuffd_wp(newpte);
- set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h));
+ set_huge_pte_at(mm, vmf->address, vmf->pte, newpte,
+ huge_page_size(h));
folio_set_hugetlb_migratable(new_folio);
/* Make the old page be freed below */
new_folio = old_folio;
}
- spin_unlock(ptl);
+ spin_unlock(vmf->ptl);
mmu_notifier_invalidate_range_end(&range);
out_release_all:
/*
@@ -6102,12 +6099,12 @@ out_release_all:
* unshare)
*/
if (new_folio != old_folio)
- restore_reserve_on_error(h, vma, haddr, new_folio);
+ restore_reserve_on_error(h, vma, vmf->address, new_folio);
folio_put(new_folio);
out_release_old:
folio_put(old_folio);
- spin_lock(ptl); /* Caller expects lock to be held */
+ spin_lock(vmf->ptl); /* Caller expects lock to be held */
delayacct_wpcopy_end();
return ret;
@@ -6116,8 +6113,8 @@ out_release_old:
/*
* Return whether there is a pagecache page to back given address within VMA.
*/
-static bool hugetlbfs_pagecache_present(struct hstate *h,
- struct vm_area_struct *vma, unsigned long address)
+bool hugetlbfs_pagecache_present(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
{
struct address_space *mapping = vma->vm_file->f_mapping;
pgoff_t idx = linear_page_index(vma, address);
@@ -6193,23 +6190,19 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm,
return same;
}
-static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping, pgoff_t idx,
- unsigned long address, pte_t *ptep,
- pte_t old_pte, unsigned int flags,
+static vm_fault_t hugetlb_no_page(struct address_space *mapping,
struct vm_fault *vmf)
{
+ struct vm_area_struct *vma = vmf->vma;
+ struct mm_struct *mm = vma->vm_mm;
struct hstate *h = hstate_vma(vma);
vm_fault_t ret = VM_FAULT_SIGBUS;
int anon_rmap = 0;
unsigned long size;
struct folio *folio;
pte_t new_pte;
- spinlock_t *ptl;
- unsigned long haddr = address & huge_page_mask(h);
bool new_folio, new_pagecache_folio = false;
- u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
+ u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
/*
* Currently, we are forced to kill the process in the event the
@@ -6228,10 +6221,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* before we get page_table_lock.
*/
new_folio = false;
- folio = filemap_lock_hugetlb_folio(h, mapping, idx);
+ folio = filemap_lock_hugetlb_folio(h, mapping, vmf->pgoff);
if (IS_ERR(folio)) {
size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
+ if (vmf->pgoff >= size)
goto out;
/* Check for page in userfault range */
if (userfaultfd_missing(vma)) {
@@ -6252,7 +6245,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* never happen on the page after UFFDIO_COPY has
* correctly installed the page and returned.
*/
- if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+ if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) {
ret = 0;
goto out;
}
@@ -6267,7 +6260,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
goto out;
}
- folio = alloc_hugetlb_folio(vma, haddr, 0);
+ folio = alloc_hugetlb_folio(vma, vmf->address, 0);
if (IS_ERR(folio)) {
/*
* Returning error will result in faulting task being
@@ -6281,18 +6274,20 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* here. Before returning error, get ptl and make
* sure there really is no pte entry.
*/
- if (hugetlb_pte_stable(h, mm, ptep, old_pte))
+ if (hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte))
ret = vmf_error(PTR_ERR(folio));
else
ret = 0;
goto out;
}
- clear_huge_page(&folio->page, address, pages_per_huge_page(h));
+ clear_huge_page(&folio->page, vmf->real_address,
+ pages_per_huge_page(h));
__folio_mark_uptodate(folio);
new_folio = true;
if (vma->vm_flags & VM_MAYSHARE) {
- int err = hugetlb_add_to_page_cache(folio, mapping, idx);
+ int err = hugetlb_add_to_page_cache(folio, mapping,
+ vmf->pgoff);
if (err) {
/*
* err can't be -EEXIST which implies someone
@@ -6301,7 +6296,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* to the page cache. So it's safe to call
* restore_reserve_on_error() here.
*/
- restore_reserve_on_error(h, vma, haddr, folio);
+ restore_reserve_on_error(h, vma, vmf->address,
+ folio);
folio_put(folio);
ret = VM_FAULT_SIGBUS;
goto out;
@@ -6328,7 +6324,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
folio_unlock(folio);
folio_put(folio);
/* See comment in userfaultfd_missing() block above */
- if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+ if (!hugetlb_pte_stable(h, mm, vmf->pte, vmf->orig_pte)) {
ret = 0;
goto out;
}
@@ -6343,23 +6339,23 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* any allocations necessary to record that reservation occur outside
* the spinlock.
*/
- if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
- if (vma_needs_reservation(h, vma, haddr) < 0) {
+ if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+ if (vma_needs_reservation(h, vma, vmf->address) < 0) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
}
/* Just decrements count, does not deallocate */
- vma_end_reservation(h, vma, haddr);
+ vma_end_reservation(h, vma, vmf->address);
}
- ptl = huge_pte_lock(h, mm, ptep);
+ vmf->ptl = huge_pte_lock(h, mm, vmf->pte);
ret = 0;
/* If pte changed from under us, retry */
- if (!pte_same(huge_ptep_get(ptep), old_pte))
+ if (!pte_same(huge_ptep_get(vmf->pte), vmf->orig_pte))
goto backout;
if (anon_rmap)
- hugetlb_add_new_anon_rmap(folio, vma, haddr);
+ hugetlb_add_new_anon_rmap(folio, vma, vmf->address);
else
hugetlb_add_file_rmap(folio);
new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE)
@@ -6368,17 +6364,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* If this pte was previously wr-protected, keep it wr-protected even
* if populated.
*/
- if (unlikely(pte_marker_uffd_wp(old_pte)))
+ if (unlikely(pte_marker_uffd_wp(vmf->orig_pte)))
new_pte = huge_pte_mkuffd_wp(new_pte);
- set_huge_pte_at(mm, haddr, ptep, new_pte, huge_page_size(h));
+ set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h));
hugetlb_count_add(pages_per_huge_page(h), mm);
- if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+ if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl, vmf);
+ ret = hugetlb_wp(folio, vmf);
}
- spin_unlock(ptl);
+ spin_unlock(vmf->ptl);
/*
* Only set hugetlb_migratable in newly allocated pages. Existing pages
@@ -6395,10 +6391,10 @@ out:
return ret;
backout:
- spin_unlock(ptl);
+ spin_unlock(vmf->ptl);
backout_unlocked:
if (new_folio && !new_pagecache_folio)
- restore_reserve_on_error(h, vma, haddr, folio);
+ restore_reserve_on_error(h, vma, vmf->address, folio);
folio_unlock(folio);
folio_put(folio);
@@ -6432,8 +6428,6 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
- pte_t *ptep, entry;
- spinlock_t *ptl;
vm_fault_t ret;
u32 hash;
struct folio *folio = NULL;
@@ -6441,13 +6435,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
int need_wait_lock = 0;
- unsigned long haddr = address & huge_page_mask(h);
struct vm_fault vmf = {
.vma = vma,
- .address = haddr,
+ .address = address & huge_page_mask(h),
.real_address = address,
.flags = flags,
- .pgoff = vma_hugecache_offset(h, vma, haddr),
+ .pgoff = vma_hugecache_offset(h, vma,
+ address & huge_page_mask(h)),
/* TODO: Track hugetlb faults using vm_fault */
/*
@@ -6467,22 +6461,22 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Acquire vma lock before calling huge_pte_alloc and hold
- * until finished with ptep. This prevents huge_pmd_unshare from
- * being called elsewhere and making the ptep no longer valid.
+ * until finished with vmf.pte. This prevents huge_pmd_unshare from
+ * being called elsewhere and making the vmf.pte no longer valid.
*/
hugetlb_vma_lock_read(vma);
- ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
- if (!ptep) {
+ vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h));
+ if (!vmf.pte) {
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
return VM_FAULT_OOM;
}
- entry = huge_ptep_get(ptep);
- if (huge_pte_none_mostly(entry)) {
- if (is_pte_marker(entry)) {
+ vmf.orig_pte = huge_ptep_get(vmf.pte);
+ if (huge_pte_none_mostly(vmf.orig_pte)) {
+ if (is_pte_marker(vmf.orig_pte)) {
pte_marker marker =
- pte_marker_get(pte_to_swp_entry(entry));
+ pte_marker_get(pte_to_swp_entry(vmf.orig_pte));
if (marker & PTE_MARKER_POISONED) {
ret = VM_FAULT_HWPOISON_LARGE;
@@ -6496,21 +6490,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* hugetlb_no_page will drop vma lock and hugetlb fault
* mutex internally, which make us return immediately.
*/
- return hugetlb_no_page(mm, vma, mapping, vmf.pgoff, address,
- ptep, entry, flags, &vmf);
+ return hugetlb_no_page(mapping, &vmf);
}
ret = 0;
/*
- * entry could be a migration/hwpoison entry at this point, so this
- * check prevents the kernel from going below assuming that we have
- * an active hugepage in pagecache. This goto expects the 2nd page
- * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
- * properly handle it.
+ * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this
+ * point, so this check prevents the kernel from going below assuming
+ * that we have an active hugepage in pagecache. This goto expects
+ * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned)
+ * check will properly handle it.
*/
- if (!pte_present(entry)) {
- if (unlikely(is_hugetlb_entry_migration(entry))) {
+ if (!pte_present(vmf.orig_pte)) {
+ if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) {
/*
* Release the hugetlb fault lock now, but retain
* the vma lock, because it is needed to guard the
@@ -6519,9 +6512,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* be released there.
*/
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- migration_entry_wait_huge(vma, ptep);
+ migration_entry_wait_huge(vma, vmf.pte);
return 0;
- } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte)))
ret = VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
goto out_mutex;
@@ -6535,13 +6528,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* determine if a reservation has been consumed.
*/
if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
- !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) {
- if (vma_needs_reservation(h, vma, haddr) < 0) {
+ !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) {
+ if (vma_needs_reservation(h, vma, vmf.address) < 0) {
ret = VM_FAULT_OOM;
goto out_mutex;
}
/* Just decrements count, does not deallocate */
- vma_end_reservation(h, vma, haddr);
+ vma_end_reservation(h, vma, vmf.address);
pagecache_folio = filemap_lock_hugetlb_folio(h, mapping,
vmf.pgoff);
@@ -6549,17 +6542,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pagecache_folio = NULL;
}
- ptl = huge_pte_lock(h, mm, ptep);
+ vmf.ptl = huge_pte_lock(h, mm, vmf.pte);
/* Check for a racing update before calling hugetlb_wp() */
- if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
+ if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(vmf.pte))))
goto out_ptl;
/* Handle userfault-wp first, before trying to lock more pages */
- if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
- (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+ if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(vmf.pte)) &&
+ (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) {
if (!userfaultfd_wp_async(vma)) {
- spin_unlock(ptl);
+ spin_unlock(vmf.ptl);
if (pagecache_folio) {
folio_unlock(pagecache_folio);
folio_put(pagecache_folio);
@@ -6569,18 +6562,18 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return handle_userfault(&vmf, VM_UFFD_WP);
}
- entry = huge_pte_clear_uffd_wp(entry);
- set_huge_pte_at(mm, haddr, ptep, entry,
+ vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte);
+ set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte,
huge_page_size(hstate_vma(vma)));
/* Fallthrough to CoW */
}
/*
- * hugetlb_wp() requires page locks of pte_page(entry) and
+ * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and
* pagecache_folio, so here we need take the former one
* when folio != pagecache_folio or !pagecache_folio.
*/
- folio = page_folio(pte_page(entry));
+ folio = page_folio(pte_page(vmf.orig_pte));
if (folio != pagecache_folio)
if (!folio_trylock(folio)) {
need_wait_lock = 1;
@@ -6590,24 +6583,23 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
folio_get(folio);
if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
- if (!huge_pte_write(entry)) {
- ret = hugetlb_wp(mm, vma, address, ptep, flags,
- pagecache_folio, ptl, &vmf);
+ if (!huge_pte_write(vmf.orig_pte)) {
+ ret = hugetlb_wp(pagecache_folio, &vmf);
goto out_put_page;
} else if (likely(flags & FAULT_FLAG_WRITE)) {
- entry = huge_pte_mkdirty(entry);
+ vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
}
}
- entry = pte_mkyoung(entry);
- if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
+ vmf.orig_pte = pte_mkyoung(vmf.orig_pte);
+ if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
flags & FAULT_FLAG_WRITE))
- update_mmu_cache(vma, haddr, ptep);
+ update_mmu_cache(vma, vmf.address, vmf.pte);
out_put_page:
if (folio != pagecache_folio)
folio_unlock(folio);
folio_put(folio);
out_ptl:
- spin_unlock(ptl);
+ spin_unlock(vmf.ptl);
if (pagecache_folio) {
folio_unlock(pagecache_folio);
@@ -6643,7 +6635,13 @@ static struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
gfp_mask = htlb_alloc_mask(h);
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
- folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
+ /*
+ * This is used to allocate a temporary hugetlb to hold the copied
+ * content, which will then be copied again to the final hugetlb
+ * consuming a reservation. Set the alloc_fallback to false to indicate
+ * that breaking the per-node hugetlb pool is not allowed in this case.
+ */
+ folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask, false);
mpol_cond_put(mpol);
return folio;
@@ -6873,77 +6871,6 @@ out_release_nounlock:
}
#endif /* CONFIG_USERFAULTFD */
-struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags,
- unsigned int *page_mask)
-{
- struct hstate *h = hstate_vma(vma);
- struct mm_struct *mm = vma->vm_mm;
- unsigned long haddr = address & huge_page_mask(h);
- struct page *page = NULL;
- spinlock_t *ptl;
- pte_t *pte, entry;
- int ret;
-
- hugetlb_vma_lock_read(vma);
- pte = hugetlb_walk(vma, haddr, huge_page_size(h));
- if (!pte)
- goto out_unlock;
-
- ptl = huge_pte_lock(h, mm, pte);
- entry = huge_ptep_get(pte);
- if (pte_present(entry)) {
- page = pte_page(entry);
-
- if (!huge_pte_write(entry)) {
- if (flags & FOLL_WRITE) {
- page = NULL;
- goto out;
- }
-
- if (gup_must_unshare(vma, flags, page)) {
- /* Tell the caller to do unsharing */
- page = ERR_PTR(-EMLINK);
- goto out;
- }
- }
-
- page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT));
-
- /*
- * Note that page may be a sub-page, and with vmemmap
- * optimizations the page struct may be read only.
- * try_grab_page() will increase the ref count on the
- * head page, so this will be OK.
- *
- * try_grab_page() should always be able to get the page here,
- * because we hold the ptl lock and have verified pte_present().
- */
- ret = try_grab_page(page, flags);
-
- if (WARN_ON_ONCE(ret)) {
- page = ERR_PTR(ret);
- goto out;
- }
-
- *page_mask = (1U << huge_page_order(h)) - 1;
- }
-out:
- spin_unlock(ptl);
-out_unlock:
- hugetlb_vma_unlock_read(vma);
-
- /*
- * Fixup retval for dump requests: if pagecache doesn't exist,
- * don't try to allocate a new page but just skip it.
- */
- if (!page && (flags & FOLL_DUMP) &&
- !hugetlbfs_pagecache_present(h, vma, address))
- page = ERR_PTR(-EFAULT);
-
- return page;
-}
-
long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
@@ -7867,9 +7794,9 @@ void __init hugetlb_cma_reserve(int order)
* huge page demotion.
*/
res = cma_declare_contiguous_nid(0, size, 0,
- PAGE_SIZE << HUGETLB_PAGE_ORDER,
- 0, false, name,
- &hugetlb_cma[nid], nid);
+ PAGE_SIZE << HUGETLB_PAGE_ORDER,
+ HUGETLB_PAGE_ORDER, false, name,
+ &hugetlb_cma[nid], nid);
if (res) {
pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
res, nid);
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index da177e49d95648..b9a55322e52ce9 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -679,7 +679,6 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dobool,
},
- { }
};
static int __init hugetlb_vmemmap_init(void)
diff --git a/mm/internal.h b/mm/internal.h
index 07ad2675a88b47..22152e0c8494a8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -11,6 +11,8 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#include <linux/tracepoint-defs.h>
struct folio_batch;
@@ -71,12 +73,27 @@ void page_writeback_init(void);
* How many individual pages have an elevated _mapcount. Excludes
* the folio's entire_mapcount.
*/
-static inline int folio_nr_pages_mapped(struct folio *folio)
+static inline int folio_nr_pages_mapped(const struct folio *folio)
{
return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
}
-static inline void *folio_raw_mapping(struct folio *folio)
+/*
+ * Retrieve the first entry of a folio based on a provided entry within the
+ * folio. We cannot rely on folio->swap as there is no guarantee that it has
+ * been initialized. Used for calling arch_swap_restore()
+ */
+static inline swp_entry_t folio_swap(swp_entry_t entry,
+ const struct folio *folio)
+{
+ swp_entry_t swap = {
+ .val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)),
+ };
+
+ return swap;
+}
+
+static inline void *folio_raw_mapping(const struct folio *folio)
{
unsigned long mapping = (unsigned long)folio->mapping;
@@ -113,6 +130,8 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
* @flags: Flags to modify the PTE batch semantics.
* @any_writable: Optional pointer to indicate whether any entry except the
* first one is writable.
+ * @any_young: Optional pointer to indicate whether any entry except the
+ * first one is young.
*
* Detect a PTE batch: consecutive (present) PTEs that map consecutive
* pages of the same large folio.
@@ -128,16 +147,18 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
*/
static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
- bool *any_writable)
+ bool *any_writable, bool *any_young)
{
unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
const pte_t *end_ptep = start_ptep + max_nr;
pte_t expected_pte, *ptep;
- bool writable;
+ bool writable, young;
int nr;
if (any_writable)
*any_writable = false;
+ if (any_young)
+ *any_young = false;
VM_WARN_ON_FOLIO(!pte_present(pte), folio);
VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
@@ -151,6 +172,8 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
pte = ptep_get(ptep);
if (any_writable)
writable = !!pte_write(pte);
+ if (any_young)
+ young = !!pte_young(pte);
pte = __pte_batch_clear_ignored(pte, flags);
if (!pte_same(pte, expected_pte))
@@ -166,6 +189,8 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
if (any_writable)
*any_writable |= writable;
+ if (any_young)
+ *any_young |= young;
nr = pte_batch_hint(ptep, pte);
expected_pte = pte_advance_pfn(expected_pte, nr);
@@ -174,6 +199,68 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
return min(ptep - start_ptep, max_nr);
}
+
+/**
+ * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
+ * @pte: The initial pte state; is_swap_pte(pte) must be true and
+ * non_swap_entry() must be false.
+ *
+ * Increments the swap offset, while maintaining all other fields, including
+ * swap type, and any swp pte bits. The resulting pte is returned.
+ */
+static inline pte_t pte_next_swp_offset(pte_t pte)
+{
+ swp_entry_t entry = pte_to_swp_entry(pte);
+ pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
+ (swp_offset(entry) + 1)));
+
+ if (pte_swp_soft_dirty(pte))
+ new = pte_swp_mksoft_dirty(new);
+ if (pte_swp_exclusive(pte))
+ new = pte_swp_mkexclusive(new);
+ if (pte_swp_uffd_wp(pte))
+ new = pte_swp_mkuffd_wp(new);
+
+ return new;
+}
+
+/**
+ * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
+ * @start_ptep: Page table pointer for the first entry.
+ * @max_nr: The maximum number of table entries to consider.
+ * @pte: Page table entry for the first entry.
+ *
+ * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
+ * containing swap entries all with consecutive offsets and targeting the same
+ * swap type, all with matching swp pte bits.
+ *
+ * max_nr must be at least one and must be limited by the caller so scanning
+ * cannot exceed a single page table.
+ *
+ * Return: the number of table entries in the batch.
+ */
+static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
+{
+ pte_t expected_pte = pte_next_swp_offset(pte);
+ const pte_t *end_ptep = start_ptep + max_nr;
+ pte_t *ptep = start_ptep + 1;
+
+ VM_WARN_ON(max_nr < 1);
+ VM_WARN_ON(!is_swap_pte(pte));
+ VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
+
+ while (ptep < end_ptep) {
+ pte = ptep_get(ptep);
+
+ if (!pte_same(pte, expected_pte))
+ break;
+
+ expected_pte = pte_next_swp_offset(expected_pte);
+ ptep++;
+ }
+
+ return ptep - start_ptep;
+}
#endif /* CONFIG_MMU */
void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
@@ -491,6 +578,7 @@ extern void __putback_isolated_page(struct page *page, unsigned int order,
extern void memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order);
extern void __free_pages_core(struct page *page, unsigned int order);
+extern void kernel_init_pages(struct page *page, int numpages);
/*
* This will have no effect, other than possibly generating a warning, if the
@@ -513,7 +601,8 @@ static inline struct folio *page_rmappable_folio(struct page *page)
{
struct folio *folio = (struct folio *)page;
- folio_prep_large_rmappable(folio);
+ if (folio && folio_test_large(folio))
+ folio_set_large_rmappable(folio);
return folio;
}
@@ -525,6 +614,8 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
atomic_set(&folio->_entire_mapcount, -1);
atomic_set(&folio->_nr_pages_mapped, 0);
atomic_set(&folio->_pincount, 0);
+ if (order > 1)
+ INIT_LIST_HEAD(&folio->_deferred_list);
}
static inline void prep_compound_tail(struct page *head, int tail_idx)
@@ -559,10 +650,6 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
unsigned long, enum meminit_context, struct vmem_altmap *, int);
-
-int split_free_page(struct page *free_page,
- unsigned int order, unsigned long split_pfn_offset);
-
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/*
@@ -789,13 +876,17 @@ void mlock_drain_remote(int cpu);
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
-/*
- * Return the start of user virtual address at the specific offset within
- * a vma.
+/**
+ * vma_address - Find the virtual address a page range is mapped at
+ * @vma: The vma which maps this object.
+ * @pgoff: The page offset within its object.
+ * @nr_pages: The number of pages to consider.
+ *
+ * If any page in this range is mapped by this VMA, return the first address
+ * where any of these pages appear. Otherwise, return -EFAULT.
*/
-static inline unsigned long
-vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages,
- struct vm_area_struct *vma)
+static inline unsigned long vma_address(struct vm_area_struct *vma,
+ pgoff_t pgoff, unsigned long nr_pages)
{
unsigned long address;
@@ -815,18 +906,6 @@ vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages,
}
/*
- * Return the start of user virtual address of a page within a vma.
- * Returns -EFAULT if all of the page is outside the range of vma.
- * If page is a compound head, the entire compound page is considered.
- */
-static inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
-{
- VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
- return vma_pgoff_address(page_to_pgoff(page), compound_nr(page), vma);
-}
-
-/*
* Then at what user virtual address will none of the range be found in vma?
* Assumes that vma_address() already returned a good starting address.
*/
@@ -1040,17 +1119,13 @@ static inline bool is_migrate_highatomic(enum migratetype migratetype)
return migratetype == MIGRATE_HIGHATOMIC;
}
-static inline bool is_migrate_highatomic_page(struct page *page)
-{
- return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
-}
-
void setup_zone_pageset(struct zone *zone);
struct migration_target_control {
int nid; /* preferred node id */
nodemask_t *nmask;
gfp_t gfp_mask;
+ enum migrate_reason reason;
};
/*
@@ -1087,10 +1162,10 @@ void vunmap_range_noflush(unsigned long start, unsigned long end);
void __vunmap_range_noflush(unsigned long start, unsigned long end);
-int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma,
+int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
unsigned long addr, int page_nid, int *flags);
-void free_zone_device_page(struct page *page);
+void free_zone_device_folio(struct folio *folio);
int migrate_device_coherent_page(struct page *page);
/*
@@ -1102,9 +1177,10 @@ int __must_check try_grab_page(struct page *page, unsigned int flags);
/*
* mm/huge_memory.c
*/
-struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmd,
- unsigned int flags);
+void touch_pud(struct vm_area_struct *vma, unsigned long addr,
+ pud_t *pud, bool write);
+void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, bool write);
/*
* mm/mmap.c
@@ -1189,20 +1265,10 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma,
}
/* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */
- if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
smp_rmb();
/*
- * During GUP-fast we might not get called on the head page for a
- * hugetlb page that is mapped using cont-PTE, because GUP-fast does
- * not work with the abstracted hugetlb PTEs that always point at the
- * head page. For hugetlb, PageAnonExclusive only applies on the head
- * page (as it cannot be partially COW-shared), so lookup the head page.
- */
- if (unlikely(!PageHead(page) && PageHuge(page)))
- page = compound_head(page);
-
- /*
* Note that PageKsm() pages cannot be exclusive, and consequently,
* cannot get pinned.
*/
@@ -1245,6 +1311,35 @@ static inline void vma_iter_config(struct vma_iterator *vmi,
__mas_set_range(&vmi->mas, index, last - 1);
}
+static inline void vma_iter_reset(struct vma_iterator *vmi)
+{
+ mas_reset(&vmi->mas);
+}
+
+static inline
+struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min)
+{
+ return mas_prev_range(&vmi->mas, min);
+}
+
+static inline
+struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max)
+{
+ return mas_next_range(&vmi->mas, max);
+}
+
+static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min,
+ unsigned long max, unsigned long size)
+{
+ return mas_empty_area(&vmi->mas, min, max - 1, size);
+}
+
+static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min,
+ unsigned long max, unsigned long size)
+{
+ return mas_empty_area_rev(&vmi->mas, min, max - 1, size);
+}
+
/*
* VMA Iterator functions shared between nommu and mmap
*/
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 2b994092a2d430..9958ebc15d3830 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -16,6 +16,7 @@
#include <linux/static_key.h>
#include <linux/string.h>
#include <linux/types.h>
+#include <linux/vmalloc.h>
#include "kasan.h"
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 8350f5c06f2e0a..964b8482275ba0 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -595,9 +595,9 @@ static unsigned long kfence_init_pool(void)
continue;
__folio_set_slab(slab_folio(slab));
-#ifdef CONFIG_MEMCG
- slab->memcg_data = (unsigned long)&kfence_metadata_init[i / 2 - 1].objcg |
- MEMCG_DATA_OBJCGS;
+#ifdef CONFIG_MEMCG_KMEM
+ slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts |
+ MEMCG_DATA_OBJEXTS;
#endif
}
@@ -645,8 +645,8 @@ reset_slab:
if (!i || (i % 2))
continue;
-#ifdef CONFIG_MEMCG
- slab->memcg_data = 0;
+#ifdef CONFIG_MEMCG_KMEM
+ slab->obj_exts = 0;
#endif
__folio_clear_slab(slab_folio(slab));
}
@@ -1139,8 +1139,8 @@ void __kfence_free(void *addr)
{
struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
-#ifdef CONFIG_MEMCG
- KFENCE_WARN_ON(meta->objcg);
+#ifdef CONFIG_MEMCG_KMEM
+ KFENCE_WARN_ON(meta->obj_exts.objcg);
#endif
/*
* If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index f46fbb03062b9c..084f5f36e8e76a 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -97,8 +97,8 @@ struct kfence_metadata {
struct kfence_track free_track;
/* For updating alloc_covered on frees. */
u32 alloc_stack_hash;
-#ifdef CONFIG_MEMCG
- struct obj_cgroup *objcg;
+#ifdef CONFIG_MEMCG_KMEM
+ struct slabobj_ext obj_exts;
#endif
};
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 38830174608fba..89e2624fb3ff11 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -767,7 +767,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
* Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC.
*
* @pte: starting of the PTEs to copy from
- * @page: the new hugepage to copy contents to
+ * @folio: the new hugepage to copy contents to
* @pmd: pointer to the new hugepage's PMD
* @orig_pmd: the original raw pages' PMD
* @vma: the original raw pages' virtual memory area
@@ -775,33 +775,29 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
* @ptl: lock on raw pages' PTEs
* @compound_pagelist: list that stores compound pages
*/
-static int __collapse_huge_page_copy(pte_t *pte,
- struct page *page,
- pmd_t *pmd,
- pmd_t orig_pmd,
- struct vm_area_struct *vma,
- unsigned long address,
- spinlock_t *ptl,
- struct list_head *compound_pagelist)
+static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
+ pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
+ unsigned long address, spinlock_t *ptl,
+ struct list_head *compound_pagelist)
{
- struct page *src_page;
- pte_t *_pte;
- pte_t pteval;
- unsigned long _address;
+ unsigned int i;
int result = SCAN_SUCCEED;
/*
* Copying pages' contents is subject to memory poison at any iteration.
*/
- for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR;
- _pte++, page++, _address += PAGE_SIZE) {
- pteval = ptep_get(_pte);
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ pte_t pteval = ptep_get(pte + i);
+ struct page *page = folio_page(folio, i);
+ unsigned long src_addr = address + i * PAGE_SIZE;
+ struct page *src_page;
+
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
- clear_user_highpage(page, _address);
+ clear_user_highpage(page, src_addr);
continue;
}
src_page = pte_page(pteval);
- if (copy_mc_user_highpage(page, src_page, _address, vma) > 0) {
+ if (copy_mc_user_highpage(page, src_page, src_addr, vma) > 0) {
result = SCAN_COPY_MC;
break;
}
@@ -891,20 +887,6 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc)
}
#endif
-static bool hpage_collapse_alloc_folio(struct folio **folio, gfp_t gfp, int node,
- nodemask_t *nmask)
-{
- *folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, nmask);
-
- if (unlikely(!*folio)) {
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- return false;
- }
-
- count_vm_event(THP_COLLAPSE_ALLOC);
- return true;
-}
-
/*
* If mmap_lock temporarily dropped, revalidate vma
* before taking mmap_lock.
@@ -1059,7 +1041,7 @@ out:
return result;
}
-static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
+static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
struct collapse_control *cc)
{
gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
@@ -1067,20 +1049,23 @@ static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
int node = hpage_collapse_find_target_node(cc);
struct folio *folio;
- if (!hpage_collapse_alloc_folio(&folio, gfp, node, &cc->alloc_nmask)) {
- *hpage = NULL;
+ folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
+ if (!folio) {
+ *foliop = NULL;
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
return SCAN_ALLOC_HUGE_PAGE_FAIL;
}
+ count_vm_event(THP_COLLAPSE_ALLOC);
if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
folio_put(folio);
- *hpage = NULL;
+ *foliop = NULL;
return SCAN_CGROUP_CHARGE_FAIL;
}
count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
- *hpage = folio_page(folio, 0);
+ *foliop = folio;
return SCAN_SUCCEED;
}
@@ -1093,7 +1078,6 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
pte_t *pte;
pgtable_t pgtable;
struct folio *folio;
- struct page *hpage;
spinlock_t *pmd_ptl, *pte_ptl;
int result = SCAN_FAIL;
struct vm_area_struct *vma;
@@ -1109,7 +1093,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
*/
mmap_read_unlock(mm);
- result = alloc_charge_hpage(&hpage, mm, cc);
+ result = alloc_charge_folio(&folio, mm, cc);
if (result != SCAN_SUCCEED)
goto out_nolock;
@@ -1169,7 +1153,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
* huge and small TLB entries for the same virtual address to
* avoid the risk of CPU bugs in that area.
*
- * Parallel fast GUP is fine since fast GUP will back off when
+ * Parallel GUP-fast is fine since GUP-fast will back off when
* it detects PMD is changed.
*/
_pmd = pmdp_collapse_flush(vma, address, pmd);
@@ -1208,14 +1192,13 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
*/
anon_vma_unlock_write(vma->anon_vma);
- result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd,
+ result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
vma, address, pte_ptl,
&compound_pagelist);
pte_unmap(pte);
if (unlikely(result != SCAN_SUCCEED))
goto out_up_write;
- folio = page_folio(hpage);
/*
* The smp_wmb() inside __folio_mark_uptodate() ensures the
* copy_huge_page writes become visible before the set_pmd_at()
@@ -1224,7 +1207,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
__folio_mark_uptodate(folio);
pgtable = pmd_pgtable(_pmd);
- _pmd = mk_huge_pmd(hpage, vma->vm_page_prot);
+ _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot);
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
spin_lock(pmd_ptl);
@@ -1236,14 +1219,14 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
update_mmu_cache_pmd(vma, address, pmd);
spin_unlock(pmd_ptl);
- hpage = NULL;
+ folio = NULL;
result = SCAN_SUCCEED;
out_up_write:
mmap_write_unlock(mm);
out_nolock:
- if (hpage)
- put_page(hpage);
+ if (folio)
+ folio_put(folio);
trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
return result;
}
@@ -1797,29 +1780,26 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
struct collapse_control *cc)
{
struct address_space *mapping = file->f_mapping;
- struct page *hpage;
- struct page *page;
- struct page *tmp;
- struct folio *folio;
+ struct page *dst;
+ struct folio *folio, *tmp, *new_folio;
pgoff_t index = 0, end = start + HPAGE_PMD_NR;
LIST_HEAD(pagelist);
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
int nr_none = 0, result = SCAN_SUCCEED;
bool is_shmem = shmem_file(file);
- int nr = 0;
VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
- result = alloc_charge_hpage(&hpage, mm, cc);
+ result = alloc_charge_folio(&new_folio, mm, cc);
if (result != SCAN_SUCCEED)
goto out;
- __SetPageLocked(hpage);
+ __folio_set_locked(new_folio);
if (is_shmem)
- __SetPageSwapBacked(hpage);
- hpage->index = start;
- hpage->mapping = mapping;
+ __folio_set_swapbacked(new_folio);
+ new_folio->index = start;
+ new_folio->mapping = mapping;
/*
* Ensure we have slots for all the pages in the range. This is
@@ -1839,11 +1819,11 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
for (index = start; index < end; index++) {
xas_set(&xas, index);
- page = xas_load(&xas);
+ folio = xas_load(&xas);
VM_BUG_ON(index != xas.xa_index);
if (is_shmem) {
- if (!page) {
+ if (!folio) {
/*
* Stop if extent has been truncated or
* hole-punched, and is now completely
@@ -1859,7 +1839,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
continue;
}
- if (xa_is_value(page) || !PageUptodate(page)) {
+ if (xa_is_value(folio) || !folio_test_uptodate(folio)) {
xas_unlock_irq(&xas);
/* swap in or instantiate fallocated page */
if (shmem_get_folio(mapping->host, index,
@@ -1869,28 +1849,27 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
}
/* drain lru cache to help isolate_lru_page() */
lru_add_drain();
- page = folio_file_page(folio, index);
- } else if (trylock_page(page)) {
- get_page(page);
+ } else if (folio_trylock(folio)) {
+ folio_get(folio);
xas_unlock_irq(&xas);
} else {
result = SCAN_PAGE_LOCK;
goto xa_locked;
}
} else { /* !is_shmem */
- if (!page || xa_is_value(page)) {
+ if (!folio || xa_is_value(folio)) {
xas_unlock_irq(&xas);
page_cache_sync_readahead(mapping, &file->f_ra,
file, index,
end - index);
/* drain lru cache to help isolate_lru_page() */
lru_add_drain();
- page = find_lock_page(mapping, index);
- if (unlikely(page == NULL)) {
+ folio = filemap_lock_folio(mapping, index);
+ if (IS_ERR(folio)) {
result = SCAN_FAIL;
goto xa_unlocked;
}
- } else if (PageDirty(page)) {
+ } else if (folio_test_dirty(folio)) {
/*
* khugepaged only works on read-only fd,
* so this page is dirty because it hasn't
@@ -1908,12 +1887,12 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
filemap_flush(mapping);
result = SCAN_FAIL;
goto xa_unlocked;
- } else if (PageWriteback(page)) {
+ } else if (folio_test_writeback(folio)) {
xas_unlock_irq(&xas);
result = SCAN_FAIL;
goto xa_unlocked;
- } else if (trylock_page(page)) {
- get_page(page);
+ } else if (folio_trylock(folio)) {
+ folio_get(folio);
xas_unlock_irq(&xas);
} else {
result = SCAN_PAGE_LOCK;
@@ -1922,35 +1901,31 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
}
/*
- * The page must be locked, so we can drop the i_pages lock
+ * The folio must be locked, so we can drop the i_pages lock
* without racing with truncate.
*/
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- /* make sure the page is up to date */
- if (unlikely(!PageUptodate(page))) {
+ /* make sure the folio is up to date */
+ if (unlikely(!folio_test_uptodate(folio))) {
result = SCAN_FAIL;
goto out_unlock;
}
/*
* If file was truncated then extended, or hole-punched, before
- * we locked the first page, then a THP might be there already.
+ * we locked the first folio, then a THP might be there already.
* This will be discovered on the first iteration.
*/
- if (PageTransCompound(page)) {
- struct page *head = compound_head(page);
-
- result = compound_order(head) == HPAGE_PMD_ORDER &&
- head->index == start
+ if (folio_test_large(folio)) {
+ result = folio_order(folio) == HPAGE_PMD_ORDER &&
+ folio->index == start
/* Maybe PMD-mapped */
? SCAN_PTE_MAPPED_HUGEPAGE
: SCAN_PAGE_COMPOUND;
goto out_unlock;
}
- folio = page_folio(page);
-
if (folio_mapping(folio) != mapping) {
result = SCAN_TRUNCATED;
goto out_unlock;
@@ -1960,7 +1935,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
folio_test_writeback(folio))) {
/*
* khugepaged only works on read-only fd, so this
- * page is dirty because it hasn't been flushed
+ * folio is dirty because it hasn't been flushed
* since first write.
*/
result = SCAN_FAIL;
@@ -1984,33 +1959,34 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
xas_lock_irq(&xas);
- VM_BUG_ON_PAGE(page != xa_load(xas.xa, index), page);
+ VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio);
/*
- * We control three references to the page:
+ * We control three references to the folio:
* - we hold a pin on it;
* - one reference from page cache;
- * - one from isolate_lru_page;
- * If those are the only references, then any new usage of the
- * page will have to fetch it from the page cache. That requires
- * locking the page to handle truncate, so any new usage will be
- * blocked until we unlock page after collapse/during rollback.
+ * - one from lru_isolate_folio;
+ * If those are the only references, then any new usage
+ * of the folio will have to fetch it from the page
+ * cache. That requires locking the folio to handle
+ * truncate, so any new usage will be blocked until we
+ * unlock folio after collapse/during rollback.
*/
- if (page_count(page) != 3) {
+ if (folio_ref_count(folio) != 3) {
result = SCAN_PAGE_COUNT;
xas_unlock_irq(&xas);
- putback_lru_page(page);
+ folio_putback_lru(folio);
goto out_unlock;
}
/*
- * Accumulate the pages that are being collapsed.
+ * Accumulate the folios that are being collapsed.
*/
- list_add_tail(&page->lru, &pagelist);
+ list_add_tail(&folio->lru, &pagelist);
continue;
out_unlock:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
goto xa_unlocked;
}
@@ -2049,23 +2025,27 @@ xa_unlocked:
}
/*
- * The old pages are locked, so they won't change anymore.
+ * The old folios are locked, so they won't change anymore.
*/
index = start;
- list_for_each_entry(page, &pagelist, lru) {
- while (index < page->index) {
- clear_highpage(hpage + (index % HPAGE_PMD_NR));
+ dst = folio_page(new_folio, 0);
+ list_for_each_entry(folio, &pagelist, lru) {
+ while (index < folio->index) {
+ clear_highpage(dst);
index++;
+ dst++;
}
- if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR), page) > 0) {
+ if (copy_mc_highpage(dst, folio_page(folio, 0)) > 0) {
result = SCAN_COPY_MC;
goto rollback;
}
index++;
+ dst++;
}
while (index < end) {
- clear_highpage(hpage + (index % HPAGE_PMD_NR));
+ clear_highpage(dst);
index++;
+ dst++;
}
if (nr_none) {
@@ -2093,16 +2073,17 @@ xa_unlocked:
}
/*
- * If userspace observed a missing page in a VMA with a MODE_MISSING
- * userfaultfd, then it might expect a UFFD_EVENT_PAGEFAULT for that
- * page. If so, we need to roll back to avoid suppressing such an
- * event. Since wp/minor userfaultfds don't give userspace any
- * guarantees that the kernel doesn't fill a missing page with a zero
- * page, so they don't matter here.
+ * If userspace observed a missing page in a VMA with
+ * a MODE_MISSING userfaultfd, then it might expect a
+ * UFFD_EVENT_PAGEFAULT for that page. If so, we need to
+ * roll back to avoid suppressing such an event. Since
+ * wp/minor userfaultfds don't give userspace any
+ * guarantees that the kernel doesn't fill a missing
+ * page with a zero page, so they don't matter here.
*
- * Any userfaultfds registered after this point will not be able to
- * observe any missing pages due to the previously inserted retry
- * entries.
+ * Any userfaultfds registered after this point will
+ * not be able to observe any missing pages due to the
+ * previously inserted retry entries.
*/
vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
if (userfaultfd_missing(vma)) {
@@ -2127,33 +2108,32 @@ immap_locked:
xas_lock_irq(&xas);
}
- folio = page_folio(hpage);
- nr = folio_nr_pages(folio);
if (is_shmem)
- __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
+ __lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
else
- __lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr);
+ __lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);
if (nr_none) {
- __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_none);
+ __lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none);
/* nr_none is always 0 for non-shmem. */
- __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_none);
+ __lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
}
/*
- * Mark hpage as uptodate before inserting it into the page cache so
- * that it isn't mistaken for an fallocated but unwritten page.
+ * Mark new_folio as uptodate before inserting it into the
+ * page cache so that it isn't mistaken for an fallocated but
+ * unwritten page.
*/
- folio_mark_uptodate(folio);
- folio_ref_add(folio, HPAGE_PMD_NR - 1);
+ folio_mark_uptodate(new_folio);
+ folio_ref_add(new_folio, HPAGE_PMD_NR - 1);
if (is_shmem)
- folio_mark_dirty(folio);
- folio_add_lru(folio);
+ folio_mark_dirty(new_folio);
+ folio_add_lru(new_folio);
/* Join all the small entries into a single multi-index entry. */
xas_set_order(&xas, start, HPAGE_PMD_ORDER);
- xas_store(&xas, folio);
+ xas_store(&xas, new_folio);
WARN_ON_ONCE(xas_error(&xas));
xas_unlock_irq(&xas);
@@ -2164,18 +2144,18 @@ immap_locked:
retract_page_tables(mapping, start);
if (cc && !cc->is_khugepaged)
result = SCAN_PTE_MAPPED_HUGEPAGE;
- folio_unlock(folio);
+ folio_unlock(new_folio);
/*
- * The collapse has succeeded, so free the old pages.
+ * The collapse has succeeded, so free the old folios.
*/
- list_for_each_entry_safe(page, tmp, &pagelist, lru) {
- list_del(&page->lru);
- page->mapping = NULL;
- ClearPageActive(page);
- ClearPageUnevictable(page);
- unlock_page(page);
- folio_put_refs(page_folio(page), 3);
+ list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
+ list_del(&folio->lru);
+ folio->mapping = NULL;
+ folio_clear_active(folio);
+ folio_clear_unevictable(folio);
+ folio_unlock(folio);
+ folio_put_refs(folio, 3);
}
goto out;
@@ -2189,11 +2169,11 @@ rollback:
shmem_uncharge(mapping->host, nr_none);
}
- list_for_each_entry_safe(page, tmp, &pagelist, lru) {
- list_del(&page->lru);
- unlock_page(page);
- putback_lru_page(page);
- put_page(page);
+ list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
+ list_del(&folio->lru);
+ folio_unlock(folio);
+ folio_putback_lru(folio);
+ folio_put(folio);
}
/*
* Undo the updates of filemap_nr_thps_inc for non-SHMEM
@@ -2209,13 +2189,13 @@ rollback:
smp_mb();
}
- hpage->mapping = NULL;
+ new_folio->mapping = NULL;
- unlock_page(hpage);
- put_page(hpage);
+ folio_unlock(new_folio);
+ folio_put(new_folio);
out:
VM_BUG_ON(!list_empty(&pagelist));
- trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result);
+ trace_mm_khugepaged_collapse_file(mm, new_folio, index, is_shmem, addr, file, HPAGE_PMD_NR, result);
return result;
}
@@ -2223,7 +2203,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
struct file *file, pgoff_t start,
struct collapse_control *cc)
{
- struct page *page = NULL;
+ struct folio *folio = NULL;
struct address_space *mapping = file->f_mapping;
XA_STATE(xas, &mapping->i_pages, start);
int present, swap;
@@ -2235,11 +2215,11 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
rcu_read_lock();
- xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) {
+ if (xas_retry(&xas, folio))
continue;
- if (xa_is_value(page)) {
+ if (xa_is_value(folio)) {
++swap;
if (cc->is_khugepaged &&
swap > khugepaged_max_ptes_swap) {
@@ -2254,11 +2234,9 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
* TODO: khugepaged should compact smaller compound pages
* into a PMD sized page
*/
- if (PageTransCompound(page)) {
- struct page *head = compound_head(page);
-
- result = compound_order(head) == HPAGE_PMD_ORDER &&
- head->index == start
+ if (folio_test_large(folio)) {
+ result = folio_order(folio) == HPAGE_PMD_ORDER &&
+ folio->index == start
/* Maybe PMD-mapped */
? SCAN_PTE_MAPPED_HUGEPAGE
: SCAN_PAGE_COMPOUND;
@@ -2271,28 +2249,29 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
break;
}
- node = page_to_nid(page);
+ node = folio_nid(folio);
if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
break;
}
cc->node_load[node]++;
- if (!PageLRU(page)) {
+ if (!folio_test_lru(folio)) {
result = SCAN_PAGE_LRU;
break;
}
- if (page_count(page) !=
- 1 + page_mapcount(page) + page_has_private(page)) {
+ if (folio_ref_count(folio) !=
+ 1 + folio_mapcount(folio) + folio_test_private(folio)) {
result = SCAN_PAGE_COUNT;
break;
}
/*
- * We probably should check if the page is referenced here, but
- * nobody would transfer pte_young() to PageReferenced() for us.
- * And rmap walk here is just too costly...
+ * We probably should check if the folio is referenced
+ * here, but nobody would transfer pte_young() to
+ * folio_test_referenced() for us. And rmap walk here
+ * is just too costly...
*/
present++;
@@ -2314,7 +2293,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
}
}
- trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result);
+ trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
return result;
}
#else
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 6a540c2b27c524..fdcf01f622028f 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -158,9 +158,9 @@ struct kmemleak_object {
int count;
/* checksum for detecting modified objects */
u32 checksum;
+ depot_stack_handle_t trace_handle;
/* memory ranges to be scanned inside an object (empty for all) */
struct hlist_head area_list;
- depot_stack_handle_t trace_handle;
unsigned long jiffies; /* creation timestamp */
pid_t pid; /* pid of the current task */
char comm[TASK_COMM_LEN]; /* executable name */
@@ -463,7 +463,7 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
/* try the slab allocator first */
if (object_cache) {
- object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
+ object = kmem_cache_alloc_noprof(object_cache, gfp_kmemleak_mask(gfp));
if (object)
return object;
}
@@ -947,7 +947,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer);
if (scan_area_cache)
- area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
+ area = kmem_cache_alloc_noprof(scan_area_cache, gfp_kmemleak_mask(gfp));
raw_spin_lock_irqsave(&object->lock, flags);
if (!area) {
diff --git a/mm/madvise.c b/mm/madvise.c
index 1a073fcc4c0c02..f59169888b8ee2 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -336,6 +336,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
LIST_HEAD(folio_list);
bool pageout_anon_only_filter;
unsigned int batch_count = 0;
+ int nr;
if (fatal_signal_pending(current))
return -EINTR;
@@ -363,10 +364,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
goto huge_unlock;
}
- folio = pfn_folio(pmd_pfn(orig_pmd));
+ folio = pmd_folio(orig_pmd);
/* Do not interfere with other mappings of this folio */
- if (folio_estimated_sharers(folio) != 1)
+ if (folio_likely_mapped_shared(folio))
goto huge_unlock;
if (pageout_anon_only_filter && !folio_test_anon(folio))
@@ -423,7 +424,8 @@ restart:
return 0;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
- for (; addr < end; pte++, addr += PAGE_SIZE) {
+ for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
+ nr = 1;
ptent = ptep_get(pte);
if (++batch_count == SWAP_CLUSTER_MAX) {
@@ -447,55 +449,66 @@ restart:
continue;
/*
- * Creating a THP page is expensive so split it only if we
- * are sure it's worth. Split it if we are only owner.
+ * If we encounter a large folio, only split it if it is not
+ * fully mapped within the range we are operating on. Otherwise
+ * leave it as is so that it can be swapped out whole. If we
+ * fail to split a folio, leave it in place and advance to the
+ * next pte in the range.
*/
if (folio_test_large(folio)) {
- int err;
-
- if (folio_estimated_sharers(folio) > 1)
- break;
- if (pageout_anon_only_filter && !folio_test_anon(folio))
- break;
- if (!folio_trylock(folio))
- break;
- folio_get(folio);
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(start_pte, ptl);
- start_pte = NULL;
- err = split_folio(folio);
- folio_unlock(folio);
- folio_put(folio);
- if (err)
- break;
- start_pte = pte =
- pte_offset_map_lock(mm, pmd, addr, &ptl);
- if (!start_pte)
- break;
- arch_enter_lazy_mmu_mode();
- pte--;
- addr -= PAGE_SIZE;
- continue;
+ const fpb_t fpb_flags = FPB_IGNORE_DIRTY |
+ FPB_IGNORE_SOFT_DIRTY;
+ int max_nr = (end - addr) / PAGE_SIZE;
+ bool any_young;
+
+ nr = folio_pte_batch(folio, addr, pte, ptent, max_nr,
+ fpb_flags, NULL, &any_young);
+ if (any_young)
+ ptent = pte_mkyoung(ptent);
+
+ if (nr < folio_nr_pages(folio)) {
+ int err;
+
+ if (folio_likely_mapped_shared(folio))
+ continue;
+ if (pageout_anon_only_filter && !folio_test_anon(folio))
+ continue;
+ if (!folio_trylock(folio))
+ continue;
+ folio_get(folio);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ start_pte = NULL;
+ err = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ start_pte = pte =
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!start_pte)
+ break;
+ arch_enter_lazy_mmu_mode();
+ if (!err)
+ nr = 0;
+ continue;
+ }
}
/*
* Do not interfere with other mappings of this folio and
- * non-LRU folio.
+ * non-LRU folio. If we have a large folio at this point, we
+ * know it is fully mapped so if its mapcount is the same as its
+ * number of pages, it must be exclusive.
*/
- if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
+ if (!folio_test_lru(folio) ||
+ folio_mapcount(folio) != folio_nr_pages(folio))
continue;
if (pageout_anon_only_filter && !folio_test_anon(folio))
continue;
- VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-
if (!pageout && pte_young(ptent)) {
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
- ptent = pte_mkold(ptent);
- set_pte_at(mm, addr, pte, ptent);
- tlb_remove_tlb_entry(tlb, pte, addr);
+ mkold_ptes(vma, addr, pte, nr);
+ tlb_remove_tlb_entries(tlb, pte, nr, addr);
}
/*
@@ -628,6 +641,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
struct folio *folio;
int nr_swap = 0;
unsigned long next;
+ int nr, max_nr;
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd))
@@ -640,7 +654,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
- for (; addr != end; pte++, addr += PAGE_SIZE) {
+ for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
+ nr = 1;
ptent = ptep_get(pte);
if (pte_none(ptent))
@@ -655,9 +670,11 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
entry = pte_to_swp_entry(ptent);
if (!non_swap_entry(entry)) {
- nr_swap--;
- free_swap_and_cache(entry);
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ max_nr = (end - addr) / PAGE_SIZE;
+ nr = swap_pte_batch(pte, max_nr, ptent);
+ nr_swap -= nr;
+ free_swap_and_cache_nr(entry, nr);
+ clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
} else if (is_hwpoison_entry(entry) ||
is_poisoned_swp_entry(entry)) {
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
@@ -677,7 +694,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (folio_test_large(folio)) {
int err;
- if (folio_estimated_sharers(folio) != 1)
+ if (folio_likely_mapped_shared(folio))
break;
if (!folio_trylock(folio))
break;
@@ -901,26 +918,19 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
return -EINVAL;
}
-static long madvise_populate(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end,
- int behavior)
+static long madvise_populate(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int behavior)
{
const bool write = behavior == MADV_POPULATE_WRITE;
- struct mm_struct *mm = vma->vm_mm;
int locked = 1;
long pages;
- *prev = vma;
-
while (start < end) {
/* Populate (prefault) page tables readable/writable. */
pages = faultin_page_range(mm, start, end, write, &locked);
if (!locked) {
mmap_read_lock(mm);
locked = 1;
- *prev = NULL;
- vma = NULL;
}
if (pages < 0) {
switch (pages) {
@@ -1021,9 +1031,6 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
return madvise_dontneed_free(vma, prev, start, end, behavior);
- case MADV_POPULATE_READ:
- case MADV_POPULATE_WRITE:
- return madvise_populate(vma, prev, start, end, behavior);
case MADV_NORMAL:
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
break;
@@ -1425,8 +1432,16 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
end = start + len;
blk_start_plug(&plug);
- error = madvise_walk_vmas(mm, start, end, behavior,
- madvise_vma_behavior);
+ switch (behavior) {
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
+ error = madvise_populate(mm, start, end, behavior);
+ break;
+ default:
+ error = madvise_walk_vmas(mm, start, end, behavior,
+ madvise_vma_behavior);
+ break;
+ }
blk_finish_plug(&plug);
if (write)
mmap_write_unlock(mm);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fabce2b50c6955..29fa5b17a1ef58 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -350,7 +350,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
/*
* A lot of the calls to the cache allocation functions are expected to be
- * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
+ * inlined by the compiler. Since the calls to memcg_slab_post_alloc_hook() are
* conditional to this static branch, we'll have to allow modules that does
* kmem_cache_alloc and the such to see this symbol as well
*/
@@ -2369,8 +2369,7 @@ static void drain_local_stock(struct work_struct *dummy)
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- if (old)
- obj_cgroup_put(old);
+ obj_cgroup_put(old);
}
/*
@@ -2978,13 +2977,6 @@ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
}
#ifdef CONFIG_MEMCG_KMEM
-/*
- * The allocated objcg pointers array is not accounted directly.
- * Moreover, it should not come from DMA buffer and is not readily
- * reclaimable. So those GFP bits should be masked off.
- */
-#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \
- __GFP_ACCOUNT | __GFP_NOFAIL)
/*
* mod_objcg_mlstate() may be called with irq enabled, so
@@ -3004,62 +2996,27 @@ static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
rcu_read_unlock();
}
-int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
- gfp_t gfp, bool new_slab)
-{
- unsigned int objects = objs_per_slab(s, slab);
- unsigned long memcg_data;
- void *vec;
-
- gfp &= ~OBJCGS_CLEAR_MASK;
- vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
- slab_nid(slab));
- if (!vec)
- return -ENOMEM;
-
- memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
- if (new_slab) {
- /*
- * If the slab is brand new and nobody can yet access its
- * memcg_data, no synchronization is required and memcg_data can
- * be simply assigned.
- */
- slab->memcg_data = memcg_data;
- } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) {
- /*
- * If the slab is already in use, somebody can allocate and
- * assign obj_cgroups in parallel. In this case the existing
- * objcg vector should be reused.
- */
- kfree(vec);
- return 0;
- }
-
- kmemleak_not_leak(vec);
- return 0;
-}
-
static __always_inline
struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
{
/*
* Slab objects are accounted individually, not per-page.
* Memcg membership data for each individual object is saved in
- * slab->memcg_data.
+ * slab->obj_exts.
*/
if (folio_test_slab(folio)) {
- struct obj_cgroup **objcgs;
+ struct slabobj_ext *obj_exts;
struct slab *slab;
unsigned int off;
slab = folio_slab(folio);
- objcgs = slab_objcgs(slab);
- if (!objcgs)
+ obj_exts = slab_obj_exts(slab);
+ if (!obj_exts)
return NULL;
off = obj_to_index(slab->slab_cache, slab, p);
- if (objcgs[off])
- return obj_cgroup_memcg(objcgs[off]);
+ if (obj_exts[off].objcg)
+ return obj_cgroup_memcg(obj_exts[off].objcg);
return NULL;
}
@@ -3067,7 +3024,7 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
/*
* folio_memcg_check() is used here, because in theory we can encounter
* a folio where the slab flag has been cleared already, but
- * slab->memcg_data has not been freed yet
+ * slab->obj_exts has not been freed yet
* folio_memcg_check() will guarantee that a proper memory
* cgroup pointer or NULL will be returned.
*/
@@ -3145,8 +3102,7 @@ static struct obj_cgroup *current_objcg_update(void)
if (old) {
old = (struct obj_cgroup *)
((unsigned long)old & ~CURRENT_OBJCG_UPDATE_FLAG);
- if (old)
- obj_cgroup_put(old);
+ obj_cgroup_put(old);
old = NULL;
}
@@ -3418,8 +3374,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
mod_objcg_mlstate(objcg, pgdat, idx, nr);
local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- if (old)
- obj_cgroup_put(old);
+ obj_cgroup_put(old);
}
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
@@ -3546,8 +3501,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
}
local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
- if (old)
- obj_cgroup_put(old);
+ obj_cgroup_put(old);
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
@@ -3602,6 +3556,96 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
refill_obj_stock(objcg, size, true);
}
+static inline size_t obj_full_size(struct kmem_cache *s)
+{
+ /*
+ * For each accounted object there is an extra space which is used
+ * to store obj_cgroup membership. Charge it too.
+ */
+ return s->size + sizeof(struct obj_cgroup *);
+}
+
+bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
+ gfp_t flags, size_t size, void **p)
+{
+ struct obj_cgroup *objcg;
+ struct slab *slab;
+ unsigned long off;
+ size_t i;
+
+ /*
+ * The obtained objcg pointer is safe to use within the current scope,
+ * defined by current task or set_active_memcg() pair.
+ * obj_cgroup_get() is used to get a permanent reference.
+ */
+ objcg = current_obj_cgroup();
+ if (!objcg)
+ return true;
+
+ /*
+ * slab_alloc_node() avoids the NULL check, so we might be called with a
+ * single NULL object. kmem_cache_alloc_bulk() aborts if it can't fill
+ * the whole requested size.
+ * return success as there's nothing to free back
+ */
+ if (unlikely(*p == NULL))
+ return true;
+
+ flags &= gfp_allowed_mask;
+
+ if (lru) {
+ int ret;
+ struct mem_cgroup *memcg;
+
+ memcg = get_mem_cgroup_from_objcg(objcg);
+ ret = memcg_list_lru_alloc(memcg, lru, flags);
+ css_put(&memcg->css);
+
+ if (ret)
+ return false;
+ }
+
+ if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s)))
+ return false;
+
+ for (i = 0; i < size; i++) {
+ slab = virt_to_slab(p[i]);
+
+ if (!slab_obj_exts(slab) &&
+ alloc_slab_obj_exts(slab, s, flags, false)) {
+ obj_cgroup_uncharge(objcg, obj_full_size(s));
+ continue;
+ }
+
+ off = obj_to_index(s, slab, p[i]);
+ obj_cgroup_get(objcg);
+ slab_obj_exts(slab)[off].objcg = objcg;
+ mod_objcg_state(objcg, slab_pgdat(slab),
+ cache_vmstat_idx(s), obj_full_size(s));
+ }
+
+ return true;
+}
+
+void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
+ void **p, int objects, struct slabobj_ext *obj_exts)
+{
+ for (int i = 0; i < objects; i++) {
+ struct obj_cgroup *objcg;
+ unsigned int off;
+
+ off = obj_to_index(s, slab, p[i]);
+ objcg = obj_exts[off].objcg;
+ if (!objcg)
+ continue;
+
+ obj_exts[off].objcg = NULL;
+ obj_cgroup_uncharge(objcg, obj_full_size(s));
+ mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
+ -obj_full_size(s));
+ obj_cgroup_put(objcg);
+ }
+}
#endif /* CONFIG_MEMCG_KMEM */
/*
@@ -5468,8 +5512,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- if (memcg->orig_objcg)
- obj_cgroup_put(memcg->orig_objcg);
+ obj_cgroup_put(memcg->orig_objcg);
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
@@ -6620,8 +6663,7 @@ static void mem_cgroup_exit(struct task_struct *task)
objcg = (struct obj_cgroup *)
((unsigned long)objcg & ~CURRENT_OBJCG_UPDATE_FLAG);
- if (objcg)
- obj_cgroup_put(objcg);
+ obj_cgroup_put(objcg);
/*
* Some kernel allocations can happen after this point,
@@ -7448,6 +7490,9 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
struct obj_cgroup *objcg;
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
+ !folio_test_hugetlb(folio) &&
+ !list_empty(&folio->_deferred_list), folio);
/*
* Nobody should be changing or seriously looking at
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9e62a00b46ddee..4f22330600ffbc 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -141,7 +141,6 @@ static struct ctl_table memory_failure_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
- { }
};
/*
@@ -455,7 +454,7 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(p, vma);
if (is_zone_device_page(p)) {
if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
- tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
+ tk->addr = vma_address(vma, fsdax_pgoff, 1);
tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
} else
tk->size_shift = page_shift(compound_head(p));
@@ -1251,7 +1250,6 @@ static int me_huge_page(struct page_state *ps, struct page *p)
#define mlock (1UL << PG_mlocked)
#define lru (1UL << PG_lru)
#define head (1UL << PG_head)
-#define slab (1UL << PG_slab)
#define reserved (1UL << PG_reserved)
static struct page_state error_states[] = {
@@ -1261,13 +1259,6 @@ static struct page_state error_states[] = {
* PG_buddy pages only make a small fraction of all free pages.
*/
- /*
- * Could in theory check if slab page is free or if we can drop
- * currently unused objects without touching them. But just
- * treat it as standard kernel for now.
- */
- { slab, slab, MF_MSG_SLAB, me_kernel },
-
{ head, head, MF_MSG_HUGE, me_huge_page },
{ sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
@@ -1294,7 +1285,6 @@ static struct page_state error_states[] = {
#undef mlock
#undef lru
#undef head
-#undef slab
#undef reserved
static void update_per_node_mf_stats(unsigned long pfn,
@@ -2678,6 +2668,7 @@ static int soft_offline_in_use_page(struct page *page)
struct migration_target_control mtc = {
.nid = NUMA_NO_NODE,
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ .reason = MR_MEMORY_FAILURE,
};
if (!huge && folio_test_large(folio)) {
diff --git a/mm/memory.c b/mm/memory.c
index d2155ced45f8f8..59c05dc8b18a7a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -989,7 +989,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
flags |= FPB_IGNORE_SOFT_DIRTY;
nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
- &any_writable);
+ &any_writable, NULL);
folio_ref_add(folio, nr);
if (folio_test_anon(folio)) {
if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
@@ -1506,6 +1506,12 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
}
+
+ if (want_init_mlocked_on_free() && folio_test_mlocked(folio) &&
+ !delay_rmap && folio_test_anon(folio)) {
+ kernel_init_pages(page, folio_nr_pages(folio));
+ }
+
if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) {
*force_flush = true;
*force_break = true;
@@ -1553,7 +1559,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
*/
if (unlikely(folio_test_large(folio) && max_nr != 1)) {
nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
- NULL);
+ NULL, NULL);
zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
addr, details, rss, force_flush,
@@ -1631,12 +1637,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
folio_remove_rmap_pte(folio, page, vma);
folio_put(folio);
} else if (!non_swap_entry(entry)) {
- /* Genuine swap entry, hence a private anon page */
+ max_nr = (end - addr) / PAGE_SIZE;
+ nr = swap_pte_batch(pte, max_nr, ptent);
+ /* Genuine swap entries, hence a private anon pages */
if (!should_zap_cows(details))
continue;
- rss[MM_SWAPENTS]--;
- if (unlikely(!free_swap_and_cache(entry)))
- print_bad_pte(vma, addr, ptent, NULL);
+ rss[MM_SWAPENTS] -= nr;
+ free_swap_and_cache_nr(entry, nr);
} else if (is_migration_entry(entry)) {
folio = pfn_swap_entry_folio(entry);
if (!should_zap_folio(details, folio))
@@ -1659,8 +1666,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
WARN_ON_ONCE(1);
}
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
- zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent);
+ clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
+ zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
add_mm_rss_vec(mm, rss);
@@ -2765,7 +2772,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
unsigned long next;
int err = 0;
- BUG_ON(pud_huge(*pud));
+ BUG_ON(pud_leaf(*pud));
if (create) {
pmd = pmd_alloc_track(mm, pud, addr, mask);
@@ -4190,7 +4197,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* when reading from swap. This metadata may be indexed by swap entry
* so this must be called before swap_free().
*/
- arch_swap_restore(entry, folio);
+ arch_swap_restore(folio_swap(entry, folio), folio);
/*
* Remove the swap entry and conditionally try to free up the swapcache.
@@ -4352,6 +4359,9 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
pte_unmap(pte);
+ if (!orders)
+ goto fallback;
+
/* Try allocating the highest of the remaining orders. */
gfp = vma_thp_gfp_mask(vma);
while (orders) {
@@ -5035,9 +5045,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
return ret;
}
-int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma,
+int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
unsigned long addr, int page_nid, int *flags)
{
+ struct vm_area_struct *vma = vmf->vma;
+
folio_get(folio);
/* Record the current PID acceesing VMA */
@@ -5049,7 +5061,55 @@ int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma,
*flags |= TNF_FAULT_LOCAL;
}
- return mpol_misplaced(folio, vma, addr);
+ return mpol_misplaced(folio, vmf, addr);
+}
+
+static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
+ unsigned long fault_addr, pte_t *fault_pte,
+ bool writable)
+{
+ pte_t pte, old_pte;
+
+ old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte);
+ pte = pte_modify(old_pte, vma->vm_page_prot);
+ pte = pte_mkyoung(pte);
+ if (writable)
+ pte = pte_mkwrite(pte, vma);
+ ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte);
+ update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1);
+}
+
+static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
+ struct folio *folio, pte_t fault_pte,
+ bool ignore_writable, bool pte_write_upgrade)
+{
+ int nr = pte_pfn(fault_pte) - folio_pfn(folio);
+ unsigned long start = max(vmf->address - nr * PAGE_SIZE, vma->vm_start);
+ unsigned long end = min(vmf->address + (folio_nr_pages(folio) - nr) * PAGE_SIZE, vma->vm_end);
+ pte_t *start_ptep = vmf->pte - (vmf->address - start) / PAGE_SIZE;
+ unsigned long addr;
+
+ /* Restore all PTEs' mapping of the large folio */
+ for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) {
+ pte_t ptent = ptep_get(start_ptep);
+ bool writable = false;
+
+ if (!pte_present(ptent) || !pte_protnone(ptent))
+ continue;
+
+ if (pfn_folio(pte_pfn(ptent)) != folio)
+ continue;
+
+ if (!ignore_writable) {
+ ptent = pte_modify(ptent, vma->vm_page_prot);
+ writable = pte_write(ptent);
+ if (!writable && pte_write_upgrade &&
+ can_change_pte_writable(vma, addr, ptent))
+ writable = true;
+ }
+
+ numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable);
+ }
}
static vm_fault_t do_numa_page(struct vm_fault *vmf)
@@ -5057,11 +5117,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct folio *folio = NULL;
int nid = NUMA_NO_NODE;
- bool writable = false;
+ bool writable = false, ignore_writable = false;
+ bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma);
int last_cpupid;
int target_nid;
pte_t pte, old_pte;
- int flags = 0;
+ int flags = 0, nr_pages;
/*
* The pte cannot be used safely until we verify, while holding the page
@@ -5083,7 +5144,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
* is only valid while holding the PT lock.
*/
writable = pte_write(pte);
- if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
+ if (!writable && pte_write_upgrade &&
can_change_pte_writable(vma, vmf->address, pte))
writable = true;
@@ -5091,10 +5152,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
if (!folio || folio_is_zone_device(folio))
goto out_map;
- /* TODO: handle PTE-mapped THP */
- if (folio_test_large(folio))
- goto out_map;
-
/*
* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
* much anyway since they can be in shared cache state. This misses
@@ -5110,10 +5167,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
* Flag if the folio is shared between multiple address spaces. This
* is later used when determining whether to group tasks together
*/
- if (folio_estimated_sharers(folio) > 1 && (vma->vm_flags & VM_SHARED))
+ if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
flags |= TNF_SHARED;
nid = folio_nid(folio);
+ nr_pages = folio_nr_pages(folio);
/*
* For memory tiering mode, cpupid of slow memory page is used
* to record page access time. So use default value.
@@ -5123,13 +5181,14 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
last_cpupid = (-1 & LAST_CPUPID_MASK);
else
last_cpupid = folio_last_cpupid(folio);
- target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags);
+ target_nid = numa_migrate_prep(folio, vmf, vmf->address, nid, &flags);
if (target_nid == NUMA_NO_NODE) {
folio_put(folio);
goto out_map;
}
pte_unmap_unlock(vmf->pte, vmf->ptl);
writable = false;
+ ignore_writable = true;
/* Migrate to the requested node */
if (migrate_misplaced_folio(folio, vma, target_nid)) {
@@ -5150,20 +5209,19 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
out:
if (nid != NUMA_NO_NODE)
- task_numa_fault(last_cpupid, nid, 1, flags);
+ task_numa_fault(last_cpupid, nid, nr_pages, flags);
return 0;
out_map:
/*
* Make it present again, depending on how arch implements
* non-accessible ptes, some can allow access by kernel mode.
*/
- old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
- pte = pte_modify(old_pte, vma->vm_page_prot);
- pte = pte_mkyoung(pte);
- if (writable)
- pte = pte_mkwrite(pte, vma);
- ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
- update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
+ if (folio && folio_test_large(folio))
+ numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable,
+ pte_write_upgrade);
+ else
+ numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte,
+ writable);
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
@@ -5882,8 +5940,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
* Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
* should be taken for read.
*
- * KVM uses this function. While it is arguably less bad than ``follow_pfn``,
- * it is not a good general-purpose API.
+ * KVM uses this function. While it is arguably less bad than the historic
+ * ``follow_pfn``, it is not a good general-purpose API.
*
* Return: zero on success, -ve otherwise.
*/
@@ -5925,71 +5983,7 @@ out:
}
EXPORT_SYMBOL_GPL(follow_pte);
-/**
- * follow_pfn - look up PFN at a user virtual address
- * @vma: memory mapping
- * @address: user virtual address
- * @pfn: location to store found PFN
- *
- * Only IO mappings and raw PFN mappings are allowed.
- *
- * This function does not allow the caller to read the permissions
- * of the PTE. Do not use it.
- *
- * Return: zero and the pfn at @pfn on success, -ve otherwise.
- */
-int follow_pfn(struct vm_area_struct *vma, unsigned long address,
- unsigned long *pfn)
-{
- int ret = -EINVAL;
- spinlock_t *ptl;
- pte_t *ptep;
-
- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
- return ret;
-
- ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
- if (ret)
- return ret;
- *pfn = pte_pfn(ptep_get(ptep));
- pte_unmap_unlock(ptep, ptl);
- return 0;
-}
-EXPORT_SYMBOL(follow_pfn);
-
#ifdef CONFIG_HAVE_IOREMAP_PROT
-int follow_phys(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags,
- unsigned long *prot, resource_size_t *phys)
-{
- int ret = -EINVAL;
- pte_t *ptep, pte;
- spinlock_t *ptl;
-
- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
- goto out;
-
- if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
- goto out;
- pte = ptep_get(ptep);
-
- /* Never return PFNs of anon folios in COW mappings. */
- if (vm_normal_folio(vma, address, pte))
- goto unlock;
-
- if ((flags & FOLL_WRITE) && !pte_write(pte))
- goto unlock;
-
- *prot = pgprot_val(pte_pgprot(pte));
- *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
-
- ret = 0;
-unlock:
- pte_unmap_unlock(ptep, ptl);
-out:
- return ret;
-}
-
/**
* generic_access_phys - generic implementation for iomem mmap access
* @vma: the vma to access
@@ -6440,3 +6434,15 @@ void ptlock_free(struct ptdesc *ptdesc)
kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
}
#endif
+
+void vma_pgtable_walk_begin(struct vm_area_struct *vma)
+{
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_vma_lock_read(vma);
+}
+
+void vma_pgtable_walk_end(struct vm_area_struct *vma)
+{
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_vma_unlock_read(vma);
+}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a444e2d7dd2bff..b79ba36e09e034 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1841,6 +1841,7 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
struct migration_target_control mtc = {
.nmask = &nmask,
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ .reason = MR_MEMORY_HOTPLUG,
};
int ret;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0fe77738d971de..aec756ae563779 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -509,8 +509,8 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
qp->nr_failed++;
return;
}
- folio = pfn_folio(pmd_pfn(*pmd));
- if (is_huge_zero_page(&folio->page)) {
+ folio = pmd_folio(*pmd);
+ if (is_huge_zero_folio(folio)) {
walk->action = ACTION_CONTINUE;
return;
}
@@ -642,12 +642,11 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
* Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
* Choosing not to migrate a shared folio is not counted as a failure.
*
- * To check if the folio is shared, ideally we want to make sure
- * every page is mapped to the same process. Doing that is very
- * expensive, so check the estimated sharers of the folio instead.
+ * See folio_likely_mapped_shared() on possible imprecision when we
+ * cannot easily detect if a folio is shared.
*/
if ((flags & MPOL_MF_MOVE_ALL) ||
- (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte)))
+ (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
if (!isolate_hugetlb(folio, qp->pagelist))
qp->nr_failed++;
unlock:
@@ -1032,11 +1031,10 @@ static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
* Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
* Choosing not to migrate a shared folio is not counted as a failure.
*
- * To check if the folio is shared, ideally we want to make sure
- * every page is mapped to the same process. Doing that is very
- * expensive, so check the estimated sharers of the folio instead.
+ * See folio_likely_mapped_shared() on possible imprecision when we
+ * cannot easily detect if a folio is shared.
*/
- if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) {
+ if ((flags & MPOL_MF_MOVE_ALL) || !folio_likely_mapped_shared(folio)) {
if (folio_isolate_lru(folio)) {
list_add_tail(&folio->lru, foliolist);
node_stat_mod_folio(folio,
@@ -1070,6 +1068,7 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest,
struct migration_target_control mtc = {
.nid = dest,
.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+ .reason = MR_SYSCALL,
};
nodes_clear(nmask);
@@ -1227,7 +1226,8 @@ static struct folio *alloc_migration_target_by_mpol(struct folio *src,
h = folio_hstate(src);
gfp = htlb_alloc_mask(h);
nodemask = policy_nodemask(gfp, pol, ilx, &nid);
- return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp);
+ return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
+ htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
}
if (folio_test_large(src))
@@ -1504,9 +1504,10 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
return -EINVAL;
if (*flags & MPOL_F_NUMA_BALANCING) {
- if (*mode != MPOL_BIND)
+ if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
+ *flags |= (MPOL_F_MOF | MPOL_F_MORON);
+ else
return -EINVAL;
- *flags |= (MPOL_F_MOF | MPOL_F_MORON);
}
return 0;
}
@@ -2200,9 +2201,9 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
*/
preferred_gfp = gfp | __GFP_NOWARN;
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
- page = __alloc_pages(preferred_gfp, order, nid, nodemask);
+ page = __alloc_pages_noprof(preferred_gfp, order, nid, nodemask);
if (!page)
- page = __alloc_pages(gfp, order, nid, NULL);
+ page = __alloc_pages_noprof(gfp, order, nid, NULL);
return page;
}
@@ -2217,7 +2218,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
*
* Return: The page on success or NULL if allocation fails.
*/
-struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
+struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
struct mempolicy *pol, pgoff_t ilx, int nid)
{
nodemask_t *nodemask;
@@ -2248,7 +2249,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
* First, try to allocate THP only on local node, but
* don't reclaim unnecessarily, just compact.
*/
- page = __alloc_pages_node(nid,
+ page = __alloc_pages_node_noprof(nid,
gfp | __GFP_THISNODE | __GFP_NORETRY, order);
if (page || !(gfp & __GFP_DIRECT_RECLAIM))
return page;
@@ -2261,7 +2262,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
}
}
- page = __alloc_pages(gfp, order, nid, nodemask);
+ page = __alloc_pages_noprof(gfp, order, nid, nodemask);
if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) {
/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
@@ -2292,7 +2293,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
*
* Return: The folio on success or NULL if allocation fails.
*/
-struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
+struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, bool hugepage)
{
struct mempolicy *pol;
@@ -2300,12 +2301,12 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
struct page *page;
pol = get_vma_policy(vma, addr, order, &ilx);
- page = alloc_pages_mpol(gfp | __GFP_COMP, order,
- pol, ilx, numa_node_id());
+ page = alloc_pages_mpol_noprof(gfp | __GFP_COMP, order,
+ pol, ilx, numa_node_id());
mpol_cond_put(pol);
return page_rmappable_folio(page);
}
-EXPORT_SYMBOL(vma_alloc_folio);
+EXPORT_SYMBOL(vma_alloc_folio_noprof);
/**
* alloc_pages - Allocate pages.
@@ -2321,7 +2322,7 @@ EXPORT_SYMBOL(vma_alloc_folio);
* flags are used.
* Return: The page on success or NULL if allocation fails.
*/
-struct page *alloc_pages(gfp_t gfp, unsigned int order)
+struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
{
struct mempolicy *pol = &default_policy;
@@ -2332,16 +2333,16 @@ struct page *alloc_pages(gfp_t gfp, unsigned int order)
if (!in_interrupt() && !(gfp & __GFP_THISNODE))
pol = get_task_policy(current);
- return alloc_pages_mpol(gfp, order,
- pol, NO_INTERLEAVE_INDEX, numa_node_id());
+ return alloc_pages_mpol_noprof(gfp, order, pol, NO_INTERLEAVE_INDEX,
+ numa_node_id());
}
-EXPORT_SYMBOL(alloc_pages);
+EXPORT_SYMBOL(alloc_pages_noprof);
-struct folio *folio_alloc(gfp_t gfp, unsigned int order)
+struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
{
- return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order));
+ return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
}
-EXPORT_SYMBOL(folio_alloc);
+EXPORT_SYMBOL(folio_alloc_noprof);
static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
struct mempolicy *pol, unsigned long nr_pages,
@@ -2360,13 +2361,13 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
for (i = 0; i < nodes; i++) {
if (delta) {
- nr_allocated = __alloc_pages_bulk(gfp,
+ nr_allocated = alloc_pages_bulk_noprof(gfp,
interleave_nodes(pol), NULL,
nr_pages_per_node + 1, NULL,
page_array);
delta--;
} else {
- nr_allocated = __alloc_pages_bulk(gfp,
+ nr_allocated = alloc_pages_bulk_noprof(gfp,
interleave_nodes(pol), NULL,
nr_pages_per_node, NULL, page_array);
}
@@ -2503,11 +2504,11 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
preferred_gfp = gfp | __GFP_NOWARN;
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
- nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
+ nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
nr_pages, NULL, page_array);
if (nr_allocated < nr_pages)
- nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
+ nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
nr_pages - nr_allocated, NULL,
page_array + nr_allocated);
return nr_allocated;
@@ -2519,7 +2520,7 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
* It can accelerate memory allocation especially interleaving
* allocate memory.
*/
-unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
+unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
unsigned long nr_pages, struct page **page_array)
{
struct mempolicy *pol = &default_policy;
@@ -2543,8 +2544,8 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
nid = numa_node_id();
nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
- return __alloc_pages_bulk(gfp, nid, nodemask,
- nr_pages, NULL, page_array);
+ return alloc_pages_bulk_noprof(gfp, nid, nodemask,
+ nr_pages, NULL, page_array);
}
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
@@ -2718,7 +2719,7 @@ static void sp_free(struct sp_node *n)
* mpol_misplaced - check whether current folio node is valid in policy
*
* @folio: folio to be checked
- * @vma: vm area where folio mapped
+ * @vmf: structure describing the fault
* @addr: virtual address in @vma for shared policy lookup and interleave policy
*
* Lookup current policy node id for vma,addr and "compare to" folio's
@@ -2728,18 +2729,24 @@ static void sp_free(struct sp_node *n)
* Return: NUMA_NO_NODE if the page is in a node that is valid for this
* policy, or a suitable node ID to allocate a replacement folio from.
*/
-int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
+int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
unsigned long addr)
{
struct mempolicy *pol;
pgoff_t ilx;
struct zoneref *z;
int curnid = folio_nid(folio);
+ struct vm_area_struct *vma = vmf->vma;
int thiscpu = raw_smp_processor_id();
- int thisnid = cpu_to_node(thiscpu);
+ int thisnid = numa_node_id();
int polnid = NUMA_NO_NODE;
int ret = NUMA_NO_NODE;
+ /*
+ * Make sure ptl is held so that we don't preempt and we
+ * have a stable smp processor id
+ */
+ lockdep_assert_held(vmf->ptl);
pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
if (!(pol->flags & MPOL_F_MOF))
goto out;
@@ -2764,15 +2771,26 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
break;
case MPOL_BIND:
- /* Optimize placement among multiple nodes via NUMA balancing */
+ case MPOL_PREFERRED_MANY:
+ /*
+ * Even though MPOL_PREFERRED_MANY can allocate pages outside
+ * policy nodemask we don't allow numa migration to nodes
+ * outside policy nodemask for now. This is done so that if we
+ * want demotion to slow memory to happen, before allocating
+ * from some DRAM node say 'x', we will end up using a
+ * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
+ * we should not promote to node 'x' from slow memory node.
+ */
if (pol->flags & MPOL_F_MORON) {
+ /*
+ * Optimize placement among multiple nodes
+ * via NUMA balancing
+ */
if (node_isset(thisnid, pol->nodes))
break;
goto out;
}
- fallthrough;
- case MPOL_PREFERRED_MANY:
/*
* use current page if in policy nodemask,
* else select nearest allowed node, if any.
@@ -2781,7 +2799,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
if (node_isset(curnid, pol->nodes))
goto out;
z = first_zones_zonelist(
- node_zonelist(numa_node_id(), GFP_HIGHUSER),
+ node_zonelist(thisnid, GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
&pol->nodes);
polnid = zone_to_nid(z->zone);
diff --git a/mm/mempool.c b/mm/mempool.c
index 076c736f5f1ff8..6ece63a00acf58 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -240,22 +240,24 @@ EXPORT_SYMBOL(mempool_init_node);
*
* Return: %0 on success, negative error code otherwise.
*/
-int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
- mempool_free_t *free_fn, void *pool_data)
+int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data)
{
return mempool_init_node(pool, min_nr, alloc_fn, free_fn,
pool_data, GFP_KERNEL, NUMA_NO_NODE);
}
-EXPORT_SYMBOL(mempool_init);
+EXPORT_SYMBOL(mempool_init_noprof);
/**
- * mempool_create - create a memory pool
+ * mempool_create_node - create a memory pool
* @min_nr: the minimum number of elements guaranteed to be
* allocated for this pool.
* @alloc_fn: user-defined element-allocation function.
* @free_fn: user-defined element-freeing function.
* @pool_data: optional private data available to the user-defined functions.
+ * @gfp_mask: memory allocation flags
+ * @node_id: numa node to allocate on
*
* this function creates and allocates a guaranteed size, preallocated
* memory pool. The pool can be used from the mempool_alloc() and mempool_free()
@@ -265,17 +267,9 @@ EXPORT_SYMBOL(mempool_init);
*
* Return: pointer to the created memory pool object or %NULL on error.
*/
-mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
- mempool_free_t *free_fn, void *pool_data)
-{
- return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data,
- GFP_KERNEL, NUMA_NO_NODE);
-}
-EXPORT_SYMBOL(mempool_create);
-
-mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
- mempool_free_t *free_fn, void *pool_data,
- gfp_t gfp_mask, int node_id)
+mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data,
+ gfp_t gfp_mask, int node_id)
{
mempool_t *pool;
@@ -291,7 +285,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
return pool;
}
-EXPORT_SYMBOL(mempool_create_node);
+EXPORT_SYMBOL(mempool_create_node_noprof);
/**
* mempool_resize - resize an existing memory pool
@@ -387,7 +381,7 @@ EXPORT_SYMBOL(mempool_resize);
*
* Return: pointer to the allocated element or %NULL on error.
*/
-void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
+void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask)
{
void *element;
unsigned long flags;
@@ -454,7 +448,7 @@ repeat_alloc:
finish_wait(&pool->wait, &wait);
goto repeat_alloc;
}
-EXPORT_SYMBOL(mempool_alloc);
+EXPORT_SYMBOL(mempool_alloc_noprof);
/**
* mempool_alloc_preallocated - allocate an element from preallocated elements
@@ -562,7 +556,7 @@ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
{
struct kmem_cache *mem = pool_data;
VM_BUG_ON(mem->ctor);
- return kmem_cache_alloc(mem, gfp_mask);
+ return kmem_cache_alloc_noprof(mem, gfp_mask);
}
EXPORT_SYMBOL(mempool_alloc_slab);
@@ -580,7 +574,7 @@ EXPORT_SYMBOL(mempool_free_slab);
void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
{
size_t size = (size_t)pool_data;
- return kmalloc(size, gfp_mask);
+ return kmalloc_noprof(size, gfp_mask);
}
EXPORT_SYMBOL(mempool_kmalloc);
@@ -610,7 +604,7 @@ EXPORT_SYMBOL(mempool_kvfree);
void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data)
{
int order = (int)(long)pool_data;
- return alloc_pages(gfp_mask, order);
+ return alloc_pages_noprof(gfp_mask, order);
}
EXPORT_SYMBOL(mempool_alloc_pages);
diff --git a/mm/memremap.c b/mm/memremap.c
index 9e9fb1972fff0a..e1776693e2eaed 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -456,21 +456,23 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
}
EXPORT_SYMBOL_GPL(get_dev_pagemap);
-void free_zone_device_page(struct page *page)
+void free_zone_device_folio(struct folio *folio)
{
- if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free))
+ if (WARN_ON_ONCE(!folio->page.pgmap->ops ||
+ !folio->page.pgmap->ops->page_free))
return;
- mem_cgroup_uncharge(page_folio(page));
+ mem_cgroup_uncharge(folio);
/*
* Note: we don't expect anonymous compound pages yet. Once supported
* and we could PTE-map them similar to THP, we'd have to clear
* PG_anon_exclusive on all tail pages.
*/
- VM_BUG_ON_PAGE(PageAnon(page) && PageCompound(page), page);
- if (PageAnon(page))
- __ClearPageAnonExclusive(page);
+ if (folio_test_anon(folio)) {
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
+ __ClearPageAnonExclusive(folio_page(folio, 0));
+ }
/*
* When a device managed page is freed, the folio->mapping field
@@ -481,20 +483,20 @@ void free_zone_device_page(struct page *page)
*
* For other types of ZONE_DEVICE pages, migration is either
* handled differently or not done at all, so there is no need
- * to clear page->mapping.
+ * to clear folio->mapping.
*/
- page->mapping = NULL;
- page->pgmap->ops->page_free(page);
+ folio->mapping = NULL;
+ folio->page.pgmap->ops->page_free(folio_page(folio, 0));
- if (page->pgmap->type != MEMORY_DEVICE_PRIVATE &&
- page->pgmap->type != MEMORY_DEVICE_COHERENT)
+ if (folio->page.pgmap->type != MEMORY_DEVICE_PRIVATE &&
+ folio->page.pgmap->type != MEMORY_DEVICE_COHERENT)
/*
- * Reset the page count to 1 to prepare for handing out the page
+ * Reset the refcount to 1 to prepare for handing out the page
* again.
*/
- set_page_count(page, 1);
+ folio_set_count(folio, 1);
else
- put_dev_pagemap(page->pgmap);
+ put_dev_pagemap(folio->page.pgmap);
}
void zone_device_page_init(struct page *page)
diff --git a/mm/migrate.c b/mm/migrate.c
index 73a052a382f13a..285072bca29ce2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -113,7 +113,7 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode)
if (!mops->isolate_page(&folio->page, mode))
goto out_no_isolated;
- /* Driver shouldn't use PG_isolated bit of page->flags */
+ /* Driver shouldn't use the isolated flag */
WARN_ON_ONCE(folio_test_isolated(folio));
folio_set_isolated(folio);
folio_unlock(folio);
@@ -1653,6 +1653,29 @@ static int migrate_pages_batch(struct list_head *from,
cond_resched();
/*
+ * The rare folio on the deferred split list should
+ * be split now. It should not count as a failure.
+ * Only check it without removing it from the list.
+ * Since the folio can be on deferred_split_scan()
+ * local list and removing it can cause the local list
+ * corruption. Folio split process below can handle it
+ * with the help of folio_ref_freeze().
+ *
+ * nr_pages > 2 is needed to avoid checking order-1
+ * page cache folios. They exist, in contrast to
+ * non-existent order-1 anonymous folios, and do not
+ * use _deferred_list.
+ */
+ if (nr_pages > 2 &&
+ !list_empty(&folio->_deferred_list)) {
+ if (try_split_folio(folio, split_folios) == 0) {
+ stats->nr_thp_split += is_thp;
+ stats->nr_split++;
+ continue;
+ }
+ }
+
+ /*
* Large folio migration might be unsupported or
* the allocation might be failed so we should retry
* on the same folio with the large folio split
@@ -2022,7 +2045,8 @@ struct folio *alloc_migration_target(struct folio *src, unsigned long private)
gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
return alloc_hugetlb_folio_nodemask(h, nid,
- mtc->nmask, gfp_mask);
+ mtc->nmask, gfp_mask,
+ htlb_allow_alloc_fallback(mtc->reason));
}
if (folio_test_large(src)) {
@@ -2060,6 +2084,7 @@ static int do_move_pages_to_node(struct list_head *pagelist, int node)
struct migration_target_control mtc = {
.nid = node,
.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+ .reason = MR_SYSCALL,
};
err = migrate_pages(pagelist, alloc_migration_target, NULL,
@@ -2568,11 +2593,11 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
/*
* Don't migrate file folios that are mapped in multiple processes
* with execute permissions as they are probably shared libraries.
- * To check if the folio is shared, ideally we want to make sure
- * every page is mapped to the same process. Doing that is very
- * expensive, so check the estimated mapcount of the folio instead.
+ *
+ * See folio_likely_mapped_shared() on possible imprecision when we
+ * cannot easily detect if a folio is shared.
*/
- if (folio_estimated_sharers(folio) != 1 && folio_is_file_lru(folio) &&
+ if (folio_likely_mapped_shared(folio) && folio_is_file_lru(folio) &&
(vma->vm_flags & VM_EXEC))
goto out;
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index b6c27c76e1a0b2..d40b46ae9d65b7 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -71,7 +71,7 @@ again:
return migrate_vma_collect_hole(start, end, -1, walk);
if (pmd_trans_huge(*pmdp)) {
- struct page *page;
+ struct folio *folio;
ptl = pmd_lock(mm, pmdp);
if (unlikely(!pmd_trans_huge(*pmdp))) {
@@ -79,21 +79,21 @@ again:
goto again;
}
- page = pmd_page(*pmdp);
- if (is_huge_zero_page(page)) {
+ folio = pmd_folio(*pmdp);
+ if (is_huge_zero_folio(folio)) {
spin_unlock(ptl);
split_huge_pmd(vma, pmdp, addr);
} else {
int ret;
- get_page(page);
+ folio_get(folio);
spin_unlock(ptl);
- if (unlikely(!trylock_page(page)))
+ if (unlikely(!folio_trylock(folio)))
return migrate_vma_collect_skip(start, end,
walk);
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
+ ret = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
if (ret)
return migrate_vma_collect_skip(start, end,
walk);
diff --git a/mm/mlock.c b/mm/mlock.c
index 1ed2f2ab37cd18..30b51cdea89dec 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -378,7 +378,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
goto out;
if (is_huge_zero_pmd(*pmd))
goto out;
- folio = page_folio(pmd_page(*pmd));
+ folio = pmd_folio(*pmd);
if (vma->vm_flags & VM_LOCKED)
mlock_folio(folio);
else
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 549e76af8f82a8..2c8f3af4430f5c 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -24,6 +24,7 @@
#include <linux/page_ext.h>
#include <linux/pti.h>
#include <linux/pgtable.h>
+#include <linux/stackdepot.h>
#include <linux/swap.h>
#include <linux/cma.h>
#include <linux/crash_dump.h>
@@ -226,7 +227,6 @@ static unsigned long required_movablecore_percent __initdata;
static unsigned long nr_kernel_pages __initdata;
static unsigned long nr_all_pages __initdata;
-static unsigned long dma_reserve __initdata;
static bool deferred_struct_pages __meminitdata;
@@ -1144,7 +1144,7 @@ static void __init adjust_zone_range_for_zone_movable(int nid,
* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
* then all holes in the requested range will be accounted for.
*/
-unsigned long __init __absent_pages_in_range(int nid,
+static unsigned long __init __absent_pages_in_range(int nid,
unsigned long range_start_pfn,
unsigned long range_end_pfn)
{
@@ -1265,6 +1265,30 @@ static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat)
pr_debug("On node %d totalpages: 0\n", pgdat->node_id);
}
+static void __init calc_nr_kernel_pages(void)
+{
+ unsigned long start_pfn, end_pfn;
+ phys_addr_t start_addr, end_addr;
+ u64 u;
+#ifdef CONFIG_HIGHMEM
+ unsigned long high_zone_low = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];
+#endif
+
+ for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) {
+ start_pfn = PFN_UP(start_addr);
+ end_pfn = PFN_DOWN(end_addr);
+
+ if (start_pfn < end_pfn) {
+ nr_all_pages += end_pfn - start_pfn;
+#ifdef CONFIG_HIGHMEM
+ start_pfn = clamp(start_pfn, 0, high_zone_low);
+ end_pfn = clamp(end_pfn, 0, high_zone_low);
+#endif
+ nr_kernel_pages += end_pfn - start_pfn;
+ }
+ }
+}
+
static void __init calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long node_start_pfn,
unsigned long node_end_pfn)
@@ -1308,26 +1332,6 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
}
-static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
- unsigned long present_pages)
-{
- unsigned long pages = spanned_pages;
-
- /*
- * Provide a more accurate estimation if there are holes within
- * the zone and SPARSEMEM is in use. If there are holes within the
- * zone, each populated memory region may cost us one or two extra
- * memmap pages due to alignment because memmap pages for each
- * populated regions may not be naturally aligned on page boundary.
- * So the (present_pages >> 4) heuristic is a tradeoff for that.
- */
- if (spanned_pages > present_pages + (present_pages >> 4) &&
- IS_ENABLED(CONFIG_SPARSEMEM))
- pages = present_pages;
-
- return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
-}
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void pgdat_init_split_queue(struct pglist_data *pgdat)
{
@@ -1542,15 +1546,6 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
}
#endif
-/*
- * Set up the zone data structures:
- * - mark all pages reserved
- * - mark all memory queues empty
- * - clear the memory bitmaps
- *
- * NOTE: pgdat should get zeroed by caller.
- * NOTE: this function is only called during early init.
- */
static void __init free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
@@ -1561,47 +1556,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
- unsigned long size, freesize, memmap_pages;
-
- size = zone->spanned_pages;
- freesize = zone->present_pages;
-
- /*
- * Adjust freesize so that it accounts for how much memory
- * is used by this zone for memmap. This affects the watermark
- * and per-cpu initialisations
- */
- memmap_pages = calc_memmap_size(size, freesize);
- if (!is_highmem_idx(j)) {
- if (freesize >= memmap_pages) {
- freesize -= memmap_pages;
- if (memmap_pages)
- pr_debug(" %s zone: %lu pages used for memmap\n",
- zone_names[j], memmap_pages);
- } else
- pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n",
- zone_names[j], memmap_pages, freesize);
- }
-
- /* Account for reserved pages */
- if (j == 0 && freesize > dma_reserve) {
- freesize -= dma_reserve;
- pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
- }
-
- if (!is_highmem_idx(j))
- nr_kernel_pages += freesize;
- /* Charge for highmem memmap if there are enough kernel pages */
- else if (nr_kernel_pages > memmap_pages * 2)
- nr_kernel_pages -= memmap_pages;
- nr_all_pages += freesize;
+ unsigned long size = zone->spanned_pages;
/*
- * Set an approximate value for lowmem here, it will be adjusted
- * when the bootmem allocator frees pages into the buddy system.
- * And all highmem pages will be managed by the buddy system.
+ * Initialize zone->managed_pages as 0 , it will be reset
+ * when memblock allocator frees pages into buddy system.
*/
- zone_init_internals(zone, j, nid, freesize);
+ zone_init_internals(zone, j, nid, zone->present_pages);
if (!size)
continue;
@@ -1874,30 +1835,26 @@ void __init free_area_init(unsigned long *max_zone_pfn)
panic("Cannot allocate %zuB for node %d.\n",
sizeof(*pgdat), nid);
arch_refresh_nodedata(nid, pgdat);
- free_area_init_node(nid);
-
- /*
- * We do not want to confuse userspace by sysfs
- * files/directories for node without any memory
- * attached to it, so this node is not marked as
- * N_MEMORY and not marked online so that no sysfs
- * hierarchy will be created via register_one_node for
- * it. The pgdat will get fully initialized by
- * hotadd_init_pgdat() when memory is hotplugged into
- * this node.
- */
- continue;
}
pgdat = NODE_DATA(nid);
free_area_init_node(nid);
- /* Any memory on that node */
- if (pgdat->node_present_pages)
+ /*
+ * No sysfs hierarcy will be created via register_one_node()
+ *for memory-less node because here it's not marked as N_MEMORY
+ *and won't be set online later. The benefit is userspace
+ *program won't be confused by sysfs files/directories of
+ *memory-less node. The pgdat will get fully initialized by
+ *hotadd_init_pgdat() when memory is hotplugged into this node.
+ */
+ if (pgdat->node_present_pages) {
node_set_state(nid, N_MEMORY);
- check_for_memory(pgdat);
+ check_for_memory(pgdat);
+ }
}
+ calc_nr_kernel_pages();
memmap_init();
/* disable hash distribution for systems with a single node */
@@ -2057,7 +2014,7 @@ static unsigned long __init deferred_init_pages(struct zone *zone,
__init_single_page(page, pfn, zid, nid);
nr_pages++;
}
- return (nr_pages);
+ return nr_pages;
}
/*
@@ -2259,10 +2216,6 @@ zone_empty:
* Return true when zone was grown, otherwise return false. We return true even
* when we grow less than requested, to let the caller decide if there are
* enough pages to satisfy the allocation.
- *
- * Note: We use noinline because this function is needed only during boot, and
- * it is called from a __ref function _deferred_grow_zone. This way we are
- * making sure that it is not inlined into permanent text section.
*/
bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
{
@@ -2412,17 +2365,6 @@ void __init page_alloc_init_late(void)
page_alloc_sysctl_init();
}
-#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
-/*
- * Returns the number of pages that arch has reserved but
- * is not known to alloc_large_system_hash().
- */
-static unsigned long __init arch_reserved_kernel_pages(void)
-{
- return 0;
-}
-#endif
-
/*
* Adaptive scale is meant to reduce sizes of hash tables on large memory
* machines. As memory size is increased the scale is also increased but at
@@ -2465,7 +2407,6 @@ void *__init alloc_large_system_hash(const char *tablename,
if (!numentries) {
/* round applicable memory size up to nearest megabyte */
numentries = nr_kernel_pages;
- numentries -= arch_reserved_kernel_pages();
/* It isn't necessary when PAGE_SIZE >= 1MB */
if (PAGE_SIZE < SZ_1M)
@@ -2547,26 +2488,9 @@ void *__init alloc_large_system_hash(const char *tablename,
return table;
}
-/**
- * set_dma_reserve - set the specified number of pages reserved in the first zone
- * @new_dma_reserve: The number of pages to mark reserved
- *
- * The per-cpu batchsize and zone watermarks are determined by managed_pages.
- * In the DMA zone, a significant percentage may be consumed by kernel image
- * and other unfreeable allocations which can skew the watermarks badly. This
- * function may optionally be used to account for unfreeable pages in the
- * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
- * smaller per-cpu batchsize.
- */
-void __init set_dma_reserve(unsigned long new_dma_reserve)
-{
- dma_reserve = new_dma_reserve;
-}
-
void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
{
-
if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) {
int nid = early_pfn_to_nid(pfn);
@@ -2578,6 +2502,17 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
/* KMSAN will take care of these pages. */
return;
}
+
+ /* pages were reserved and not allocated */
+ if (mem_alloc_profiling_enabled()) {
+ union codetag_ref *ref = get_page_tag_ref(page);
+
+ if (ref) {
+ set_codetag_empty(ref);
+ put_page_tag_ref(ref);
+ }
+ }
+
__free_pages_core(page, order);
}
@@ -2587,6 +2522,9 @@ EXPORT_SYMBOL(init_on_alloc);
DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
EXPORT_SYMBOL(init_on_free);
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free);
+EXPORT_SYMBOL(init_mlocked_on_free);
+
static bool _init_on_alloc_enabled_early __read_mostly
= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
static int __init early_init_on_alloc(char *buf)
@@ -2604,6 +2542,14 @@ static int __init early_init_on_free(char *buf)
}
early_param("init_on_free", early_init_on_free);
+static bool _init_mlocked_on_free_enabled_early __read_mostly
+ = IS_ENABLED(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON);
+static int __init early_init_mlocked_on_free(char *buf)
+{
+ return kstrtobool(buf, &_init_mlocked_on_free_enabled_early);
+}
+early_param("init_mlocked_on_free", early_init_mlocked_on_free);
+
DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
/*
@@ -2631,12 +2577,21 @@ static void __init mem_debugging_and_hardening_init(void)
}
#endif
- if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) &&
+ if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early ||
+ _init_mlocked_on_free_enabled_early) &&
page_poisoning_requested) {
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
- "will take precedence over init_on_alloc and init_on_free\n");
+ "will take precedence over init_on_alloc, init_on_free "
+ "and init_mlocked_on_free\n");
_init_on_alloc_enabled_early = false;
_init_on_free_enabled_early = false;
+ _init_mlocked_on_free_enabled_early = false;
+ }
+
+ if (_init_mlocked_on_free_enabled_early && _init_on_free_enabled_early) {
+ pr_info("mem auto-init: init_on_free is on, "
+ "will take precedence over init_mlocked_on_free\n");
+ _init_mlocked_on_free_enabled_early = false;
}
if (_init_on_alloc_enabled_early) {
@@ -2653,9 +2608,17 @@ static void __init mem_debugging_and_hardening_init(void)
static_branch_disable(&init_on_free);
}
- if (IS_ENABLED(CONFIG_KMSAN) &&
- (_init_on_alloc_enabled_early || _init_on_free_enabled_early))
- pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
+ if (_init_mlocked_on_free_enabled_early) {
+ want_check_pages = true;
+ static_branch_enable(&init_mlocked_on_free);
+ } else {
+ static_branch_disable(&init_mlocked_on_free);
+ }
+
+ if (IS_ENABLED(CONFIG_KMSAN) && (_init_on_alloc_enabled_early ||
+ _init_on_free_enabled_early || _init_mlocked_on_free_enabled_early))
+ pr_info("mem auto-init: please make sure init_on_alloc, init_on_free and "
+ "init_mlocked_on_free are disabled when running KMSAN\n");
#ifdef CONFIG_DEBUG_PAGEALLOC
if (debug_pagealloc_enabled()) {
@@ -2694,9 +2657,10 @@ static void __init report_meminit(void)
else
stack = "off";
- pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n",
+ pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s, mlocked free:%s\n",
stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off",
- want_init_on_free() ? "on" : "off");
+ want_init_on_free() ? "on" : "off",
+ want_init_mlocked_on_free() ? "on" : "off");
if (want_init_on_free())
pr_info("mem auto-init: clearing system memory may take some time...\n");
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 6dbda99a47da1c..d3c2ca8efa534c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1114,21 +1114,21 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_
*/
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
- MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end);
struct anon_vma *anon_vma = NULL;
struct vm_area_struct *prev, *next;
+ VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);
/* Try next first. */
- next = mas_walk(&mas);
+ next = vma_iter_load(&vmi);
if (next) {
anon_vma = reusable_anon_vma(next, vma, next);
if (anon_vma)
return anon_vma;
}
- prev = mas_prev(&mas, 0);
+ prev = vma_prev(&vmi);
VM_BUG_ON_VMA(prev != vma, vma);
- prev = mas_prev(&mas, 0);
+ prev = vma_prev(&vmi);
/* Try prev next. */
if (prev)
anon_vma = reusable_anon_vma(prev, prev, vma);
@@ -1255,18 +1255,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
- /* Obtain the address to map to. we verify (or select) it and ensure
- * that it represents a valid section of the address space.
- */
- addr = get_unmapped_area(file, addr, len, pgoff, flags);
- if (IS_ERR_VALUE(addr))
- return addr;
-
- if (flags & MAP_FIXED_NOREPLACE) {
- if (find_vma_intersection(mm, addr, addr + len))
- return -EEXIST;
- }
-
if (prot == PROT_EXEC) {
pkey = execute_only_pkey(mm);
if (pkey < 0)
@@ -1280,6 +1268,18 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+ /* Obtain the address to map to. we verify (or select) it and ensure
+ * that it represents a valid section of the address space.
+ */
+ addr = __get_unmapped_area(file, addr, len, pgoff, flags, vm_flags);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
+ if (flags & MAP_FIXED_NOREPLACE) {
+ if (find_vma_intersection(mm, addr, addr + len))
+ return -EEXIST;
+ }
+
if (flags & MAP_LOCKED)
if (!can_do_mlock())
return -EPERM;
@@ -1576,11 +1576,10 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
unsigned long length, gap;
unsigned long low_limit, high_limit;
struct vm_area_struct *tmp;
-
- MA_STATE(mas, &current->mm->mm_mt, 0, 0);
+ VMA_ITERATOR(vmi, current->mm, 0);
/* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask;
+ length = info->length + info->align_mask + info->start_gap;
if (length < info->length)
return -ENOMEM;
@@ -1589,23 +1588,29 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
low_limit = mmap_min_addr;
high_limit = info->high_limit;
retry:
- if (mas_empty_area(&mas, low_limit, high_limit - 1, length))
+ if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length))
return -ENOMEM;
- gap = mas.index;
+ /*
+ * Adjust for the gap first so it doesn't interfere with the
+ * later alignment. The first step is the minimum needed to
+ * fulill the start gap, the next steps is the minimum to align
+ * that. It is the minimum needed to fulill both.
+ */
+ gap = vma_iter_addr(&vmi) + info->start_gap;
gap += (info->align_offset - gap) & info->align_mask;
- tmp = mas_next(&mas, ULONG_MAX);
+ tmp = vma_next(&vmi);
if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
if (vm_start_gap(tmp) < gap + length - 1) {
low_limit = tmp->vm_end;
- mas_reset(&mas);
+ vma_iter_reset(&vmi);
goto retry;
}
} else {
- tmp = mas_prev(&mas, 0);
+ tmp = vma_prev(&vmi);
if (tmp && vm_end_gap(tmp) > gap) {
low_limit = vm_end_gap(tmp);
- mas_reset(&mas);
+ vma_iter_reset(&vmi);
goto retry;
}
}
@@ -1628,10 +1633,10 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
unsigned long length, gap, gap_end;
unsigned long low_limit, high_limit;
struct vm_area_struct *tmp;
+ VMA_ITERATOR(vmi, current->mm, 0);
- MA_STATE(mas, &current->mm->mm_mt, 0, 0);
/* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask;
+ length = info->length + info->align_mask + info->start_gap;
if (length < info->length)
return -ENOMEM;
@@ -1640,24 +1645,24 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
low_limit = mmap_min_addr;
high_limit = info->high_limit;
retry:
- if (mas_empty_area_rev(&mas, low_limit, high_limit - 1, length))
+ if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
return -ENOMEM;
- gap = mas.last + 1 - info->length;
+ gap = vma_iter_end(&vmi) - info->length;
gap -= (gap - info->align_offset) & info->align_mask;
- gap_end = mas.last;
- tmp = mas_next(&mas, ULONG_MAX);
+ gap_end = vma_iter_end(&vmi);
+ tmp = vma_next(&vmi);
if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
- if (vm_start_gap(tmp) <= gap_end) {
+ if (vm_start_gap(tmp) < gap_end) {
high_limit = vm_start_gap(tmp);
- mas_reset(&mas);
+ vma_iter_reset(&vmi);
goto retry;
}
} else {
- tmp = mas_prev(&mas, 0);
+ tmp = vma_prev(&vmi);
if (tmp && vm_end_gap(tmp) > gap) {
high_limit = tmp->vm_start;
- mas_reset(&mas);
+ vma_iter_reset(&vmi);
goto retry;
}
}
@@ -1705,7 +1710,7 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr,
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
if (len > mmap_end - mmap_min_addr)
@@ -1723,12 +1728,9 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr,
return addr;
}
- info.flags = 0;
info.length = len;
info.low_limit = mm->mmap_base;
info.high_limit = mmap_end;
- info.align_mask = 0;
- info.align_offset = 0;
return vm_unmapped_area(&info);
}
@@ -1753,7 +1755,7 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
{
struct vm_area_struct *vma, *prev;
struct mm_struct *mm = current->mm;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
/* requested length too big for entire address space */
@@ -1777,8 +1779,6 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
info.length = len;
info.low_limit = PAGE_SIZE;
info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
- info.align_mask = 0;
- info.align_offset = 0;
addr = vm_unmapped_area(&info);
/*
@@ -1808,12 +1808,41 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
}
#endif
+#ifndef HAVE_ARCH_UNMAPPED_AREA_VMFLAGS
+unsigned long
+arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
+{
+ return arch_get_unmapped_area(filp, addr, len, pgoff, flags);
+}
+
+unsigned long
+arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags, vm_flags_t vm_flags)
+{
+ return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags);
+}
+#endif
+
+unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags,
+ vm_flags_t vm_flags)
+{
+ if (test_bit(MMF_TOPDOWN, &mm->flags))
+ return arch_get_unmapped_area_topdown_vmflags(filp, addr, len, pgoff,
+ flags, vm_flags);
+ return arch_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, vm_flags);
+}
+
unsigned long
-get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags)
+__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
{
unsigned long (*get_area)(struct file *, unsigned long,
- unsigned long, unsigned long, unsigned long);
+ unsigned long, unsigned long, unsigned long)
+ = NULL;
unsigned long error = arch_mmap_check(addr, len, flags);
if (error)
@@ -1823,7 +1852,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
if (len > TASK_SIZE)
return -ENOMEM;
- get_area = current->mm->get_unmapped_area;
if (file) {
if (file->f_op->get_unmapped_area)
get_area = file->f_op->get_unmapped_area;
@@ -1833,16 +1861,22 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
* so use shmem's get_unmapped_area in case it can be huge.
*/
get_area = shmem_get_unmapped_area;
- } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- /* Ensures that larger anonymous mappings are THP aligned. */
- get_area = thp_get_unmapped_area;
}
/* Always treat pgoff as zero for anonymous memory. */
if (!file)
pgoff = 0;
- addr = get_area(file, addr, len, pgoff, flags);
+ if (get_area) {
+ addr = get_area(file, addr, len, pgoff, flags);
+ } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ /* Ensures that larger anonymous mappings are THP aligned. */
+ addr = thp_get_unmapped_area_vmflags(file, addr, len,
+ pgoff, flags, vm_flags);
+ } else {
+ addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len,
+ pgoff, flags, vm_flags);
+ }
if (IS_ERR_VALUE(addr))
return addr;
@@ -1855,7 +1889,16 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
return error ? error : addr;
}
-EXPORT_SYMBOL(get_unmapped_area);
+unsigned long
+mm_get_unmapped_area(struct mm_struct *mm, struct file *file,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ if (test_bit(MMF_TOPDOWN, &mm->flags))
+ return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags);
+ return arch_get_unmapped_area(file, addr, len, pgoff, flags);
+}
+EXPORT_SYMBOL(mm_get_unmapped_area);
/**
* find_vma_intersection() - Look up the first VMA which intersects the interval
@@ -1912,12 +1955,12 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
struct vm_area_struct *vma;
- MA_STATE(mas, &mm->mm_mt, addr, addr);
+ VMA_ITERATOR(vmi, mm, addr);
- vma = mas_walk(&mas);
- *pprev = mas_prev(&mas, 0);
+ vma = vma_iter_load(&vmi);
+ *pprev = vma_prev(&vmi);
if (!vma)
- vma = mas_next(&mas, ULONG_MAX);
+ vma = vma_next(&vmi);
return vma;
}
@@ -1971,7 +2014,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
struct vm_area_struct *next;
unsigned long gap_addr;
int error = 0;
- MA_STATE(mas, &mm->mm_mt, vma->vm_start, address);
+ VMA_ITERATOR(vmi, mm, vma->vm_start);
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
@@ -1997,15 +2040,15 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
if (next)
- mas_prev_range(&mas, address);
+ vma_iter_prev_range_limit(&vmi, address);
- __mas_set_range(&mas, vma->vm_start, address - 1);
- if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ vma_iter_config(&vmi, vma->vm_start, address);
+ if (vma_iter_prealloc(&vmi, vma))
return -ENOMEM;
/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma))) {
- mas_destroy(&mas);
+ vma_iter_free(&vmi);
return -ENOMEM;
}
@@ -2045,7 +2088,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
/* Overwrite old entry in mtree. */
- mas_store_prealloc(&mas, vma);
+ vma_iter_store(&vmi, vma);
anon_vma_interval_tree_post_update_vma(vma);
spin_unlock(&mm->page_table_lock);
@@ -2054,7 +2097,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
}
anon_vma_unlock_write(vma->anon_vma);
- mas_destroy(&mas);
+ vma_iter_free(&vmi);
validate_mm(mm);
return error;
}
@@ -2067,9 +2110,9 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
int expand_downwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
- MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
struct vm_area_struct *prev;
int error = 0;
+ VMA_ITERATOR(vmi, mm, vma->vm_start);
if (!(vma->vm_flags & VM_GROWSDOWN))
return -EFAULT;
@@ -2079,7 +2122,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
return -EPERM;
/* Enforce stack_guard_gap */
- prev = mas_prev(&mas, 0);
+ prev = vma_prev(&vmi);
/* Check that both stack segments have the same anon_vma? */
if (prev) {
if (!(prev->vm_flags & VM_GROWSDOWN) &&
@@ -2089,15 +2132,15 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
}
if (prev)
- mas_next_range(&mas, vma->vm_start);
+ vma_iter_next_range_limit(&vmi, vma->vm_start);
- __mas_set_range(&mas, address, vma->vm_end - 1);
- if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ vma_iter_config(&vmi, address, vma->vm_end);
+ if (vma_iter_prealloc(&vmi, vma))
return -ENOMEM;
/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma))) {
- mas_destroy(&mas);
+ vma_iter_free(&vmi);
return -ENOMEM;
}
@@ -2138,7 +2181,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
vma->vm_start = address;
vma->vm_pgoff -= grow;
/* Overwrite old entry in mtree. */
- mas_store_prealloc(&mas, vma);
+ vma_iter_store(&vmi, vma);
anon_vma_interval_tree_post_update_vma(vma);
spin_unlock(&mm->page_table_lock);
@@ -2147,7 +2190,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
}
}
anon_vma_unlock_write(vma->anon_vma);
- mas_destroy(&mas);
+ vma_iter_free(&vmi);
validate_mm(mm);
return error;
}
@@ -3242,7 +3285,7 @@ void exit_mmap(struct mm_struct *mm)
struct mmu_gather tlb;
struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
- MA_STATE(mas, &mm->mm_mt, 0, 0);
+ VMA_ITERATOR(vmi, mm, 0);
int count = 0;
/* mm's last user has gone, and its about to be pulled down */
@@ -3251,7 +3294,7 @@ void exit_mmap(struct mm_struct *mm)
mmap_read_lock(mm);
arch_exit_mmap(mm);
- vma = mas_find(&mas, ULONG_MAX);
+ vma = vma_next(&vmi);
if (!vma || unlikely(xa_is_zero(vma))) {
/* Can happen if dup_mmap() received an OOM */
mmap_read_unlock(mm);
@@ -3264,7 +3307,7 @@ void exit_mmap(struct mm_struct *mm)
tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
- unmap_vmas(&tlb, &mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
+ unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
mmap_read_unlock(mm);
/*
@@ -3274,8 +3317,8 @@ void exit_mmap(struct mm_struct *mm)
set_bit(MMF_OOM_SKIP, &mm->flags);
mmap_write_lock(mm);
mt_clear_in_rcu(&mm->mm_mt);
- mas_set(&mas, vma->vm_end);
- free_pgtables(&tlb, &mas, vma, FIRST_USER_ADDRESS,
+ vma_iter_set(&vmi, vma->vm_end);
+ free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
USER_PGTABLES_CEILING, true);
tlb_finish_mmu(&tlb);
@@ -3284,14 +3327,14 @@ void exit_mmap(struct mm_struct *mm)
* enabled, without holding any MM locks besides the unreachable
* mmap_write_lock.
*/
- mas_set(&mas, vma->vm_end);
+ vma_iter_set(&vmi, vma->vm_end);
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
remove_vma(vma, true);
count++;
cond_resched();
- vma = mas_find(&mas, ULONG_MAX);
+ vma = vma_next(&vmi);
} while (vma && likely(!xa_is_zero(vma)));
BUG_ON(count != mm->map_count);
@@ -3713,7 +3756,7 @@ int mm_take_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
- MA_STATE(mas, &mm->mm_mt, 0, 0);
+ VMA_ITERATOR(vmi, mm, 0);
mmap_assert_write_locked(mm);
@@ -3725,14 +3768,14 @@ int mm_take_all_locks(struct mm_struct *mm)
* being written to until mmap_write_unlock() or mmap_write_downgrade()
* is reached.
*/
- mas_for_each(&mas, vma, ULONG_MAX) {
+ for_each_vma(vmi, vma) {
if (signal_pending(current))
goto out_unlock;
vma_start_write(vma);
}
- mas_set(&mas, 0);
- mas_for_each(&mas, vma, ULONG_MAX) {
+ vma_iter_init(&vmi, mm, 0);
+ for_each_vma(vmi, vma) {
if (signal_pending(current))
goto out_unlock;
if (vma->vm_file && vma->vm_file->f_mapping &&
@@ -3740,8 +3783,8 @@ int mm_take_all_locks(struct mm_struct *mm)
vm_lock_mapping(mm, vma->vm_file->f_mapping);
}
- mas_set(&mas, 0);
- mas_for_each(&mas, vma, ULONG_MAX) {
+ vma_iter_init(&vmi, mm, 0);
+ for_each_vma(vmi, vma) {
if (signal_pending(current))
goto out_unlock;
if (vma->vm_file && vma->vm_file->f_mapping &&
@@ -3749,8 +3792,8 @@ int mm_take_all_locks(struct mm_struct *mm)
vm_lock_mapping(mm, vma->vm_file->f_mapping);
}
- mas_set(&mas, 0);
- mas_for_each(&mas, vma, ULONG_MAX) {
+ vma_iter_init(&vmi, mm, 0);
+ for_each_vma(vmi, vma) {
if (signal_pending(current))
goto out_unlock;
if (vma->anon_vma)
@@ -3809,12 +3852,12 @@ void mm_drop_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
- MA_STATE(mas, &mm->mm_mt, 0, 0);
+ VMA_ITERATOR(vmi, mm, 0);
mmap_assert_write_locked(mm);
BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
- mas_for_each(&mas, vma, ULONG_MAX) {
+ for_each_vma(vmi, vma) {
if (vma->anon_vma)
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
vm_unlock_anon_vma(avc->anon_vma);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index f8a4544b4601db..94878c39ee32dd 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -129,7 +129,8 @@ static long change_pte_range(struct mmu_gather *tlb,
/* Also skip shared copy-on-write pages */
if (is_cow_mapping(vma->vm_flags) &&
- folio_ref_count(folio) != 1)
+ (folio_maybe_dma_pinned(folio) ||
+ folio_likely_mapped_shared(folio)))
continue;
/*
diff --git a/mm/mremap.c b/mm/mremap.c
index 38d98465f3d825..f5aba752d35f76 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -205,7 +205,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
*/
if (pte_present(pte))
force_flush = true;
- pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
+ pte = move_pte(pte, old_addr, new_addr);
pte = move_soft_dirty_pte(pte);
set_pte_at(mm, new_addr, new_pte, pte);
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 5ec8f44e7ce976..331d2f778695c9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -110,55 +110,34 @@ unsigned int kobjsize(const void *objp)
return page_size(page);
}
-/**
- * follow_pfn - look up PFN at a user virtual address
- * @vma: memory mapping
- * @address: user virtual address
- * @pfn: location to store found PFN
- *
- * Only IO mappings and raw PFN mappings are allowed.
- *
- * Returns zero and the pfn at @pfn on success, -ve otherwise.
- */
-int follow_pfn(struct vm_area_struct *vma, unsigned long address,
- unsigned long *pfn)
-{
- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
- return -EINVAL;
-
- *pfn = address >> PAGE_SHIFT;
- return 0;
-}
-EXPORT_SYMBOL(follow_pfn);
-
void vfree(const void *addr)
{
kfree(addr);
}
EXPORT_SYMBOL(vfree);
-void *__vmalloc(unsigned long size, gfp_t gfp_mask)
+void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
{
/*
* You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc()
* returns only a logical address.
*/
- return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
+ return kmalloc_noprof(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
}
-EXPORT_SYMBOL(__vmalloc);
+EXPORT_SYMBOL(__vmalloc_noprof);
-void *__vmalloc_node_range(unsigned long size, unsigned long align,
+void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
const void *caller)
{
- return __vmalloc(size, gfp_mask);
+ return __vmalloc_noprof(size, gfp_mask);
}
-void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
+void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask,
int node, const void *caller)
{
- return __vmalloc(size, gfp_mask);
+ return __vmalloc_noprof(size, gfp_mask);
}
static void *__vmalloc_user_flags(unsigned long size, gfp_t flags)
@@ -179,11 +158,11 @@ static void *__vmalloc_user_flags(unsigned long size, gfp_t flags)
return ret;
}
-void *vmalloc_user(unsigned long size)
+void *vmalloc_user_noprof(unsigned long size)
{
return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO);
}
-EXPORT_SYMBOL(vmalloc_user);
+EXPORT_SYMBOL(vmalloc_user_noprof);
struct page *vmalloc_to_page(const void *addr)
{
@@ -217,13 +196,13 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
* For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*/
-void *vmalloc(unsigned long size)
+void *vmalloc_noprof(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL);
+ return __vmalloc_noprof(size, GFP_KERNEL);
}
-EXPORT_SYMBOL(vmalloc);
+EXPORT_SYMBOL(vmalloc_noprof);
-void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc);
+void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof);
/*
* vzalloc - allocate virtually contiguous memory with zero fill
@@ -237,11 +216,11 @@ void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc)
* For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*/
-void *vzalloc(unsigned long size)
+void *vzalloc_noprof(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_ZERO);
+ return __vmalloc_noprof(size, GFP_KERNEL | __GFP_ZERO);
}
-EXPORT_SYMBOL(vzalloc);
+EXPORT_SYMBOL(vzalloc_noprof);
/**
* vmalloc_node - allocate memory on a specific node
@@ -254,11 +233,11 @@ EXPORT_SYMBOL(vzalloc);
* For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*/
-void *vmalloc_node(unsigned long size, int node)
+void *vmalloc_node_noprof(unsigned long size, int node)
{
- return vmalloc(size);
+ return vmalloc_noprof(size);
}
-EXPORT_SYMBOL(vmalloc_node);
+EXPORT_SYMBOL(vmalloc_node_noprof);
/**
* vzalloc_node - allocate memory on a specific node with zero fill
@@ -272,11 +251,11 @@ EXPORT_SYMBOL(vmalloc_node);
* For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*/
-void *vzalloc_node(unsigned long size, int node)
+void *vzalloc_node_noprof(unsigned long size, int node)
{
- return vzalloc(size);
+ return vzalloc_noprof(size);
}
-EXPORT_SYMBOL(vzalloc_node);
+EXPORT_SYMBOL(vzalloc_node_noprof);
/**
* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
@@ -285,11 +264,11 @@ EXPORT_SYMBOL(vzalloc_node);
* Allocate enough 32bit PA addressable pages to cover @size from the
* page level allocator and map them into contiguous kernel virtual space.
*/
-void *vmalloc_32(unsigned long size)
+void *vmalloc_32_noprof(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL);
+ return __vmalloc_noprof(size, GFP_KERNEL);
}
-EXPORT_SYMBOL(vmalloc_32);
+EXPORT_SYMBOL(vmalloc_32_noprof);
/**
* vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
@@ -301,15 +280,15 @@ EXPORT_SYMBOL(vmalloc_32);
* VM_USERMAP is set on the corresponding VMA so that subsequent calls to
* remap_vmalloc_range() are permissible.
*/
-void *vmalloc_32_user(unsigned long size)
+void *vmalloc_32_user_noprof(unsigned long size)
{
/*
* We'll have to sort out the ZONE_DMA bits for 64-bit,
* but for now this can simply use vmalloc_user() directly.
*/
- return vmalloc_user(size);
+ return vmalloc_user_noprof(size);
}
-EXPORT_SYMBOL(vmalloc_32_user);
+EXPORT_SYMBOL(vmalloc_32_user_noprof);
void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot)
{
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8d6a207c3c5905..4d7a0004df2cac 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -724,7 +724,6 @@ static struct ctl_table vm_oom_kill_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {}
};
#endif
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3e19b87049db17..fba324e1a010d2 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2291,7 +2291,6 @@ static struct ctl_table vm_page_writeback_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- {}
};
#endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 14d39f34d3367f..22e8b9f1d710b9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -54,6 +54,7 @@
#include <linux/khugepaged.h>
#include <linux/delayacct.h>
#include <linux/cacheinfo.h>
+#include <linux/pgalloc_tag.h>
#include <asm/div64.h>
#include "internal.h"
#include "shuffle.h"
@@ -206,24 +207,6 @@ EXPORT_SYMBOL(node_states);
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
-/*
- * A cached value of the page's pageblock's migratetype, used when the page is
- * put on a pcplist. Used to avoid the pageblock migratetype lookup when
- * freeing from pcplists in most cases, at the cost of possibly becoming stale.
- * Also the migratetype set in the page does not necessarily match the pcplist
- * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
- * other index - this ensures that it will be put on the correct CMA freelist.
- */
-static inline int get_pcppage_migratetype(struct page *page)
-{
- return page->index;
-}
-
-static inline void set_pcppage_migratetype(struct page *page, int migratetype)
-{
- page->index = migratetype;
-}
-
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
unsigned int pageblock_order __read_mostly;
#endif
@@ -332,7 +315,7 @@ static inline bool deferred_pages_enabled(void)
static bool __ref
_deferred_grow_zone(struct zone *zone, unsigned int order)
{
- return deferred_grow_zone(zone, order);
+ return deferred_grow_zone(zone, order);
}
#else
static inline bool deferred_pages_enabled(void)
@@ -523,7 +506,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (order > PAGE_ALLOC_COSTLY_ORDER) {
- VM_BUG_ON(order != pageblock_order);
+ VM_BUG_ON(order != HPAGE_PMD_ORDER);
return NR_LOWORDER_PCP_LISTS;
}
#else
@@ -539,7 +522,7 @@ static inline int pindex_to_order(unsigned int pindex)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (pindex == NR_LOWORDER_PCP_LISTS)
- order = pageblock_order;
+ order = HPAGE_PMD_ORDER;
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
#endif
@@ -552,20 +535,12 @@ static inline bool pcp_allowed_order(unsigned int order)
if (order <= PAGE_ALLOC_COSTLY_ORDER)
return true;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (order == pageblock_order)
+ if (order == HPAGE_PMD_ORDER)
return true;
#endif
return false;
}
-static inline void free_the_page(struct page *page, unsigned int order)
-{
- if (pcp_allowed_order(order)) /* Via pcp? */
- free_unref_page(page, order);
- else
- __free_pages_ok(page, order, FPI_NONE);
-}
-
/*
* Higher-order pages are called "compound pages". They are structured thusly:
*
@@ -590,20 +565,6 @@ void prep_compound_page(struct page *page, unsigned int order)
prep_compound_head(page, order);
}
-void destroy_large_folio(struct folio *folio)
-{
- if (folio_test_hugetlb(folio)) {
- free_huge_folio(folio);
- return;
- }
-
- if (folio_test_large_rmappable(folio))
- folio_undo_large_rmappable(folio);
-
- mem_cgroup_uncharge(folio);
- free_the_page(&folio->page, folio_order(folio));
-}
-
static inline void set_buddy_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
@@ -660,23 +621,33 @@ compaction_capture(struct capture_control *capc, struct page *page,
}
#endif /* CONFIG_COMPACTION */
-/* Used for pages not on another list */
-static inline void add_to_free_list(struct page *page, struct zone *zone,
- unsigned int order, int migratetype)
+static inline void account_freepages(struct zone *zone, int nr_pages,
+ int migratetype)
{
- struct free_area *area = &zone->free_area[order];
+ if (is_migrate_isolate(migratetype))
+ return;
- list_add(&page->buddy_list, &area->free_list[migratetype]);
- area->nr_free++;
+ __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
+
+ if (is_migrate_cma(migratetype))
+ __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
}
/* Used for pages not on another list */
-static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
- unsigned int order, int migratetype)
+static inline void __add_to_free_list(struct page *page, struct zone *zone,
+ unsigned int order, int migratetype,
+ bool tail)
{
struct free_area *area = &zone->free_area[order];
- list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
+ VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
+ "page type is %lu, passed migratetype is %d (nr=%d)\n",
+ get_pageblock_migratetype(page), migratetype, 1 << order);
+
+ if (tail)
+ list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
+ else
+ list_add(&page->buddy_list, &area->free_list[migratetype]);
area->nr_free++;
}
@@ -686,16 +657,28 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
* allocation again (e.g., optimization for memory onlining).
*/
static inline void move_to_free_list(struct page *page, struct zone *zone,
- unsigned int order, int migratetype)
+ unsigned int order, int old_mt, int new_mt)
{
struct free_area *area = &zone->free_area[order];
- list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
+ /* Free page moving can fail, so it happens before the type update */
+ VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt,
+ "page type is %lu, passed migratetype is %d (nr=%d)\n",
+ get_pageblock_migratetype(page), old_mt, 1 << order);
+
+ list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
+
+ account_freepages(zone, -(1 << order), old_mt);
+ account_freepages(zone, 1 << order, new_mt);
}
-static inline void del_page_from_free_list(struct page *page, struct zone *zone,
- unsigned int order)
+static inline void __del_page_from_free_list(struct page *page, struct zone *zone,
+ unsigned int order, int migratetype)
{
+ VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
+ "page type is %lu, passed migratetype is %d (nr=%d)\n",
+ get_pageblock_migratetype(page), migratetype, 1 << order);
+
/* clear reported state and update reported page count */
if (page_reported(page))
__ClearPageReported(page);
@@ -706,6 +689,13 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,
zone->free_area[order].nr_free--;
}
+static inline void del_page_from_free_list(struct page *page, struct zone *zone,
+ unsigned int order, int migratetype)
+{
+ __del_page_from_free_list(page, zone, order, migratetype);
+ account_freepages(zone, -(1 << order), migratetype);
+}
+
static inline struct page *get_page_from_free_area(struct free_area *area,
int migratetype)
{
@@ -777,16 +767,16 @@ static inline void __free_one_page(struct page *page,
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
VM_BUG_ON(migratetype == -1);
- if (likely(!is_migrate_isolate(migratetype)))
- __mod_zone_freepage_state(zone, 1 << order, migratetype);
-
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
+ account_freepages(zone, 1 << order, migratetype);
+
while (order < MAX_PAGE_ORDER) {
+ int buddy_mt = migratetype;
+
if (compaction_capture(capc, page, order, migratetype)) {
- __mod_zone_freepage_state(zone, -(1 << order),
- migratetype);
+ account_freepages(zone, -(1 << order), migratetype);
return;
}
@@ -801,11 +791,11 @@ static inline void __free_one_page(struct page *page,
* pageblock isolation could cause incorrect freepage or CMA
* accounting or HIGHATOMIC accounting.
*/
- int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);
+ buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);
- if (migratetype != buddy_mt
- && (!migratetype_is_mergeable(migratetype) ||
- !migratetype_is_mergeable(buddy_mt)))
+ if (migratetype != buddy_mt &&
+ (!migratetype_is_mergeable(migratetype) ||
+ !migratetype_is_mergeable(buddy_mt)))
goto done_merging;
}
@@ -814,9 +804,19 @@ static inline void __free_one_page(struct page *page,
* merge with it and move up one order.
*/
if (page_is_guard(buddy))
- clear_page_guard(zone, buddy, order, migratetype);
+ clear_page_guard(zone, buddy, order);
else
- del_page_from_free_list(buddy, zone, order);
+ __del_page_from_free_list(buddy, zone, order, buddy_mt);
+
+ if (unlikely(buddy_mt != migratetype)) {
+ /*
+ * Match buddy type. This ensures that an
+ * expand() down the line puts the sub-blocks
+ * on the right freelists.
+ */
+ set_pageblock_migratetype(buddy, migratetype);
+ }
+
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
@@ -833,74 +833,13 @@ done_merging:
else
to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
- if (to_tail)
- add_to_free_list_tail(page, zone, order, migratetype);
- else
- add_to_free_list(page, zone, order, migratetype);
+ __add_to_free_list(page, zone, order, migratetype, to_tail);
/* Notify page reporting subsystem of freed page */
if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
page_reporting_notify_free(order);
}
-/**
- * split_free_page() -- split a free page at split_pfn_offset
- * @free_page: the original free page
- * @order: the order of the page
- * @split_pfn_offset: split offset within the page
- *
- * Return -ENOENT if the free page is changed, otherwise 0
- *
- * It is used when the free page crosses two pageblocks with different migratetypes
- * at split_pfn_offset within the page. The split free page will be put into
- * separate migratetype lists afterwards. Otherwise, the function achieves
- * nothing.
- */
-int split_free_page(struct page *free_page,
- unsigned int order, unsigned long split_pfn_offset)
-{
- struct zone *zone = page_zone(free_page);
- unsigned long free_page_pfn = page_to_pfn(free_page);
- unsigned long pfn;
- unsigned long flags;
- int free_page_order;
- int mt;
- int ret = 0;
-
- if (split_pfn_offset == 0)
- return ret;
-
- spin_lock_irqsave(&zone->lock, flags);
-
- if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
- ret = -ENOENT;
- goto out;
- }
-
- mt = get_pfnblock_migratetype(free_page, free_page_pfn);
- if (likely(!is_migrate_isolate(mt)))
- __mod_zone_freepage_state(zone, -(1UL << order), mt);
-
- del_page_from_free_list(free_page, zone, order);
- for (pfn = free_page_pfn;
- pfn < free_page_pfn + (1UL << order);) {
- int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
-
- free_page_order = min_t(unsigned int,
- pfn ? __ffs(pfn) : order,
- __fls(split_pfn_offset));
- __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
- mt, FPI_NONE);
- pfn += 1UL << free_page_order;
- split_pfn_offset -= (1UL << free_page_order);
- /* we have done the first part, now switch to second part */
- if (split_pfn_offset == 0)
- split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
- }
-out:
- spin_unlock_irqrestore(&zone->lock, flags);
- return ret;
-}
/*
* A bad page could be due to a number of fields. Instead of multiple branches,
* try and check multiple fields with one check. The caller must do a detailed
@@ -1006,10 +945,11 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
}
break;
case 2:
- /*
- * the second tail page: ->mapping is
- * deferred_list.next -- ignore value.
- */
+ /* the second tail page: deferred_list overlaps ->mapping */
+ if (unlikely(!list_empty(&folio->_deferred_list))) {
+ bad_page(page, "on deferred list");
+ goto out;
+ }
break;
default:
if (page->mapping != TAIL_MAPPING) {
@@ -1070,7 +1010,7 @@ static inline bool should_skip_kasan_poison(struct page *page)
return page_kasan_tag(page) == KASAN_TAG_KERNEL;
}
-static void kernel_init_pages(struct page *page, int numpages)
+void kernel_init_pages(struct page *page, int numpages)
{
int i;
@@ -1101,6 +1041,7 @@ __always_inline bool free_pages_prepare(struct page *page,
/* Do not let hwpoison pages hit pcplists/buddy */
reset_page_owner(page, order);
page_table_check_free(page, order);
+ pgalloc_tag_sub(page, 1 << order);
return false;
}
@@ -1140,6 +1081,7 @@ __always_inline bool free_pages_prepare(struct page *page,
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
reset_page_owner(page, order);
page_table_check_free(page, order);
+ pgalloc_tag_sub(page, 1 << order);
if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page),
@@ -1191,7 +1133,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
{
unsigned long flags;
unsigned int order;
- bool isolated_pageblocks;
struct page *page;
/*
@@ -1204,7 +1145,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
pindex = pindex - 1;
spin_lock_irqsave(&zone->lock, flags);
- isolated_pageblocks = has_isolate_pageblock(zone);
while (count > 0) {
struct list_head *list;
@@ -1220,23 +1160,19 @@ static void free_pcppages_bulk(struct zone *zone, int count,
order = pindex_to_order(pindex);
nr_pages = 1 << order;
do {
+ unsigned long pfn;
int mt;
page = list_last_entry(list, struct page, pcp_list);
- mt = get_pcppage_migratetype(page);
+ pfn = page_to_pfn(page);
+ mt = get_pfnblock_migratetype(page, pfn);
/* must delete to avoid corrupting pcp list */
list_del(&page->pcp_list);
count -= nr_pages;
pcp->count -= nr_pages;
- /* MIGRATE_ISOLATE page should not go to pcplists */
- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
- /* Pageblock could have been isolated meanwhile */
- if (unlikely(isolated_pageblocks))
- mt = get_pageblock_migratetype(page);
-
- __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
+ __free_one_page(page, pfn, zone, order, mt, FPI_NONE);
trace_mm_page_pcpu_drain(page, order, mt);
} while (count > 0 && !list_empty(list));
}
@@ -1244,18 +1180,15 @@ static void free_pcppages_bulk(struct zone *zone, int count,
spin_unlock_irqrestore(&zone->lock, flags);
}
-static void free_one_page(struct zone *zone,
- struct page *page, unsigned long pfn,
- unsigned int order,
- int migratetype, fpi_t fpi_flags)
+static void free_one_page(struct zone *zone, struct page *page,
+ unsigned long pfn, unsigned int order,
+ fpi_t fpi_flags)
{
unsigned long flags;
+ int migratetype;
spin_lock_irqsave(&zone->lock, flags);
- if (unlikely(has_isolate_pageblock(zone) ||
- is_migrate_isolate(migratetype))) {
- migratetype = get_pfnblock_migratetype(page, pfn);
- }
+ migratetype = get_pfnblock_migratetype(page, pfn);
__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -1263,21 +1196,13 @@ static void free_one_page(struct zone *zone,
static void __free_pages_ok(struct page *page, unsigned int order,
fpi_t fpi_flags)
{
- int migratetype;
unsigned long pfn = page_to_pfn(page);
struct zone *zone = page_zone(page);
if (!free_pages_prepare(page, order))
return;
- /*
- * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here
- * is used to avoid calling get_pfnblock_migratetype() under the lock.
- * This will reduce the lock holding time.
- */
- migratetype = get_pfnblock_migratetype(page, pfn);
-
- free_one_page(zone, page, pfn, order, migratetype, fpi_flags);
+ free_one_page(zone, page, pfn, order, fpi_flags);
__count_vm_events(PGFREE, 1 << order);
}
@@ -1388,6 +1313,7 @@ static inline void expand(struct zone *zone, struct page *page,
int low, int high, int migratetype)
{
unsigned long size = 1 << high;
+ unsigned long nr_added = 0;
while (high > low) {
high--;
@@ -1400,12 +1326,14 @@ static inline void expand(struct zone *zone, struct page *page,
* Corresponding page table entries will not be touched,
* pages will stay not present in virtual address space
*/
- if (set_page_guard(zone, &page[size], high, migratetype))
+ if (set_page_guard(zone, &page[size], high))
continue;
- add_to_free_list(&page[size], zone, high, migratetype);
+ __add_to_free_list(&page[size], zone, high, migratetype, false);
set_buddy_order(&page[size], high);
+ nr_added += size;
}
+ account_freepages(zone, nr_added, migratetype);
}
static void check_new_page_bad(struct page *page)
@@ -1533,6 +1461,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
set_page_owner(page, order, gfp_flags);
page_table_check_alloc(page, order);
+ pgalloc_tag_add(page, current, 1 << order);
}
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
@@ -1573,9 +1502,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
- del_page_from_free_list(page, zone, current_order);
+ del_page_from_free_list(page, zone, current_order, migratetype);
expand(zone, page, order, current_order, migratetype);
- set_pcppage_migratetype(page, migratetype);
trace_mm_page_alloc_zone_locked(page, order, migratetype,
pcp_allowed_order(order) &&
migratetype < MIGRATE_PCPTYPES);
@@ -1592,7 +1520,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
*
* The other migratetypes do not have fallbacks.
*/
-static int fallbacks[MIGRATE_TYPES][MIGRATE_PCPTYPES - 1] = {
+static int fallbacks[MIGRATE_PCPTYPES][MIGRATE_PCPTYPES - 1] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE },
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE },
@@ -1610,30 +1538,23 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
#endif
/*
- * Move the free pages in a range to the freelist tail of the requested type.
- * Note that start_page and end_pages are not aligned on a pageblock
- * boundary. If alignment is required, use move_freepages_block()
+ * Change the type of a block and move all its free pages to that
+ * type's freelist.
*/
-static int move_freepages(struct zone *zone,
- unsigned long start_pfn, unsigned long end_pfn,
- int migratetype, int *num_movable)
+static int __move_freepages_block(struct zone *zone, unsigned long start_pfn,
+ int old_mt, int new_mt)
{
struct page *page;
- unsigned long pfn;
+ unsigned long pfn, end_pfn;
unsigned int order;
int pages_moved = 0;
- for (pfn = start_pfn; pfn <= end_pfn;) {
+ VM_WARN_ON(start_pfn & (pageblock_nr_pages - 1));
+ end_pfn = pageblock_end_pfn(start_pfn);
+
+ for (pfn = start_pfn; pfn < end_pfn;) {
page = pfn_to_page(pfn);
if (!PageBuddy(page)) {
- /*
- * We assume that pages that could be isolated for
- * migration are movable. But we don't actually try
- * isolating, as that would be expensive.
- */
- if (num_movable &&
- (PageLRU(page) || __PageMovable(page)))
- (*num_movable)++;
pfn++;
continue;
}
@@ -1643,36 +1564,187 @@ static int move_freepages(struct zone *zone,
VM_BUG_ON_PAGE(page_zone(page) != zone, page);
order = buddy_order(page);
- move_to_free_list(page, zone, order, migratetype);
+
+ move_to_free_list(page, zone, order, old_mt, new_mt);
+
pfn += 1 << order;
pages_moved += 1 << order;
}
+ set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt);
+
return pages_moved;
}
-int move_freepages_block(struct zone *zone, struct page *page,
- int migratetype, int *num_movable)
+static bool prep_move_freepages_block(struct zone *zone, struct page *page,
+ unsigned long *start_pfn,
+ int *num_free, int *num_movable)
{
- unsigned long start_pfn, end_pfn, pfn;
+ unsigned long pfn, start, end;
- if (num_movable)
+ pfn = page_to_pfn(page);
+ start = pageblock_start_pfn(pfn);
+ end = pageblock_end_pfn(pfn);
+
+ /*
+ * The caller only has the lock for @zone, don't touch ranges
+ * that straddle into other zones. While we could move part of
+ * the range that's inside the zone, this call is usually
+ * accompanied by other operations such as migratetype updates
+ * which also should be locked.
+ */
+ if (!zone_spans_pfn(zone, start))
+ return false;
+ if (!zone_spans_pfn(zone, end - 1))
+ return false;
+
+ *start_pfn = start;
+
+ if (num_free) {
+ *num_free = 0;
*num_movable = 0;
+ for (pfn = start; pfn < end;) {
+ page = pfn_to_page(pfn);
+ if (PageBuddy(page)) {
+ int nr = 1 << buddy_order(page);
- pfn = page_to_pfn(page);
- start_pfn = pageblock_start_pfn(pfn);
- end_pfn = pageblock_end_pfn(pfn) - 1;
+ *num_free += nr;
+ pfn += nr;
+ continue;
+ }
+ /*
+ * We assume that pages that could be isolated for
+ * migration are movable. But we don't actually try
+ * isolating, as that would be expensive.
+ */
+ if (PageLRU(page) || __PageMovable(page))
+ (*num_movable)++;
+ pfn++;
+ }
+ }
- /* Do not cross zone boundaries */
- if (!zone_spans_pfn(zone, start_pfn))
- start_pfn = pfn;
- if (!zone_spans_pfn(zone, end_pfn))
- return 0;
+ return true;
+}
+
+static int move_freepages_block(struct zone *zone, struct page *page,
+ int old_mt, int new_mt)
+{
+ unsigned long start_pfn;
+
+ if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
+ return -1;
+
+ return __move_freepages_block(zone, start_pfn, old_mt, new_mt);
+}
+
+#ifdef CONFIG_MEMORY_ISOLATION
+/* Look for a buddy that straddles start_pfn */
+static unsigned long find_large_buddy(unsigned long start_pfn)
+{
+ int order = 0;
+ struct page *page;
+ unsigned long pfn = start_pfn;
+
+ while (!PageBuddy(page = pfn_to_page(pfn))) {
+ /* Nothing found */
+ if (++order > MAX_PAGE_ORDER)
+ return start_pfn;
+ pfn &= ~0UL << order;
+ }
+
+ /*
+ * Found a preceding buddy, but does it straddle?
+ */
+ if (pfn + (1 << buddy_order(page)) > start_pfn)
+ return pfn;
- return move_freepages(zone, start_pfn, end_pfn, migratetype,
- num_movable);
+ /* Nothing found */
+ return start_pfn;
}
+/* Split a multi-block free page into its individual pageblocks */
+static void split_large_buddy(struct zone *zone, struct page *page,
+ unsigned long pfn, int order)
+{
+ unsigned long end_pfn = pfn + (1 << order);
+
+ VM_WARN_ON_ONCE(order <= pageblock_order);
+ VM_WARN_ON_ONCE(pfn & (pageblock_nr_pages - 1));
+
+ /* Caller removed page from freelist, buddy info cleared! */
+ VM_WARN_ON_ONCE(PageBuddy(page));
+
+ while (pfn != end_pfn) {
+ int mt = get_pfnblock_migratetype(page, pfn);
+
+ __free_one_page(page, pfn, zone, pageblock_order, mt, FPI_NONE);
+ pfn += pageblock_nr_pages;
+ page = pfn_to_page(pfn);
+ }
+}
+
+/**
+ * move_freepages_block_isolate - move free pages in block for page isolation
+ * @zone: the zone
+ * @page: the pageblock page
+ * @migratetype: migratetype to set on the pageblock
+ *
+ * This is similar to move_freepages_block(), but handles the special
+ * case encountered in page isolation, where the block of interest
+ * might be part of a larger buddy spanning multiple pageblocks.
+ *
+ * Unlike the regular page allocator path, which moves pages while
+ * stealing buddies off the freelist, page isolation is interested in
+ * arbitrary pfn ranges that may have overlapping buddies on both ends.
+ *
+ * This function handles that. Straddling buddies are split into
+ * individual pageblocks. Only the block of interest is moved.
+ *
+ * Returns %true if pages could be moved, %false otherwise.
+ */
+bool move_freepages_block_isolate(struct zone *zone, struct page *page,
+ int migratetype)
+{
+ unsigned long start_pfn, pfn;
+
+ if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
+ return false;
+
+ /* No splits needed if buddies can't span multiple blocks */
+ if (pageblock_order == MAX_PAGE_ORDER)
+ goto move;
+
+ /* We're a tail block in a larger buddy */
+ pfn = find_large_buddy(start_pfn);
+ if (pfn != start_pfn) {
+ struct page *buddy = pfn_to_page(pfn);
+ int order = buddy_order(buddy);
+
+ del_page_from_free_list(buddy, zone, order,
+ get_pfnblock_migratetype(buddy, pfn));
+ set_pageblock_migratetype(page, migratetype);
+ split_large_buddy(zone, buddy, pfn, order);
+ return true;
+ }
+
+ /* We're the starting block of a larger buddy */
+ if (PageBuddy(page) && buddy_order(page) > pageblock_order) {
+ int order = buddy_order(page);
+
+ del_page_from_free_list(page, zone, order,
+ get_pfnblock_migratetype(page, pfn));
+ set_pageblock_migratetype(page, migratetype);
+ split_large_buddy(zone, page, pfn, order);
+ return true;
+ }
+move:
+ __move_freepages_block(zone, start_pfn,
+ get_pfnblock_migratetype(page, start_pfn),
+ migratetype);
+ return true;
+}
+#endif /* CONFIG_MEMORY_ISOLATION */
+
static void change_pageblock_range(struct page *pageblock_page,
int start_order, int migratetype)
{
@@ -1755,33 +1827,37 @@ static inline bool boost_watermark(struct zone *zone)
}
/*
- * This function implements actual steal behaviour. If order is large enough,
- * we can steal whole pageblock. If not, we first move freepages in this
- * pageblock to our migratetype and determine how many already-allocated pages
- * are there in the pageblock with a compatible migratetype. If at least half
- * of pages are free or compatible, we can change migratetype of the pageblock
- * itself, so pages freed in the future will be put on the correct free list.
+ * This function implements actual steal behaviour. If order is large enough, we
+ * can claim the whole pageblock for the requested migratetype. If not, we check
+ * the pageblock for constituent pages; if at least half of the pages are free
+ * or compatible, we can still claim the whole block, so pages freed in the
+ * future will be put on the correct free list. Otherwise, we isolate exactly
+ * the order we need from the fallback block and leave its migratetype alone.
*/
-static void steal_suitable_fallback(struct zone *zone, struct page *page,
- unsigned int alloc_flags, int start_type, bool whole_block)
+static struct page *
+steal_suitable_fallback(struct zone *zone, struct page *page,
+ int current_order, int order, int start_type,
+ unsigned int alloc_flags, bool whole_block)
{
- unsigned int current_order = buddy_order(page);
int free_pages, movable_pages, alike_pages;
- int old_block_type;
+ unsigned long start_pfn;
+ int block_type;
- old_block_type = get_pageblock_migratetype(page);
+ block_type = get_pageblock_migratetype(page);
/*
* This can happen due to races and we want to prevent broken
* highatomic accounting.
*/
- if (is_migrate_highatomic(old_block_type))
+ if (is_migrate_highatomic(block_type))
goto single_page;
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
+ del_page_from_free_list(page, zone, current_order, block_type);
change_pageblock_range(page, current_order, start_type);
- goto single_page;
+ expand(zone, page, order, current_order, start_type);
+ return page;
}
/*
@@ -1796,10 +1872,9 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
if (!whole_block)
goto single_page;
- free_pages = move_freepages_block(zone, page, start_type,
- &movable_pages);
/* moving whole block can fail due to zone boundary conditions */
- if (!free_pages)
+ if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages,
+ &movable_pages))
goto single_page;
/*
@@ -1817,7 +1892,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
* vice versa, be conservative since we can't distinguish the
* exact migratetype of non-movable pages.
*/
- if (old_block_type == MIGRATE_MOVABLE)
+ if (block_type == MIGRATE_MOVABLE)
alike_pages = pageblock_nr_pages
- (free_pages + movable_pages);
else
@@ -1828,13 +1903,15 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
* compatible migratability as our allocation, claim the whole block.
*/
if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
- page_group_by_mobility_disabled)
- set_pageblock_migratetype(page, start_type);
-
- return;
+ page_group_by_mobility_disabled) {
+ __move_freepages_block(zone, start_pfn, block_type, start_type);
+ return __rmqueue_smallest(zone, order, start_type);
+ }
single_page:
- move_to_free_list(page, zone, current_order, start_type);
+ del_page_from_free_list(page, zone, current_order, block_type);
+ expand(zone, page, order, current_order, block_type);
+ return page;
}
/*
@@ -1901,11 +1978,10 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone)
/* Yoink! */
mt = get_pageblock_migratetype(page);
/* Only reserve normal pageblocks (i.e., they can merge with others) */
- if (migratetype_is_mergeable(mt)) {
- zone->nr_reserved_highatomic += pageblock_nr_pages;
- set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
- move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
- }
+ if (migratetype_is_mergeable(mt))
+ if (move_freepages_block(zone, page, mt,
+ MIGRATE_HIGHATOMIC) != -1)
+ zone->nr_reserved_highatomic += pageblock_nr_pages;
out_unlock:
spin_unlock_irqrestore(&zone->lock, flags);
@@ -1929,7 +2005,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
struct zone *zone;
struct page *page;
int order;
- bool ret;
+ int ret;
for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
ac->nodemask) {
@@ -1944,11 +2020,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < NR_PAGE_ORDERS; order++) {
struct free_area *area = &(zone->free_area[order]);
+ int mt;
page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
if (!page)
continue;
+ mt = get_pageblock_migratetype(page);
/*
* In page freeing path, migratetype change is racy so
* we can counter several free pages in a pageblock
@@ -1956,7 +2034,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* from highatomic to ac->migratetype. So we should
* adjust the count once.
*/
- if (is_migrate_highatomic_page(page)) {
+ if (is_migrate_highatomic(mt)) {
/*
* It should never happen but changes to
* locking could inadvertently allow a per-cpu
@@ -1978,10 +2056,14 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* of pageblocks that cannot be completely freed
* may increase.
*/
- set_pageblock_migratetype(page, ac->migratetype);
- ret = move_freepages_block(zone, page, ac->migratetype,
- NULL);
- if (ret) {
+ ret = move_freepages_block(zone, page, mt,
+ ac->migratetype);
+ /*
+ * Reserving this block already succeeded, so this should
+ * not fail on zone boundaries.
+ */
+ WARN_ON_ONCE(ret == -1);
+ if (ret > 0) {
spin_unlock_irqrestore(&zone->lock, flags);
return ret;
}
@@ -2002,7 +2084,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* deviation from the rest of this file, to make the for loop
* condition simpler.
*/
-static __always_inline bool
+static __always_inline struct page *
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
unsigned int alloc_flags)
{
@@ -2049,7 +2131,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
goto do_steal;
}
- return false;
+ return NULL;
find_smallest:
for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
@@ -2069,14 +2151,14 @@ find_smallest:
do_steal:
page = get_page_from_free_area(area, fallback_mt);
- steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
- can_steal);
+ /* take off list, maybe claim block, expand remainder */
+ page = steal_suitable_fallback(zone, page, current_order, order,
+ start_migratetype, alloc_flags, can_steal);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
- return true;
-
+ return page;
}
/*
@@ -2103,15 +2185,15 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
return page;
}
}
-retry:
+
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page)) {
if (alloc_flags & ALLOC_CMA)
page = __rmqueue_cma_fallback(zone, order);
- if (!page && __rmqueue_fallback(zone, order, migratetype,
- alloc_flags))
- goto retry;
+ if (!page)
+ page = __rmqueue_fallback(zone, order, migratetype,
+ alloc_flags);
}
return page;
}
@@ -2146,12 +2228,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* pages are ordered properly.
*/
list_add_tail(&page->pcp_list, list);
- if (is_migrate_cma(get_pcppage_migratetype(page)))
- __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
- -(1 << order));
}
-
- __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock_irqrestore(&zone->lock, flags);
return i;
@@ -2216,12 +2293,15 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
*/
static void drain_pages_zone(unsigned int cpu, struct zone *zone)
{
- struct per_cpu_pages *pcp;
+ struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ int count = READ_ONCE(pcp->count);
+
+ while (count) {
+ int to_drain = min(count, pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX);
+ count -= to_drain;
- pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
- if (pcp->count) {
spin_lock(&pcp->lock);
- free_pcppages_bulk(zone, pcp->count, pcp, 0);
+ free_pcppages_bulk(zone, to_drain, pcp, 0);
spin_unlock(&pcp->lock);
}
}
@@ -2339,19 +2419,6 @@ void drain_all_pages(struct zone *zone)
__drain_all_pages(zone, false);
}
-static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
- unsigned int order)
-{
- int migratetype;
-
- if (!free_pages_prepare(page, order))
- return false;
-
- migratetype = get_pfnblock_migratetype(page, pfn);
- set_pcppage_migratetype(page, migratetype);
- return true;
-}
-
static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high)
{
int min_nr_free, max_nr_free;
@@ -2482,9 +2549,14 @@ void free_unref_page(struct page *page, unsigned int order)
struct per_cpu_pages *pcp;
struct zone *zone;
unsigned long pfn = page_to_pfn(page);
- int migratetype, pcpmigratetype;
+ int migratetype;
- if (!free_unref_page_prepare(page, pfn, order))
+ if (!pcp_allowed_order(order)) {
+ __free_pages_ok(page, order, FPI_NONE);
+ return;
+ }
+
+ if (!free_pages_prepare(page, order))
return;
/*
@@ -2494,23 +2566,23 @@ void free_unref_page(struct page *page, unsigned int order)
* get those areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
- migratetype = pcpmigratetype = get_pcppage_migratetype(page);
+ migratetype = get_pfnblock_migratetype(page, pfn);
if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
+ free_one_page(page_zone(page), page, pfn, order, FPI_NONE);
return;
}
- pcpmigratetype = MIGRATE_MOVABLE;
+ migratetype = MIGRATE_MOVABLE;
}
zone = page_zone(page);
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (pcp) {
- free_unref_page_commit(zone, pcp, page, pcpmigratetype, order);
+ free_unref_page_commit(zone, pcp, page, migratetype, order);
pcp_spin_unlock(pcp);
} else {
- free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
+ free_one_page(zone, page, pfn, order, FPI_NONE);
}
pcp_trylock_finish(UP_flags);
}
@@ -2523,7 +2595,7 @@ void free_unref_folios(struct folio_batch *folios)
unsigned long __maybe_unused UP_flags;
struct per_cpu_pages *pcp = NULL;
struct zone *locked_zone = NULL;
- int i, j, migratetype;
+ int i, j;
/* Prepare folios for freeing */
for (i = 0, j = 0; i < folios->nr; i++) {
@@ -2533,18 +2605,15 @@ void free_unref_folios(struct folio_batch *folios)
if (order > 0 && folio_test_large_rmappable(folio))
folio_undo_large_rmappable(folio);
- if (!free_unref_page_prepare(&folio->page, pfn, order))
+ if (!free_pages_prepare(&folio->page, order))
continue;
-
/*
- * Free isolated folios and orders not handled on the PCP
- * directly to the allocator, see comment in free_unref_page.
+ * Free orders not handled on the PCP directly to the
+ * allocator.
*/
- migratetype = get_pcppage_migratetype(&folio->page);
- if (!pcp_allowed_order(order) ||
- is_migrate_isolate(migratetype)) {
- free_one_page(folio_zone(folio), &folio->page, pfn,
- order, migratetype, FPI_NONE);
+ if (!pcp_allowed_order(order)) {
+ free_one_page(folio_zone(folio), &folio->page,
+ pfn, order, FPI_NONE);
continue;
}
folio->private = (void *)(unsigned long)order;
@@ -2557,16 +2626,31 @@ void free_unref_folios(struct folio_batch *folios)
for (i = 0; i < folios->nr; i++) {
struct folio *folio = folios->folios[i];
struct zone *zone = folio_zone(folio);
+ unsigned long pfn = folio_pfn(folio);
unsigned int order = (unsigned long)folio->private;
+ int migratetype;
folio->private = NULL;
- migratetype = get_pcppage_migratetype(&folio->page);
+ migratetype = get_pfnblock_migratetype(&folio->page, pfn);
/* Different zone requires a different pcp lock */
- if (zone != locked_zone) {
+ if (zone != locked_zone ||
+ is_migrate_isolate(migratetype)) {
if (pcp) {
pcp_spin_unlock(pcp);
pcp_trylock_finish(UP_flags);
+ locked_zone = NULL;
+ pcp = NULL;
+ }
+
+ /*
+ * Free isolated pages directly to the
+ * allocator, see comment in free_unref_page.
+ */
+ if (is_migrate_isolate(migratetype)) {
+ free_one_page(zone, &folio->page, pfn,
+ order, FPI_NONE);
+ continue;
}
/*
@@ -2577,10 +2661,8 @@ void free_unref_folios(struct folio_batch *folios)
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (unlikely(!pcp)) {
pcp_trylock_finish(UP_flags);
- free_one_page(zone, &folio->page,
- folio_pfn(folio), order,
- migratetype, FPI_NONE);
- locked_zone = NULL;
+ free_one_page(zone, &folio->page, pfn,
+ order, FPI_NONE);
continue;
}
locked_zone = zone;
@@ -2623,6 +2705,7 @@ void split_page(struct page *page, unsigned int order)
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
split_page_owner(page, order, 0);
+ pgalloc_tag_split(page, 1 << order);
split_page_memcg(page, order, 0);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -2643,11 +2726,9 @@ int __isolate_free_page(struct page *page, unsigned int order)
watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
return 0;
-
- __mod_zone_freepage_state(zone, -(1UL << order), mt);
}
- del_page_from_free_list(page, zone, order);
+ del_page_from_free_list(page, zone, order, mt);
/*
* Set the pageblock if the isolated page is at least half of a
@@ -2662,8 +2743,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
* with others)
*/
if (migratetype_is_mergeable(mt))
- set_pageblock_migratetype(page,
- MIGRATE_MOVABLE);
+ move_freepages_block(zone, page, mt,
+ MIGRATE_MOVABLE);
}
}
@@ -2747,8 +2828,6 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
return NULL;
}
}
- __mod_zone_freepage_state(zone, -(1 << order),
- get_pcppage_migratetype(page));
spin_unlock_irqrestore(&zone->lock, flags);
} while (check_new_pages(page, order));
@@ -4384,7 +4463,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
*
* Returns the number of pages on the list or array.
*/
-unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
+unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
nodemask_t *nodemask, int nr_pages,
struct list_head *page_list,
struct page **page_array)
@@ -4520,7 +4599,7 @@ failed_irq:
pcp_trylock_finish(UP_flags);
failed:
- page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
+ page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask);
if (page) {
if (page_list)
list_add(&page->lru, page_list);
@@ -4531,13 +4610,13 @@ failed:
goto out;
}
-EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
+EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof);
/*
* This is the 'heart' of the zoned buddy allocator.
*/
-struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
- nodemask_t *nodemask)
+struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
+ int preferred_nid, nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
@@ -4599,38 +4678,38 @@ out:
return page;
}
-EXPORT_SYMBOL(__alloc_pages);
+EXPORT_SYMBOL(__alloc_pages_noprof);
-struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
+struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
- struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
+ struct page *page = __alloc_pages_noprof(gfp | __GFP_COMP, order,
preferred_nid, nodemask);
return page_rmappable_folio(page);
}
-EXPORT_SYMBOL(__folio_alloc);
+EXPORT_SYMBOL(__folio_alloc_noprof);
/*
* Common helper functions. Never use with __GFP_HIGHMEM because the returned
* address cannot represent highmem pages. Use alloc_pages and then kmap if
* you need to access high mem.
*/
-unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
+unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order)
{
struct page *page;
- page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
+ page = alloc_pages_noprof(gfp_mask & ~__GFP_HIGHMEM, order);
if (!page)
return 0;
return (unsigned long) page_address(page);
}
-EXPORT_SYMBOL(__get_free_pages);
+EXPORT_SYMBOL(get_free_pages_noprof);
-unsigned long get_zeroed_page(gfp_t gfp_mask)
+unsigned long get_zeroed_page_noprof(gfp_t gfp_mask)
{
- return __get_free_page(gfp_mask | __GFP_ZERO);
+ return get_free_pages_noprof(gfp_mask | __GFP_ZERO, 0);
}
-EXPORT_SYMBOL(get_zeroed_page);
+EXPORT_SYMBOL(get_zeroed_page_noprof);
/**
* __free_pages - Free pages allocated with alloc_pages().
@@ -4656,12 +4735,15 @@ void __free_pages(struct page *page, unsigned int order)
{
/* get PageHead before we drop reference */
int head = PageHead(page);
+ struct alloc_tag *tag = pgalloc_tag_get(page);
if (put_page_testzero(page))
- free_the_page(page, order);
- else if (!head)
+ free_unref_page(page, order);
+ else if (!head) {
+ pgalloc_tag_sub_pages(tag, (1 << order) - 1);
while (order-- > 0)
- free_the_page(page + (1 << order), order);
+ free_unref_page(page + (1 << order), order);
+ }
}
EXPORT_SYMBOL(__free_pages);
@@ -4722,7 +4804,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
if (page_ref_sub_and_test(page, count))
- free_the_page(page, compound_order(page));
+ free_unref_page(page, compound_order(page));
}
EXPORT_SYMBOL(__page_frag_cache_drain);
@@ -4763,7 +4845,7 @@ refill:
goto refill;
if (unlikely(nc->pfmemalloc)) {
- free_the_page(page, compound_order(page));
+ free_unref_page(page, compound_order(page));
goto refill;
}
@@ -4807,7 +4889,7 @@ void page_frag_free(void *addr)
struct page *page = virt_to_head_page(addr);
if (unlikely(put_page_testzero(page)))
- free_the_page(page, compound_order(page));
+ free_unref_page(page, compound_order(page));
}
EXPORT_SYMBOL(page_frag_free);
@@ -4820,6 +4902,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
struct page *last = page + nr;
split_page_owner(page, order, 0);
+ pgalloc_tag_split(page, 1 << order);
split_page_memcg(page, order, 0);
while (page < --last)
set_page_refcounted(last);
@@ -4846,7 +4929,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
*
* Return: pointer to the allocated area or %NULL in case of error.
*/
-void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
+void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask)
{
unsigned int order = get_order(size);
unsigned long addr;
@@ -4854,10 +4937,10 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
- addr = __get_free_pages(gfp_mask, order);
+ addr = get_free_pages_noprof(gfp_mask, order);
return make_alloc_exact(addr, order, size);
}
-EXPORT_SYMBOL(alloc_pages_exact);
+EXPORT_SYMBOL(alloc_pages_exact_noprof);
/**
* alloc_pages_exact_nid - allocate an exact number of physically-contiguous
@@ -4871,7 +4954,7 @@ EXPORT_SYMBOL(alloc_pages_exact);
*
* Return: pointer to the allocated area or %NULL in case of error.
*/
-void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
+void * __meminit alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask)
{
unsigned int order = get_order(size);
struct page *p;
@@ -4879,7 +4962,7 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
- p = alloc_pages_node(nid, gfp_mask, order);
+ p = alloc_pages_node_noprof(nid, gfp_mask, order);
if (!p)
return NULL;
return make_alloc_exact((unsigned long)page_address(p), order, size);
@@ -5180,37 +5263,13 @@ static void setup_min_slab_ratio(void);
static void build_zonelists(pg_data_t *pgdat)
{
- int node, local_node;
struct zoneref *zonerefs;
int nr_zones;
- local_node = pgdat->node_id;
-
zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
nr_zones = build_zonerefs_node(pgdat, zonerefs);
zonerefs += nr_zones;
- /*
- * Now we build the zonelist so that it contains the zones
- * of all the other nodes.
- * We don't want to pressure a particular node, so when
- * building the zones for node N, we make sure that the
- * zones coming right after the local ones are those from
- * node N+1 (modulo N)
- */
- for (node = local_node + 1; node < MAX_NUMNODES; node++) {
- if (!node_online(node))
- continue;
- nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
- zonerefs += nr_zones;
- }
- for (node = 0; node < local_node; node++) {
- if (!node_online(node))
- continue;
- nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
- zonerefs += nr_zones;
- }
-
zonerefs->zone = NULL;
zonerefs->zone_idx = 0;
}
@@ -5827,10 +5886,11 @@ static void setup_per_zone_lowmem_reserve(void)
for (j = i + 1; j < MAX_NR_ZONES; j++) {
struct zone *upper_zone = &pgdat->node_zones[j];
+ bool empty = !zone_managed_pages(upper_zone);
managed_pages += zone_managed_pages(upper_zone);
- if (clear)
+ if (clear || empty)
zone->lowmem_reserve[j] = 0;
else
zone->lowmem_reserve[j] = managed_pages / ratio;
@@ -6211,7 +6271,6 @@ static struct ctl_table page_alloc_sysctl_table[] = {
.extra2 = SYSCTL_ONE_HUNDRED,
},
#endif
- {}
};
void __init page_alloc_sysctl_init(void)
@@ -6251,6 +6310,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
struct migration_target_control mtc = {
.nid = zone_to_nid(cc->zone),
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ .reason = MR_CONTIG_RANGE,
};
struct page *page;
unsigned long total_mapped = 0;
@@ -6336,11 +6396,10 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
* pages which PFN is in [start, end) are allocated for the caller and
* need to be freed with free_contig_range().
*/
-int alloc_contig_range(unsigned long start, unsigned long end,
+int alloc_contig_range_noprof(unsigned long start, unsigned long end,
unsigned migratetype, gfp_t gfp_mask)
{
unsigned long outer_start, outer_end;
- int order;
int ret = 0;
struct compact_control cc = {
@@ -6413,29 +6472,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
* We don't have to hold zone->lock here because the pages are
* isolated thus they won't get removed from buddy.
*/
-
- order = 0;
- outer_start = start;
- while (!PageBuddy(pfn_to_page(outer_start))) {
- if (++order > MAX_PAGE_ORDER) {
- outer_start = start;
- break;
- }
- outer_start &= ~0UL << order;
- }
-
- if (outer_start != start) {
- order = buddy_order(pfn_to_page(outer_start));
-
- /*
- * outer_start page could be small order buddy page and
- * it doesn't include start page. Adjust outer_start
- * in this case to report failed page properly
- * on tracepoint in test_pages_isolated()
- */
- if (outer_start + (1UL << order) <= start)
- outer_start = start;
- }
+ outer_start = find_large_buddy(start);
/* Make sure the range is really isolated. */
if (test_pages_isolated(outer_start, end, 0)) {
@@ -6460,15 +6497,15 @@ done:
undo_isolate_page_range(start, end, migratetype);
return ret;
}
-EXPORT_SYMBOL(alloc_contig_range);
+EXPORT_SYMBOL(alloc_contig_range_noprof);
static int __alloc_contig_pages(unsigned long start_pfn,
unsigned long nr_pages, gfp_t gfp_mask)
{
unsigned long end_pfn = start_pfn + nr_pages;
- return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
- gfp_mask);
+ return alloc_contig_range_noprof(start_pfn, end_pfn, MIGRATE_MOVABLE,
+ gfp_mask);
}
static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
@@ -6523,8 +6560,8 @@ static bool zone_spans_last_pfn(const struct zone *zone,
*
* Return: pointer to contiguous pages on success, or NULL if not successful.
*/
-struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
- int nid, nodemask_t *nodemask)
+struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
{
unsigned long ret, pfn, flags;
struct zonelist *zonelist;
@@ -6655,8 +6692,9 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
+ VM_WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE);
order = buddy_order(page);
- del_page_from_free_list(page, zone, order);
+ del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE);
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -6666,16 +6704,16 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
/*
* This function returns a stable result only if called under zone lock.
*/
-bool is_free_buddy_page(struct page *page)
+bool is_free_buddy_page(const struct page *page)
{
unsigned long pfn = page_to_pfn(page);
unsigned int order;
for (order = 0; order < NR_PAGE_ORDERS; order++) {
- struct page *page_head = page - (pfn & ((1 << order) - 1));
+ const struct page *head = page - (pfn & ((1 << order) - 1));
- if (PageBuddy(page_head) &&
- buddy_order_unsafe(page_head) >= order)
+ if (PageBuddy(head) &&
+ buddy_order_unsafe(head) >= order)
break;
}
@@ -6684,6 +6722,14 @@ bool is_free_buddy_page(struct page *page)
EXPORT_SYMBOL(is_free_buddy_page);
#ifdef CONFIG_MEMORY_FAILURE
+static inline void add_to_free_list(struct page *page, struct zone *zone,
+ unsigned int order, int migratetype,
+ bool tail)
+{
+ __add_to_free_list(page, zone, order, migratetype, tail);
+ account_freepages(zone, 1 << order, migratetype);
+}
+
/*
* Break down a higher-order page in sub-pages, and keep our target out of
* buddy allocator.
@@ -6706,10 +6752,10 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page,
current_buddy = page + size;
}
- if (set_page_guard(zone, current_buddy, high, migratetype))
+ if (set_page_guard(zone, current_buddy, high))
continue;
- add_to_free_list(current_buddy, zone, high, migratetype);
+ add_to_free_list(current_buddy, zone, high, migratetype, false);
set_buddy_order(current_buddy, high);
}
}
@@ -6735,12 +6781,11 @@ bool take_page_off_buddy(struct page *page)
int migratetype = get_pfnblock_migratetype(page_head,
pfn_head);
- del_page_from_free_list(page_head, zone, page_order);
+ del_page_from_free_list(page_head, zone, page_order,
+ migratetype);
break_down_buddy_pages(zone, page_head, page, 0,
page_order, migratetype);
SetPageHWPoisonTakenOff(page);
- if (!is_migrate_isolate(migratetype))
- __mod_zone_freepage_state(zone, -1, migratetype);
ret = true;
break;
}
@@ -6757,13 +6802,14 @@ bool take_page_off_buddy(struct page *page)
bool put_page_back_buddy(struct page *page)
{
struct zone *zone = page_zone(page);
- unsigned long pfn = page_to_pfn(page);
unsigned long flags;
- int migratetype = get_pfnblock_migratetype(page, pfn);
bool ret = false;
spin_lock_irqsave(&zone->lock, flags);
if (put_page_testzero(page)) {
+ unsigned long pfn = page_to_pfn(page);
+ int migratetype = get_pfnblock_migratetype(page, pfn);
+
ClearPageHWPoisonTakenOff(page);
__free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
if (TestClearPageHWPoison(page)) {
@@ -6847,7 +6893,7 @@ static bool try_to_accept_memory_one(struct zone *zone)
list_del(&page->lru);
last = list_empty(&zone->unaccepted_pages);
- __mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+ account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
spin_unlock_irqrestore(&zone->lock, flags);
@@ -6899,7 +6945,7 @@ static bool __free_unaccepted(struct page *page)
spin_lock_irqsave(&zone->lock, flags);
first = list_empty(&zone->unaccepted_pages);
list_add_tail(&page->lru, &zone->unaccepted_pages);
- __mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+ account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
spin_unlock_irqrestore(&zone->lock, flags);
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 4548fcc66d74d0..95dd8ffeaf811a 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -10,6 +10,7 @@
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
#include <linux/rcupdate.h>
+#include <linux/pgalloc_tag.h>
/*
* struct page extension
@@ -82,6 +83,9 @@ static struct page_ext_operations *page_ext_ops[] __initdata = {
#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
&page_idle_ops,
#endif
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+ &page_alloc_tagging_ops,
+#endif
#ifdef CONFIG_PAGE_TABLE_CHECK
&page_table_check_ops,
#endif
@@ -91,7 +95,16 @@ unsigned long page_ext_size;
static unsigned long total_usage;
+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
+/*
+ * To ensure correct allocation tagging for pages, page_ext should be available
+ * before the first page allocation. Otherwise early task stacks will be
+ * allocated before page_ext initialization and missing tags will be flagged.
+ */
+bool early_page_ext __meminitdata = true;
+#else
bool early_page_ext __meminitdata;
+#endif
static int __init setup_early_page_ext(char *str)
{
early_page_ext = true;
@@ -501,7 +514,7 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
* Context: Any context. Caller may not sleep until they have called
* page_ext_put().
*/
-struct page_ext *page_ext_get(struct page *page)
+struct page_ext *page_ext_get(const struct page *page)
{
struct page_ext *page_ext;
diff --git a/mm/page_io.c b/mm/page_io.c
index ae2b49055e4357..a9a7c236aeccfb 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -189,7 +189,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
* Arch code may have to preserve more data than just the page
* contents, e.g. memory tags.
*/
- ret = arch_prepare_to_swap(&folio->page);
+ ret = arch_prepare_to_swap(folio);
if (ret) {
folio_mark_dirty(folio);
folio_unlock(folio);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index a5c8fa4c2a75c3..042937d5abe4b8 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -178,15 +178,11 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
migratetype, isol_flags);
if (!unmovable) {
- unsigned long nr_pages;
- int mt = get_pageblock_migratetype(page);
-
- set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+ if (!move_freepages_block_isolate(zone, page, MIGRATE_ISOLATE)) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return -EBUSY;
+ }
zone->nr_isolate_pageblock++;
- nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE,
- NULL);
-
- __mod_zone_freepage_state(zone, -nr_pages, mt);
spin_unlock_irqrestore(&zone->lock, flags);
return 0;
}
@@ -206,7 +202,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
static void unset_migratetype_isolate(struct page *page, int migratetype)
{
struct zone *zone;
- unsigned long flags, nr_pages;
+ unsigned long flags;
bool isolated_page = false;
unsigned int order;
struct page *buddy;
@@ -252,12 +248,15 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
* allocation.
*/
if (!isolated_page) {
- nr_pages = move_freepages_block(zone, page, migratetype, NULL);
- __mod_zone_freepage_state(zone, nr_pages, migratetype);
- }
- set_pageblock_migratetype(page, migratetype);
- if (isolated_page)
+ /*
+ * Isolating this block already succeeded, so this
+ * should not fail on zone boundaries.
+ */
+ WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype));
+ } else {
+ set_pageblock_migratetype(page, migratetype);
__putback_isolated_page(page, order, migratetype);
+ }
zone->nr_isolate_pageblock--;
out:
spin_unlock_irqrestore(&zone->lock, flags);
@@ -367,26 +366,29 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
VM_BUG_ON(!page);
pfn = page_to_pfn(page);
- /*
- * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any
- * free pages in [start_pfn, boundary_pfn), its head page will
- * always be in the range.
- */
+
if (PageBuddy(page)) {
int order = buddy_order(page);
- if (pfn + (1UL << order) > boundary_pfn) {
- /* free page changed before split, check it again */
- if (split_free_page(page, order, boundary_pfn - pfn))
- continue;
- }
+ /* move_freepages_block_isolate() handled this */
+ VM_WARN_ON_ONCE(pfn + (1 << order) > boundary_pfn);
pfn += 1UL << order;
continue;
}
+
/*
- * migrate compound pages then let the free page handling code
- * above do the rest. If migration is not possible, just fail.
+ * If a compound page is straddling our block, attempt
+ * to migrate it out of the way.
+ *
+ * We don't have to worry about this creating a large
+ * free page that straddles into our block: gigantic
+ * pages are freed as order-0 chunks, and LRU pages
+ * (currently) do not exceed pageblock_order.
+ *
+ * The block of interest has already been marked
+ * MIGRATE_ISOLATE above, so when migration is done it
+ * will free its pages onto the correct freelists.
*/
if (PageCompound(page)) {
struct page *head = compound_head(page);
@@ -397,16 +399,10 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
pfn = head_pfn + nr_pages;
continue;
}
+
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
- /*
- * hugetlb, lru compound (THP), and movable compound pages
- * can be migrated. Otherwise, fail the isolation.
- */
- if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) {
- int order;
- unsigned long outer_pfn;
+ if (PageHuge(page)) {
int page_mt = get_pageblock_migratetype(page);
- bool isolate_page = !is_migrate_isolate_page(page);
struct compact_control cc = {
.nr_migratepages = 0,
.order = -1,
@@ -419,56 +415,25 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
};
INIT_LIST_HEAD(&cc.migratepages);
- /*
- * XXX: mark the page as MIGRATE_ISOLATE so that
- * no one else can grab the freed page after migration.
- * Ideally, the page should be freed as two separate
- * pages to be added into separate migratetype free
- * lists.
- */
- if (isolate_page) {
- ret = set_migratetype_isolate(page, page_mt,
- flags, head_pfn, head_pfn + nr_pages);
- if (ret)
- goto failed;
- }
-
ret = __alloc_contig_migrate_range(&cc, head_pfn,
head_pfn + nr_pages, page_mt);
-
- /*
- * restore the page's migratetype so that it can
- * be split into separate migratetype free lists
- * later.
- */
- if (isolate_page)
- unset_migratetype_isolate(page, page_mt);
-
if (ret)
goto failed;
- /*
- * reset pfn to the head of the free page, so
- * that the free page handling code above can split
- * the free page to the right migratetype list.
- *
- * head_pfn is not used here as a hugetlb page order
- * can be bigger than MAX_PAGE_ORDER, but after it is
- * freed, the free page order is not. Use pfn within
- * the range to find the head of the free page.
- */
- order = 0;
- outer_pfn = pfn;
- while (!PageBuddy(pfn_to_page(outer_pfn))) {
- /* stop if we cannot find the free page */
- if (++order > MAX_PAGE_ORDER)
- goto failed;
- outer_pfn &= ~0UL << order;
- }
- pfn = outer_pfn;
+ pfn = head_pfn + nr_pages;
continue;
- } else
+ }
+
+ /*
+ * These pages are movable too, but they're
+ * not expected to exceed pageblock_order.
+ *
+ * Let us know when they do, so we can add
+ * proper free and split handling for them.
+ */
+ VM_WARN_ON_ONCE_PAGE(PageLRU(page), page);
+ VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page);
#endif
- goto failed;
+ goto failed;
}
pfn++;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 8eed0f3dc0853c..0a448d76e0af73 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -515,7 +515,7 @@ static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret,
if (!memcg_data)
goto out_unlock;
- if (memcg_data & MEMCG_DATA_OBJCGS)
+ if (memcg_data & MEMCG_DATA_OBJEXTS)
ret += scnprintf(kbuf + ret, count - ret,
"Slab cache page\n");
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 74d2de15fb5e09..53b8868ede61c6 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -325,6 +325,8 @@ next_pte:
*/
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
{
+ struct folio *folio = page_folio(page);
+ pgoff_t pgoff = folio->index + folio_page_idx(folio, page);
struct page_vma_mapped_walk pvmw = {
.pfn = page_to_pfn(page),
.nr_pages = 1,
@@ -332,7 +334,7 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
.flags = PVMW_SYNC,
};
- pvmw.address = vma_address(page, vma);
+ pvmw.address = vma_address(vma, pgoff, 1);
if (pvmw.address == -EFAULT)
return 0;
if (!page_vma_mapped_walk(&pvmw))
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index cdd0aa597a8196..7e42f0ca3b7bb3 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -32,6 +32,19 @@ struct pcpu_block_md {
int nr_bits; /* total bits responsible for */
};
+struct pcpuobj_ext {
+#ifdef CONFIG_MEMCG_KMEM
+ struct obj_cgroup *cgroup;
+#endif
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+ union codetag_ref tag;
+#endif
+};
+
+#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MEM_ALLOC_PROFILING)
+#define NEED_PCPUOBJ_EXT
+#endif
+
struct pcpu_chunk {
#ifdef CONFIG_PERCPU_STATS
int nr_alloc; /* # of allocations */
@@ -64,8 +77,8 @@ struct pcpu_chunk {
int end_offset; /* additional area required to
have the region end page
aligned */
-#ifdef CONFIG_MEMCG_KMEM
- struct obj_cgroup **obj_cgroups; /* vector of object cgroups */
+#ifdef NEED_PCPUOBJ_EXT
+ struct pcpuobj_ext *obj_exts; /* vector of object cgroups */
#endif
int nr_pages; /* # of pages served by this chunk */
@@ -74,6 +87,15 @@ struct pcpu_chunk {
unsigned long populated[]; /* populated bitmap */
};
+static inline bool need_pcpuobj_ext(void)
+{
+ if (IS_ENABLED(CONFIG_MEM_ALLOC_PROFILING))
+ return true;
+ if (!mem_cgroup_kmem_disabled())
+ return true;
+ return false;
+}
+
extern spinlock_t pcpu_lock;
extern struct list_head *pcpu_chunk_lists;
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 2054c9213c4339..cd69caf6aa8d8e 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -231,10 +231,10 @@ static int pcpu_map_pages(struct pcpu_chunk *chunk,
return 0;
err:
for_each_possible_cpu(tcpu) {
- if (tcpu == cpu)
- break;
__pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
page_end - page_start);
+ if (tcpu == cpu)
+ break;
}
pcpu_post_unmap_tlb_flush(chunk, page_start, page_end);
return err;
diff --git a/mm/percpu.c b/mm/percpu.c
index 4e11fc1e6deff0..474e3683b74d17 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1392,9 +1392,9 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
panic("%s: Failed to allocate %zu bytes\n", __func__,
alloc_size);
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef NEED_PCPUOBJ_EXT
/* first chunk is free to use */
- chunk->obj_cgroups = NULL;
+ chunk->obj_exts = NULL;
#endif
pcpu_init_md_blocks(chunk);
@@ -1463,12 +1463,12 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
if (!chunk->md_blocks)
goto md_blocks_fail;
-#ifdef CONFIG_MEMCG_KMEM
- if (!mem_cgroup_kmem_disabled()) {
- chunk->obj_cgroups =
+#ifdef NEED_PCPUOBJ_EXT
+ if (need_pcpuobj_ext()) {
+ chunk->obj_exts =
pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
- sizeof(struct obj_cgroup *), gfp);
- if (!chunk->obj_cgroups)
+ sizeof(struct pcpuobj_ext), gfp);
+ if (!chunk->obj_exts)
goto objcg_fail;
}
#endif
@@ -1480,7 +1480,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
return chunk;
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef NEED_PCPUOBJ_EXT
objcg_fail:
pcpu_mem_free(chunk->md_blocks);
#endif
@@ -1498,8 +1498,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
return;
-#ifdef CONFIG_MEMCG_KMEM
- pcpu_mem_free(chunk->obj_cgroups);
+#ifdef NEED_PCPUOBJ_EXT
+ pcpu_mem_free(chunk->obj_exts);
#endif
pcpu_mem_free(chunk->md_blocks);
pcpu_mem_free(chunk->bound_map);
@@ -1646,9 +1646,9 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
if (!objcg)
return;
- if (likely(chunk && chunk->obj_cgroups)) {
+ if (likely(chunk && chunk->obj_exts)) {
obj_cgroup_get(objcg);
- chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
+ chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = objcg;
rcu_read_lock();
mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
@@ -1663,13 +1663,13 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
struct obj_cgroup *objcg;
- if (unlikely(!chunk->obj_cgroups))
+ if (unlikely(!chunk->obj_exts))
return;
- objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
+ objcg = chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup;
if (!objcg)
return;
- chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
+ chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = NULL;
obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
@@ -1699,6 +1699,32 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
}
#endif /* CONFIG_MEMCG_KMEM */
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
+ size_t size)
+{
+ if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) {
+ alloc_tag_add(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag,
+ current->alloc_tag, size);
+ }
+}
+
+static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
+{
+ if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts))
+ alloc_tag_sub(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, size);
+}
+#else
+static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
+ size_t size)
+{
+}
+
+static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
+{
+}
+#endif
+
/**
* pcpu_alloc - the percpu allocator
* @size: size of area to allocate in bytes
@@ -1714,7 +1740,7 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
-static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
+void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
gfp_t gfp)
{
gfp_t pcpu_gfp;
@@ -1881,6 +1907,8 @@ area_found:
pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
+ pcpu_alloc_tag_alloc_hook(chunk, off, size);
+
return ptr;
fail_unlock:
@@ -1909,61 +1937,7 @@ fail:
return NULL;
}
-
-/**
- * __alloc_percpu_gfp - allocate dynamic percpu area
- * @size: size of area to allocate in bytes
- * @align: alignment of area (max PAGE_SIZE)
- * @gfp: allocation flags
- *
- * Allocate zero-filled percpu area of @size bytes aligned at @align. If
- * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
- * be called from any context but is a lot more likely to fail. If @gfp
- * has __GFP_NOWARN then no warning will be triggered on invalid or failed
- * allocation requests.
- *
- * RETURNS:
- * Percpu pointer to the allocated area on success, NULL on failure.
- */
-void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
-{
- return pcpu_alloc(size, align, false, gfp);
-}
-EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
-
-/**
- * __alloc_percpu - allocate dynamic percpu area
- * @size: size of area to allocate in bytes
- * @align: alignment of area (max PAGE_SIZE)
- *
- * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
- */
-void __percpu *__alloc_percpu(size_t size, size_t align)
-{
- return pcpu_alloc(size, align, false, GFP_KERNEL);
-}
-EXPORT_SYMBOL_GPL(__alloc_percpu);
-
-/**
- * __alloc_reserved_percpu - allocate reserved percpu area
- * @size: size of area to allocate in bytes
- * @align: alignment of area (max PAGE_SIZE)
- *
- * Allocate zero-filled percpu area of @size bytes aligned at @align
- * from reserved percpu area if arch has set it up; otherwise,
- * allocation is served from the same dynamic area. Might sleep.
- * Might trigger writeouts.
- *
- * CONTEXT:
- * Does GFP_KERNEL allocation.
- *
- * RETURNS:
- * Percpu pointer to the allocated area on success, NULL on failure.
- */
-void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
-{
- return pcpu_alloc(size, align, true, GFP_KERNEL);
-}
+EXPORT_SYMBOL_GPL(pcpu_alloc_noprof);
/**
* pcpu_balance_free - manage the amount of free chunks
@@ -2302,6 +2276,8 @@ void free_percpu(void __percpu *ptr)
spin_lock_irqsave(&pcpu_lock, flags);
size = pcpu_free_area(chunk, off);
+ pcpu_alloc_tag_free_hook(chunk, off, size);
+
pcpu_memcg_free_hook(chunk, off, size);
/*
diff --git a/mm/readahead.c b/mm/readahead.c
index d55138e9560b54..c1b23989d9caf3 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -228,6 +228,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
*/
for (i = 0; i < nr_to_read; i++) {
struct folio *folio = xa_load(&mapping->i_pages, index + i);
+ int ret;
if (folio && !xa_is_value(folio)) {
/*
@@ -247,9 +248,12 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
folio = filemap_alloc_folio(gfp_mask, 0);
if (!folio)
break;
- if (filemap_add_folio(mapping, folio, index + i,
- gfp_mask) < 0) {
+
+ ret = filemap_add_folio(mapping, folio, index + i, gfp_mask);
+ if (ret < 0) {
folio_put(folio);
+ if (ret == -ENOMEM)
+ break;
read_pages(ractl);
ractl->_index++;
i = ractl->_index + ractl->_nr_pages - index - 1;
diff --git a/mm/rmap.c b/mm/rmap.c
index 3746a553101832..56b313aa2ebfb2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -23,7 +23,7 @@
* inode->i_rwsem (while writing or truncating, not reading or faulting)
* mm->mmap_lock
* mapping->invalidate_lock (in filemap_fault)
- * page->flags PG_locked (lock_page)
+ * folio_lock
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
* vma_start_write
* mapping->i_mmap_rwsem
@@ -50,7 +50,7 @@
* hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
* vma_lock (hugetlb specific lock for pmd_sharing)
* mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
- * page->flags PG_locked (lock_page)
+ * folio_lock
*/
#include <linux/mm.h>
@@ -775,6 +775,8 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
struct folio *folio = page_folio(page);
+ pgoff_t pgoff;
+
if (folio_test_anon(folio)) {
struct anon_vma *page__anon_vma = folio_anon_vma(folio);
/*
@@ -790,7 +792,9 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
return -EFAULT;
}
- return vma_address(page, vma);
+ /* The !page__anon_vma above handles KSM folios */
+ pgoff = folio->index + folio_page_idx(folio, page);
+ return vma_address(vma, pgoff, 1);
}
/*
@@ -1128,13 +1132,13 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
if (invalid_mkclean_vma(vma, NULL))
return 0;
- pvmw.address = vma_pgoff_address(pgoff, nr_pages, vma);
+ pvmw.address = vma_address(vma, pgoff, nr_pages);
VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);
return page_vma_mkclean_one(&pvmw);
}
-int folio_total_mapcount(struct folio *folio)
+int folio_total_mapcount(const struct folio *folio)
{
int mapcount = folio_entire_mapcount(folio);
int nr_pages;
@@ -2588,7 +2592,8 @@ static void rmap_walk_anon(struct folio *folio,
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
pgoff_start, pgoff_end) {
struct vm_area_struct *vma = avc->vma;
- unsigned long address = vma_address(&folio->page, vma);
+ unsigned long address = vma_address(vma, pgoff_start,
+ folio_nr_pages(folio));
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
@@ -2649,7 +2654,8 @@ static void rmap_walk_file(struct folio *folio,
lookup:
vma_interval_tree_foreach(vma, &mapping->i_mmap,
pgoff_start, pgoff_end) {
- unsigned long address = vma_address(&folio->page, vma);
+ unsigned long address = vma_address(vma, pgoff_start,
+ folio_nr_pages(folio));
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
diff --git a/mm/shmem.c b/mm/shmem.c
index 94ab99b6b574a4..fa2a0ed97507d1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1907,7 +1907,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
* Some architectures may have to restore extra metadata to the
* folio after reading from swap.
*/
- arch_swap_restore(swap, folio);
+ arch_swap_restore(folio_swap(swap, folio), folio);
if (shmem_should_replace_folio(folio, gfp)) {
error = shmem_replace_folio(&folio, gfp, info, index);
@@ -2267,8 +2267,6 @@ unsigned long shmem_get_unmapped_area(struct file *file,
unsigned long uaddr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
- unsigned long (*get_area)(struct file *,
- unsigned long, unsigned long, unsigned long, unsigned long);
unsigned long addr;
unsigned long offset;
unsigned long inflated_len;
@@ -2278,8 +2276,8 @@ unsigned long shmem_get_unmapped_area(struct file *file,
if (len > TASK_SIZE)
return -ENOMEM;
- get_area = current->mm->get_unmapped_area;
- addr = get_area(file, uaddr, len, pgoff, flags);
+ addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff,
+ flags);
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return addr;
@@ -2336,7 +2334,8 @@ unsigned long shmem_get_unmapped_area(struct file *file,
if (inflated_len < len)
return addr;
- inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
+ inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr,
+ inflated_len, 0, flags);
if (IS_ERR_VALUE(inflated_addr))
return addr;
if (inflated_addr & ~PAGE_MASK)
@@ -4801,7 +4800,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
- return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+ return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
}
#endif
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 8dcfafbd283c12..bdb439551eefcb 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -423,4 +423,30 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
#ifdef CONFIG_MEMORY_FAILURE
printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
#endif
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+ {
+ struct codetag_bytes tags[10];
+ size_t i, nr;
+
+ nr = alloc_tag_top_users(tags, ARRAY_SIZE(tags), false);
+ if (nr) {
+ pr_notice("Memory allocations:\n");
+ for (i = 0; i < nr; i++) {
+ struct codetag *ct = tags[i].ct;
+ struct alloc_tag *tag = ct_to_alloc_tag(ct);
+ struct alloc_tag_counters counter = alloc_tag_read(tag);
+
+ /* Same as alloc_tag_to_text() but w/o intermediate buffer */
+ if (ct->modname)
+ pr_notice("%12lli %8llu %s:%u [%s] func:%s\n",
+ counter.bytes, counter.calls, ct->filename,
+ ct->lineno, ct->modname, ct->function);
+ else
+ pr_notice("%12lli %8llu %s:%u func:%s\n",
+ counter.bytes, counter.calls, ct->filename,
+ ct->lineno, ct->function);
+ }
+ }
+ }
+#endif
}
diff --git a/mm/slab.h b/mm/slab.h
index d2bc9b19122292..411251b9bdd15c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -84,11 +84,11 @@ struct slab {
};
struct rcu_head rcu_head;
};
- unsigned int __unused;
+ unsigned int __page_type;
atomic_t __page_refcount;
-#ifdef CONFIG_MEMCG
- unsigned long memcg_data;
+#ifdef CONFIG_SLAB_OBJ_EXT
+ unsigned long obj_exts;
#endif
};
@@ -97,8 +97,8 @@ struct slab {
SLAB_MATCH(flags, __page_flags);
SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */
SLAB_MATCH(_refcount, __page_refcount);
-#ifdef CONFIG_MEMCG
-SLAB_MATCH(memcg_data, memcg_data);
+#ifdef CONFIG_SLAB_OBJ_EXT
+SLAB_MATCH(memcg_data, obj_exts);
#endif
#undef SLAB_MATCH
static_assert(sizeof(struct slab) <= sizeof(struct page));
@@ -536,42 +536,54 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
return false;
}
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_SLAB_OBJ_EXT
+
/*
- * slab_objcgs - get the object cgroups vector associated with a slab
+ * slab_obj_exts - get the pointer to the slab object extension vector
+ * associated with a slab.
* @slab: a pointer to the slab struct
*
- * Returns a pointer to the object cgroups vector associated with the slab,
+ * Returns a pointer to the object extension vector associated with the slab,
* or NULL if no such vector has been associated yet.
*/
-static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
+static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
{
- unsigned long memcg_data = READ_ONCE(slab->memcg_data);
+ unsigned long obj_exts = READ_ONCE(slab->obj_exts);
- VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS),
+#ifdef CONFIG_MEMCG
+ VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS),
slab_page(slab));
- VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab));
-
- return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab));
+#endif
+ return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK);
}
-int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
- gfp_t gfp, bool new_slab);
-void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
- enum node_stat_item idx, int nr);
-#else /* CONFIG_MEMCG_KMEM */
-static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
+int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
+ gfp_t gfp, bool new_slab);
+
+#else /* CONFIG_SLAB_OBJ_EXT */
+
+static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
{
return NULL;
}
-static inline int memcg_alloc_slab_cgroups(struct slab *slab,
- struct kmem_cache *s, gfp_t gfp,
- bool new_slab)
+#endif /* CONFIG_SLAB_OBJ_EXT */
+
+static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s)
{
- return 0;
+ return (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B;
}
-#endif /* CONFIG_MEMCG_KMEM */
+
+#ifdef CONFIG_MEMCG_KMEM
+bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
+ gfp_t flags, size_t size, void **p);
+void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
+ void **p, int objects, struct slabobj_ext *obj_exts);
+void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr);
+#endif
size_t __ksize(const void *objp);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f5234672f03cea..3179a6aeffc56e 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1189,7 +1189,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
return (void *)p;
}
- ret = kmalloc_track_caller(new_size, flags);
+ ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_);
if (ret && p) {
/* Disable KASAN checks as the object's redzone is accessed. */
kasan_disable_current();
@@ -1213,7 +1213,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
*
* Return: pointer to the allocated memory or %NULL in case of error
*/
-void *krealloc(const void *p, size_t new_size, gfp_t flags)
+void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
{
void *ret;
@@ -1228,7 +1228,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
return ret;
}
-EXPORT_SYMBOL(krealloc);
+EXPORT_SYMBOL(krealloc_noprof);
/**
* kfree_sensitive - Clear sensitive information in memory before freeing
diff --git a/mm/slub.c b/mm/slub.c
index 1bb2a93cf7b6a4..3e33ff900d35ea 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -616,18 +616,12 @@ slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
*/
static __always_inline void slab_lock(struct slab *slab)
{
- struct page *page = slab_page(slab);
-
- VM_BUG_ON_PAGE(PageTail(page), page);
- bit_spin_lock(PG_locked, &page->flags);
+ bit_spin_lock(PG_locked, &slab->__page_flags);
}
static __always_inline void slab_unlock(struct slab *slab)
{
- struct page *page = slab_page(slab);
-
- VM_BUG_ON_PAGE(PageTail(page), page);
- bit_spin_unlock(PG_locked, &page->flags);
+ bit_spin_unlock(PG_locked, &slab->__page_flags);
}
static inline bool
@@ -1865,198 +1859,278 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
#endif
#endif /* CONFIG_SLUB_DEBUG */
-static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s)
+#ifdef CONFIG_SLAB_OBJ_EXT
+
+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
+
+static inline void mark_objexts_empty(struct slabobj_ext *obj_exts)
{
- return (s->flags & SLAB_RECLAIM_ACCOUNT) ?
- NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B;
+ struct slabobj_ext *slab_exts;
+ struct slab *obj_exts_slab;
+
+ obj_exts_slab = virt_to_slab(obj_exts);
+ slab_exts = slab_obj_exts(obj_exts_slab);
+ if (slab_exts) {
+ unsigned int offs = obj_to_index(obj_exts_slab->slab_cache,
+ obj_exts_slab, obj_exts);
+ /* codetag should be NULL */
+ WARN_ON(slab_exts[offs].ref.ct);
+ set_codetag_empty(&slab_exts[offs].ref);
+ }
}
-#ifdef CONFIG_MEMCG_KMEM
-static inline void memcg_free_slab_cgroups(struct slab *slab)
+static inline void mark_failed_objexts_alloc(struct slab *slab)
{
- kfree(slab_objcgs(slab));
- slab->memcg_data = 0;
+ slab->obj_exts = OBJEXTS_ALLOC_FAIL;
}
-static inline size_t obj_full_size(struct kmem_cache *s)
+static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
+ struct slabobj_ext *vec, unsigned int objects)
{
/*
- * For each accounted object there is an extra space which is used
- * to store obj_cgroup membership. Charge it too.
+ * If vector previously failed to allocate then we have live
+ * objects with no tag reference. Mark all references in this
+ * vector as empty to avoid warnings later on.
*/
- return s->size + sizeof(struct obj_cgroup *);
+ if (obj_exts & OBJEXTS_ALLOC_FAIL) {
+ unsigned int i;
+
+ for (i = 0; i < objects; i++)
+ set_codetag_empty(&vec[i].ref);
+ }
}
+#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
+
+static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {}
+static inline void mark_failed_objexts_alloc(struct slab *slab) {}
+static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
+ struct slabobj_ext *vec, unsigned int objects) {}
+
+#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
+
/*
- * Returns false if the allocation should fail.
+ * The allocated objcg pointers array is not accounted directly.
+ * Moreover, it should not come from DMA buffer and is not readily
+ * reclaimable. So those GFP bits should be masked off.
*/
-static bool __memcg_slab_pre_alloc_hook(struct kmem_cache *s,
- struct list_lru *lru,
- struct obj_cgroup **objcgp,
- size_t objects, gfp_t flags)
+#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \
+ __GFP_ACCOUNT | __GFP_NOFAIL)
+
+int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
+ gfp_t gfp, bool new_slab)
+{
+ unsigned int objects = objs_per_slab(s, slab);
+ unsigned long new_exts;
+ unsigned long old_exts;
+ struct slabobj_ext *vec;
+
+ gfp &= ~OBJCGS_CLEAR_MASK;
+ /* Prevent recursive extension vector allocation */
+ gfp |= __GFP_NO_OBJ_EXT;
+ vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
+ slab_nid(slab));
+ if (!vec) {
+ /* Mark vectors which failed to allocate */
+ if (new_slab)
+ mark_failed_objexts_alloc(slab);
+
+ return -ENOMEM;
+ }
+
+ new_exts = (unsigned long)vec;
+#ifdef CONFIG_MEMCG
+ new_exts |= MEMCG_DATA_OBJEXTS;
+#endif
+ old_exts = slab->obj_exts;
+ handle_failed_objexts_alloc(old_exts, vec, objects);
+ if (new_slab) {
+ /*
+ * If the slab is brand new and nobody can yet access its
+ * obj_exts, no synchronization is required and obj_exts can
+ * be simply assigned.
+ */
+ slab->obj_exts = new_exts;
+ } else if (cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) {
+ /*
+ * If the slab is already in use, somebody can allocate and
+ * assign slabobj_exts in parallel. In this case the existing
+ * objcg vector should be reused.
+ */
+ mark_objexts_empty(vec);
+ kfree(vec);
+ return 0;
+ }
+
+ kmemleak_not_leak(vec);
+ return 0;
+}
+
+static inline void free_slab_obj_exts(struct slab *slab)
{
+ struct slabobj_ext *obj_exts;
+
+ obj_exts = slab_obj_exts(slab);
+ if (!obj_exts)
+ return;
+
/*
- * The obtained objcg pointer is safe to use within the current scope,
- * defined by current task or set_active_memcg() pair.
- * obj_cgroup_get() is used to get a permanent reference.
+ * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its
+ * corresponding extension will be NULL. alloc_tag_sub() will throw a
+ * warning if slab has extensions but the extension of an object is
+ * NULL, therefore replace NULL with CODETAG_EMPTY to indicate that
+ * the extension for obj_exts is expected to be NULL.
*/
- struct obj_cgroup *objcg = current_obj_cgroup();
- if (!objcg)
+ mark_objexts_empty(obj_exts);
+ kfree(obj_exts);
+ slab->obj_exts = 0;
+}
+
+static inline bool need_slab_obj_ext(void)
+{
+ if (mem_alloc_profiling_enabled())
return true;
- if (lru) {
- int ret;
- struct mem_cgroup *memcg;
+ /*
+ * CONFIG_MEMCG_KMEM creates vector of obj_cgroup objects conditionally
+ * inside memcg_slab_post_alloc_hook. No other users for now.
+ */
+ return false;
+}
- memcg = get_mem_cgroup_from_objcg(objcg);
- ret = memcg_list_lru_alloc(memcg, lru, flags);
- css_put(&memcg->css);
+static inline struct slabobj_ext *
+prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
+{
+ struct slab *slab;
- if (ret)
- return false;
- }
+ if (!p)
+ return NULL;
- if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s)))
- return false;
+ if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
+ return NULL;
- *objcgp = objcg;
- return true;
+ if (flags & __GFP_NO_OBJ_EXT)
+ return NULL;
+
+ slab = virt_to_slab(p);
+ if (!slab_obj_exts(slab) &&
+ WARN(alloc_slab_obj_exts(slab, s, flags, false),
+ "%s, %s: Failed to create slab extension vector!\n",
+ __func__, s->name))
+ return NULL;
+
+ return slab_obj_exts(slab) + obj_to_index(s, slab, p);
}
-/*
- * Returns false if the allocation should fail.
- */
-static __fastpath_inline
-bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
- struct obj_cgroup **objcgp, size_t objects,
- gfp_t flags)
+static inline void
+alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
+ int objects)
{
- if (!memcg_kmem_online())
- return true;
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+ struct slabobj_ext *obj_exts;
+ int i;
- if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)))
- return true;
+ if (!mem_alloc_profiling_enabled())
+ return;
- return likely(__memcg_slab_pre_alloc_hook(s, lru, objcgp, objects,
- flags));
+ obj_exts = slab_obj_exts(slab);
+ if (!obj_exts)
+ return;
+
+ for (i = 0; i < objects; i++) {
+ unsigned int off = obj_to_index(s, slab, p[i]);
+
+ alloc_tag_sub(&obj_exts[off].ref, s->size);
+ }
+#endif
}
-static void __memcg_slab_post_alloc_hook(struct kmem_cache *s,
- struct obj_cgroup *objcg,
- gfp_t flags, size_t size,
- void **p)
+#else /* CONFIG_SLAB_OBJ_EXT */
+
+static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
+ gfp_t gfp, bool new_slab)
{
- struct slab *slab;
- unsigned long off;
- size_t i;
+ return 0;
+}
- flags &= gfp_allowed_mask;
+static inline void free_slab_obj_exts(struct slab *slab)
+{
+}
- for (i = 0; i < size; i++) {
- if (likely(p[i])) {
- slab = virt_to_slab(p[i]);
+static inline bool need_slab_obj_ext(void)
+{
+ return false;
+}
- if (!slab_objcgs(slab) &&
- memcg_alloc_slab_cgroups(slab, s, flags, false)) {
- obj_cgroup_uncharge(objcg, obj_full_size(s));
- continue;
- }
+static inline struct slabobj_ext *
+prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
+{
+ return NULL;
+}
- off = obj_to_index(s, slab, p[i]);
- obj_cgroup_get(objcg);
- slab_objcgs(slab)[off] = objcg;
- mod_objcg_state(objcg, slab_pgdat(slab),
- cache_vmstat_idx(s), obj_full_size(s));
- } else {
- obj_cgroup_uncharge(objcg, obj_full_size(s));
- }
- }
+static inline void
+alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
+ int objects)
+{
}
+#endif /* CONFIG_SLAB_OBJ_EXT */
+
+#ifdef CONFIG_MEMCG_KMEM
+
+static void memcg_alloc_abort_single(struct kmem_cache *s, void *object);
+
static __fastpath_inline
-void memcg_slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg,
+bool memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
gfp_t flags, size_t size, void **p)
{
- if (likely(!memcg_kmem_online() || !objcg))
- return;
-
- return __memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
-}
+ if (likely(!memcg_kmem_online()))
+ return true;
-static void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
- void **p, int objects,
- struct obj_cgroup **objcgs)
-{
- for (int i = 0; i < objects; i++) {
- struct obj_cgroup *objcg;
- unsigned int off;
+ if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)))
+ return true;
- off = obj_to_index(s, slab, p[i]);
- objcg = objcgs[off];
- if (!objcg)
- continue;
+ if (likely(__memcg_slab_post_alloc_hook(s, lru, flags, size, p)))
+ return true;
- objcgs[off] = NULL;
- obj_cgroup_uncharge(objcg, obj_full_size(s));
- mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
- -obj_full_size(s));
- obj_cgroup_put(objcg);
+ if (likely(size == 1)) {
+ memcg_alloc_abort_single(s, *p);
+ *p = NULL;
+ } else {
+ kmem_cache_free_bulk(s, size, p);
}
+
+ return false;
}
static __fastpath_inline
void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
int objects)
{
- struct obj_cgroup **objcgs;
+ struct slabobj_ext *obj_exts;
if (!memcg_kmem_online())
return;
- objcgs = slab_objcgs(slab);
- if (likely(!objcgs))
+ obj_exts = slab_obj_exts(slab);
+ if (likely(!obj_exts))
return;
- __memcg_slab_free_hook(s, slab, p, objects, objcgs);
-}
-
-static inline
-void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects,
- struct obj_cgroup *objcg)
-{
- if (objcg)
- obj_cgroup_uncharge(objcg, objects * obj_full_size(s));
+ __memcg_slab_free_hook(s, slab, p, objects, obj_exts);
}
#else /* CONFIG_MEMCG_KMEM */
-static inline void memcg_free_slab_cgroups(struct slab *slab)
-{
-}
-
-static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
- struct list_lru *lru,
- struct obj_cgroup **objcgp,
- size_t objects, gfp_t flags)
-{
- return true;
-}
-
-static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
- struct obj_cgroup *objcg,
+static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s,
+ struct list_lru *lru,
gfp_t flags, size_t size,
void **p)
{
+ return true;
}
static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects)
{
}
-
-static inline
-void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects,
- struct obj_cgroup *objcg)
-{
-}
#endif /* CONFIG_MEMCG_KMEM */
/*
@@ -2106,9 +2180,9 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init)
return !kasan_slab_free(s, x, init);
}
-static inline bool slab_free_freelist_hook(struct kmem_cache *s,
- void **head, void **tail,
- int *cnt)
+static __fastpath_inline
+bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail,
+ int *cnt)
{
void *object;
@@ -2298,7 +2372,7 @@ static __always_inline void account_slab(struct slab *slab, int order,
struct kmem_cache *s, gfp_t gfp)
{
if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
- memcg_alloc_slab_cgroups(slab, s, gfp, true);
+ alloc_slab_obj_exts(slab, s, gfp, true);
mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
PAGE_SIZE << order);
@@ -2307,8 +2381,8 @@ static __always_inline void account_slab(struct slab *slab, int order,
static __always_inline void unaccount_slab(struct slab *slab, int order,
struct kmem_cache *s)
{
- if (memcg_kmem_online())
- memcg_free_slab_cgroups(slab);
+ if (memcg_kmem_online() || need_slab_obj_ext())
+ free_slab_obj_exts(slab);
mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
-(PAGE_SIZE << order));
@@ -3736,10 +3810,7 @@ noinline int should_failslab(struct kmem_cache *s, gfp_t gfpflags)
ALLOW_ERROR_INJECTION(should_failslab, ERRNO);
static __fastpath_inline
-struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
- struct list_lru *lru,
- struct obj_cgroup **objcgp,
- size_t size, gfp_t flags)
+struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
{
flags &= gfp_allowed_mask;
@@ -3748,18 +3819,16 @@ struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
if (unlikely(should_failslab(s, flags)))
return NULL;
- if (unlikely(!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags)))
- return NULL;
-
return s;
}
static __fastpath_inline
-void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg,
+bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
gfp_t flags, size_t size, void **p, bool init,
unsigned int orig_size)
{
unsigned int zero_size = s->object_size;
+ struct slabobj_ext *obj_exts;
bool kasan_init = init;
size_t i;
gfp_t init_flags = flags & gfp_allowed_mask;
@@ -3802,9 +3871,21 @@ void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg,
kmemleak_alloc_recursive(p[i], s->object_size, 1,
s->flags, init_flags);
kmsan_slab_alloc(s, p[i], init_flags);
+ if (need_slab_obj_ext()) {
+ obj_exts = prepare_slab_obj_exts_hook(s, flags, p[i]);
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+ /*
+ * Currently obj_exts is used only for allocation profiling.
+ * If other users appear then mem_alloc_profiling_enabled()
+ * check should be added before alloc_tag_add().
+ */
+ if (likely(obj_exts))
+ alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
+#endif
+ }
}
- memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
+ return memcg_slab_post_alloc_hook(s, lru, flags, size, p);
}
/*
@@ -3821,10 +3902,9 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list
gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
{
void *object;
- struct obj_cgroup *objcg = NULL;
bool init = false;
- s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags);
+ s = slab_pre_alloc_hook(s, gfpflags);
if (unlikely(!s))
return NULL;
@@ -3841,13 +3921,15 @@ out:
/*
* When init equals 'true', like for kzalloc() family, only
* @orig_size bytes might be zeroed instead of s->object_size
+ * In case this fails due to memcg_slab_post_alloc_hook(),
+ * object is set to NULL
*/
- slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init, orig_size);
+ slab_post_alloc_hook(s, lru, gfpflags, 1, &object, init, orig_size);
return object;
}
-void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
+void *kmem_cache_alloc_noprof(struct kmem_cache *s, gfp_t gfpflags)
{
void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_,
s->object_size);
@@ -3856,9 +3938,9 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
return ret;
}
-EXPORT_SYMBOL(kmem_cache_alloc);
+EXPORT_SYMBOL(kmem_cache_alloc_noprof);
-void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
+void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
gfp_t gfpflags)
{
void *ret = slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, _RET_IP_,
@@ -3868,7 +3950,7 @@ void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
return ret;
}
-EXPORT_SYMBOL(kmem_cache_alloc_lru);
+EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof);
/**
* kmem_cache_alloc_node - Allocate an object on the specified node
@@ -3883,7 +3965,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_lru);
*
* Return: pointer to the new object or %NULL in case of error
*/
-void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
+void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int node)
{
void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size);
@@ -3891,7 +3973,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
return ret;
}
-EXPORT_SYMBOL(kmem_cache_alloc_node);
+EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
/*
* To avoid unnecessary overhead, we pass through large allocation requests
@@ -3908,7 +3990,7 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node)
flags = kmalloc_fix_flags(flags);
flags |= __GFP_COMP;
- folio = (struct folio *)alloc_pages_node(node, flags, order);
+ folio = (struct folio *)alloc_pages_node_noprof(node, flags, order);
if (folio) {
ptr = folio_address(folio);
lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
@@ -3923,7 +4005,7 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node)
return ptr;
}
-void *kmalloc_large(size_t size, gfp_t flags)
+void *kmalloc_large_noprof(size_t size, gfp_t flags)
{
void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE);
@@ -3931,9 +4013,9 @@ void *kmalloc_large(size_t size, gfp_t flags)
flags, NUMA_NO_NODE);
return ret;
}
-EXPORT_SYMBOL(kmalloc_large);
+EXPORT_SYMBOL(kmalloc_large_noprof);
-void *kmalloc_large_node(size_t size, gfp_t flags, int node)
+void *kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
{
void *ret = __kmalloc_large_node(size, flags, node);
@@ -3941,7 +4023,7 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node)
flags, node);
return ret;
}
-EXPORT_SYMBOL(kmalloc_large_node);
+EXPORT_SYMBOL(kmalloc_large_node_noprof);
static __always_inline
void *__do_kmalloc_node(size_t size, gfp_t flags, int node,
@@ -3968,26 +4050,26 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node,
return ret;
}
-void *__kmalloc_node(size_t size, gfp_t flags, int node)
+void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node)
{
return __do_kmalloc_node(size, flags, node, _RET_IP_);
}
-EXPORT_SYMBOL(__kmalloc_node);
+EXPORT_SYMBOL(__kmalloc_node_noprof);
-void *__kmalloc(size_t size, gfp_t flags)
+void *__kmalloc_noprof(size_t size, gfp_t flags)
{
return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_);
}
-EXPORT_SYMBOL(__kmalloc);
+EXPORT_SYMBOL(__kmalloc_noprof);
-void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
- int node, unsigned long caller)
+void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags,
+ int node, unsigned long caller)
{
return __do_kmalloc_node(size, flags, node, caller);
}
-EXPORT_SYMBOL(__kmalloc_node_track_caller);
+EXPORT_SYMBOL(kmalloc_node_track_caller_noprof);
-void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
+void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE,
_RET_IP_, size);
@@ -3997,9 +4079,9 @@ void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
ret = kasan_kmalloc(s, ret, size, gfpflags);
return ret;
}
-EXPORT_SYMBOL(kmalloc_trace);
+EXPORT_SYMBOL(kmalloc_trace_noprof);
-void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
+void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags,
int node, size_t size)
{
void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size);
@@ -4009,7 +4091,7 @@ void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
ret = kasan_kmalloc(s, ret, size, gfpflags);
return ret;
}
-EXPORT_SYMBOL(kmalloc_node_trace);
+EXPORT_SYMBOL(kmalloc_node_trace_noprof);
static noinline void free_to_partial_list(
struct kmem_cache *s, struct slab *slab,
@@ -4276,16 +4358,28 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
unsigned long addr)
{
memcg_slab_free_hook(s, slab, &object, 1);
+ alloc_tagging_slab_free_hook(s, slab, &object, 1);
if (likely(slab_free_hook(s, object, slab_want_init_on_free(s))))
do_slab_free(s, slab, object, object, 1, addr);
}
+#ifdef CONFIG_MEMCG_KMEM
+/* Do not inline the rare memcg charging failed path into the allocation path */
+static noinline
+void memcg_alloc_abort_single(struct kmem_cache *s, void *object)
+{
+ if (likely(slab_free_hook(s, object, slab_want_init_on_free(s))))
+ do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_);
+}
+#endif
+
static __fastpath_inline
void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head,
void *tail, void **p, int cnt, unsigned long addr)
{
memcg_slab_free_hook(s, slab, p, cnt);
+ alloc_tagging_slab_free_hook(s, slab, p, cnt);
/*
* With KASAN enabled slab_free_freelist_hook modifies the freelist
* to remove objects, whose reuse must be delayed.
@@ -4612,36 +4706,33 @@ error:
#endif /* CONFIG_SLUB_TINY */
/* Note that interrupts must be enabled when calling this function. */
-int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
- void **p)
+int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
+ void **p)
{
int i;
- struct obj_cgroup *objcg = NULL;
if (!size)
return 0;
- /* memcg and kmem_cache debug support */
- s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags);
+ s = slab_pre_alloc_hook(s, flags);
if (unlikely(!s))
return 0;
i = __kmem_cache_alloc_bulk(s, flags, size, p);
+ if (unlikely(i == 0))
+ return 0;
/*
* memcg and kmem_cache debug support and memory initialization.
* Done outside of the IRQ disabled fastpath loop.
*/
- if (likely(i != 0)) {
- slab_post_alloc_hook(s, objcg, flags, size, p,
- slab_want_init_on_alloc(flags, s), s->object_size);
- } else {
- memcg_slab_alloc_error_hook(s, size, objcg);
+ if (unlikely(!slab_post_alloc_hook(s, NULL, flags, size, p,
+ slab_want_init_on_alloc(flags, s), s->object_size))) {
+ return 0;
}
-
return i;
}
-EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof);
/*
@@ -5630,7 +5721,8 @@ void __init kmem_cache_init(void)
node_set(node, slab_nodes);
create_boot_cache(kmem_cache_node, "kmem_cache_node",
- sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
+ sizeof(struct kmem_cache_node),
+ SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
@@ -5640,7 +5732,7 @@ void __init kmem_cache_init(void)
create_boot_cache(kmem_cache, "kmem_cache",
offsetof(struct kmem_cache, node) +
nr_node_ids * sizeof(struct kmem_cache_node *),
- SLAB_HWCACHE_ALIGN, 0, 0);
+ SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
kmem_cache = bootstrap(&boot_kmem_cache);
kmem_cache_node = bootstrap(&boot_kmem_cache_node);
diff --git a/mm/sparse.c b/mm/sparse.c
index aed0951b87fa04..46e88549d1a6a9 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -226,19 +226,6 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en
{
unsigned long pfn;
-#ifdef CONFIG_SPARSEMEM_EXTREME
- if (unlikely(!mem_section)) {
- unsigned long size, align;
-
- size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
- align = 1 << (INTERNODE_CACHE_SHIFT);
- mem_section = memblock_alloc(size, align);
- if (!mem_section)
- panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
- __func__, size, align);
- }
-#endif
-
start &= PAGE_SECTION_MASK;
mminit_validate_memmodel_limits(&start, &end);
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
@@ -267,6 +254,19 @@ static void __init memblocks_present(void)
unsigned long start, end;
int i, nid;
+#ifdef CONFIG_SPARSEMEM_EXTREME
+ if (unlikely(!mem_section)) {
+ unsigned long size, align;
+
+ size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
+ align = 1 << (INTERNODE_CACHE_SHIFT);
+ mem_section = memblock_alloc(size, align);
+ if (!mem_section)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+ __func__, size, align);
+ }
+#endif
+
for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid)
memory_present(nid, start, end);
}
diff --git a/mm/swap.c b/mm/swap.c
index 500a09a48dfd3a..8ae5cd4ed180bd 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -112,34 +112,21 @@ static void page_cache_release(struct folio *folio)
unlock_page_lruvec_irqrestore(lruvec, flags);
}
-static void __folio_put_small(struct folio *folio)
+void __folio_put(struct folio *folio)
{
+ if (unlikely(folio_is_zone_device(folio))) {
+ free_zone_device_folio(folio);
+ return;
+ } else if (folio_test_hugetlb(folio)) {
+ free_huge_folio(folio);
+ return;
+ }
+
page_cache_release(folio);
+ if (folio_test_large(folio) && folio_test_large_rmappable(folio))
+ folio_undo_large_rmappable(folio);
mem_cgroup_uncharge(folio);
- free_unref_page(&folio->page, 0);
-}
-
-static void __folio_put_large(struct folio *folio)
-{
- /*
- * __page_cache_release() is supposed to be called for thp, not for
- * hugetlb. This is because hugetlb page does never have PageLRU set
- * (it's never listed to any LRU lists) and no memcg routines should
- * be called for hugetlb (it has a separate hugetlb_cgroup.)
- */
- if (!folio_test_hugetlb(folio))
- page_cache_release(folio);
- destroy_large_folio(folio);
-}
-
-void __folio_put(struct folio *folio)
-{
- if (unlikely(folio_is_zone_device(folio)))
- free_zone_device_page(&folio->page);
- else if (unlikely(folio_test_large(folio)))
- __folio_put_large(folio);
- else
- __folio_put_small(folio);
+ free_unref_page(&folio->page, folio_order(folio));
}
EXPORT_SYMBOL(__folio_put);
@@ -158,8 +145,8 @@ void put_pages_list(struct list_head *pages)
list_for_each_entry_safe(folio, next, pages, lru) {
if (!folio_put_testzero(folio))
continue;
- if (folio_test_large(folio)) {
- __folio_put_large(folio);
+ if (folio_test_hugetlb(folio)) {
+ free_huge_folio(folio);
continue;
}
/* LRU flag must be clear because it's passed using the lru */
@@ -985,7 +972,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
struct folio *folio = folios->folios[i];
unsigned int nr_refs = refs ? refs[i] : 1;
- if (is_huge_zero_page(&folio->page))
+ if (is_huge_zero_folio(folio))
continue;
if (folio_is_zone_device(folio)) {
@@ -996,7 +983,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
if (put_devmap_managed_page_refs(&folio->page, nr_refs))
continue;
if (folio_ref_sub_and_test(folio, nr_refs))
- free_zone_device_page(&folio->page);
+ free_zone_device_folio(folio);
continue;
}
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 90973ce7881db2..13ab3b771409c3 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -264,7 +264,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache)
cache->cur = 0;
if (swap_slot_cache_active)
cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
- cache->slots, 1);
+ cache->slots, 0);
return cache->nr;
}
@@ -310,8 +310,8 @@ swp_entry_t folio_alloc_swap(struct folio *folio)
entry.val = 0;
if (folio_test_large(folio)) {
- if (IS_ENABLED(CONFIG_THP_SWAP) && arch_thp_swp_supported())
- get_swap_pages(1, &entry, folio_nr_pages(folio));
+ if (IS_ENABLED(CONFIG_THP_SWAP))
+ get_swap_pages(1, &entry, folio_order(folio));
goto out;
}
@@ -343,7 +343,7 @@ repeat:
goto out;
}
- get_swap_pages(1, &entry, 1);
+ get_swap_pages(1, &entry, 0);
out:
if (mem_cgroup_try_charge_swap(folio, entry)) {
put_swap_folio(folio, entry);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index bfc7e8c58a6d34..642c30d8376c60 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -73,11 +73,11 @@ void *get_shadow_from_swap_cache(swp_entry_t entry)
{
struct address_space *address_space = swap_address_space(entry);
pgoff_t idx = swp_offset(entry);
- struct page *page;
+ void *shadow;
- page = xa_load(&address_space->i_pages, idx);
- if (xa_is_value(page))
- return page;
+ shadow = xa_load(&address_space->i_pages, idx);
+ if (xa_is_value(shadow))
+ return shadow;
return NULL;
}
@@ -301,7 +301,7 @@ void free_page_and_swap_cache(struct page *page)
struct folio *folio = page_folio(page);
free_swap_cache(folio);
- if (!is_huge_zero_page(page))
+ if (!is_huge_zero_folio(folio))
folio_put(folio);
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4919423cce76a3..148ef08f19dd66 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -130,7 +130,11 @@ static inline unsigned char swap_count(unsigned char ent)
/* Reclaim the swap entry if swap is getting full*/
#define TTRS_FULL 0x4
-/* returns 1 if swap entry is freed */
+/*
+ * returns number of pages in the folio that backs the swap entry. If positive,
+ * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
+ * folio was associated with the swap entry.
+ */
static int __try_to_reclaim_swap(struct swap_info_struct *si,
unsigned long offset, unsigned long flags)
{
@@ -155,6 +159,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
ret = folio_free_swap(folio);
folio_unlock(folio);
}
+ ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
folio_put(folio);
return ret;
}
@@ -273,15 +278,15 @@ static void discard_swap_cluster(struct swap_info_struct *si,
#ifdef CONFIG_THP_SWAP
#define SWAPFILE_CLUSTER HPAGE_PMD_NR
-#define swap_entry_size(size) (size)
+#define swap_entry_order(order) (order)
#else
#define SWAPFILE_CLUSTER 256
/*
- * Define swap_entry_size() as constant to let compiler to optimize
+ * Define swap_entry_order() as constant to let compiler to optimize
* out some code if !CONFIG_THP_SWAP
*/
-#define swap_entry_size(size) 1
+#define swap_entry_order(order) 0
#endif
#define LATENCY_LIMIT 256
@@ -343,18 +348,6 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
info->data = 0;
}
-static inline bool cluster_is_huge(struct swap_cluster_info *info)
-{
- if (IS_ENABLED(CONFIG_THP_SWAP))
- return info->flags & CLUSTER_FLAG_HUGE;
- return false;
-}
-
-static inline void cluster_clear_huge(struct swap_cluster_info *info)
-{
- info->flags &= ~CLUSTER_FLAG_HUGE;
-}
-
static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
unsigned long offset)
{
@@ -558,10 +551,12 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx)
/*
* The cluster corresponding to page_nr will be used. The cluster will be
- * removed from free cluster list and its usage counter will be increased.
+ * removed from free cluster list and its usage counter will be increased by
+ * count.
*/
-static void inc_cluster_info_page(struct swap_info_struct *p,
- struct swap_cluster_info *cluster_info, unsigned long page_nr)
+static void add_cluster_info_page(struct swap_info_struct *p,
+ struct swap_cluster_info *cluster_info, unsigned long page_nr,
+ unsigned long count)
{
unsigned long idx = page_nr / SWAPFILE_CLUSTER;
@@ -570,9 +565,19 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
if (cluster_is_free(&cluster_info[idx]))
alloc_cluster(p, idx);
- VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
+ VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
cluster_set_count(&cluster_info[idx],
- cluster_count(&cluster_info[idx]) + 1);
+ cluster_count(&cluster_info[idx]) + count);
+}
+
+/*
+ * The cluster corresponding to page_nr will be used. The cluster will be
+ * removed from free cluster list and its usage counter will be increased by 1.
+ */
+static void inc_cluster_info_page(struct swap_info_struct *p,
+ struct swap_cluster_info *cluster_info, unsigned long page_nr)
+{
+ add_cluster_info_page(p, cluster_info, page_nr, 1);
}
/*
@@ -602,7 +607,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
*/
static bool
scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
- unsigned long offset)
+ unsigned long offset, int order)
{
struct percpu_cluster *percpu_cluster;
bool conflict;
@@ -616,27 +621,42 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
return false;
percpu_cluster = this_cpu_ptr(si->percpu_cluster);
- cluster_set_null(&percpu_cluster->index);
+ percpu_cluster->next[order] = SWAP_NEXT_INVALID;
+ return true;
+}
+
+static inline bool swap_range_empty(char *swap_map, unsigned int start,
+ unsigned int nr_pages)
+{
+ unsigned int i;
+
+ for (i = 0; i < nr_pages; i++) {
+ if (swap_map[start + i])
+ return false;
+ }
+
return true;
}
/*
- * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
- * might involve allocating a new cluster for current CPU too.
+ * Try to get swap entries with specified order from current cpu's swap entry
+ * pool (a cluster). This might involve allocating a new cluster for current CPU
+ * too.
*/
static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
- unsigned long *offset, unsigned long *scan_base)
+ unsigned long *offset, unsigned long *scan_base, int order)
{
+ unsigned int nr_pages = 1 << order;
struct percpu_cluster *cluster;
struct swap_cluster_info *ci;
- unsigned long tmp, max;
+ unsigned int tmp, max;
new_cluster:
cluster = this_cpu_ptr(si->percpu_cluster);
- if (cluster_is_null(&cluster->index)) {
+ tmp = cluster->next[order];
+ if (tmp == SWAP_NEXT_INVALID) {
if (!cluster_list_empty(&si->free_clusters)) {
- cluster->index = si->free_clusters.head;
- cluster->next = cluster_next(&cluster->index) *
+ tmp = cluster_next(&si->free_clusters.head) *
SWAPFILE_CLUSTER;
} else if (!cluster_list_empty(&si->discard_clusters)) {
/*
@@ -654,27 +674,27 @@ new_cluster:
/*
* Other CPUs can use our cluster if they can't find a free cluster,
- * check if there is still free entry in the cluster
+ * check if there is still free entry in the cluster, maintaining
+ * natural alignment.
*/
- tmp = cluster->next;
- max = min_t(unsigned long, si->max,
- (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
+ max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER));
if (tmp < max) {
ci = lock_cluster(si, tmp);
while (tmp < max) {
- if (!si->swap_map[tmp])
+ if (swap_range_empty(si->swap_map, tmp, nr_pages))
break;
- tmp++;
+ tmp += nr_pages;
}
unlock_cluster(ci);
}
if (tmp >= max) {
- cluster_set_null(&cluster->index);
+ cluster->next[order] = SWAP_NEXT_INVALID;
goto new_cluster;
}
- cluster->next = tmp + 1;
*offset = tmp;
*scan_base = tmp;
+ tmp += nr_pages;
+ cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID;
return true;
}
@@ -804,13 +824,14 @@ static bool swap_offset_available_and_locked(struct swap_info_struct *si,
static int scan_swap_map_slots(struct swap_info_struct *si,
unsigned char usage, int nr,
- swp_entry_t slots[])
+ swp_entry_t slots[], int order)
{
struct swap_cluster_info *ci;
unsigned long offset;
unsigned long scan_base;
unsigned long last_in_cluster = 0;
int latency_ration = LATENCY_LIMIT;
+ unsigned int nr_pages = 1 << order;
int n_ret = 0;
bool scanned_many = false;
@@ -825,6 +846,25 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
* And we let swap pages go all over an SSD partition. Hugh
*/
+ if (order > 0) {
+ /*
+ * Should not even be attempting large allocations when huge
+ * page swap is disabled. Warn and fail the allocation.
+ */
+ if (!IS_ENABLED(CONFIG_THP_SWAP) ||
+ nr_pages > SWAPFILE_CLUSTER) {
+ VM_WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ /*
+ * Swapfile is not block device or not using clusters so unable
+ * to allocate large entries.
+ */
+ if (!(si->flags & SWP_BLKDEV) || !si->cluster_info)
+ return 0;
+ }
+
si->flags += SWP_SCANNING;
/*
* Use percpu scan base for SSD to reduce lock contention on
@@ -839,8 +879,11 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
/* SSD algorithm */
if (si->cluster_info) {
- if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
+ if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) {
+ if (order > 0)
+ goto no_page;
goto scan;
+ }
} else if (unlikely(!si->cluster_nr--)) {
if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
si->cluster_nr = SWAPFILE_CLUSTER - 1;
@@ -882,13 +925,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
checks:
if (si->cluster_info) {
- while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
+ while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) {
/* take a break if we already got some slots */
if (n_ret)
goto done;
if (!scan_swap_map_try_ssd_cluster(si, &offset,
- &scan_base))
+ &scan_base, order)) {
+ if (order > 0)
+ goto no_page;
goto scan;
+ }
}
}
if (!(si->flags & SWP_WRITEOK))
@@ -907,7 +953,7 @@ checks:
swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
spin_lock(&si->lock);
/* entry was freed successfully, try to use this again */
- if (swap_was_freed)
+ if (swap_was_freed > 0)
goto checks;
goto scan; /* check next one */
}
@@ -919,11 +965,11 @@ checks:
else
goto done;
}
- WRITE_ONCE(si->swap_map[offset], usage);
- inc_cluster_info_page(si, si->cluster_info, offset);
+ memset(si->swap_map + offset, usage, nr_pages);
+ add_cluster_info_page(si, si->cluster_info, offset, nr_pages);
unlock_cluster(ci);
- swap_range_alloc(si, offset, 1);
+ swap_range_alloc(si, offset, nr_pages);
slots[n_ret++] = swp_entry(si->type, offset);
/* got enough slots or reach max slots? */
@@ -944,8 +990,10 @@ checks:
/* try to get more slots in cluster */
if (si->cluster_info) {
- if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
+ if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order))
goto checks;
+ if (order > 0)
+ goto done;
} else if (si->cluster_nr && !si->swap_map[++offset]) {
/* non-ssd case, still more slots in cluster? */
--si->cluster_nr;
@@ -972,11 +1020,13 @@ checks:
}
done:
- set_cluster_next(si, offset + 1);
+ if (order == 0)
+ set_cluster_next(si, offset + 1);
si->flags -= SWP_SCANNING;
return n_ret;
scan:
+ VM_WARN_ON(order > 0);
spin_unlock(&si->lock);
while (++offset <= READ_ONCE(si->highest_bit)) {
if (unlikely(--latency_ration < 0)) {
@@ -1005,38 +1055,6 @@ no_page:
return n_ret;
}
-static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
-{
- unsigned long idx;
- struct swap_cluster_info *ci;
- unsigned long offset;
-
- /*
- * Should not even be attempting cluster allocations when huge
- * page swap is disabled. Warn and fail the allocation.
- */
- if (!IS_ENABLED(CONFIG_THP_SWAP)) {
- VM_WARN_ON_ONCE(1);
- return 0;
- }
-
- if (cluster_list_empty(&si->free_clusters))
- return 0;
-
- idx = cluster_list_first(&si->free_clusters);
- offset = idx * SWAPFILE_CLUSTER;
- ci = lock_cluster(si, offset);
- alloc_cluster(si, idx);
- cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
-
- memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER);
- unlock_cluster(ci);
- swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
- *slot = swp_entry(si->type, offset);
-
- return 1;
-}
-
static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
{
unsigned long offset = idx * SWAPFILE_CLUSTER;
@@ -1050,17 +1068,15 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
swap_range_free(si, offset, SWAPFILE_CLUSTER);
}
-int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
+int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
{
- unsigned long size = swap_entry_size(entry_size);
+ int order = swap_entry_order(entry_order);
+ unsigned long size = 1 << order;
struct swap_info_struct *si, *next;
long avail_pgs;
int n_ret = 0;
int node;
- /* Only single cluster request supported */
- WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
-
spin_lock(&swap_avail_lock);
avail_pgs = atomic_long_read(&nr_swap_pages) / size;
@@ -1096,14 +1112,10 @@ start_over:
spin_unlock(&si->lock);
goto nextsi;
}
- if (size == SWAPFILE_CLUSTER) {
- if (si->flags & SWP_BLKDEV)
- n_ret = swap_alloc_cluster(si, swp_entries);
- } else
- n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
- n_goal, swp_entries);
+ n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
+ n_goal, swp_entries, order);
spin_unlock(&si->lock);
- if (n_ret || size == SWAPFILE_CLUSTER)
+ if (n_ret || size > 1)
goto check_out;
cond_resched();
@@ -1357,7 +1369,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
unsigned char *map;
unsigned int i, free_entries = 0;
unsigned char val;
- int size = swap_entry_size(folio_nr_pages(folio));
+ int size = 1 << swap_entry_order(folio_order(folio));
si = _swap_info_get(entry);
if (!si)
@@ -1365,7 +1377,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
ci = lock_cluster_or_swap_info(si, offset);
if (size == SWAPFILE_CLUSTER) {
- VM_BUG_ON(!cluster_is_huge(ci));
map = si->swap_map + offset;
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
val = map[i];
@@ -1373,7 +1384,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
if (val == SWAP_HAS_CACHE)
free_entries++;
}
- cluster_clear_huge(ci);
if (free_entries == SWAPFILE_CLUSTER) {
unlock_cluster_or_swap_info(si, ci);
spin_lock(&si->lock);
@@ -1395,23 +1405,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
unlock_cluster_or_swap_info(si, ci);
}
-#ifdef CONFIG_THP_SWAP
-int split_swap_cluster(swp_entry_t entry)
-{
- struct swap_info_struct *si;
- struct swap_cluster_info *ci;
- unsigned long offset = swp_offset(entry);
-
- si = _swap_info_get(entry);
- if (!si)
- return -EBUSY;
- ci = lock_cluster(si, offset);
- cluster_clear_huge(ci);
- unlock_cluster(ci);
- return 0;
-}
-#endif
-
static int swp_entry_cmp(const void *ent1, const void *ent2)
{
const swp_entry_t *e1 = ent1, *e2 = ent2;
@@ -1519,22 +1512,23 @@ out:
}
static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
- swp_entry_t entry)
+ swp_entry_t entry, int order)
{
struct swap_cluster_info *ci;
unsigned char *map = si->swap_map;
+ unsigned int nr_pages = 1 << order;
unsigned long roffset = swp_offset(entry);
- unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
+ unsigned long offset = round_down(roffset, nr_pages);
int i;
bool ret = false;
ci = lock_cluster_or_swap_info(si, offset);
- if (!ci || !cluster_is_huge(ci)) {
+ if (!ci || nr_pages == 1) {
if (swap_count(map[roffset]))
ret = true;
goto unlock_out;
}
- for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+ for (i = 0; i < nr_pages; i++) {
if (swap_count(map[offset + i])) {
ret = true;
break;
@@ -1556,7 +1550,7 @@ static bool folio_swapped(struct folio *folio)
if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
return swap_swapcount(si, entry) != 0;
- return swap_page_trans_huge_swapped(si, entry);
+ return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
}
/**
@@ -1602,33 +1596,88 @@ bool folio_free_swap(struct folio *folio)
return true;
}
-/*
- * Free the swap entry like above, but also try to
- * free the page cache entry if it is the last user.
+/**
+ * free_swap_and_cache_nr() - Release reference on range of swap entries and
+ * reclaim their cache if no more references remain.
+ * @entry: First entry of range.
+ * @nr: Number of entries in range.
+ *
+ * For each swap entry in the contiguous range, release a reference. If any swap
+ * entries become free, try to reclaim their underlying folios, if present. The
+ * offset range is defined by [entry.offset, entry.offset + nr).
*/
-int free_swap_and_cache(swp_entry_t entry)
+void free_swap_and_cache_nr(swp_entry_t entry, int nr)
{
- struct swap_info_struct *p;
+ const unsigned long start_offset = swp_offset(entry);
+ const unsigned long end_offset = start_offset + nr;
+ unsigned int type = swp_type(entry);
+ struct swap_info_struct *si;
+ bool any_only_cache = false;
+ unsigned long offset;
unsigned char count;
if (non_swap_entry(entry))
- return 1;
+ return;
- p = get_swap_device(entry);
- if (p) {
- if (WARN_ON(data_race(!p->swap_map[swp_offset(entry)]))) {
- put_swap_device(p);
- return 0;
+ si = get_swap_device(entry);
+ if (!si)
+ return;
+
+ if (WARN_ON(end_offset > si->max))
+ goto out;
+
+ /*
+ * First free all entries in the range.
+ */
+ for (offset = start_offset; offset < end_offset; offset++) {
+ if (data_race(si->swap_map[offset])) {
+ count = __swap_entry_free(si, swp_entry(type, offset));
+ if (count == SWAP_HAS_CACHE)
+ any_only_cache = true;
+ } else {
+ WARN_ON_ONCE(1);
}
+ }
- count = __swap_entry_free(p, entry);
- if (count == SWAP_HAS_CACHE &&
- !swap_page_trans_huge_swapped(p, entry))
- __try_to_reclaim_swap(p, swp_offset(entry),
+ /*
+ * Short-circuit the below loop if none of the entries had their
+ * reference drop to zero.
+ */
+ if (!any_only_cache)
+ goto out;
+
+ /*
+ * Now go back over the range trying to reclaim the swap cache. This is
+ * more efficient for large folios because we will only try to reclaim
+ * the swap once per folio in the common case. If we do
+ * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
+ * latter will get a reference and lock the folio for every individual
+ * page but will only succeed once the swap slot for every subpage is
+ * zero.
+ */
+ for (offset = start_offset; offset < end_offset; offset += nr) {
+ nr = 1;
+ if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
+ /*
+ * Folios are always naturally aligned in swap so
+ * advance forward to the next boundary. Zero means no
+ * folio was found for the swap entry, so advance by 1
+ * in this case. Negative value means folio was found
+ * but could not be reclaimed. Here we can still advance
+ * to the next boundary.
+ */
+ nr = __try_to_reclaim_swap(si, offset,
TTRS_UNMAPPED | TTRS_FULL);
- put_swap_device(p);
+ if (nr == 0)
+ nr = 1;
+ else if (nr < 0)
+ nr = -nr;
+ nr = ALIGN(offset + 1, nr) - offset;
+ }
}
- return p != NULL;
+
+out:
+ put_swap_device(si);
}
#ifdef CONFIG_HIBERNATION
@@ -1643,7 +1692,7 @@ swp_entry_t get_swap_page_of_type(int type)
/* This is called for allocating swap entry, not cache */
spin_lock(&si->lock);
- if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry))
+ if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
atomic_long_dec(&nr_swap_pages);
spin_unlock(&si->lock);
fail:
@@ -1806,7 +1855,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
* when reading from swap. This metadata may be indexed by swap entry
* so this must be called before swap_free().
*/
- arch_swap_restore(entry, folio);
+ arch_swap_restore(folio_swap(entry, folio), folio);
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
@@ -3097,7 +3146,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
p->flags |= SWP_SYNCHRONOUS_IO;
if (p->bdev && bdev_nonrot(p->bdev)) {
- int cpu;
+ int cpu, i;
unsigned long ci, nr_cluster;
p->flags |= SWP_SOLIDSTATE;
@@ -3133,8 +3182,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
for_each_possible_cpu(cpu) {
struct percpu_cluster *cluster;
+
cluster = per_cpu_ptr(p->percpu_cluster, cpu);
- cluster_set_null(&cluster->index);
+ for (i = 0; i < SWAP_NR_ORDERS; i++)
+ cluster->next[i] = SWAP_NEXT_INVALID;
}
} else {
atomic_inc(&nr_rotate_swap);
diff --git a/mm/truncate.c b/mm/truncate.c
index 725b150e47ac4c..e99085bf3d34d9 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -764,15 +764,15 @@ EXPORT_SYMBOL(truncate_setsize);
* @from: original inode size
* @to: new inode size
*
- * Handle extension of inode size either caused by extending truncate or by
- * write starting after current i_size. We mark the page straddling current
- * i_size RO so that page_mkwrite() is called on the nearest write access to
- * the page. This way filesystem can be sure that page_mkwrite() is called on
- * the page before user writes to the page via mmap after the i_size has been
- * changed.
+ * Handle extension of inode size either caused by extending truncate or
+ * by write starting after current i_size. We mark the page straddling
+ * current i_size RO so that page_mkwrite() is called on the first
+ * write access to the page. The filesystem will update its per-block
+ * information before user writes to the page via mmap after the i_size
+ * has been changed.
*
* The function must be called after i_size is updated so that page fault
- * coming after we unlock the page will already see the new i_size.
+ * coming after we unlock the folio will already see the new i_size.
* The function must be called while we still hold i_rwsem - this not only
* makes sure i_size is stable but also that userspace cannot observe new
* i_size value before we are prepared to store mmap writes at new inode size.
@@ -781,31 +781,29 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
{
int bsize = i_blocksize(inode);
loff_t rounded_from;
- struct page *page;
- pgoff_t index;
+ struct folio *folio;
WARN_ON(to > inode->i_size);
- if (from >= to || bsize == PAGE_SIZE)
+ if (from >= to || bsize >= PAGE_SIZE)
return;
/* Page straddling @from will not have any hole block created? */
rounded_from = round_up(from, bsize);
if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1)))
return;
- index = from >> PAGE_SHIFT;
- page = find_lock_page(inode->i_mapping, index);
- /* Page not cached? Nothing to do */
- if (!page)
+ folio = filemap_lock_folio(inode->i_mapping, from / PAGE_SIZE);
+ /* Folio not cached? Nothing to do */
+ if (IS_ERR(folio))
return;
/*
- * See clear_page_dirty_for_io() for details why set_page_dirty()
+ * See folio_clear_dirty_for_io() for details why folio_mark_dirty()
* is needed.
*/
- if (page_mkclean(page))
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ if (folio_mkclean(folio))
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
}
EXPORT_SYMBOL(pagecache_isize_extended);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 3c3539c573e7fe..b70618e8dcd248 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1662,9 +1662,9 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
/* Check if we can move the pmd without splitting it. */
if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
!pmd_none(dst_pmdval)) {
- struct folio *folio = pfn_folio(pmd_pfn(*src_pmd));
+ struct folio *folio = pmd_folio(*src_pmd);
- if (!folio || (!is_huge_zero_page(&folio->page) &&
+ if (!folio || (!is_huge_zero_folio(folio) &&
!PageAnonExclusive(&folio->page))) {
spin_unlock(ptl);
err = -EBUSY;
diff --git a/mm/util.c b/mm/util.c
index 669397235787b9..c9e519e6811f55 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -124,16 +124,16 @@ EXPORT_SYMBOL(kstrndup);
* Return: newly allocated copy of @src or %NULL in case of error,
* result is physically contiguous. Use kfree() to free.
*/
-void *kmemdup(const void *src, size_t len, gfp_t gfp)
+void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp)
{
void *p;
- p = kmalloc_track_caller(len, gfp);
+ p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_);
if (p)
memcpy(p, src, len);
return p;
}
-EXPORT_SYMBOL(kmemdup);
+EXPORT_SYMBOL(kmemdup_noprof);
/**
* kmemdup_array - duplicate a given array.
@@ -469,17 +469,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
- mm->get_unmapped_area = arch_get_unmapped_area;
+ clear_bit(MMF_TOPDOWN, &mm->flags);
} else {
mm->mmap_base = mmap_base(random_factor, rlim_stack);
- mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+ set_bit(MMF_TOPDOWN, &mm->flags);
}
}
#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
- mm->get_unmapped_area = arch_get_unmapped_area;
+ clear_bit(MMF_TOPDOWN, &mm->flags);
}
#endif
@@ -609,7 +609,7 @@ EXPORT_SYMBOL(vm_mmap);
*
* Return: pointer to the allocated memory of %NULL in case of failure
*/
-void *kvmalloc_node(size_t size, gfp_t flags, int node)
+void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node)
{
gfp_t kmalloc_flags = flags;
void *ret;
@@ -631,7 +631,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
kmalloc_flags &= ~__GFP_NOFAIL;
}
- ret = kmalloc_node(size, kmalloc_flags, node);
+ ret = kmalloc_node_noprof(size, kmalloc_flags, node);
/*
* It doesn't really make sense to fallback to vmalloc for sub page
@@ -656,11 +656,11 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
* about the resulting pointer, and cannot play
* protection games.
*/
- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+ return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
node, __builtin_return_address(0));
}
-EXPORT_SYMBOL(kvmalloc_node);
+EXPORT_SYMBOL(kvmalloc_node_noprof);
/**
* kvfree() - Free memory.
@@ -699,7 +699,7 @@ void kvfree_sensitive(const void *addr, size_t len)
}
EXPORT_SYMBOL(kvfree_sensitive);
-void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
+void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
{
void *newp;
@@ -712,7 +712,7 @@ void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
kvfree(p);
return newp;
}
-EXPORT_SYMBOL(kvrealloc);
+EXPORT_SYMBOL(kvrealloc_noprof);
/**
* __vmalloc_array - allocate memory for a virtually contiguous array.
@@ -720,7 +720,7 @@ EXPORT_SYMBOL(kvrealloc);
* @size: element size.
* @flags: the type of memory to allocate (see kmalloc).
*/
-void *__vmalloc_array(size_t n, size_t size, gfp_t flags)
+void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
{
size_t bytes;
@@ -728,18 +728,18 @@ void *__vmalloc_array(size_t n, size_t size, gfp_t flags)
return NULL;
return __vmalloc(bytes, flags);
}
-EXPORT_SYMBOL(__vmalloc_array);
+EXPORT_SYMBOL(__vmalloc_array_noprof);
/**
* vmalloc_array - allocate memory for a virtually contiguous array.
* @n: number of elements.
* @size: element size.
*/
-void *vmalloc_array(size_t n, size_t size)
+void *vmalloc_array_noprof(size_t n, size_t size)
{
return __vmalloc_array(n, size, GFP_KERNEL);
}
-EXPORT_SYMBOL(vmalloc_array);
+EXPORT_SYMBOL(vmalloc_array_noprof);
/**
* __vcalloc - allocate and zero memory for a virtually contiguous array.
@@ -747,22 +747,22 @@ EXPORT_SYMBOL(vmalloc_array);
* @size: element size.
* @flags: the type of memory to allocate (see kmalloc).
*/
-void *__vcalloc(size_t n, size_t size, gfp_t flags)
+void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags)
{
return __vmalloc_array(n, size, flags | __GFP_ZERO);
}
-EXPORT_SYMBOL(__vcalloc);
+EXPORT_SYMBOL(__vcalloc_noprof);
/**
* vcalloc - allocate and zero memory for a virtually contiguous array.
* @n: number of elements.
* @size: element size.
*/
-void *vcalloc(size_t n, size_t size)
+void *vcalloc_noprof(size_t n, size_t size)
{
return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO);
}
-EXPORT_SYMBOL(vcalloc);
+EXPORT_SYMBOL(vcalloc_noprof);
struct anon_vma *folio_anon_vma(struct folio *folio)
{
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 125427cbdb87bc..a9fedb16ec17eb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1926,15 +1926,25 @@ node_alloc(unsigned long size, unsigned long align,
return va;
}
+static inline void setup_vmalloc_vm(struct vm_struct *vm,
+ struct vmap_area *va, unsigned long flags, const void *caller)
+{
+ vm->flags = flags;
+ vm->addr = (void *)va->va_start;
+ vm->size = va->va_end - va->va_start;
+ vm->caller = caller;
+ va->vm = vm;
+}
+
/*
* Allocate a region of KVA of the specified size and alignment, within the
- * vstart and vend.
+ * vstart and vend. If vm is passed in, the two will also be bound.
*/
static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long align,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask,
- unsigned long va_flags)
+ unsigned long va_flags, struct vm_struct *vm)
{
struct vmap_node *vn;
struct vmap_area *va;
@@ -1997,6 +2007,12 @@ retry:
va->vm = NULL;
va->flags = (va_flags | vn_id);
+ if (vm) {
+ vm->addr = (void *)va->va_start;
+ vm->size = va->va_end - va->va_start;
+ va->vm = vm;
+ }
+
vn = addr_to_node(va->va_start);
spin_lock(&vn->busy.lock);
@@ -2574,7 +2590,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
VMALLOC_START, VMALLOC_END,
node, gfp_mask,
- VMAP_RAM|VMAP_BLOCK);
+ VMAP_RAM|VMAP_BLOCK, NULL);
if (IS_ERR(va)) {
kfree(vb);
return ERR_CAST(va);
@@ -2931,7 +2947,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
struct vmap_area *va;
va = alloc_vmap_area(size, PAGE_SIZE,
VMALLOC_START, VMALLOC_END,
- node, GFP_KERNEL, VMAP_RAM);
+ node, GFP_KERNEL, VMAP_RAM,
+ NULL);
if (IS_ERR(va))
return NULL;
@@ -3034,26 +3051,6 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
}
-static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
- struct vmap_area *va, unsigned long flags, const void *caller)
-{
- vm->flags = flags;
- vm->addr = (void *)va->va_start;
- vm->size = va->va_end - va->va_start;
- vm->caller = caller;
- va->vm = vm;
-}
-
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, const void *caller)
-{
- struct vmap_node *vn = addr_to_node(va->va_start);
-
- spin_lock(&vn->busy.lock);
- setup_vmalloc_vm_locked(vm, va, flags, caller);
- spin_unlock(&vn->busy.lock);
-}
-
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
/*
@@ -3090,14 +3087,15 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
if (!(flags & VM_NO_GUARD))
size += PAGE_SIZE;
- va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
+ area->flags = flags;
+ area->caller = caller;
+
+ va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
if (IS_ERR(va)) {
kfree(area);
return NULL;
}
- setup_vmalloc_vm(area, va, flags, caller);
-
/*
* Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
* best-effort approach, as they can be mapped outside of vmalloc code.
@@ -3523,12 +3521,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
* but mempolicy wants to alloc memory by interleaving.
*/
if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
- nr = alloc_pages_bulk_array_mempolicy(bulk_gfp,
+ nr = alloc_pages_bulk_array_mempolicy_noprof(bulk_gfp,
nr_pages_request,
pages + nr_allocated);
else
- nr = alloc_pages_bulk_array_node(bulk_gfp, nid,
+ nr = alloc_pages_bulk_array_node_noprof(bulk_gfp, nid,
nr_pages_request,
pages + nr_allocated);
@@ -3558,9 +3556,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
break;
if (nid == NUMA_NO_NODE)
- page = alloc_pages(alloc_gfp, order);
+ page = alloc_pages_noprof(alloc_gfp, order);
else
- page = alloc_pages_node(nid, alloc_gfp, order);
+ page = alloc_pages_node_noprof(nid, alloc_gfp, order);
if (unlikely(!page)) {
if (!nofail)
break;
@@ -3617,10 +3615,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
- area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
+ area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node,
area->caller);
} else {
- area->pages = kmalloc_node(array_size, nested_gfp, node);
+ area->pages = kmalloc_node_noprof(array_size, nested_gfp, node);
}
if (!area->pages) {
@@ -3730,7 +3728,7 @@ fail:
*
* Return: the address of the area or %NULL on failure
*/
-void *__vmalloc_node_range(unsigned long size, unsigned long align,
+void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
const void *caller)
@@ -3877,10 +3875,10 @@ fail:
*
* Return: pointer to the allocated memory or %NULL on error
*/
-void *__vmalloc_node(unsigned long size, unsigned long align,
+void *__vmalloc_node_noprof(unsigned long size, unsigned long align,
gfp_t gfp_mask, int node, const void *caller)
{
- return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+ return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
gfp_mask, PAGE_KERNEL, 0, node, caller);
}
/*
@@ -3889,15 +3887,15 @@ void *__vmalloc_node(unsigned long size, unsigned long align,
* than that.
*/
#ifdef CONFIG_TEST_VMALLOC_MODULE
-EXPORT_SYMBOL_GPL(__vmalloc_node);
+EXPORT_SYMBOL_GPL(__vmalloc_node_noprof);
#endif
-void *__vmalloc(unsigned long size, gfp_t gfp_mask)
+void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
{
- return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
+ return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE,
__builtin_return_address(0));
}
-EXPORT_SYMBOL(__vmalloc);
+EXPORT_SYMBOL(__vmalloc_noprof);
/**
* vmalloc - allocate virtually contiguous memory
@@ -3911,12 +3909,12 @@ EXPORT_SYMBOL(__vmalloc);
*
* Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc(unsigned long size)
+void *vmalloc_noprof(unsigned long size)
{
- return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
+ return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE,
__builtin_return_address(0));
}
-EXPORT_SYMBOL(vmalloc);
+EXPORT_SYMBOL(vmalloc_noprof);
/**
* vmalloc_huge - allocate virtually contiguous memory, allow huge pages
@@ -3930,13 +3928,13 @@ EXPORT_SYMBOL(vmalloc);
*
* Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
+void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask)
{
- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+ return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
NUMA_NO_NODE, __builtin_return_address(0));
}
-EXPORT_SYMBOL_GPL(vmalloc_huge);
+EXPORT_SYMBOL_GPL(vmalloc_huge_noprof);
/**
* vzalloc - allocate virtually contiguous memory with zero fill
@@ -3951,12 +3949,12 @@ EXPORT_SYMBOL_GPL(vmalloc_huge);
*
* Return: pointer to the allocated memory or %NULL on error
*/
-void *vzalloc(unsigned long size)
+void *vzalloc_noprof(unsigned long size)
{
- return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
+ return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
__builtin_return_address(0));
}
-EXPORT_SYMBOL(vzalloc);
+EXPORT_SYMBOL(vzalloc_noprof);
/**
* vmalloc_user - allocate zeroed virtually contiguous memory for userspace
@@ -3967,14 +3965,14 @@ EXPORT_SYMBOL(vzalloc);
*
* Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc_user(unsigned long size)
+void *vmalloc_user_noprof(unsigned long size)
{
- return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
+ return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END,
GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
VM_USERMAP, NUMA_NO_NODE,
__builtin_return_address(0));
}
-EXPORT_SYMBOL(vmalloc_user);
+EXPORT_SYMBOL(vmalloc_user_noprof);
/**
* vmalloc_node - allocate memory on a specific node
@@ -3989,12 +3987,12 @@ EXPORT_SYMBOL(vmalloc_user);
*
* Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc_node(unsigned long size, int node)
+void *vmalloc_node_noprof(unsigned long size, int node)
{
- return __vmalloc_node(size, 1, GFP_KERNEL, node,
+ return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node,
__builtin_return_address(0));
}
-EXPORT_SYMBOL(vmalloc_node);
+EXPORT_SYMBOL(vmalloc_node_noprof);
/**
* vzalloc_node - allocate memory on a specific node with zero fill
@@ -4007,12 +4005,12 @@ EXPORT_SYMBOL(vmalloc_node);
*
* Return: pointer to the allocated memory or %NULL on error
*/
-void *vzalloc_node(unsigned long size, int node)
+void *vzalloc_node_noprof(unsigned long size, int node)
{
- return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
+ return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node,
__builtin_return_address(0));
}
-EXPORT_SYMBOL(vzalloc_node);
+EXPORT_SYMBOL(vzalloc_node_noprof);
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
@@ -4035,12 +4033,12 @@ EXPORT_SYMBOL(vzalloc_node);
*
* Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc_32(unsigned long size)
+void *vmalloc_32_noprof(unsigned long size)
{
- return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
+ return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
__builtin_return_address(0));
}
-EXPORT_SYMBOL(vmalloc_32);
+EXPORT_SYMBOL(vmalloc_32_noprof);
/**
* vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
@@ -4051,14 +4049,14 @@ EXPORT_SYMBOL(vmalloc_32);
*
* Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc_32_user(unsigned long size)
+void *vmalloc_32_user_noprof(unsigned long size)
{
- return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
+ return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END,
GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
VM_USERMAP, NUMA_NO_NODE,
__builtin_return_address(0));
}
-EXPORT_SYMBOL(vmalloc_32_user);
+EXPORT_SYMBOL(vmalloc_32_user_noprof);
/*
* Atomically zero bytes in the iterator.
@@ -4672,7 +4670,7 @@ retry:
spin_lock(&vn->busy.lock);
insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
- setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
+ setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
spin_unlock(&vn->busy.lock);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3ef654addd44c2..da93213af981e9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -967,7 +967,8 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
__GFP_NOMEMALLOC | GFP_NOWAIT,
.nid = target_nid,
- .nmask = &allowed_mask
+ .nmask = &allowed_mask,
+ .reason = MR_DEMOTION,
};
if (list_empty(demote_folios))
@@ -1205,25 +1206,25 @@ retry:
if (!can_split_folio(folio, NULL))
goto activate_locked;
/*
- * Split folios without a PMD map right
- * away. Chances are some or all of the
- * tail pages can be freed without IO.
+ * Split partially mapped folios right away.
+ * We can free the unmapped pages without IO.
*/
- if (!folio_entire_mapcount(folio) &&
- split_folio_to_list(folio,
- folio_list))
+ if (data_race(!list_empty(&folio->_deferred_list)) &&
+ split_folio_to_list(folio, folio_list))
goto activate_locked;
}
if (!add_to_swap(folio)) {
if (!folio_test_large(folio))
goto activate_locked_split;
/* Fallback to swap normal pages */
- if (split_folio_to_list(folio,
- folio_list))
+ if (split_folio_to_list(folio, folio_list))
goto activate_locked;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1);
- count_vm_event(THP_SWPOUT_FALLBACK);
+ if (nr_pages >= HPAGE_PMD_NR) {
+ count_memcg_folio_events(folio,
+ THP_SWPOUT_FALLBACK, 1);
+ count_vm_event(THP_SWPOUT_FALLBACK);
+ }
#endif
if (!add_to_swap(folio))
goto activate_locked_split;
@@ -1256,6 +1257,20 @@ retry:
if (folio_test_pmd_mappable(folio))
flags |= TTU_SPLIT_HUGE_PMD;
+ /*
+ * Without TTU_SYNC, try_to_unmap will only begin to
+ * hold PTL from the first present PTE within a large
+ * folio. Some initial PTEs might be skipped due to
+ * races with parallel PTE writes in which PTEs can be
+ * cleared temporarily before being written new present
+ * values. This will lead to a large folio is still
+ * mapped while some subpages have been partially
+ * unmapped after try_to_unmap; TTU_SYNC helps
+ * try_to_unmap acquire PTL from the first PTE,
+ * eliminating the influence of temporary PTE values.
+ */
+ if (folio_test_large(folio) && list_empty(&folio->_deferred_list))
+ flags |= TTU_SYNC;
try_to_unmap(folio, flags);
if (folio_mapped(folio)) {
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 7ab05621052dc5..2ebfed32871bf3 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -1237,12 +1237,12 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
}
/**
- * z3fold_get_pool_size() - gets the z3fold pool size in pages
+ * z3fold_get_pool_pages() - gets the z3fold pool size in pages
* @pool: pool whose size is being queried
*
* Returns: size in pages of the given pool.
*/
-static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
+static u64 z3fold_get_pool_pages(struct z3fold_pool *pool)
{
return atomic64_read(&pool->pages_nr);
}
@@ -1402,9 +1402,9 @@ static void z3fold_zpool_unmap(void *pool, unsigned long handle)
z3fold_unmap(pool, handle);
}
-static u64 z3fold_zpool_total_size(void *pool)
+static u64 z3fold_zpool_total_pages(void *pool)
{
- return z3fold_get_pool_size(pool) * PAGE_SIZE;
+ return z3fold_get_pool_pages(pool);
}
static struct zpool_driver z3fold_zpool_driver = {
@@ -1417,7 +1417,7 @@ static struct zpool_driver z3fold_zpool_driver = {
.free = z3fold_zpool_free,
.map = z3fold_zpool_map,
.unmap = z3fold_zpool_unmap,
- .total_size = z3fold_zpool_total_size,
+ .total_pages = z3fold_zpool_total_pages,
};
MODULE_ALIAS("zpool-z3fold");
diff --git a/mm/zbud.c b/mm/zbud.c
index 2190cc1f37b3dc..e9836fff943811 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -365,13 +365,13 @@ static void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
}
/**
- * zbud_get_pool_size() - gets the zbud pool size in pages
+ * zbud_get_pool_pages() - gets the zbud pool size in pages
* @pool: pool whose size is being queried
*
* Returns: size in pages of the given pool. The pool lock need not be
* taken to access pages_nr.
*/
-static u64 zbud_get_pool_size(struct zbud_pool *pool)
+static u64 zbud_get_pool_pages(struct zbud_pool *pool)
{
return pool->pages_nr;
}
@@ -410,9 +410,9 @@ static void zbud_zpool_unmap(void *pool, unsigned long handle)
zbud_unmap(pool, handle);
}
-static u64 zbud_zpool_total_size(void *pool)
+static u64 zbud_zpool_total_pages(void *pool)
{
- return zbud_get_pool_size(pool) * PAGE_SIZE;
+ return zbud_get_pool_pages(pool);
}
static struct zpool_driver zbud_zpool_driver = {
@@ -425,7 +425,7 @@ static struct zpool_driver zbud_zpool_driver = {
.free = zbud_zpool_free,
.map = zbud_zpool_map,
.unmap = zbud_zpool_unmap,
- .total_size = zbud_zpool_total_size,
+ .total_pages = zbud_zpool_total_pages,
};
MODULE_ALIAS("zpool-zbud");
diff --git a/mm/zpool.c b/mm/zpool.c
index 846410479c2f41..b9fda1fa857da7 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -321,16 +321,16 @@ void zpool_unmap_handle(struct zpool *zpool, unsigned long handle)
}
/**
- * zpool_get_total_size() - The total size of the pool
+ * zpool_get_total_pages() - The total size of the pool
* @zpool: The zpool to check
*
- * This returns the total size in bytes of the pool.
+ * This returns the total size in pages of the pool.
*
- * Returns: Total size of the zpool in bytes.
+ * Returns: Total size of the zpool in pages.
*/
-u64 zpool_get_total_size(struct zpool *zpool)
+u64 zpool_get_total_pages(struct zpool *zpool)
{
- return zpool->driver->total_size(zpool->pool);
+ return zpool->driver->total_pages(zpool->pool);
}
/**
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 7d7cb3eaabe029..b42d3545ca856f 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -399,9 +399,9 @@ static void zs_zpool_unmap(void *pool, unsigned long handle)
zs_unmap_object(pool, handle);
}
-static u64 zs_zpool_total_size(void *pool)
+static u64 zs_zpool_total_pages(void *pool)
{
- return zs_get_total_pages(pool) << PAGE_SHIFT;
+ return zs_get_total_pages(pool);
}
static struct zpool_driver zs_zpool_driver = {
@@ -414,7 +414,7 @@ static struct zpool_driver zs_zpool_driver = {
.free = zs_zpool_free,
.map = zs_zpool_map,
.unmap = zs_zpool_unmap,
- .total_size = zs_zpool_total_size,
+ .total_pages = zs_zpool_total_pages,
};
MODULE_ALIAS("zpool-zsmalloc");
diff --git a/mm/zswap.c b/mm/zswap.c
index 6f8850c44b6166..e6cf6282f5fd01 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -20,7 +20,6 @@
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/atomic.h>
-#include <linux/rbtree.h>
#include <linux/swap.h>
#include <linux/crypto.h>
#include <linux/scatterlist.h>
@@ -43,8 +42,6 @@
/*********************************
* statistics
**********************************/
-/* Total bytes used by the compressed storage */
-u64 zswap_pool_total_size;
/* The number of compressed pages currently stored in zswap */
atomic_t zswap_stored_pages = ATOMIC_INIT(0);
/* The number of same-value filled pages currently stored in zswap */
@@ -183,8 +180,6 @@ struct zswap_pool {
/* Global LRU lists shared by all zswap pools. */
static struct list_lru zswap_list_lru;
-/* counter of pages stored in all zswap pools. */
-static atomic_t zswap_nr_stored = ATOMIC_INIT(0);
/* The lock protects zswap_next_shrink updates. */
static DEFINE_SPINLOCK(zswap_shrink_lock);
@@ -198,7 +193,6 @@ static struct shrinker *zswap_shrinker;
* This structure contains the metadata for tracking a single compressed
* page within zswap.
*
- * rbnode - links the entry into red-black tree for the appropriate swap type
* swpentry - associated swap entry, the offset indexes into the red-black tree
* length - the length in bytes of the compressed page data. Needed during
* decompression. For a same value filled page length is 0, and both
@@ -210,7 +204,6 @@ static struct shrinker *zswap_shrinker;
* lru - handle to the pool's lru used to evict pages.
*/
struct zswap_entry {
- struct rb_node rbnode;
swp_entry_t swpentry;
unsigned int length;
struct zswap_pool *pool;
@@ -222,12 +215,7 @@ struct zswap_entry {
struct list_head lru;
};
-struct zswap_tree {
- struct rb_root rbroot;
- spinlock_t lock;
-};
-
-static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
+static struct xarray *zswap_trees[MAX_SWAPFILES];
static unsigned int nr_zswap_trees[MAX_SWAPFILES];
/* RCU-protected iteration */
@@ -255,7 +243,7 @@ static bool zswap_has_pool;
* helpers and fwd declarations
**********************************/
-static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
+static inline struct xarray *swap_zswap_tree(swp_entry_t swp)
{
return &zswap_trees[swp_type(swp)][swp_offset(swp)
>> SWAP_ADDRESS_SPACE_SHIFT];
@@ -265,45 +253,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
zpool_get_type((p)->zpools[0]))
-static bool zswap_is_full(void)
-{
- return totalram_pages() * zswap_max_pool_percent / 100 <
- DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
-}
-
-static bool zswap_can_accept(void)
-{
- return totalram_pages() * zswap_accept_thr_percent / 100 *
- zswap_max_pool_percent / 100 >
- DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
-}
-
-static u64 get_zswap_pool_size(struct zswap_pool *pool)
-{
- u64 pool_size = 0;
- int i;
-
- for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
- pool_size += zpool_get_total_size(pool->zpools[i]);
-
- return pool_size;
-}
-
-static void zswap_update_total_size(void)
-{
- struct zswap_pool *pool;
- u64 total = 0;
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(pool, &zswap_pools, list)
- total += get_zswap_pool_size(pool);
-
- rcu_read_unlock();
-
- zswap_pool_total_size = total;
-}
-
/*********************************
* pool functions
**********************************/
@@ -541,6 +490,33 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
return NULL;
}
+static unsigned long zswap_max_pages(void)
+{
+ return totalram_pages() * zswap_max_pool_percent / 100;
+}
+
+static unsigned long zswap_accept_thr_pages(void)
+{
+ return zswap_max_pages() * zswap_accept_thr_percent / 100;
+}
+
+unsigned long zswap_total_pages(void)
+{
+ struct zswap_pool *pool;
+ unsigned long total = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &zswap_pools, list) {
+ int i;
+
+ for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+ total += zpool_get_total_pages(pool->zpools[i]);
+ }
+ rcu_read_unlock();
+
+ return total;
+}
+
/*********************************
* param callbacks
**********************************/
@@ -807,63 +783,6 @@ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
}
/*********************************
-* rbtree functions
-**********************************/
-static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
-{
- struct rb_node *node = root->rb_node;
- struct zswap_entry *entry;
- pgoff_t entry_offset;
-
- while (node) {
- entry = rb_entry(node, struct zswap_entry, rbnode);
- entry_offset = swp_offset(entry->swpentry);
- if (entry_offset > offset)
- node = node->rb_left;
- else if (entry_offset < offset)
- node = node->rb_right;
- else
- return entry;
- }
- return NULL;
-}
-
-/*
- * In the case that a entry with the same offset is found, a pointer to
- * the existing entry is stored in dupentry and the function returns -EEXIST
- */
-static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
- struct zswap_entry **dupentry)
-{
- struct rb_node **link = &root->rb_node, *parent = NULL;
- struct zswap_entry *myentry;
- pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
-
- while (*link) {
- parent = *link;
- myentry = rb_entry(parent, struct zswap_entry, rbnode);
- myentry_offset = swp_offset(myentry->swpentry);
- if (myentry_offset > entry_offset)
- link = &(*link)->rb_left;
- else if (myentry_offset < entry_offset)
- link = &(*link)->rb_right;
- else {
- *dupentry = myentry;
- return -EEXIST;
- }
- }
- rb_link_node(&entry->rbnode, parent, link);
- rb_insert_color(&entry->rbnode, root);
- return 0;
-}
-
-static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
-{
- rb_erase(&entry->rbnode, root);
- RB_CLEAR_NODE(&entry->rbnode);
-}
-
-/*********************************
* zswap entry functions
**********************************/
static struct kmem_cache *zswap_entry_cache;
@@ -874,7 +793,6 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
if (!entry)
return NULL;
- RB_CLEAR_NODE(&entry->rbnode);
return entry;
}
@@ -885,12 +803,7 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
{
- int i = 0;
-
- if (ZSWAP_NR_ZPOOLS > 1)
- i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
-
- return entry->pool->zpools[i];
+ return entry->pool->zpools[hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS))];
}
/*
@@ -904,7 +817,6 @@ static void zswap_entry_free(struct zswap_entry *entry)
else {
zswap_lru_del(&zswap_list_lru, entry);
zpool_free(zswap_find_zpool(entry), entry->handle);
- atomic_dec(&zswap_nr_stored);
zswap_pool_put(entry->pool);
}
if (entry->objcg) {
@@ -913,18 +825,6 @@ static void zswap_entry_free(struct zswap_entry *entry)
}
zswap_entry_cache_free(entry);
atomic_dec(&zswap_stored_pages);
- zswap_update_total_size();
-}
-
-/*
- * The caller hold the tree lock and search the entry from the tree,
- * so it must be on the tree, remove it from the tree and free it.
- */
-static void zswap_invalidate_entry(struct zswap_tree *tree,
- struct zswap_entry *entry)
-{
- zswap_rb_erase(&tree->rbroot, entry);
- zswap_entry_free(entry);
}
/*********************************
@@ -1126,7 +1026,8 @@ static void zswap_decompress(struct zswap_entry *entry, struct page *page)
static int zswap_writeback_entry(struct zswap_entry *entry,
swp_entry_t swpentry)
{
- struct zswap_tree *tree;
+ struct xarray *tree;
+ pgoff_t offset = swp_offset(swpentry);
struct folio *folio;
struct mempolicy *mpol;
bool folio_was_allocated;
@@ -1163,19 +1064,13 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
* be dereferenced.
*/
tree = swap_zswap_tree(swpentry);
- spin_lock(&tree->lock);
- if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) {
- spin_unlock(&tree->lock);
+ if (entry != xa_cmpxchg(tree, offset, entry, NULL, GFP_KERNEL)) {
delete_from_swap_cache(folio);
folio_unlock(folio);
folio_put(folio);
return -ENOMEM;
}
- /* Safe to deref entry after the entry is verified above. */
- zswap_rb_erase(&tree->rbroot, entry);
- spin_unlock(&tree->lock);
-
zswap_decompress(entry, &folio->page);
count_vm_event(ZSWPWB);
@@ -1344,8 +1239,8 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
} else {
- nr_backing = zswap_pool_total_size >> PAGE_SHIFT;
- nr_stored = atomic_read(&zswap_nr_stored);
+ nr_backing = zswap_total_pages();
+ nr_stored = atomic_read(&zswap_stored_pages);
}
if (!nr_stored)
@@ -1365,6 +1260,11 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
* This ensures that the better zswap compresses memory, the fewer
* pages we will evict to swap (as it will otherwise incur IO for
* relatively small memory saving).
+ *
+ * The memory saving factor calculated here takes same-filled pages into
+ * account, but those are not freeable since they almost occupy no
+ * space. Hence, we may scale nr_freeable down a little bit more than we
+ * should if we have a lot of same-filled pages.
*/
return mult_frac(nr_freeable, nr_backing, nr_stored);
}
@@ -1412,6 +1312,10 @@ static void shrink_worker(struct work_struct *w)
{
struct mem_cgroup *memcg;
int ret, failures = 0;
+ unsigned long thr;
+
+ /* Reclaim down to the accept threshold */
+ thr = zswap_accept_thr_pages();
/* global reclaim will select cgroup in a round-robin fashion. */
do {
@@ -1459,10 +1363,9 @@ static void shrink_worker(struct work_struct *w)
break;
if (ret && ++failures == MAX_RECLAIM_RETRIES)
break;
-
resched:
cond_resched();
- } while (!zswap_can_accept());
+ } while (zswap_total_pages() > thr);
}
static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
@@ -1499,10 +1402,11 @@ bool zswap_store(struct folio *folio)
{
swp_entry_t swp = folio->swap;
pgoff_t offset = swp_offset(swp);
- struct zswap_tree *tree = swap_zswap_tree(swp);
- struct zswap_entry *entry, *dupentry;
+ struct xarray *tree = swap_zswap_tree(swp);
+ struct zswap_entry *entry, *old;
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg = NULL;
+ unsigned long max_pages, cur_pages;
VM_WARN_ON_ONCE(!folio_test_locked(folio));
VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1514,6 +1418,7 @@ bool zswap_store(struct folio *folio)
if (!zswap_enabled)
goto check_old;
+ /* Check cgroup limits */
objcg = get_obj_cgroup_from_folio(folio);
if (objcg && !obj_cgroup_may_zswap(objcg)) {
memcg = get_mem_cgroup_from_objcg(objcg);
@@ -1524,15 +1429,18 @@ bool zswap_store(struct folio *folio)
mem_cgroup_put(memcg);
}
- /* reclaim space if needed */
- if (zswap_is_full()) {
+ /* Check global limits */
+ cur_pages = zswap_total_pages();
+ max_pages = zswap_max_pages();
+
+ if (cur_pages >= max_pages) {
zswap_pool_limit_hit++;
zswap_pool_reached_full = true;
goto shrink;
}
if (zswap_pool_reached_full) {
- if (!zswap_can_accept())
+ if (cur_pages > zswap_accept_thr_pages())
goto shrink;
else
zswap_pool_reached_full = false;
@@ -1583,54 +1491,71 @@ bool zswap_store(struct folio *folio)
insert_entry:
entry->swpentry = swp;
entry->objcg = objcg;
+
+ old = xa_store(tree, offset, entry, GFP_KERNEL);
+ if (xa_is_err(old)) {
+ int err = xa_err(old);
+
+ WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
+ zswap_reject_alloc_fail++;
+ goto store_failed;
+ }
+
+ /*
+ * We may have had an existing entry that became stale when
+ * the folio was redirtied and now the new version is being
+ * swapped out. Get rid of the old.
+ */
+ if (old)
+ zswap_entry_free(old);
+
if (objcg) {
obj_cgroup_charge_zswap(objcg, entry->length);
- /* Account before objcg ref is moved to tree */
count_objcg_event(objcg, ZSWPOUT);
}
- /* map */
- spin_lock(&tree->lock);
/*
- * The folio may have been dirtied again, invalidate the
- * possibly stale entry before inserting the new entry.
+ * We finish initializing the entry while it's already in xarray.
+ * This is safe because:
+ *
+ * 1. Concurrent stores and invalidations are excluded by folio lock.
+ *
+ * 2. Writeback is excluded by the entry not being on the LRU yet.
+ * The publishing order matters to prevent writeback from seeing
+ * an incoherent entry.
*/
- if (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
- zswap_invalidate_entry(tree, dupentry);
- WARN_ON(zswap_rb_insert(&tree->rbroot, entry, &dupentry));
- }
if (entry->length) {
INIT_LIST_HEAD(&entry->lru);
zswap_lru_add(&zswap_list_lru, entry);
- atomic_inc(&zswap_nr_stored);
}
- spin_unlock(&tree->lock);
/* update stats */
atomic_inc(&zswap_stored_pages);
- zswap_update_total_size();
count_vm_event(ZSWPOUT);
return true;
+store_failed:
+ if (!entry->length)
+ atomic_dec(&zswap_same_filled_pages);
+ else {
+ zpool_free(zswap_find_zpool(entry), entry->handle);
put_pool:
- zswap_pool_put(entry->pool);
+ zswap_pool_put(entry->pool);
+ }
freepage:
zswap_entry_cache_free(entry);
reject:
- if (objcg)
- obj_cgroup_put(objcg);
+ obj_cgroup_put(objcg);
check_old:
/*
* If the zswap store fails or zswap is disabled, we must invalidate the
* possibly stale entry which was previously stored at this offset.
* Otherwise, writeback could overwrite the new data in the swapfile.
*/
- spin_lock(&tree->lock);
- entry = zswap_rb_search(&tree->rbroot, offset);
+ entry = xa_erase(tree, offset);
if (entry)
- zswap_invalidate_entry(tree, entry);
- spin_unlock(&tree->lock);
+ zswap_entry_free(entry);
return false;
shrink:
@@ -1644,18 +1569,12 @@ bool zswap_load(struct folio *folio)
pgoff_t offset = swp_offset(swp);
struct page *page = &folio->page;
bool swapcache = folio_test_swapcache(folio);
- struct zswap_tree *tree = swap_zswap_tree(swp);
+ struct xarray *tree = swap_zswap_tree(swp);
struct zswap_entry *entry;
u8 *dst;
VM_WARN_ON_ONCE(!folio_test_locked(folio));
- spin_lock(&tree->lock);
- entry = zswap_rb_search(&tree->rbroot, offset);
- if (!entry) {
- spin_unlock(&tree->lock);
- return false;
- }
/*
* When reading into the swapcache, invalidate our entry. The
* swapcache can be the authoritative owner of the page and
@@ -1669,8 +1588,12 @@ bool zswap_load(struct folio *folio)
* the fault fails. We remain the primary owner of the entry.)
*/
if (swapcache)
- zswap_rb_erase(&tree->rbroot, entry);
- spin_unlock(&tree->lock);
+ entry = xa_erase(tree, offset);
+ else
+ entry = xa_load(tree, offset);
+
+ if (!entry)
+ return false;
if (entry->length)
zswap_decompress(entry, page);
@@ -1695,19 +1618,17 @@ bool zswap_load(struct folio *folio)
void zswap_invalidate(swp_entry_t swp)
{
pgoff_t offset = swp_offset(swp);
- struct zswap_tree *tree = swap_zswap_tree(swp);
+ struct xarray *tree = swap_zswap_tree(swp);
struct zswap_entry *entry;
- spin_lock(&tree->lock);
- entry = zswap_rb_search(&tree->rbroot, offset);
+ entry = xa_erase(tree, offset);
if (entry)
- zswap_invalidate_entry(tree, entry);
- spin_unlock(&tree->lock);
+ zswap_entry_free(entry);
}
int zswap_swapon(int type, unsigned long nr_pages)
{
- struct zswap_tree *trees, *tree;
+ struct xarray *trees, *tree;
unsigned int nr, i;
nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
@@ -1717,11 +1638,8 @@ int zswap_swapon(int type, unsigned long nr_pages)
return -ENOMEM;
}
- for (i = 0; i < nr; i++) {
- tree = trees + i;
- tree->rbroot = RB_ROOT;
- spin_lock_init(&tree->lock);
- }
+ for (i = 0; i < nr; i++)
+ xa_init(trees + i);
nr_zswap_trees[type] = nr;
zswap_trees[type] = trees;
@@ -1730,7 +1648,7 @@ int zswap_swapon(int type, unsigned long nr_pages)
void zswap_swapoff(int type)
{
- struct zswap_tree *trees = zswap_trees[type];
+ struct xarray *trees = zswap_trees[type];
unsigned int i;
if (!trees)
@@ -1738,7 +1656,7 @@ void zswap_swapoff(int type)
/* try_to_unuse() invalidated all the entries already */
for (i = 0; i < nr_zswap_trees[type]; i++)
- WARN_ON_ONCE(!RB_EMPTY_ROOT(&trees[i].rbroot));
+ WARN_ON_ONCE(!xa_empty(trees + i));
kvfree(trees);
nr_zswap_trees[type] = 0;
@@ -1753,6 +1671,13 @@ void zswap_swapoff(int type)
static struct dentry *zswap_debugfs_root;
+static int debugfs_get_total_size(void *data, u64 *val)
+{
+ *val = zswap_total_pages() * PAGE_SIZE;
+ return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu\n");
+
static int zswap_debugfs_init(void)
{
if (!debugfs_initialized())
@@ -1774,8 +1699,8 @@ static int zswap_debugfs_init(void)
zswap_debugfs_root, &zswap_reject_compress_poor);
debugfs_create_u64("written_back_pages", 0444,
zswap_debugfs_root, &zswap_written_back_pages);
- debugfs_create_u64("pool_total_size", 0444,
- zswap_debugfs_root, &zswap_pool_total_size);
+ debugfs_create_file("pool_total_size", 0444,
+ zswap_debugfs_root, NULL, &total_size_fops);
debugfs_create_atomic_t("stored_pages", 0444,
zswap_debugfs_root, &zswap_stored_pages);
debugfs_create_atomic_t("same_filled_pages", 0444,
diff --git a/net/sunrpc/auth_gss/auth_gss_internal.h b/net/sunrpc/auth_gss/auth_gss_internal.h
index c53b329092d405..4ebc1b7043d917 100644
--- a/net/sunrpc/auth_gss/auth_gss_internal.h
+++ b/net/sunrpc/auth_gss/auth_gss_internal.h
@@ -23,7 +23,7 @@ simple_get_bytes(const void *p, const void *end, void *res, size_t len)
}
static inline const void *
-simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest)
+simple_get_netobj_noprof(const void *p, const void *end, struct xdr_netobj *dest)
{
const void *q;
unsigned int len;
@@ -35,7 +35,7 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest)
if (unlikely(q > end || q < p))
return ERR_PTR(-EFAULT);
if (len) {
- dest->data = kmemdup(p, len, GFP_KERNEL);
+ dest->data = kmemdup_noprof(p, len, GFP_KERNEL);
if (unlikely(dest->data == NULL))
return ERR_PTR(-ENOMEM);
} else
@@ -43,3 +43,5 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest)
dest->len = len;
return q;
}
+
+#define simple_get_netobj(...) alloc_hooks(simple_get_netobj_noprof(__VA_ARGS__))
diff --git a/rust/helpers.c b/rust/helpers.c
index 70e59efd92bc43..858d802abd115e 100644
--- a/rust/helpers.c
+++ b/rust/helpers.c
@@ -28,6 +28,7 @@
#include <linux/mutex.h>
#include <linux/refcount.h>
#include <linux/sched/signal.h>
+#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
@@ -157,6 +158,13 @@ void rust_helper_init_work_with_key(struct work_struct *work, work_func_t func,
}
EXPORT_SYMBOL_GPL(rust_helper_init_work_with_key);
+void * __must_check __realloc_size(2)
+rust_helper_krealloc(const void *objp, size_t new_size, gfp_t flags)
+{
+ return krealloc(objp, new_size, flags);
+}
+EXPORT_SYMBOL_GPL(rust_helper_krealloc);
+
/*
* `bindgen` binds the C `size_t` type as the Rust `usize` type, so we can
* use it in contexts where Rust expects a `usize` like slice (array) indices.
diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
index 653b92f6d4c8f4..47978efe4797c2 100644
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -204,6 +204,11 @@ static int symbol_in_range(const struct sym_entry *s,
return 0;
}
+static bool string_starts_with(const char *s, const char *prefix)
+{
+ return strncmp(s, prefix, strlen(prefix)) == 0;
+}
+
static int symbol_valid(const struct sym_entry *s)
{
const char *name = sym_name(s);
@@ -211,6 +216,14 @@ static int symbol_valid(const struct sym_entry *s)
/* if --all-symbols is not specified, then symbols outside the text
* and inittext sections are discarded */
if (!all_symbols) {
+ /*
+ * Symbols starting with __start and __stop are used to denote
+ * section boundaries, and should always be included:
+ */
+ if (string_starts_with(name, "__start_") ||
+ string_starts_with(name, "__stop_"))
+ return 1;
+
if (symbol_in_range(s, text_ranges,
ARRAY_SIZE(text_ranges)) == 0)
return 0;
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index cb1be22afc65ff..b463acecad401b 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -1723,6 +1723,7 @@ sub dump_function($$) {
$prototype =~ s/__must_check +//;
$prototype =~ s/__weak +//;
$prototype =~ s/__sched +//;
+ $prototype =~ s/_noprof//;
$prototype =~ s/__printf\s*\(\s*\d*\s*,\s*\d*\s*\) +//;
$prototype =~ s/__(?:re)?alloc_size\s*\(\s*\d+\s*(?:,\s*\d+\s*)?\) +//;
$prototype =~ s/__diagnose_as\s*\(\s*\S+\s*(?:,\s*\d+\s*)*\) +//;
diff --git a/scripts/module.lds.S b/scripts/module.lds.S
index bf5bcf2836d815..45c67a0994f3e3 100644
--- a/scripts/module.lds.S
+++ b/scripts/module.lds.S
@@ -9,6 +9,8 @@
#define DISCARD_EH_FRAME *(.eh_frame)
#endif
+#include <asm-generic/codetag.lds.h>
+
SECTIONS {
/DISCARD/ : {
*(.discard)
@@ -47,12 +49,17 @@ SECTIONS {
.data : {
*(.data .data.[0-9a-zA-Z_]*)
*(.data..L*)
+ CODETAG_SECTIONS()
}
.rodata : {
*(.rodata .rodata.[0-9a-zA-Z_]*)
*(.rodata..L*)
}
+#else
+ .data : {
+ CODETAG_SECTIONS()
+ }
#endif
}
diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening
index 2cff851ebfd7e1..effbf5982be10f 100644
--- a/security/Kconfig.hardening
+++ b/security/Kconfig.hardening
@@ -255,6 +255,21 @@ config INIT_ON_FREE_DEFAULT_ON
touching "cold" memory areas. Most cases see 3-5% impact. Some
synthetic workloads have measured as high as 8%.
+config INIT_MLOCKED_ON_FREE_DEFAULT_ON
+ bool "Enable mlocked memory zeroing on free"
+ depends on !KMSAN
+ help
+ This config has the effect of setting "init_mlocked_on_free=1"
+ on the kernel command line. If it is enabled, all mlocked process
+ memory is zeroed when freed. This restriction to mlocked memory
+ improves performance over "init_on_free" but can still be used to
+ protect confidential data like key material from content exposures
+ to other processes, as well as live forensics and cold boot attacks.
+ Any non-mlocked memory is not cleared before it is reassigned. This
+ configuration can be overwritten by setting "init_mlocked_on_free=0"
+ on the command line. The "init_on_free" boot option takes
+ precedence over "init_mlocked_on_free".
+
config CC_HAS_ZERO_CALL_USED_REGS
def_bool $(cc-option,-fzero-call-used-regs=used-gpr)
# https://github.com/ClangBuiltLinux/linux/issues/1766
diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c
index d3fa6e136744de..990b5bd717a1cb 100644
--- a/sound/pci/hda/cs35l41_hda.c
+++ b/sound/pci/hda/cs35l41_hda.c
@@ -13,6 +13,7 @@
#include <sound/soc.h>
#include <linux/pm_runtime.h>
#include <linux/spi/spi.h>
+#include <linux/vmalloc.h>
#include "hda_local.h"
#include "hda_auto_parser.h"
#include "hda_jack.h"
diff --git a/tools/cgroup/memcg_slabinfo.py b/tools/cgroup/memcg_slabinfo.py
index 1d3a90d93fe267..270c28a0d09801 100644
--- a/tools/cgroup/memcg_slabinfo.py
+++ b/tools/cgroup/memcg_slabinfo.py
@@ -146,12 +146,11 @@ def detect_kernel_config():
def for_each_slab(prog):
- PGSlab = 1 << prog.constant('PG_slab')
- PGHead = 1 << prog.constant('PG_head')
+ PGSlab = ~prog.constant('PG_slab')
for page in for_each_page(prog):
try:
- if page.flags.value_() & PGSlab:
+ if page.page_type.value_() == PGSlab:
yield cast('struct slab *', page)
except FaultError:
pass
diff --git a/tools/include/uapi/linux/memfd.h b/tools/include/uapi/linux/memfd.h
new file mode 100644
index 00000000000000..01c0324e773316
--- /dev/null
+++ b/tools/include/uapi/linux/memfd.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _LINUX_MEMFD_H
+#define _LINUX_MEMFD_H
+
+#include <asm-generic/hugetlb_encode.h>
+
+/* flags for memfd_create(2) (unsigned int) */
+#define MFD_CLOEXEC 0x0001U
+#define MFD_ALLOW_SEALING 0x0002U
+#define MFD_HUGETLB 0x0004U
+/* not executable and sealed to prevent changing to executable. */
+#define MFD_NOEXEC_SEAL 0x0008U
+/* executable */
+#define MFD_EXEC 0x0010U
+
+/*
+ * Huge page size encoding when MFD_HUGETLB is specified, and a huge page
+ * size other than the default is desired. See hugetlb_encode.h.
+ * All known huge page size encodings are provided here. It is the
+ * responsibility of the application to know which sizes are supported on
+ * the running system. See mmap(2) man page for details.
+ */
+#define MFD_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT
+#define MFD_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK
+
+#define MFD_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB
+#define MFD_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB
+#define MFD_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB
+#define MFD_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB
+#define MFD_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB
+#define MFD_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB
+#define MFD_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB
+#define MFD_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB
+#define MFD_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB
+#define MFD_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB
+#define MFD_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB
+#define MFD_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB
+
+#endif /* _LINUX_MEMFD_H */
diff --git a/tools/include/uapi/linux/userfaultfd.h b/tools/include/uapi/linux/userfaultfd.h
new file mode 100644
index 00000000000000..4283de22d5b659
--- /dev/null
+++ b/tools/include/uapi/linux/userfaultfd.h
@@ -0,0 +1,386 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * include/linux/userfaultfd.h
+ *
+ * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ */
+
+#ifndef _LINUX_USERFAULTFD_H
+#define _LINUX_USERFAULTFD_H
+
+#include <linux/types.h>
+
+/* ioctls for /dev/userfaultfd */
+#define USERFAULTFD_IOC 0xAA
+#define USERFAULTFD_IOC_NEW _IO(USERFAULTFD_IOC, 0x00)
+
+/*
+ * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and
+ * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In
+ * userfaultfd.h we assumed the kernel was reading (instead _IOC_READ
+ * means the userland is reading).
+ */
+#define UFFD_API ((__u64)0xAA)
+#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING | \
+ UFFDIO_REGISTER_MODE_WP | \
+ UFFDIO_REGISTER_MODE_MINOR)
+#define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
+ UFFD_FEATURE_EVENT_FORK | \
+ UFFD_FEATURE_EVENT_REMAP | \
+ UFFD_FEATURE_EVENT_REMOVE | \
+ UFFD_FEATURE_EVENT_UNMAP | \
+ UFFD_FEATURE_MISSING_HUGETLBFS | \
+ UFFD_FEATURE_MISSING_SHMEM | \
+ UFFD_FEATURE_SIGBUS | \
+ UFFD_FEATURE_THREAD_ID | \
+ UFFD_FEATURE_MINOR_HUGETLBFS | \
+ UFFD_FEATURE_MINOR_SHMEM | \
+ UFFD_FEATURE_EXACT_ADDRESS | \
+ UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \
+ UFFD_FEATURE_WP_UNPOPULATED | \
+ UFFD_FEATURE_POISON | \
+ UFFD_FEATURE_WP_ASYNC | \
+ UFFD_FEATURE_MOVE)
+#define UFFD_API_IOCTLS \
+ ((__u64)1 << _UFFDIO_REGISTER | \
+ (__u64)1 << _UFFDIO_UNREGISTER | \
+ (__u64)1 << _UFFDIO_API)
+#define UFFD_API_RANGE_IOCTLS \
+ ((__u64)1 << _UFFDIO_WAKE | \
+ (__u64)1 << _UFFDIO_COPY | \
+ (__u64)1 << _UFFDIO_ZEROPAGE | \
+ (__u64)1 << _UFFDIO_MOVE | \
+ (__u64)1 << _UFFDIO_WRITEPROTECT | \
+ (__u64)1 << _UFFDIO_CONTINUE | \
+ (__u64)1 << _UFFDIO_POISON)
+#define UFFD_API_RANGE_IOCTLS_BASIC \
+ ((__u64)1 << _UFFDIO_WAKE | \
+ (__u64)1 << _UFFDIO_COPY | \
+ (__u64)1 << _UFFDIO_WRITEPROTECT | \
+ (__u64)1 << _UFFDIO_CONTINUE | \
+ (__u64)1 << _UFFDIO_POISON)
+
+/*
+ * Valid ioctl command number range with this API is from 0x00 to
+ * 0x3F. UFFDIO_API is the fixed number, everything else can be
+ * changed by implementing a different UFFD_API. If sticking to the
+ * same UFFD_API more ioctl can be added and userland will be aware of
+ * which ioctl the running kernel implements through the ioctl command
+ * bitmask written by the UFFDIO_API.
+ */
+#define _UFFDIO_REGISTER (0x00)
+#define _UFFDIO_UNREGISTER (0x01)
+#define _UFFDIO_WAKE (0x02)
+#define _UFFDIO_COPY (0x03)
+#define _UFFDIO_ZEROPAGE (0x04)
+#define _UFFDIO_MOVE (0x05)
+#define _UFFDIO_WRITEPROTECT (0x06)
+#define _UFFDIO_CONTINUE (0x07)
+#define _UFFDIO_POISON (0x08)
+#define _UFFDIO_API (0x3F)
+
+/* userfaultfd ioctl ids */
+#define UFFDIO 0xAA
+#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \
+ struct uffdio_api)
+#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \
+ struct uffdio_register)
+#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \
+ struct uffdio_range)
+#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \
+ struct uffdio_range)
+#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \
+ struct uffdio_copy)
+#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \
+ struct uffdio_zeropage)
+#define UFFDIO_MOVE _IOWR(UFFDIO, _UFFDIO_MOVE, \
+ struct uffdio_move)
+#define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
+ struct uffdio_writeprotect)
+#define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \
+ struct uffdio_continue)
+#define UFFDIO_POISON _IOWR(UFFDIO, _UFFDIO_POISON, \
+ struct uffdio_poison)
+
+/* read() structure */
+struct uffd_msg {
+ __u8 event;
+
+ __u8 reserved1;
+ __u16 reserved2;
+ __u32 reserved3;
+
+ union {
+ struct {
+ __u64 flags;
+ __u64 address;
+ union {
+ __u32 ptid;
+ } feat;
+ } pagefault;
+
+ struct {
+ __u32 ufd;
+ } fork;
+
+ struct {
+ __u64 from;
+ __u64 to;
+ __u64 len;
+ } remap;
+
+ struct {
+ __u64 start;
+ __u64 end;
+ } remove;
+
+ struct {
+ /* unused reserved fields */
+ __u64 reserved1;
+ __u64 reserved2;
+ __u64 reserved3;
+ } reserved;
+ } arg;
+} __attribute__((packed));
+
+/*
+ * Start at 0x12 and not at 0 to be more strict against bugs.
+ */
+#define UFFD_EVENT_PAGEFAULT 0x12
+#define UFFD_EVENT_FORK 0x13
+#define UFFD_EVENT_REMAP 0x14
+#define UFFD_EVENT_REMOVE 0x15
+#define UFFD_EVENT_UNMAP 0x16
+
+/* flags for UFFD_EVENT_PAGEFAULT */
+#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */
+#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */
+#define UFFD_PAGEFAULT_FLAG_MINOR (1<<2) /* If reason is VM_UFFD_MINOR */
+
+struct uffdio_api {
+ /* userland asks for an API number and the features to enable */
+ __u64 api;
+ /*
+ * Kernel answers below with the all available features for
+ * the API, this notifies userland of which events and/or
+ * which flags for each event are enabled in the current
+ * kernel.
+ *
+ * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
+ * are to be considered implicitly always enabled in all kernels as
+ * long as the uffdio_api.api requested matches UFFD_API.
+ *
+ * UFFD_FEATURE_MISSING_HUGETLBFS means an UFFDIO_REGISTER
+ * with UFFDIO_REGISTER_MODE_MISSING mode will succeed on
+ * hugetlbfs virtual memory ranges. Adding or not adding
+ * UFFD_FEATURE_MISSING_HUGETLBFS to uffdio_api.features has
+ * no real functional effect after UFFDIO_API returns, but
+ * it's only useful for an initial feature set probe at
+ * UFFDIO_API time. There are two ways to use it:
+ *
+ * 1) by adding UFFD_FEATURE_MISSING_HUGETLBFS to the
+ * uffdio_api.features before calling UFFDIO_API, an error
+ * will be returned by UFFDIO_API on a kernel without
+ * hugetlbfs missing support
+ *
+ * 2) the UFFD_FEATURE_MISSING_HUGETLBFS can not be added in
+ * uffdio_api.features and instead it will be set by the
+ * kernel in the uffdio_api.features if the kernel supports
+ * it, so userland can later check if the feature flag is
+ * present in uffdio_api.features after UFFDIO_API
+ * succeeded.
+ *
+ * UFFD_FEATURE_MISSING_SHMEM works the same as
+ * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem
+ * (i.e. tmpfs and other shmem based APIs).
+ *
+ * UFFD_FEATURE_SIGBUS feature means no page-fault
+ * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead
+ * a SIGBUS signal will be sent to the faulting process.
+ *
+ * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will
+ * be returned, if feature is not requested 0 will be returned.
+ *
+ * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults
+ * can be intercepted (via REGISTER_MODE_MINOR) for
+ * hugetlbfs-backed pages.
+ *
+ * UFFD_FEATURE_MINOR_SHMEM indicates the same support as
+ * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead.
+ *
+ * UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page
+ * faults would be provided and the offset within the page would not be
+ * masked.
+ *
+ * UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd
+ * write-protection mode is supported on both shmem and hugetlbfs.
+ *
+ * UFFD_FEATURE_WP_UNPOPULATED indicates that userfaultfd
+ * write-protection mode will always apply to unpopulated pages
+ * (i.e. empty ptes). This will be the default behavior for shmem
+ * & hugetlbfs, so this flag only affects anonymous memory behavior
+ * when userfault write-protection mode is registered.
+ *
+ * UFFD_FEATURE_WP_ASYNC indicates that userfaultfd write-protection
+ * asynchronous mode is supported in which the write fault is
+ * automatically resolved and write-protection is un-set.
+ * It implies UFFD_FEATURE_WP_UNPOPULATED.
+ *
+ * UFFD_FEATURE_MOVE indicates that the kernel supports moving an
+ * existing page contents from userspace.
+ */
+#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
+#define UFFD_FEATURE_EVENT_FORK (1<<1)
+#define UFFD_FEATURE_EVENT_REMAP (1<<2)
+#define UFFD_FEATURE_EVENT_REMOVE (1<<3)
+#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4)
+#define UFFD_FEATURE_MISSING_SHMEM (1<<5)
+#define UFFD_FEATURE_EVENT_UNMAP (1<<6)
+#define UFFD_FEATURE_SIGBUS (1<<7)
+#define UFFD_FEATURE_THREAD_ID (1<<8)
+#define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9)
+#define UFFD_FEATURE_MINOR_SHMEM (1<<10)
+#define UFFD_FEATURE_EXACT_ADDRESS (1<<11)
+#define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12)
+#define UFFD_FEATURE_WP_UNPOPULATED (1<<13)
+#define UFFD_FEATURE_POISON (1<<14)
+#define UFFD_FEATURE_WP_ASYNC (1<<15)
+#define UFFD_FEATURE_MOVE (1<<16)
+ __u64 features;
+
+ __u64 ioctls;
+};
+
+struct uffdio_range {
+ __u64 start;
+ __u64 len;
+};
+
+struct uffdio_register {
+ struct uffdio_range range;
+#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0)
+#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1)
+#define UFFDIO_REGISTER_MODE_MINOR ((__u64)1<<2)
+ __u64 mode;
+
+ /*
+ * kernel answers which ioctl commands are available for the
+ * range, keep at the end as the last 8 bytes aren't read.
+ */
+ __u64 ioctls;
+};
+
+struct uffdio_copy {
+ __u64 dst;
+ __u64 src;
+ __u64 len;
+#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0)
+ /*
+ * UFFDIO_COPY_MODE_WP will map the page write protected on
+ * the fly. UFFDIO_COPY_MODE_WP is available only if the
+ * write protected ioctl is implemented for the range
+ * according to the uffdio_register.ioctls.
+ */
+#define UFFDIO_COPY_MODE_WP ((__u64)1<<1)
+ __u64 mode;
+
+ /*
+ * "copy" is written by the ioctl and must be at the end: the
+ * copy_from_user will not read the last 8 bytes.
+ */
+ __s64 copy;
+};
+
+struct uffdio_zeropage {
+ struct uffdio_range range;
+#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0)
+ __u64 mode;
+
+ /*
+ * "zeropage" is written by the ioctl and must be at the end:
+ * the copy_from_user will not read the last 8 bytes.
+ */
+ __s64 zeropage;
+};
+
+struct uffdio_writeprotect {
+ struct uffdio_range range;
+/*
+ * UFFDIO_WRITEPROTECT_MODE_WP: set the flag to write protect a range,
+ * unset the flag to undo protection of a range which was previously
+ * write protected.
+ *
+ * UFFDIO_WRITEPROTECT_MODE_DONTWAKE: set the flag to avoid waking up
+ * any wait thread after the operation succeeds.
+ *
+ * NOTE: Write protecting a region (WP=1) is unrelated to page faults,
+ * therefore DONTWAKE flag is meaningless with WP=1. Removing write
+ * protection (WP=0) in response to a page fault wakes the faulting
+ * task unless DONTWAKE is set.
+ */
+#define UFFDIO_WRITEPROTECT_MODE_WP ((__u64)1<<0)
+#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE ((__u64)1<<1)
+ __u64 mode;
+};
+
+struct uffdio_continue {
+ struct uffdio_range range;
+#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0)
+ /*
+ * UFFDIO_CONTINUE_MODE_WP will map the page write protected on
+ * the fly. UFFDIO_CONTINUE_MODE_WP is available only if the
+ * write protected ioctl is implemented for the range
+ * according to the uffdio_register.ioctls.
+ */
+#define UFFDIO_CONTINUE_MODE_WP ((__u64)1<<1)
+ __u64 mode;
+
+ /*
+ * Fields below here are written by the ioctl and must be at the end:
+ * the copy_from_user will not read past here.
+ */
+ __s64 mapped;
+};
+
+struct uffdio_poison {
+ struct uffdio_range range;
+#define UFFDIO_POISON_MODE_DONTWAKE ((__u64)1<<0)
+ __u64 mode;
+
+ /*
+ * Fields below here are written by the ioctl and must be at the end:
+ * the copy_from_user will not read past here.
+ */
+ __s64 updated;
+};
+
+struct uffdio_move {
+ __u64 dst;
+ __u64 src;
+ __u64 len;
+ /*
+ * Especially if used to atomically remove memory from the
+ * address space the wake on the dst range is not needed.
+ */
+#define UFFDIO_MOVE_MODE_DONTWAKE ((__u64)1<<0)
+#define UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES ((__u64)1<<1)
+ __u64 mode;
+ /*
+ * "move" is written by the ioctl and must be at the end: the
+ * copy_from_user will not read the last 8 bytes.
+ */
+ __s64 move;
+};
+
+/*
+ * Flags for the userfaultfd(2) system call itself.
+ */
+
+/*
+ * Create a userfaultfd that can handle page faults only in user mode.
+ */
+#define UFFD_USER_MODE_ONLY 1
+
+#endif /* _LINUX_USERFAULTFD_H */
diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk
index da2cade3bab0e6..1dae4a02957f9d 100644
--- a/tools/testing/selftests/lib.mk
+++ b/tools/testing/selftests/lib.mk
@@ -48,6 +48,15 @@ ifeq ($(KHDR_INCLUDES),)
KHDR_INCLUDES := -isystem $(top_srcdir)/usr/include
endif
+# In order to use newer items that haven't yet been added to the user's system
+# header files, add $(TOOLS_INCLUDES) to the compiler invocation in each
+# each selftest.
+# You may need to add files to that location, or to refresh an existing file. In
+# order to do that, run "make headers" from $(top_srcdir), then copy the
+# header file that you want from $(top_srcdir)/usr/include/... , to the matching
+# subdir in $(TOOLS_INCLUDE).
+TOOLS_INCLUDES := -isystem $(top_srcdir)/tools/include/uapi
+
# The following are built by lib.mk common compile rules.
# TEST_CUSTOM_PROGS should be used by tests that require
# custom build rule and prevent common build rule use.
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index eb5f39a2668b44..7ca9186a063943 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -32,7 +32,7 @@ endif
# LDLIBS.
MAKEFLAGS += --no-builtin-rules
-CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
+CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
LDLIBS = -lrt -lpthread -lm
TEST_GEN_FILES = cow
diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c
index d615767e396bec..db845dca8d1950 100644
--- a/tools/testing/selftests/mm/ksm_functional_tests.c
+++ b/tools/testing/selftests/mm/ksm_functional_tests.c
@@ -28,6 +28,15 @@
#define MiB (1024 * KiB)
#define FORK_EXEC_CHILD_PRG_NAME "ksm_fork_exec_child"
+#define MAP_MERGE_FAIL ((void *)-1)
+#define MAP_MERGE_SKIP ((void *)-2)
+
+enum ksm_merge_mode {
+ KSM_MERGE_PRCTL,
+ KSM_MERGE_MADVISE,
+ KSM_MERGE_NONE, /* PRCTL already set */
+};
+
static int mem_fd;
static int ksm_fd;
static int ksm_full_scans_fd;
@@ -146,33 +155,34 @@ static int ksm_unmerge(void)
return 0;
}
-static char *mmap_and_merge_range(char val, unsigned long size, int prot,
- bool use_prctl)
+static char *__mmap_and_merge_range(char val, unsigned long size, int prot,
+ enum ksm_merge_mode mode)
{
char *map;
+ char *err_map = MAP_MERGE_FAIL;
int ret;
/* Stabilize accounting by disabling KSM completely. */
if (ksm_unmerge()) {
- ksft_test_result_fail("Disabling (unmerging) KSM failed\n");
- return MAP_FAILED;
+ ksft_print_msg("Disabling (unmerging) KSM failed\n");
+ return err_map;
}
if (get_my_merging_pages() > 0) {
- ksft_test_result_fail("Still pages merged\n");
- return MAP_FAILED;
+ ksft_print_msg("Still pages merged\n");
+ return err_map;
}
map = mmap(NULL, size, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANON, -1, 0);
if (map == MAP_FAILED) {
- ksft_test_result_fail("mmap() failed\n");
- return MAP_FAILED;
+ ksft_print_msg("mmap() failed\n");
+ return err_map;
}
/* Don't use THP. Ignore if THP are not around on a kernel. */
if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) {
- ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
+ ksft_print_msg("MADV_NOHUGEPAGE failed\n");
goto unmap;
}
@@ -180,27 +190,36 @@ static char *mmap_and_merge_range(char val, unsigned long size, int prot,
memset(map, val, size);
if (mprotect(map, size, prot)) {
- ksft_test_result_skip("mprotect() failed\n");
+ ksft_print_msg("mprotect() failed\n");
+ err_map = MAP_MERGE_SKIP;
goto unmap;
}
- if (use_prctl) {
+ switch (mode) {
+ case KSM_MERGE_PRCTL:
ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0);
if (ret < 0 && errno == EINVAL) {
- ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n");
+ ksft_print_msg("PR_SET_MEMORY_MERGE not supported\n");
+ err_map = MAP_MERGE_SKIP;
goto unmap;
} else if (ret) {
- ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n");
+ ksft_print_msg("PR_SET_MEMORY_MERGE=1 failed\n");
goto unmap;
}
- } else if (madvise(map, size, MADV_MERGEABLE)) {
- ksft_test_result_fail("MADV_MERGEABLE failed\n");
- goto unmap;
+ break;
+ case KSM_MERGE_MADVISE:
+ if (madvise(map, size, MADV_MERGEABLE)) {
+ ksft_print_msg("MADV_MERGEABLE failed\n");
+ goto unmap;
+ }
+ break;
+ case KSM_MERGE_NONE:
+ break;
}
/* Run KSM to trigger merging and wait. */
if (ksm_merge()) {
- ksft_test_result_fail("Running KSM failed\n");
+ ksft_print_msg("Running KSM failed\n");
goto unmap;
}
@@ -209,14 +228,31 @@ static char *mmap_and_merge_range(char val, unsigned long size, int prot,
* accounted differently (depending on kernel support).
*/
if (val && !get_my_merging_pages()) {
- ksft_test_result_fail("No pages got merged\n");
+ ksft_print_msg("No pages got merged\n");
goto unmap;
}
return map;
unmap:
munmap(map, size);
- return MAP_FAILED;
+ return err_map;
+}
+
+static char *mmap_and_merge_range(char val, unsigned long size, int prot,
+ enum ksm_merge_mode mode)
+{
+ char *map;
+ char *ret = MAP_FAILED;
+
+ map = __mmap_and_merge_range(val, size, prot, mode);
+ if (map == MAP_MERGE_FAIL)
+ ksft_test_result_fail("Merging memory failed");
+ else if (map == MAP_MERGE_SKIP)
+ ksft_test_result_skip("Merging memory skipped");
+ else
+ ret = map;
+
+ return ret;
}
static void test_unmerge(void)
@@ -226,7 +262,7 @@ static void test_unmerge(void)
ksft_print_msg("[RUN] %s\n", __func__);
- map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false);
+ map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE);
if (map == MAP_FAILED)
return;
@@ -264,7 +300,7 @@ static void test_unmerge_zero_pages(void)
}
/* Let KSM deduplicate zero pages. */
- map = mmap_and_merge_range(0x00, size, PROT_READ | PROT_WRITE, false);
+ map = mmap_and_merge_range(0x00, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE);
if (map == MAP_FAILED)
return;
@@ -312,7 +348,7 @@ static void test_unmerge_discarded(void)
ksft_print_msg("[RUN] %s\n", __func__);
- map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false);
+ map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE);
if (map == MAP_FAILED)
return;
@@ -344,7 +380,7 @@ static void test_unmerge_uffd_wp(void)
ksft_print_msg("[RUN] %s\n", __func__);
- map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false);
+ map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE);
if (map == MAP_FAILED)
return;
@@ -439,6 +475,36 @@ static void test_prctl(void)
ksft_test_result_pass("Setting/clearing PR_SET_MEMORY_MERGE works\n");
}
+static int test_child_ksm(void)
+{
+ const unsigned int size = 2 * MiB;
+ char *map;
+
+ /* Test if KSM is enabled for the process. */
+ if (prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0) != 1)
+ return -1;
+
+ /* Test if merge could really happen. */
+ map = __mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_NONE);
+ if (map == MAP_MERGE_FAIL)
+ return -2;
+ else if (map == MAP_MERGE_SKIP)
+ return -3;
+
+ munmap(map, size);
+ return 0;
+}
+
+static void test_child_ksm_err(int status)
+{
+ if (status == -1)
+ ksft_test_result_fail("unexpected PR_GET_MEMORY_MERGE result in child\n");
+ else if (status == -2)
+ ksft_test_result_fail("Merge in child failed\n");
+ else if (status == -3)
+ ksft_test_result_skip("Merge in child skipped\n");
+}
+
/* Verify that prctl ksm flag is inherited. */
static void test_prctl_fork(void)
{
@@ -458,7 +524,7 @@ static void test_prctl_fork(void)
child_pid = fork();
if (!child_pid) {
- exit(prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0));
+ exit(test_child_ksm());
} else if (child_pid < 0) {
ksft_test_result_fail("fork() failed\n");
return;
@@ -467,8 +533,11 @@ static void test_prctl_fork(void)
if (waitpid(child_pid, &status, 0) < 0) {
ksft_test_result_fail("waitpid() failed\n");
return;
- } else if (WEXITSTATUS(status) != 1) {
- ksft_test_result_fail("unexpected PR_GET_MEMORY_MERGE result in child\n");
+ }
+
+ status = WEXITSTATUS(status);
+ if (status) {
+ test_child_ksm_err(status);
return;
}
@@ -480,12 +549,6 @@ static void test_prctl_fork(void)
ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n");
}
-static int ksm_fork_exec_child(void)
-{
- /* Test if KSM is enabled for the process. */
- return prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0) == 1;
-}
-
static void test_prctl_fork_exec(void)
{
int ret, status;
@@ -518,7 +581,7 @@ static void test_prctl_fork_exec(void)
if (WIFEXITED(status)) {
status = WEXITSTATUS(status);
if (status) {
- ksft_test_result_fail("KSM not enabled\n");
+ test_child_ksm_err(status);
return;
}
} else {
@@ -545,7 +608,7 @@ static void test_prctl_unmerge(void)
ksft_print_msg("[RUN] %s\n", __func__);
- map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, true);
+ map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_PRCTL);
if (map == MAP_FAILED)
return;
@@ -568,7 +631,7 @@ static void test_prot_none(void)
ksft_print_msg("[RUN] %s\n", __func__);
- map = mmap_and_merge_range(0x11, size, PROT_NONE, false);
+ map = mmap_and_merge_range(0x11, size, PROT_NONE, KSM_MERGE_MADVISE);
if (map == MAP_FAILED)
goto unmap;
@@ -599,7 +662,7 @@ int main(int argc, char **argv)
int err;
if (argc > 1 && !strcmp(argv[1], FORK_EXEC_CHILD_PRG_NAME)) {
- exit(ksm_fork_exec_child() == 1 ? 0 : 1);
+ exit(test_child_ksm());
}
#ifdef __NR_userfaultfd
diff --git a/tools/testing/selftests/mm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c
index 9b298f6a04b371..9a0597310a7651 100644
--- a/tools/testing/selftests/mm/memfd_secret.c
+++ b/tools/testing/selftests/mm/memfd_secret.c
@@ -20,6 +20,7 @@
#include <unistd.h>
#include <errno.h>
#include <stdio.h>
+#include <fcntl.h>
#include "../kselftest.h"
@@ -83,6 +84,45 @@ static void test_mlock_limit(int fd)
pass("mlock limit is respected\n");
}
+static void test_vmsplice(int fd, const char *desc)
+{
+ ssize_t transferred;
+ struct iovec iov;
+ int pipefd[2];
+ char *mem;
+
+ if (pipe(pipefd)) {
+ fail("pipe failed: %s\n", strerror(errno));
+ return;
+ }
+
+ mem = mmap(NULL, page_size, prot, mode, fd, 0);
+ if (mem == MAP_FAILED) {
+ fail("Unable to mmap secret memory\n");
+ goto close_pipe;
+ }
+
+ /*
+ * vmsplice() may use GUP-fast, which must also fail. Prefault the
+ * page table, so GUP-fast could find it.
+ */
+ memset(mem, PATTERN, page_size);
+
+ iov.iov_base = mem;
+ iov.iov_len = page_size;
+ transferred = vmsplice(pipefd[1], &iov, 1, 0);
+
+ if (transferred < 0 && errno == EFAULT)
+ pass("vmsplice is blocked as expected with %s\n", desc);
+ else
+ fail("vmsplice: unexpected memory access with %s\n", desc);
+
+ munmap(mem, page_size);
+close_pipe:
+ close(pipefd[0]);
+ close(pipefd[1]);
+}
+
static void try_process_vm_read(int fd, int pipefd[2])
{
struct iovec liov, riov;
@@ -187,7 +227,6 @@ static void test_remote_access(int fd, const char *name,
return;
}
- ftruncate(fd, page_size);
memset(mem, PATTERN, page_size);
if (write(pipefd[1], &mem, sizeof(mem)) < 0) {
@@ -258,7 +297,7 @@ static void prepare(void)
strerror(errno));
}
-#define NUM_TESTS 4
+#define NUM_TESTS 6
int main(int argc, char *argv[])
{
@@ -277,9 +316,17 @@ int main(int argc, char *argv[])
ksft_exit_fail_msg("memfd_secret failed: %s\n",
strerror(errno));
}
+ if (ftruncate(fd, page_size))
+ ksft_exit_fail_msg("ftruncate failed: %s\n", strerror(errno));
test_mlock_limit(fd);
test_file_apis(fd);
+ /*
+ * We have to run the first vmsplice test before any secretmem page was
+ * allocated for this fd.
+ */
+ test_vmsplice(fd, "fresh page");
+ test_vmsplice(fd, "existing page");
test_process_vm_read(fd);
test_ptrace(fd);
diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c
index 26f744188ad0c8..7f0d50fa361dcf 100644
--- a/tools/testing/selftests/mm/mlock2-tests.c
+++ b/tools/testing/selftests/mm/mlock2-tests.c
@@ -20,8 +20,6 @@ static int get_vm_area(unsigned long addr, struct vm_boundaries *area)
FILE *file;
int ret = 1;
char line[1024] = {0};
- char *end_addr;
- char *stop;
unsigned long start;
unsigned long end;
@@ -37,21 +35,10 @@ static int get_vm_area(unsigned long addr, struct vm_boundaries *area)
memset(area, 0, sizeof(struct vm_boundaries));
while(fgets(line, 1024, file)) {
- end_addr = strchr(line, '-');
- if (!end_addr) {
+ if (sscanf(line, "%lx-%lx", &start, &end) != 2) {
ksft_print_msg("cannot parse /proc/self/maps\n");
goto out;
}
- *end_addr = '\0';
- end_addr++;
- stop = strchr(end_addr, ' ');
- if (!stop) {
- ksft_print_msg("cannot parse /proc/self/maps\n");
- goto out;
- }
-
- sscanf(line, "%lx", &start);
- sscanf(end_addr, "%lx", &end);
if (start <= addr && end > addr) {
area->start = start;
diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c
index 2f8b991f78cb4c..1b03bcfaefdfa2 100644
--- a/tools/testing/selftests/mm/mremap_test.c
+++ b/tools/testing/selftests/mm/mremap_test.c
@@ -23,6 +23,7 @@
#define VALIDATION_NO_THRESHOLD 0 /* Verify the entire region */
#define MIN(X, Y) ((X) < (Y) ? (X) : (Y))
+#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
#define SIZE_MB(m) ((size_t)m * (1024 * 1024))
#define SIZE_KB(k) ((size_t)k * 1024)
@@ -69,6 +70,27 @@ enum {
.expect_failure = should_fail \
}
+/* compute square root using binary search */
+static unsigned long get_sqrt(unsigned long val)
+{
+ unsigned long low = 1;
+
+ /* assuming rand_size is less than 1TB */
+ unsigned long high = (1UL << 20);
+
+ while (low <= high) {
+ unsigned long mid = low + (high - low) / 2;
+ unsigned long temp = mid * mid;
+
+ if (temp == val)
+ return mid;
+ if (temp < val)
+ low = mid + 1;
+ high = mid - 1;
+ }
+ return low;
+}
+
/*
* Returns false if the requested remap region overlaps with an
* existing mapping (e.g text, stack) else returns true.
@@ -126,19 +148,21 @@ static unsigned long long get_mmap_min_addr(void)
* Using /proc/self/maps, assert that the specified address range is contained
* within a single mapping.
*/
-static bool is_range_mapped(FILE *maps_fp, void *start, void *end)
+static bool is_range_mapped(FILE *maps_fp, unsigned long start,
+ unsigned long end)
{
char *line = NULL;
size_t len = 0;
bool success = false;
+ unsigned long first_val, second_val;
rewind(maps_fp);
while (getline(&line, &len, maps_fp) != -1) {
- char *first = strtok(line, "- ");
- void *first_val = (void *)strtol(first, NULL, 16);
- char *second = strtok(NULL, "- ");
- void *second_val = (void *) strtol(second, NULL, 16);
+ if (sscanf(line, "%lx-%lx", &first_val, &second_val) != 2) {
+ ksft_exit_fail_msg("cannot parse /proc/self/maps\n");
+ break;
+ }
if (first_val <= start && second_val >= end) {
success = true;
@@ -233,7 +257,8 @@ static void mremap_expand_merge(FILE *maps_fp, unsigned long page_size)
goto out;
}
- success = is_range_mapped(maps_fp, start, start + 3 * page_size);
+ success = is_range_mapped(maps_fp, (unsigned long)start,
+ (unsigned long)(start + 3 * page_size));
munmap(start, 3 * page_size);
out:
@@ -272,7 +297,8 @@ static void mremap_expand_merge_offset(FILE *maps_fp, unsigned long page_size)
goto out;
}
- success = is_range_mapped(maps_fp, start, start + 3 * page_size);
+ success = is_range_mapped(maps_fp, (unsigned long)start,
+ (unsigned long)(start + 3 * page_size));
munmap(start, 3 * page_size);
out:
@@ -296,7 +322,7 @@ out:
*
* |DDDDddddSSSSssss|
*/
-static void mremap_move_within_range(char pattern_seed)
+static void mremap_move_within_range(unsigned int pattern_seed, char *rand_addr)
{
char *test_name = "mremap mremap move within range";
void *src, *dest;
@@ -316,10 +342,7 @@ static void mremap_move_within_range(char pattern_seed)
src = (void *)((unsigned long)src & ~(SIZE_MB(2) - 1));
/* Set byte pattern for source block. */
- srand(pattern_seed);
- for (i = 0; i < SIZE_MB(2); i++) {
- ((char *)src)[i] = (char) rand();
- }
+ memcpy(src, rand_addr, SIZE_MB(2));
dest = src - SIZE_MB(2);
@@ -357,14 +380,14 @@ out:
/* Returns the time taken for the remap on success else returns -1. */
static long long remap_region(struct config c, unsigned int threshold_mb,
- char pattern_seed)
+ char *rand_addr)
{
void *addr, *src_addr, *dest_addr, *dest_preamble_addr;
- int d;
- unsigned long long t;
+ unsigned long long t, d;
struct timespec t_start = {0, 0}, t_end = {0, 0};
long long start_ns, end_ns, align_mask, ret, offset;
unsigned long long threshold;
+ unsigned long num_chunks;
if (threshold_mb == VALIDATION_NO_THRESHOLD)
threshold = c.region_size;
@@ -378,9 +401,7 @@ static long long remap_region(struct config c, unsigned int threshold_mb,
}
/* Set byte pattern for source block. */
- srand(pattern_seed);
- for (t = 0; t < threshold; t++)
- memset((char *) src_addr + t, (char) rand(), 1);
+ memcpy(src_addr, rand_addr, threshold);
/* Mask to zero out lower bits of address for alignment */
align_mask = ~(c.dest_alignment - 1);
@@ -420,9 +441,7 @@ static long long remap_region(struct config c, unsigned int threshold_mb,
}
/* Set byte pattern for the dest preamble block. */
- srand(pattern_seed);
- for (d = 0; d < c.dest_preamble_size; d++)
- memset((char *) dest_preamble_addr + d, (char) rand(), 1);
+ memcpy(dest_preamble_addr, rand_addr, c.dest_preamble_size);
}
clock_gettime(CLOCK_MONOTONIC, &t_start);
@@ -436,15 +455,42 @@ static long long remap_region(struct config c, unsigned int threshold_mb,
goto clean_up_dest_preamble;
}
- /* Verify byte pattern after remapping */
- srand(pattern_seed);
- for (t = 0; t < threshold; t++) {
- char c = (char) rand();
+ /*
+ * Verify byte pattern after remapping. Employ an algorithm with a
+ * square root time complexity in threshold: divide the range into
+ * chunks, if memcmp() returns non-zero, only then perform an
+ * iteration in that chunk to find the mismatch index.
+ */
+ num_chunks = get_sqrt(threshold);
+ for (unsigned long i = 0; i < num_chunks; ++i) {
+ size_t chunk_size = threshold / num_chunks;
+ unsigned long shift = i * chunk_size;
+
+ if (!memcmp(dest_addr + shift, rand_addr + shift, chunk_size))
+ continue;
+
+ /* brute force iteration only over mismatch segment */
+ for (t = shift; t < shift + chunk_size; ++t) {
+ if (((char *) dest_addr)[t] != rand_addr[t]) {
+ ksft_print_msg("Data after remap doesn't match at offset %llu\n",
+ t);
+ ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff,
+ ((char *) dest_addr)[t] & 0xff);
+ ret = -1;
+ goto clean_up_dest;
+ }
+ }
+ }
- if (((char *) dest_addr)[t] != c) {
+ /*
+ * if threshold is not divisible by num_chunks, then check the
+ * last chunk
+ */
+ for (t = num_chunks * (threshold / num_chunks); t < threshold; ++t) {
+ if (((char *) dest_addr)[t] != rand_addr[t]) {
ksft_print_msg("Data after remap doesn't match at offset %llu\n",
- t);
- ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff,
+ t);
+ ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff,
((char *) dest_addr)[t] & 0xff);
ret = -1;
goto clean_up_dest;
@@ -452,22 +498,44 @@ static long long remap_region(struct config c, unsigned int threshold_mb,
}
/* Verify the dest preamble byte pattern after remapping */
- if (c.dest_preamble_size) {
- srand(pattern_seed);
- for (d = 0; d < c.dest_preamble_size; d++) {
- char c = (char) rand();
-
- if (((char *) dest_preamble_addr)[d] != c) {
- ksft_print_msg("Preamble data after remap doesn't match at offset %d\n",
- d);
- ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff,
- ((char *) dest_preamble_addr)[d] & 0xff);
+ if (!c.dest_preamble_size)
+ goto no_preamble;
+
+ num_chunks = get_sqrt(c.dest_preamble_size);
+
+ for (unsigned long i = 0; i < num_chunks; ++i) {
+ size_t chunk_size = c.dest_preamble_size / num_chunks;
+ unsigned long shift = i * chunk_size;
+
+ if (!memcmp(dest_preamble_addr + shift, rand_addr + shift,
+ chunk_size))
+ continue;
+
+ /* brute force iteration only over mismatched segment */
+ for (d = shift; d < shift + chunk_size; ++d) {
+ if (((char *) dest_preamble_addr)[d] != rand_addr[d]) {
+ ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n",
+ d);
+ ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff,
+ ((char *) dest_preamble_addr)[d] & 0xff);
ret = -1;
goto clean_up_dest;
}
}
}
+ for (d = num_chunks * (c.dest_preamble_size / num_chunks); d < c.dest_preamble_size; ++d) {
+ if (((char *) dest_preamble_addr)[d] != rand_addr[d]) {
+ ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n",
+ d);
+ ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff,
+ ((char *) dest_preamble_addr)[d] & 0xff);
+ ret = -1;
+ goto clean_up_dest;
+ }
+ }
+
+no_preamble:
start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec;
end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec;
ret = end_ns - start_ns;
@@ -494,7 +562,8 @@ out:
* the beginning of the mapping just because the aligned
* down address landed on a mapping that maybe does not exist.
*/
-static void mremap_move_1mb_from_start(char pattern_seed)
+static void mremap_move_1mb_from_start(unsigned int pattern_seed,
+ char *rand_addr)
{
char *test_name = "mremap move 1mb from start at 1MB+256KB aligned src";
void *src = NULL, *dest = NULL;
@@ -520,10 +589,7 @@ static void mremap_move_1mb_from_start(char pattern_seed)
}
/* Set byte pattern for source block. */
- srand(pattern_seed);
- for (i = 0; i < SIZE_MB(2); i++) {
- ((char *)src)[i] = (char) rand();
- }
+ memcpy(src, rand_addr, SIZE_MB(2));
/*
* Unmap the beginning of dest so that the aligned address
@@ -568,10 +634,10 @@ out:
static void run_mremap_test_case(struct test test_case, int *failures,
unsigned int threshold_mb,
- unsigned int pattern_seed)
+ unsigned int pattern_seed, char *rand_addr)
{
long long remap_time = remap_region(test_case.config, threshold_mb,
- pattern_seed);
+ rand_addr);
if (remap_time < 0) {
if (test_case.expect_failure)
@@ -642,7 +708,15 @@ int main(int argc, char **argv)
int failures = 0;
int i, run_perf_tests;
unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD;
+
+ /* hard-coded test configs */
+ size_t max_test_variable_region_size = _2GB;
+ size_t max_test_constant_region_size = _2MB;
+ size_t dest_preamble_size = 10 * _4MB;
+
unsigned int pattern_seed;
+ char *rand_addr;
+ size_t rand_size;
int num_expand_tests = 2;
int num_misc_tests = 2;
struct test test_cases[MAX_TEST] = {};
@@ -659,6 +733,31 @@ int main(int argc, char **argv)
ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n",
threshold_mb, pattern_seed);
+ /*
+ * set preallocated random array according to test configs; see the
+ * functions for the logic of setting the size
+ */
+ if (!threshold_mb)
+ rand_size = MAX(max_test_variable_region_size,
+ max_test_constant_region_size);
+ else
+ rand_size = MAX(MIN(threshold_mb * _1MB,
+ max_test_variable_region_size),
+ max_test_constant_region_size);
+ rand_size = MAX(dest_preamble_size, rand_size);
+
+ rand_addr = (char *)mmap(NULL, rand_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (rand_addr == MAP_FAILED) {
+ perror("mmap");
+ ksft_exit_fail_msg("cannot mmap rand_addr\n");
+ }
+
+ /* fill stream of random bytes */
+ srand(pattern_seed);
+ for (unsigned long i = 0; i < rand_size; ++i)
+ rand_addr[i] = (char) rand();
+
page_size = sysconf(_SC_PAGESIZE);
/* Expected mremap failures */
@@ -730,13 +829,13 @@ int main(int argc, char **argv)
for (i = 0; i < ARRAY_SIZE(test_cases); i++)
run_mremap_test_case(test_cases[i], &failures, threshold_mb,
- pattern_seed);
+ pattern_seed, rand_addr);
maps_fp = fopen("/proc/self/maps", "r");
if (maps_fp == NULL) {
- ksft_print_msg("Failed to read /proc/self/maps: %s\n", strerror(errno));
- exit(KSFT_FAIL);
+ munmap(rand_addr, rand_size);
+ ksft_exit_fail_msg("Failed to read /proc/self/maps: %s\n", strerror(errno));
}
mremap_expand_merge(maps_fp, page_size);
@@ -744,17 +843,20 @@ int main(int argc, char **argv)
fclose(maps_fp);
- mremap_move_within_range(pattern_seed);
- mremap_move_1mb_from_start(pattern_seed);
+ mremap_move_within_range(pattern_seed, rand_addr);
+ mremap_move_1mb_from_start(pattern_seed, rand_addr);
if (run_perf_tests) {
ksft_print_msg("\n%s\n",
"mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:");
for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++)
run_mremap_test_case(perf_test_cases[i], &failures,
- threshold_mb, pattern_seed);
+ threshold_mb, pattern_seed,
+ rand_addr);
}
+ munmap(rand_addr, rand_size);
+
if (failures > 0)
ksft_exit_fail();
else
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 4bdb3a0c7a606e..3157204b90476b 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -152,9 +152,13 @@ done < /proc/meminfo
# both of these requirements into account and attempt to increase
# number of huge pages available.
nr_cpus=$(nproc)
-hpgsize_MB=$((hpgsize_KB / 1024))
-half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128))
-needmem_KB=$((half_ufd_size_MB * 2 * 1024))
+uffd_min_KB=$((hpgsize_KB * nr_cpus * 2))
+hugetlb_min_KB=$((256 * 1024))
+if [[ $uffd_min_KB -gt $hugetlb_min_KB ]]; then
+ needmem_KB=$uffd_min_KB
+else
+ needmem_KB=$hugetlb_min_KB
+fi
# set proper nr_hugepages
if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then
@@ -294,7 +298,8 @@ CATEGORY="userfaultfd" run_test ./uffd-unit-tests
uffd_stress_bin=./uffd-stress
CATEGORY="userfaultfd" run_test ${uffd_stress_bin} anon 20 16
# Hugetlb tests require source and destination huge pages. Pass in half
-# the size ($half_ufd_size_MB), which is used for *each*.
+# the size of the free pages we have, which is used for *each*.
+half_ufd_size_MB=$((freepgs / 2))
CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb "$half_ufd_size_MB" 32
CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb-private "$half_ufd_size_MB" 32
CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem 20 16
diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c
index 7bcf8d48256a66..4e4c1e311247ff 100644
--- a/tools/testing/selftests/mm/virtual_address_range.c
+++ b/tools/testing/selftests/mm/virtual_address_range.c
@@ -12,6 +12,8 @@
#include <errno.h>
#include <sys/mman.h>
#include <sys/time.h>
+#include <fcntl.h>
+
#include "../kselftest.h"
/*
@@ -85,7 +87,7 @@ static int validate_lower_address_hint(void)
char *ptr;
ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ |
- PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (ptr == MAP_FAILED)
return 0;
@@ -93,6 +95,66 @@ static int validate_lower_address_hint(void)
return 1;
}
+static int validate_complete_va_space(void)
+{
+ unsigned long start_addr, end_addr, prev_end_addr;
+ char line[400];
+ char prot[6];
+ FILE *file;
+ int fd;
+
+ fd = open("va_dump", O_CREAT | O_WRONLY, 0600);
+ unlink("va_dump");
+ if (fd < 0) {
+ ksft_test_result_skip("cannot create or open dump file\n");
+ ksft_finished();
+ }
+
+ file = fopen("/proc/self/maps", "r");
+ if (file == NULL)
+ ksft_exit_fail_msg("cannot open /proc/self/maps\n");
+
+ prev_end_addr = 0;
+ while (fgets(line, sizeof(line), file)) {
+ unsigned long hop;
+
+ if (sscanf(line, "%lx-%lx %s[rwxp-]",
+ &start_addr, &end_addr, prot) != 3)
+ ksft_exit_fail_msg("cannot parse /proc/self/maps\n");
+
+ /* end of userspace mappings; ignore vsyscall mapping */
+ if (start_addr & (1UL << 63))
+ return 0;
+
+ /* /proc/self/maps must have gaps less than MAP_CHUNK_SIZE */
+ if (start_addr - prev_end_addr >= MAP_CHUNK_SIZE)
+ return 1;
+
+ prev_end_addr = end_addr;
+
+ if (prot[0] != 'r')
+ continue;
+
+ /*
+ * Confirm whether MAP_CHUNK_SIZE chunk can be found or not.
+ * If write succeeds, no need to check MAP_CHUNK_SIZE - 1
+ * addresses after that. If the address was not held by this
+ * process, write would fail with errno set to EFAULT.
+ * Anyways, if write returns anything apart from 1, exit the
+ * program since that would mean a bug in /proc/self/maps.
+ */
+ hop = 0;
+ while (start_addr + hop < end_addr) {
+ if (write(fd, (void *)(start_addr + hop), 1) != 1)
+ return 1;
+ lseek(fd, 0, SEEK_SET);
+
+ hop += MAP_CHUNK_SIZE;
+ }
+ }
+ return 0;
+}
+
int main(int argc, char *argv[])
{
char *ptr[NR_CHUNKS_LOW];
@@ -105,13 +167,11 @@ int main(int argc, char *argv[])
for (i = 0; i < NR_CHUNKS_LOW; i++) {
ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (ptr[i] == MAP_FAILED) {
- if (validate_lower_address_hint()) {
- ksft_test_result_skip("Memory constraint not fulfilled\n");
- ksft_finished();
- }
+ if (validate_lower_address_hint())
+ ksft_exit_fail_msg("mmap unexpectedly succeeded with hint\n");
break;
}
@@ -127,7 +187,7 @@ int main(int argc, char *argv[])
for (i = 0; i < NR_CHUNKS_HIGH; i++) {
hint = hind_addr();
hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (hptr[i] == MAP_FAILED)
break;
@@ -135,6 +195,10 @@ int main(int argc, char *argv[])
validate_addr(hptr[i], 1);
}
hchunks = i;
+ if (validate_complete_va_space()) {
+ ksft_test_result_fail("BUG in mmap() or /proc/self/maps\n");
+ ksft_finished();
+ }
for (i = 0; i < lchunks; i++)
munmap(ptr[i], MAP_CHUNK_SIZE);
diff --git a/tools/testing/selftests/x86/test_shadow_stack.c b/tools/testing/selftests/x86/test_shadow_stack.c
index 757e6527f67e2a..ee909a7927f9d9 100644
--- a/tools/testing/selftests/x86/test_shadow_stack.c
+++ b/tools/testing/selftests/x86/test_shadow_stack.c
@@ -556,7 +556,7 @@ struct node {
* looked at the shadow stack gaps.
* 5. See if it landed in the gap.
*/
-int test_guard_gap(void)
+int test_guard_gap_other_gaps(void)
{
void *free_area, *shstk, *test_map = (void *)0xFFFFFFFFFFFFFFFF;
struct node *head = NULL, *cur;
@@ -593,11 +593,64 @@ int test_guard_gap(void)
if (shstk - test_map - PAGE_SIZE != PAGE_SIZE)
return 1;
- printf("[OK]\tGuard gap test\n");
+ printf("[OK]\tGuard gap test, other mapping's gaps\n");
return 0;
}
+/* Tests respecting the guard gap of the mapping getting placed */
+int test_guard_gap_new_mappings_gaps(void)
+{
+ void *free_area, *shstk_start, *test_map = (void *)0xFFFFFFFFFFFFFFFF;
+ struct node *head = NULL, *cur;
+ int ret = 0;
+
+ free_area = mmap(0, PAGE_SIZE * 4, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ munmap(free_area, PAGE_SIZE * 4);
+
+ /* Test letting map_shadow_stack find a free space */
+ shstk_start = mmap(free_area, PAGE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (shstk_start == MAP_FAILED || shstk_start != free_area)
+ return 1;
+
+ while (test_map > shstk_start) {
+ test_map = (void *)syscall(__NR_map_shadow_stack, 0, PAGE_SIZE, 0);
+ if (test_map == MAP_FAILED) {
+ printf("[INFO]\tmap_shadow_stack MAP_FAILED\n");
+ ret = 1;
+ break;
+ }
+
+ cur = malloc(sizeof(*cur));
+ cur->mapping = test_map;
+
+ cur->next = head;
+ head = cur;
+
+ if (test_map == free_area + PAGE_SIZE) {
+ printf("[INFO]\tNew mapping has other mapping in guard gap!\n");
+ ret = 1;
+ break;
+ }
+ }
+
+ while (head) {
+ cur = head;
+ head = cur->next;
+ munmap(cur->mapping, PAGE_SIZE);
+ free(cur);
+ }
+
+ munmap(shstk_start, PAGE_SIZE);
+
+ if (!ret)
+ printf("[OK]\tGuard gap test, placement mapping's gaps\n");
+
+ return ret;
+}
+
/*
* Too complicated to pull it out of the 32 bit header, but also get the
* 64 bit one needed above. Just define a copy here.
@@ -850,9 +903,15 @@ int main(int argc, char *argv[])
goto out;
}
- if (test_guard_gap()) {
+ if (test_guard_gap_other_gaps()) {
+ ret = 1;
+ printf("[FAIL]\tGuard gap test, other mappings' gaps\n");
+ goto out;
+ }
+
+ if (test_guard_gap_new_mappings_gaps()) {
ret = 1;
- printf("[FAIL]\tGuard gap test\n");
+ printf("[FAIL]\tGuard gap test, placement mapping's gaps\n");
goto out;
}