aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2021-09-30 16:17:50 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2021-09-30 16:17:50 +1000
commitaa09b21d0c08b8964b5f5a9b9d78f89893d1ce06 (patch)
treea82f46e1bafb2f04c3addf2a8daad317301c8011
parent27d44be8c29cee0b5dc998219f36e06ca409c946 (diff)
parentc7f6f067c98b785c46f6cc9d7d3e21080bdf505d (diff)
downloaddevel-aa09b21d0c08b8964b5f5a9b9d78f89893d1ce06.tar.gz
Merge branch 'akpm-current/current'
-rw-r--r--Documentation/admin-guide/blockdev/zram.rst8
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt8
-rw-r--r--Documentation/admin-guide/mm/hugetlbpage.rst42
-rw-r--r--Documentation/admin-guide/mm/index.rst2
-rw-r--r--Documentation/admin-guide/mm/swap_numa.rst (renamed from Documentation/vm/swap_numa.rst)0
-rw-r--r--Documentation/admin-guide/mm/zswap.rst (renamed from Documentation/vm/zswap.rst)0
-rw-r--r--Documentation/dev-tools/kcov.rst5
-rw-r--r--Documentation/dev-tools/kfence.rst11
-rw-r--r--Documentation/vm/damon/index.rst1
-rw-r--r--Documentation/vm/index.rst26
-rw-r--r--Documentation/vm/page_owner.rst23
-rw-r--r--MAINTAINERS2
-rw-r--r--Makefile15
-rw-r--r--arch/Kconfig28
-rw-r--r--arch/arm/mach-rpc/ecard.c2
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/configs/skiroot_defconfig1
-rw-r--r--arch/powerpc/kernel/smp.c2
-rw-r--r--arch/powerpc/mm/book3s64/radix_tlb.c4
-rw-r--r--arch/powerpc/mm/hugetlbpage.c9
-rw-r--r--drivers/block/zram/zram_drv.c65
-rw-r--r--drivers/gpu/drm/drm_dp_mst_topology.c5
-rw-r--r--drivers/gpu/drm/drm_mm.c5
-rw-r--r--drivers/gpu/drm/i915/i915_vma.c5
-rw-r--r--drivers/gpu/drm/i915/intel_runtime_pm.c20
-rw-r--r--drivers/mmc/core/mmc_test.c1
-rw-r--r--drivers/of/kexec.c1
-rw-r--r--drivers/rapidio/devices/rio_mport_cdev.c9
-rw-r--r--drivers/virtio/Kconfig1
-rw-r--r--drivers/virtio/virtio_mem.c4
-rw-r--r--fs/binfmt_elf.c31
-rw-r--r--fs/buffer.c56
-rw-r--r--fs/coda/cnode.c13
-rw-r--r--fs/coda/coda_linux.c39
-rw-r--r--fs/coda/coda_linux.h6
-rw-r--r--fs/coda/dir.c20
-rw-r--r--fs/coda/file.c12
-rw-r--r--fs/coda/psdev.c14
-rw-r--r--fs/coda/upcall.c3
-rw-r--r--fs/exec.c4
-rw-r--r--fs/hfs/inode.c6
-rw-r--r--fs/hfsplus/catalog.c16
-rw-r--r--fs/hfsplus/dir.c4
-rw-r--r--fs/hfsplus/hfsplus_raw.h12
-rw-r--r--fs/hfsplus/inode.c12
-rw-r--r--fs/hfsplus/xattr.c18
-rw-r--r--fs/hugetlbfs/inode.c14
-rw-r--r--fs/inode.c46
-rw-r--r--fs/internal.h1
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/namei.c49
-rw-r--r--fs/ocfs2/namei.h2
-rw-r--r--fs/ocfs2/refcounttree.c15
-rw-r--r--fs/ocfs2/xattr.c12
-rw-r--r--fs/ocfs2/xattr.h1
-rw-r--r--fs/posix_acl.c3
-rw-r--r--fs/proc/fd.c23
-rw-r--r--fs/proc/page.c40
-rw-r--r--fs/proc/task_mmu.c28
-rw-r--r--fs/ramfs/inode.c11
-rw-r--r--fs/sysv/super.c6
-rw-r--r--fs/userfaultfd.c12
-rw-r--r--include/linux/compiler-gcc.h9
-rw-r--r--include/linux/compiler_types.h5
-rw-r--r--include/linux/cpuset.h17
-rw-r--r--include/linux/damon.h6
-rw-r--r--include/linux/fs.h1
-rw-r--r--include/linux/gfp.h2
-rw-r--r--include/linux/highmem.h28
-rw-r--r--include/linux/hugetlb.h24
-rw-r--r--include/linux/kasan.h2
-rw-r--r--include/linux/mempolicy.h6
-rw-r--r--include/linux/memremap.h6
-rw-r--r--include/linux/migrate.h19
-rw-r--r--include/linux/migrate_mode.h13
-rw-r--r--include/linux/mm.h54
-rw-r--r--include/linux/mmzone.h26
-rw-r--r--include/linux/page-flags.h2
-rw-r--r--include/linux/pagemap.h50
-rw-r--r--include/linux/percpu.h3
-rw-r--r--include/linux/rmap.h8
-rw-r--r--include/linux/sched.h8
-rw-r--r--include/linux/sched/mm.h21
-rw-r--r--include/linux/slab.h120
-rw-r--r--include/linux/stackdepot.h11
-rw-r--r--include/linux/stacktrace.h1
-rw-r--r--include/linux/swap.h1
-rw-r--r--include/linux/vmalloc.h13
-rw-r--r--init/main.c25
-rw-r--r--ipc/ipc_sysctl.c32
-rw-r--r--kernel/cgroup/cpuset.c23
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c60
-rw-r--r--kernel/hung_task.c44
-rw-r--r--kernel/kcov.c36
-rw-r--r--kernel/kthread.c21
-rw-r--r--kernel/resource.c54
-rw-r--r--kernel/sched/core.c35
-rw-r--r--kernel/sched/sched.h4
-rw-r--r--kernel/sched/topology.c1
-rw-r--r--kernel/stacktrace.c30
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/tsacct.c2
-rw-r--r--kernel/workqueue.c2
-rw-r--r--lib/Kconfig.debug6
-rw-r--r--lib/stackdepot.c118
-rw-r--r--lib/test_kasan.c18
-rw-r--r--mm/Kconfig10
-rw-r--r--mm/damon/Kconfig2
-rw-r--r--mm/damon/core.c10
-rw-r--r--mm/debug.c20
-rw-r--r--mm/debug_vm_pgtable.c7
-rw-r--r--mm/filemap.c20
-rw-r--r--mm/gup.c5
-rw-r--r--mm/hugetlb.c491
-rw-r--r--mm/internal.h1
-rw-r--r--mm/kasan/common.c6
-rw-r--r--mm/kasan/generic.c14
-rw-r--r--mm/kasan/kasan.h2
-rw-r--r--mm/kasan/report.c15
-rw-r--r--mm/kfence/core.c156
-rw-r--r--mm/kfence/kfence.h2
-rw-r--r--mm/kfence/kfence_test.c14
-rw-r--r--mm/memcontrol.c31
-rw-r--r--mm/memory-failure.c1
-rw-r--r--mm/memory.c162
-rw-r--r--mm/memory_hotplug.c8
-rw-r--r--mm/mempolicy.c12
-rw-r--r--mm/memremap.c20
-rw-r--r--mm/migrate.c58
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/mremap.c50
-rw-r--r--mm/nommu.c6
-rw-r--r--mm/oom_kill.c9
-rw-r--r--mm/page_alloc.c70
-rw-r--r--mm/page_isolation.c29
-rw-r--r--mm/page_owner.c18
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/shmem.c6
-rw-r--r--mm/slab.c13
-rw-r--r--mm/slab_common.c8
-rw-r--r--mm/slub.c51
-rw-r--r--mm/sparse-vmemmap.c2
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/truncate.c19
-rw-r--r--mm/userfaultfd.c3
-rw-r--r--mm/vmalloc.c47
-rw-r--r--mm/vmscan.c10
-rw-r--r--mm/vmstat.c73
-rw-r--r--mm/workingset.c10
-rw-r--r--mm/zsmalloc.c10
-rw-r--r--net/ipv4/tcp.c1
-rw-r--r--net/ipv4/udp.c1
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c1
-rw-r--r--net/openvswitch/meter.c1
-rw-r--r--net/sctp/protocol.c1
-rwxr-xr-xscripts/checkpatch.pl3
-rw-r--r--scripts/const_structs.checkpatch4
-rw-r--r--scripts/spelling.txt16
-rw-r--r--security/Kconfig14
-rw-r--r--tools/testing/selftests/vm/ksm_tests.c29
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c234
-rw-r--r--tools/vm/page_owner_sort.c94
164 files changed, 2560 insertions, 1174 deletions
diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst
index 700329d25f5799..3e11926a4df95d 100644
--- a/Documentation/admin-guide/blockdev/zram.rst
+++ b/Documentation/admin-guide/blockdev/zram.rst
@@ -328,6 +328,14 @@ as idle::
From now on, any pages on zram are idle pages. The idle mark
will be removed until someone requests access of the block.
IOW, unless there is access request, those pages are still idle pages.
+Additionally, when CONFIG_ZRAM_MEMORY_TRACKING is enabled pages can be
+marked as idle based on how long (in seconds) it's been since they were
+last accessed::
+
+ echo 86400 > /sys/block/zramX/idle
+
+In this example all pages which haven't been accessed in more than 86400
+seconds (one day) will be marked idle.
Admin can request writeback of those idle pages at right timing via::
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 316027c3aadce5..13fb04ae51263b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1599,9 +1599,11 @@
the number of pages of hugepagesz to be allocated.
If this is the first HugeTLB parameter on the command
line, it specifies the number of pages to allocate for
- the default huge page size. See also
- Documentation/admin-guide/mm/hugetlbpage.rst.
- Format: <integer>
+ the default huge page size. If using node format, the
+ number of pages to allocate per-node can be specified.
+ See also Documentation/admin-guide/mm/hugetlbpage.rst.
+ Format: <integer> or (node format)
+ <node>:<integer>[,<node>:<integer>]
hugepagesz=
[HW] The size of the HugeTLB pages. This is used in
diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst
index 8abaeb144e44dd..d5c1c646e7447c 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -128,7 +128,9 @@ hugepages
implicitly specifies the number of huge pages of default size to
allocate. If the number of huge pages of default size is implicitly
specified, it can not be overwritten by a hugepagesz,hugepages
- parameter pair for the default size.
+ parameter pair for the default size. This parameter also has a
+ node format. The node format specifies the number of huge pages
+ to allocate on specific nodes.
For example, on an architecture with 2M default huge page size::
@@ -138,6 +140,14 @@ hugepages
indicating that the hugepages=512 parameter is ignored. If a hugepages
parameter is preceded by an invalid hugepagesz parameter, it will
be ignored.
+
+ Node format example::
+
+ hugepagesz=2M hugepages=0:1,1:2
+
+ It will allocate 1 2M hugepage on node0 and 2 2M hugepages on node1.
+ If the node number is invalid, the parameter will be ignored.
+
default_hugepagesz
Specify the default huge page size. This parameter can
only be specified once on the command line. default_hugepagesz can
@@ -234,8 +244,12 @@ will exist, of the form::
hugepages-${size}kB
-Inside each of these directories, the same set of files will exist::
+Inside each of these directories, the set of files contained in ``/proc``
+will exist. In addition, two additional interfaces for demoting huge
+pages may exist::
+ demote
+ demote_size
nr_hugepages
nr_hugepages_mempolicy
nr_overcommit_hugepages
@@ -243,7 +257,29 @@ Inside each of these directories, the same set of files will exist::
resv_hugepages
surplus_hugepages
-which function as described above for the default huge page-sized case.
+The demote interfaces provide the ability to split a huge page into
+smaller huge pages. For example, the x86 architecture supports both
+1GB and 2MB huge pages sizes. A 1GB huge page can be split into 512
+2MB huge pages. Demote interfaces are not available for the smallest
+huge page size. The demote interfaces are:
+
+demote_size
+ is the size of demoted pages. When a page is demoted a corresponding
+ number of huge pages of demote_size will be created. By default,
+ demote_size is set to the next smaller huge page size. If there are
+ multiple smaller huge page sizes, demote_size can be set to any of
+ these smaller sizes. Only huge page sizes less then the current huge
+ pages size are allowed.
+
+demote
+ is used to demote a number of huge pages. A user with root privileges
+ can write to this file. It may not be possible to demote the
+ requested number of huge pages. To determine how many pages were
+ actually demoted, compare the value of nr_hugepages before and after
+ writing to the demote interface. demote is a write only interface.
+
+The interfaces which are the same as in ``/proc`` (all except demote and
+demote_size) function as described above for the default huge page-sized case.
.. _mem_policy_and_hp_alloc:
diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst
index cbd19d5e625f38..c21b5823f12619 100644
--- a/Documentation/admin-guide/mm/index.rst
+++ b/Documentation/admin-guide/mm/index.rst
@@ -37,5 +37,7 @@ the Linux memory management.
numaperf
pagemap
soft-dirty
+ swap_numa
transhuge
userfaultfd
+ zswap
diff --git a/Documentation/vm/swap_numa.rst b/Documentation/admin-guide/mm/swap_numa.rst
index e0466f2db8fa05..e0466f2db8fa05 100644
--- a/Documentation/vm/swap_numa.rst
+++ b/Documentation/admin-guide/mm/swap_numa.rst
diff --git a/Documentation/vm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst
index 8edb8d578caf7e..8edb8d578caf7e 100644
--- a/Documentation/vm/zswap.rst
+++ b/Documentation/admin-guide/mm/zswap.rst
diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst
index d2c4c27e1702dc..d83c9ab494275e 100644
--- a/Documentation/dev-tools/kcov.rst
+++ b/Documentation/dev-tools/kcov.rst
@@ -50,6 +50,7 @@ program using kcov:
#include <sys/mman.h>
#include <unistd.h>
#include <fcntl.h>
+ #include <linux/types.h>
#define KCOV_INIT_TRACE _IOR('c', 1, unsigned long)
#define KCOV_ENABLE _IO('c', 100)
@@ -177,6 +178,8 @@ Comparison operands collection is similar to coverage collection:
/* Read number of comparisons collected. */
n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED);
for (i = 0; i < n; i++) {
+ uint64_t ip;
+
type = cover[i * KCOV_WORDS_PER_CMP + 1];
/* arg1 and arg2 - operands of the comparison. */
arg1 = cover[i * KCOV_WORDS_PER_CMP + 2];
@@ -251,6 +254,8 @@ selectively from different subsystems.
.. code-block:: c
+ /* Same includes and defines as above. */
+
struct kcov_remote_arg {
__u32 trace_mode;
__u32 area_size;
diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst
index 0fbe3308bf37ff..d45f952986ae4a 100644
--- a/Documentation/dev-tools/kfence.rst
+++ b/Documentation/dev-tools/kfence.rst
@@ -269,6 +269,17 @@ tail of KFENCE's freelist, so that the least recently freed objects are reused
first, and the chances of detecting use-after-frees of recently freed objects
is increased.
+If pool utilization reaches 75% (default) or above, to reduce the risk of the
+pool eventually being fully occupied by allocated objects yet ensure diverse
+coverage of allocations, KFENCE limits currently covered allocations of the
+same source from further filling up the pool. The "source" of an allocation is
+based on its partial allocation stack trace. A side-effect is that this also
+limits frequent long-lived allocations (e.g. pagecache) of the same source
+filling up the pool permanently, which is the most common risk for the pool
+becoming full and the sampled allocation rate dropping to zero. The threshold
+at which to start limiting currently covered allocations can be configured via
+the boot parameter ``kfence.skip_covered_thresh`` (pool usage%).
+
Interface
---------
diff --git a/Documentation/vm/damon/index.rst b/Documentation/vm/damon/index.rst
index a2858baf3bf1df..48c0bbff98b2fc 100644
--- a/Documentation/vm/damon/index.rst
+++ b/Documentation/vm/damon/index.rst
@@ -27,4 +27,3 @@ workloads and systems.
faq
design
api
- plans
diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst
index b51f0d8992f8fd..6f5ffef4b716a9 100644
--- a/Documentation/vm/index.rst
+++ b/Documentation/vm/index.rst
@@ -3,27 +3,11 @@ Linux Memory Management Documentation
=====================================
This is a collection of documents about the Linux memory management (mm)
-subsystem. If you are looking for advice on simply allocating memory,
-see the :ref:`memory_allocation`.
-
-User guides for MM features
-===========================
-
-The following documents provide guides for controlling and tuning
-various features of the Linux memory management
-
-.. toctree::
- :maxdepth: 1
-
- swap_numa
- zswap
-
-Kernel developers MM documentation
-==================================
-
-The below documents describe MM internals with different level of
-details ranging from notes and mailing list responses to elaborate
-descriptions of data structures and algorithms.
+subsystem internals with different level of details ranging from notes and
+mailing list responses for elaborating descriptions of data structures and
+algorithms. If you are looking for advice on simply allocating memory, see the
+:ref:`memory_allocation`. For controlling and tuning guides, see the
+:doc:`admin guide <../admin-guide/mm/index>`.
.. toctree::
:maxdepth: 1
diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst
index 2175465c9bf2a6..9837fc8147dd6b 100644
--- a/Documentation/vm/page_owner.rst
+++ b/Documentation/vm/page_owner.rst
@@ -85,5 +85,26 @@ Usage
cat /sys/kernel/debug/page_owner > page_owner_full.txt
./page_owner_sort page_owner_full.txt sorted_page_owner.txt
+ The general output of ``page_owner_full.txt`` is as follows:
+
+ Page allocated via order XXX, ...
+ PFN XXX ...
+ // Detailed stack
+
+ Page allocated via order XXX, ...
+ PFN XXX ...
+ // Detailed stack
+
+ The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows
+ in buf, uses regexp to extract the page order value, counts the times
+ and pages of buf, and finally sorts them according to the times.
+
See the result about who allocated each page
- in the ``sorted_page_owner.txt``.
+ in the ``sorted_page_owner.txt``. General output:
+
+ XXX times, XXX pages:
+ Page allocated via order XXX, ...
+ // Detailed stack
+
+ By default, ``page_owner_sort`` is sorted according to the times of buf.
+ If you want to sort by the pages nums of buf, use the ``-m`` parameter.
diff --git a/MAINTAINERS b/MAINTAINERS
index bc387792e6d706..9b765cbc90d1b1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5191,7 +5191,7 @@ F: net/ax25/ax25_timer.c
F: net/ax25/sysctl_net_ax25.c
DATA ACCESS MONITOR
-M: SeongJae Park <sjpark@amazon.de>
+M: SeongJae Park <sj@kernel.org>
L: linux-mm@kvack.org
S: Maintained
F: Documentation/admin-guide/mm/damon/
diff --git a/Makefile b/Makefile
index aec4503c8bf149..248a9a0ee64e68 100644
--- a/Makefile
+++ b/Makefile
@@ -1099,6 +1099,21 @@ ifdef CONFIG_CC_IS_GCC
KBUILD_CFLAGS += -Wno-maybe-uninitialized
endif
+ifdef CONFIG_CC_IS_GCC
+# The allocators already balk at large sizes, so silence the compiler
+# warnings for bounds checks involving those possible values. While
+# -Wno-alloc-size-larger-than would normally be used here, earlier versions
+# of gcc (<9.1) weirdly don't handle the option correctly when _other_
+# warnings are produced (?!). Using -Walloc-size-larger-than=SIZE_MAX
+# doesn't work (as it is documented to), silently resolving to "0" prior to
+# version 9.1 (and producing an error more recently). Numeric values larger
+# than PTRDIFF_MAX also don't work prior to version 9.1, which are silently
+# ignored, continuing to default to PTRDIFF_MAX. So, left with no other
+# choice, we must perform a versioned check to disable this warning.
+# https://lore.kernel.org/lkml/20210824115859.187f272f@canb.auug.org.au
+KBUILD_CFLAGS += $(call cc-ifversion, -ge, 0901, -Wno-alloc-size-larger-than)
+endif
+
# disable invalid "can't wrap" optimizations for signed / pointers
KBUILD_CFLAGS += -fno-strict-overflow
diff --git a/arch/Kconfig b/arch/Kconfig
index 8df1c71026435d..cca27f1b5d0edb 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -428,6 +428,34 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
irqs disabled over activate_mm. Architectures that do IPI based TLB
shootdowns should enable this.
+# Use normal mm refcounting for MMU_LAZY_TLB kernel thread references.
+# MMU_LAZY_TLB_REFCOUNT=n can improve the scalability of context switching
+# to/from kernel threads when the same mm is running on a lot of CPUs (a large
+# multi-threaded application), by reducing contention on the mm refcount.
+#
+# This can be disabled if the architecture ensures no CPUs are using an mm as a
+# "lazy tlb" beyond its final refcount (i.e., by the time __mmdrop frees the mm
+# or its kernel page tables). This could be arranged by arch_exit_mmap(), or
+# final exit(2) TLB flush, for example. arch code must also ensure the
+# _lazy_tlb variants of mmgrab/mmdrop are used when dropping the lazy reference
+# to a kthread ->active_mm (non-arch code has been converted already).
+config MMU_LAZY_TLB_REFCOUNT
+ def_bool y
+ depends on !MMU_LAZY_TLB_SHOOTDOWN
+
+# This option allows MMU_LAZY_TLB_REFCOUNT=n. It ensures no CPUs are using an
+# mm as a lazy tlb beyond its last reference count, by shooting down these
+# users before the mm is deallocated. __mmdrop() first IPIs all CPUs that may
+# be using the mm as a lazy tlb, so that they may switch themselves to using
+# init_mm for their active mm. mm_cpumask(mm) is used to determine which CPUs
+# may be using mm as a lazy tlb mm.
+#
+# To implement this, an arch must ensure mm_cpumask(mm) contains at least all
+# possible CPUs in which the mm is lazy, and it must meet the requirements for
+# MMU_LAZY_TLB_REFCOUNT=n (see above).
+config MMU_LAZY_TLB_SHOOTDOWN
+ bool
+
config ARCH_HAVE_NMI_SAFE_CMPXCHG
bool
diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
index 53813f9464a242..c30df1097c524b 100644
--- a/arch/arm/mach-rpc/ecard.c
+++ b/arch/arm/mach-rpc/ecard.c
@@ -253,7 +253,7 @@ static int ecard_init_mm(void)
current->mm = mm;
current->active_mm = mm;
activate_mm(active_mm, mm);
- mmdrop(active_mm);
+ mmdrop_lazy_tlb(active_mm);
ecard_init_pgtables(mm);
return 0;
}
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index ba5b661893588d..8a584414ef67a1 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -249,6 +249,7 @@ config PPC
select IRQ_FORCED_THREADING
select MMU_GATHER_PAGE_SIZE
select MMU_GATHER_RCU_TABLE_FREE
+ select MMU_LAZY_TLB_SHOOTDOWN if PPC_BOOK3S_64
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE if PPC64 || NOT_COHERENT_CACHE
select NEED_SG_DMA_LENGTH
diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig
index b806a5d3a695d7..c3ba614c973d67 100644
--- a/arch/powerpc/configs/skiroot_defconfig
+++ b/arch/powerpc/configs/skiroot_defconfig
@@ -275,7 +275,6 @@ CONFIG_NLS_UTF8=y
CONFIG_ENCRYPTED_KEYS=y
CONFIG_SECURITY=y
CONFIG_HARDENED_USERCOPY=y
-# CONFIG_HARDENED_USERCOPY_FALLBACK is not set
CONFIG_HARDENED_USERCOPY_PAGESPAN=y
CONFIG_FORTIFY_SOURCE=y
CONFIG_SECURITY_LOCKDOWN_LSM=y
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 9cc7d3dbf4392f..3db7d1f0681dbf 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1582,7 +1582,7 @@ void start_secondary(void *unused)
if (IS_ENABLED(CONFIG_PPC32))
setup_kup();
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
current->active_mm = &init_mm;
smp_store_cpu_info(cpu);
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 7724af19ed7e68..59156c2d2ebef9 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -786,10 +786,10 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush)
if (current->active_mm == mm) {
WARN_ON_ONCE(current->mm != NULL);
/* Is a kernel thread and is using mm as the lazy tlb */
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
current->active_mm = &init_mm;
switch_mm_irqs_off(mm, &init_mm, current);
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
}
/*
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 9a75ba078e1b37..dd40ce6e7565f8 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -229,17 +229,22 @@ static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
m->hstate = hstate;
return 1;
}
+
+bool __init node_specific_alloc_support(void)
+{
+ return false;
+}
#endif
-int __init alloc_bootmem_huge_page(struct hstate *h)
+int __init alloc_bootmem_huge_page(struct hstate *h, int nid)
{
#ifdef CONFIG_PPC_BOOK3S_64
if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
return pseries_alloc_bootmem_huge_page(h);
#endif
- return __alloc_bootmem_huge_page(h);
+ return __alloc_bootmem_huge_page(h, nid);
}
#ifndef CONFIG_PPC_BOOK3S_64
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index fcaf2750f68f7c..298d54d30abe92 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -291,22 +291,16 @@ static ssize_t mem_used_max_store(struct device *dev,
return len;
}
-static ssize_t idle_store(struct device *dev,
- struct device_attribute *attr, const char *buf, size_t len)
+/*
+ * Mark all pages which are older than or equal to cutoff as IDLE.
+ * Callers should hold the zram init lock in read mode
+ */
+static void mark_idle(struct zram *zram, ktime_t cutoff)
{
- struct zram *zram = dev_to_zram(dev);
+ int is_idle = 1;
unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
int index;
- if (!sysfs_streq(buf, "all"))
- return -EINVAL;
-
- down_read(&zram->init_lock);
- if (!init_done(zram)) {
- up_read(&zram->init_lock);
- return -EINVAL;
- }
-
for (index = 0; index < nr_pages; index++) {
/*
* Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race.
@@ -314,14 +308,49 @@ static ssize_t idle_store(struct device *dev,
*/
zram_slot_lock(zram, index);
if (zram_allocated(zram, index) &&
- !zram_test_flag(zram, index, ZRAM_UNDER_WB))
- zram_set_flag(zram, index, ZRAM_IDLE);
+ !zram_test_flag(zram, index, ZRAM_UNDER_WB)) {
+#ifdef CONFIG_ZRAM_MEMORY_TRACKING
+ is_idle = (!cutoff || ktime_after(cutoff, zram->table[index].ac_time));
+#endif
+ if (is_idle)
+ zram_set_flag(zram, index, ZRAM_IDLE);
+ }
zram_slot_unlock(zram, index);
}
+}
- up_read(&zram->init_lock);
+static ssize_t idle_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct zram *zram = dev_to_zram(dev);
+ ktime_t cutoff_time = 0;
+ ssize_t rv = -EINVAL;
- return len;
+ if (!sysfs_streq(buf, "all")) {
+ /*
+ * If it did not parse as 'all' try to treat it as an integer when
+ * we have memory tracking enabled.
+ */
+ u64 age_sec;
+ if (IS_ENABLED(CONFIG_ZRAM_MEMORY_TRACKING) && !kstrtoull(buf, 0, &age_sec))
+ cutoff_time = ktime_sub(ktime_get_boottime(),
+ ns_to_ktime(age_sec * NSEC_PER_SEC));
+ else
+ goto out;
+ }
+
+ down_read(&zram->init_lock);
+ if (!init_done(zram))
+ goto out_unlock;
+
+ /* A age_sec of 0 marks everything as idle, this is the "all" behavior */
+ mark_idle(zram, cutoff_time);
+ rv = len;
+
+out_unlock:
+ up_read(&zram->init_lock);
+out:
+ return rv;
}
#ifdef CONFIG_ZRAM_WRITEBACK
@@ -587,7 +616,7 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
{
struct bio *bio;
- bio = bio_alloc(GFP_ATOMIC, 1);
+ bio = bio_alloc(GFP_NOIO, 1);
if (!bio)
return -ENOMEM;
@@ -910,7 +939,7 @@ static ssize_t read_block_state(struct file *file, char __user *buf,
zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.',
zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.');
- if (count < copied) {
+ if (count <= copied) {
zram_slot_unlock(zram, index);
break;
}
diff --git a/drivers/gpu/drm/drm_dp_mst_topology.c b/drivers/gpu/drm/drm_dp_mst_topology.c
index 86d13d6bc46313..2d1adab9e36000 100644
--- a/drivers/gpu/drm/drm_dp_mst_topology.c
+++ b/drivers/gpu/drm/drm_dp_mst_topology.c
@@ -1668,13 +1668,10 @@ __dump_topology_ref_history(struct drm_dp_mst_topology_ref_history *history,
for (i = 0; i < history->len; i++) {
const struct drm_dp_mst_topology_ref_entry *entry =
&history->entries[i];
- ulong *entries;
- uint nr_entries;
u64 ts_nsec = entry->ts_nsec;
u32 rem_nsec = do_div(ts_nsec, 1000000000);
- nr_entries = stack_depot_fetch(entry->backtrace, &entries);
- stack_trace_snprint(buf, PAGE_SIZE, entries, nr_entries, 4);
+ stack_depot_snprint(entry->backtrace, buf, PAGE_SIZE, 4);
drm_printf(&p, " %d %ss (last at %5llu.%06u):\n%s",
entry->count,
diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c
index 93d48a6f04abe2..7d1c578388d332 100644
--- a/drivers/gpu/drm/drm_mm.c
+++ b/drivers/gpu/drm/drm_mm.c
@@ -118,8 +118,6 @@ static noinline void save_stack(struct drm_mm_node *node)
static void show_leaks(struct drm_mm *mm)
{
struct drm_mm_node *node;
- unsigned long *entries;
- unsigned int nr_entries;
char *buf;
buf = kmalloc(BUFSZ, GFP_KERNEL);
@@ -133,8 +131,7 @@ static void show_leaks(struct drm_mm *mm)
continue;
}
- nr_entries = stack_depot_fetch(node->stack, &entries);
- stack_trace_snprint(buf, BUFSZ, entries, nr_entries, 0);
+ stack_depot_snprint(node->stack, buf, BUFSZ, 0);
DRM_ERROR("node [%08llx + %08llx]: inserted at\n%s",
node->start, node->size, buf);
}
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 4b7fc4647e460e..f2d9ed37510939 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -56,8 +56,6 @@ void i915_vma_free(struct i915_vma *vma)
static void vma_print_allocator(struct i915_vma *vma, const char *reason)
{
- unsigned long *entries;
- unsigned int nr_entries;
char buf[512];
if (!vma->node.stack) {
@@ -66,8 +64,7 @@ static void vma_print_allocator(struct i915_vma *vma, const char *reason)
return;
}
- nr_entries = stack_depot_fetch(vma->node.stack, &entries);
- stack_trace_snprint(buf, sizeof(buf), entries, nr_entries, 0);
+ stack_depot_snprint(vma->node.stack, buf, sizeof(buf), 0);
DRM_DEBUG_DRIVER("vma.node [%08llx + %08llx] %s: inserted at %s\n",
vma->node.start, vma->node.size, reason, buf);
}
diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c
index eaf7688f517d09..0d85f3c5c5266d 100644
--- a/drivers/gpu/drm/i915/intel_runtime_pm.c
+++ b/drivers/gpu/drm/i915/intel_runtime_pm.c
@@ -65,16 +65,6 @@ static noinline depot_stack_handle_t __save_depot_stack(void)
return stack_depot_save(entries, n, GFP_NOWAIT | __GFP_NOWARN);
}
-static void __print_depot_stack(depot_stack_handle_t stack,
- char *buf, int sz, int indent)
-{
- unsigned long *entries;
- unsigned int nr_entries;
-
- nr_entries = stack_depot_fetch(stack, &entries);
- stack_trace_snprint(buf, sz, entries, nr_entries, indent);
-}
-
static void init_intel_runtime_pm_wakeref(struct intel_runtime_pm *rpm)
{
spin_lock_init(&rpm->debug.lock);
@@ -146,12 +136,12 @@ static void untrack_intel_runtime_pm_wakeref(struct intel_runtime_pm *rpm,
if (!buf)
return;
- __print_depot_stack(stack, buf, PAGE_SIZE, 2);
+ stack_depot_snprint(stack, buf, PAGE_SIZE, 2);
DRM_DEBUG_DRIVER("wakeref %x from\n%s", stack, buf);
stack = READ_ONCE(rpm->debug.last_release);
if (stack) {
- __print_depot_stack(stack, buf, PAGE_SIZE, 2);
+ stack_depot_snprint(stack, buf, PAGE_SIZE, 2);
DRM_DEBUG_DRIVER("wakeref last released at\n%s", buf);
}
@@ -183,12 +173,12 @@ __print_intel_runtime_pm_wakeref(struct drm_printer *p,
return;
if (dbg->last_acquire) {
- __print_depot_stack(dbg->last_acquire, buf, PAGE_SIZE, 2);
+ stack_depot_snprint(dbg->last_acquire, buf, PAGE_SIZE, 2);
drm_printf(p, "Wakeref last acquired:\n%s", buf);
}
if (dbg->last_release) {
- __print_depot_stack(dbg->last_release, buf, PAGE_SIZE, 2);
+ stack_depot_snprint(dbg->last_release, buf, PAGE_SIZE, 2);
drm_printf(p, "Wakeref last released:\n%s", buf);
}
@@ -203,7 +193,7 @@ __print_intel_runtime_pm_wakeref(struct drm_printer *p,
rep = 1;
while (i + 1 < dbg->count && dbg->owners[i + 1] == stack)
rep++, i++;
- __print_depot_stack(stack, buf, PAGE_SIZE, 2);
+ stack_depot_snprint(stack, buf, PAGE_SIZE, 2);
drm_printf(p, "Wakeref x%lu taken at:\n%s", rep, buf);
}
diff --git a/drivers/mmc/core/mmc_test.c b/drivers/mmc/core/mmc_test.c
index 63524551a13a16..e6a2fd2c6d5c94 100644
--- a/drivers/mmc/core/mmc_test.c
+++ b/drivers/mmc/core/mmc_test.c
@@ -10,7 +10,6 @@
#include <linux/slab.h>
#include <linux/scatterlist.h>
-#include <linux/swap.h> /* For nr_free_buffer_pages() */
#include <linux/list.h>
#include <linux/debugfs.h>
diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c
index 761fd870d1db27..053e241f593c78 100644
--- a/drivers/of/kexec.c
+++ b/drivers/of/kexec.c
@@ -16,6 +16,7 @@
#include <linux/of.h>
#include <linux/of_fdt.h>
#include <linux/random.h>
+#include <linux/slab.h>
#include <linux/types.h>
#define RNG_SEED_SIZE 128
diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index 94331d999d273e..7df466e2228203 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -965,6 +965,7 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg)
struct rio_transfer_io *transfer;
enum dma_data_direction dir;
int i, ret = 0;
+ size_t size;
if (unlikely(copy_from_user(&transaction, arg, sizeof(transaction))))
return -EFAULT;
@@ -976,13 +977,14 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg)
priv->md->properties.transfer_mode) == 0)
return -ENODEV;
- transfer = vmalloc(array_size(sizeof(*transfer), transaction.count));
+ size = array_size(sizeof(*transfer), transaction.count);
+ transfer = vmalloc(size);
if (!transfer)
return -ENOMEM;
if (unlikely(copy_from_user(transfer,
(void __user *)(uintptr_t)transaction.block,
- array_size(sizeof(*transfer), transaction.count)))) {
+ size))) {
ret = -EFAULT;
goto out_free;
}
@@ -994,8 +996,7 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg)
transaction.sync, dir, &transfer[i]);
if (unlikely(copy_to_user((void __user *)(uintptr_t)transaction.block,
- transfer,
- array_size(sizeof(*transfer), transaction.count))))
+ transfer, size)))
ret = -EFAULT;
out_free:
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index ce1b3f6ec325de..0d974162899d9f 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -101,6 +101,7 @@ config VIRTIO_MEM
depends on MEMORY_HOTPLUG_SPARSE
depends on MEMORY_HOTREMOVE
depends on CONTIG_ALLOC
+ depends on EXCLUSIVE_SYSTEM_RAM
help
This driver provides access to virtio-mem paravirtualized memory
devices, allowing to hotplug and hotunplug memory.
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index bef8ad6bf4661b..c619bc0e46a770 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -2525,8 +2525,10 @@ static int virtio_mem_create_resource(struct virtio_mem *vm)
if (!name)
return -ENOMEM;
+ /* Disallow mapping device memory via /dev/mem completely. */
vm->parent_resource = __request_mem_region(vm->addr, vm->region_size,
- name, IORESOURCE_SYSTEM_RAM);
+ name, IORESOURCE_SYSTEM_RAM |
+ IORESOURCE_EXCLUSIVE);
if (!vm->parent_resource) {
kfree(name);
dev_warn(&vm->vdev->dev, "could not reserve device region\n");
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 69d900a8473d4e..f3523807dbcaa5 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1074,20 +1074,26 @@ out_free_interp:
vaddr = elf_ppnt->p_vaddr;
/*
- * If we are loading ET_EXEC or we have already performed
- * the ET_DYN load_addr calculations, proceed normally.
+ * The first time through the loop, load_addr_set is false:
+ * layout will be calculated. Once set, use MAP_FIXED since
+ * we know we've already safely mapped the entire region with
+ * MAP_FIXED_NOREPLACE in the once-per-binary logic following.
*/
- if (elf_ex->e_type == ET_EXEC || load_addr_set) {
+ if (load_addr_set) {
elf_flags |= MAP_FIXED;
+ } else if (elf_ex->e_type == ET_EXEC) {
+ /*
+ * This logic is run once for the first LOAD Program
+ * Header for ET_EXEC binaries. No special handling
+ * is needed.
+ */
+ elf_flags |= MAP_FIXED_NOREPLACE;
} else if (elf_ex->e_type == ET_DYN) {
/*
* This logic is run once for the first LOAD Program
* Header for ET_DYN binaries to calculate the
* randomization (load_bias) for all the LOAD
- * Program Headers, and to calculate the entire
- * size of the ELF mapping (total_size). (Note that
- * load_addr_set is set to true later once the
- * initial mapping is performed.)
+ * Program Headers.
*
* There are effectively two types of ET_DYN
* binaries: programs (i.e. PIE: ET_DYN with INTERP)
@@ -1108,7 +1114,7 @@ out_free_interp:
* Therefore, programs are loaded offset from
* ELF_ET_DYN_BASE and loaders are loaded into the
* independently randomized mmap region (0 load_bias
- * without MAP_FIXED).
+ * without MAP_FIXED nor MAP_FIXED_NOREPLACE).
*/
if (interpreter) {
load_bias = ELF_ET_DYN_BASE;
@@ -1117,7 +1123,7 @@ out_free_interp:
alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
if (alignment)
load_bias &= ~(alignment - 1);
- elf_flags |= MAP_FIXED;
+ elf_flags |= MAP_FIXED_NOREPLACE;
} else
load_bias = 0;
@@ -1129,7 +1135,14 @@ out_free_interp:
* is then page aligned.
*/
load_bias = ELF_PAGESTART(load_bias - vaddr);
+ }
+ /*
+ * Calculate the entire size of the ELF mapping (total_size).
+ * (Note that load_addr_set is set to true later once the
+ * initial mapping is performed.)
+ */
+ if (!load_addr_set) {
total_size = total_mapping_size(elf_phdata,
elf_ex->e_phnum);
if (!total_size) {
diff --git a/fs/buffer.c b/fs/buffer.c
index c615387aedcae8..17d5f40197d1dd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -956,10 +956,20 @@ grow_dev_page(struct block_device *bdev, sector_t block,
end_block = init_page_buffers(page, bdev,
(sector_t)index << sizebits,
size);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x01;
+#endif
goto done;
}
- if (!try_to_free_buffers(page))
+ if (!try_to_free_buffers(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x02;
+#endif
goto failed;
+ }
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x04;
+#endif
}
/*
@@ -979,6 +989,9 @@ grow_dev_page(struct block_device *bdev, sector_t block,
spin_unlock(&inode->i_mapping->private_lock);
done:
ret = (block < end_block) ? 1 : -ENXIO;
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x08;
+#endif
failed:
unlock_page(page);
put_page(page);
@@ -1030,6 +1043,12 @@ __getblk_slow(struct block_device *bdev, sector_t block,
return NULL;
}
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_stamp = jiffies;
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+#endif
for (;;) {
struct buffer_head *bh;
int ret;
@@ -1041,6 +1060,24 @@ __getblk_slow(struct block_device *bdev, sector_t block,
ret = grow_buffers(bdev, block, size, gfp);
if (ret < 0)
return NULL;
+
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ if (!time_after(jiffies, current->getblk_stamp + 3 * HZ))
+ continue;
+ printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx bdev_super_blocksize=%ld size=%u bdev_super_blocksize_bits=%d bdev_inode_blkbits=%d\n",
+ current->comm, current->pid, current->getblk_executed,
+ current->getblk_bh_count, current->getblk_bh_state,
+ IS_ERR_OR_NULL(bdev->bd_super) ? -1L :
+ bdev->bd_super->s_blocksize, size,
+ IS_ERR_OR_NULL(bdev->bd_super) ? -1 :
+ bdev->bd_super->s_blocksize_bits,
+ IS_ERR_OR_NULL(bdev->bd_inode) ? -1 :
+ bdev->bd_inode->i_blkbits);
+ current->getblk_executed = 0;
+ current->getblk_bh_count = 0;
+ current->getblk_bh_state = 0;
+ current->getblk_stamp = jiffies;
+#endif
}
}
@@ -3191,6 +3228,11 @@ EXPORT_SYMBOL(sync_dirty_buffer);
*/
static inline int buffer_busy(struct buffer_head *bh)
{
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x80;
+ current->getblk_bh_count = atomic_read(&bh->b_count);
+ current->getblk_bh_state = bh->b_state;
+#endif
return atomic_read(&bh->b_count) |
(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}
@@ -3229,11 +3271,18 @@ int try_to_free_buffers(struct page *page)
int ret = 0;
BUG_ON(!PageLocked(page));
- if (PageWriteback(page))
+ if (PageWriteback(page)) {
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x10;
+#endif
return 0;
+ }
if (mapping == NULL) { /* can this still happen? */
ret = drop_buffers(page, &buffers_to_free);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x20;
+#endif
goto out;
}
@@ -3257,6 +3306,9 @@ int try_to_free_buffers(struct page *page)
if (ret)
cancel_dirty_page(page);
spin_unlock(&mapping->private_lock);
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ current->getblk_executed |= 0x40;
+#endif
out:
if (buffers_to_free) {
struct buffer_head *bh = buffers_to_free;
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 06855f6c79020c..62a3d2565c2616 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -63,9 +63,10 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
struct inode *inode;
struct coda_inode_info *cii;
unsigned long hash = coda_f2i(fid);
+ umode_t inode_type = coda_inode_type(attr);
+retry:
inode = iget5_locked(sb, hash, coda_test_inode, coda_set_inode, fid);
-
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -75,11 +76,15 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
inode->i_ino = hash;
/* inode is locked and unique, no need to grab cii->c_lock */
cii->c_mapcount = 0;
+ coda_fill_inode(inode, attr);
unlock_new_inode(inode);
+ } else if ((inode->i_mode & S_IFMT) != inode_type) {
+ /* Inode has changed type, mark bad and grab a new one */
+ remove_inode_hash(inode);
+ coda_flag_inode(inode, C_PURGE);
+ iput(inode);
+ goto retry;
}
-
- /* always replace the attributes, type might have changed */
- coda_fill_inode(inode, attr);
return inode;
}
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 2e1a5a192074da..903ca8fa4b9b87 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -87,28 +87,27 @@ static struct coda_timespec timespec64_to_coda(struct timespec64 ts64)
}
/* utility functions below */
+umode_t coda_inode_type(struct coda_vattr *attr)
+{
+ switch (attr->va_type) {
+ case C_VREG:
+ return S_IFREG;
+ case C_VDIR:
+ return S_IFDIR;
+ case C_VLNK:
+ return S_IFLNK;
+ case C_VNON:
+ default:
+ return 0;
+ }
+}
+
void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
{
- int inode_type;
- /* inode's i_flags, i_ino are set by iget
- XXX: is this all we need ??
- */
- switch (attr->va_type) {
- case C_VNON:
- inode_type = 0;
- break;
- case C_VREG:
- inode_type = S_IFREG;
- break;
- case C_VDIR:
- inode_type = S_IFDIR;
- break;
- case C_VLNK:
- inode_type = S_IFLNK;
- break;
- default:
- inode_type = 0;
- }
+ /* inode's i_flags, i_ino are set by iget
+ * XXX: is this all we need ??
+ */
+ umode_t inode_type = coda_inode_type(attr);
inode->i_mode |= inode_type;
if (attr->va_mode != (u_short) -1)
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index e7b27754ce782a..9be281bbcc066d 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -53,10 +53,11 @@ int coda_getattr(struct user_namespace *, const struct path *, struct kstat *,
u32, unsigned int);
int coda_setattr(struct user_namespace *, struct dentry *, struct iattr *);
-/* this file: heloers */
+/* this file: helpers */
char *coda_f2s(struct CodaFid *f);
int coda_iscontrol(const char *name, size_t length);
+umode_t coda_inode_type(struct coda_vattr *attr);
void coda_vattr_to_iattr(struct inode *, struct coda_vattr *);
void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *);
unsigned short coda_flags_to_cflags(unsigned short);
@@ -83,6 +84,9 @@ static __inline__ void coda_flag_inode(struct inode *inode, int flag)
{
struct coda_inode_info *cii = ITOC(inode);
+ if (!inode)
+ return;
+
spin_lock(&cii->c_lock);
cii->c_flags |= flag;
spin_unlock(&cii->c_lock);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index d69989c1bac378..328d7a684b6347 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -317,13 +317,10 @@ static int coda_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
coda_dir_drop_nlink(old_dir);
coda_dir_inc_nlink(new_dir);
}
- coda_dir_update_mtime(old_dir);
- coda_dir_update_mtime(new_dir);
coda_flag_inode(d_inode(new_dentry), C_VATTR);
- } else {
- coda_flag_inode(old_dir, C_VATTR);
- coda_flag_inode(new_dir, C_VATTR);
}
+ coda_dir_update_mtime(old_dir);
+ coda_dir_update_mtime(new_dir);
}
return error;
}
@@ -499,15 +496,20 @@ out:
*/
static int coda_dentry_delete(const struct dentry * dentry)
{
- int flags;
+ struct inode *inode;
+ struct coda_inode_info *cii;
if (d_really_is_negative(dentry))
return 0;
- flags = (ITOC(d_inode(dentry))->c_flags) & C_PURGE;
- if (is_bad_inode(d_inode(dentry)) || flags) {
+ inode = d_inode(dentry);
+ if (!inode || is_bad_inode(inode))
return 1;
- }
+
+ cii = ITOC(inode);
+ if (cii->c_flags & C_PURGE)
+ return 1;
+
return 0;
}
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ef5ca22bfb3eaa..29dd87be2fb869 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -8,6 +8,7 @@
* to the Coda project. Contact Peter Braam <coda@cs.cmu.edu>.
*/
+#include <linux/refcount.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/time.h>
@@ -28,7 +29,7 @@
#include "coda_int.h"
struct coda_vm_ops {
- atomic_t refcnt;
+ refcount_t refcnt;
struct file *coda_file;
const struct vm_operations_struct *host_vm_ops;
struct vm_operations_struct vm_ops;
@@ -98,7 +99,7 @@ coda_vm_open(struct vm_area_struct *vma)
struct coda_vm_ops *cvm_ops =
container_of(vma->vm_ops, struct coda_vm_ops, vm_ops);
- atomic_inc(&cvm_ops->refcnt);
+ refcount_inc(&cvm_ops->refcnt);
if (cvm_ops->host_vm_ops && cvm_ops->host_vm_ops->open)
cvm_ops->host_vm_ops->open(vma);
@@ -113,7 +114,7 @@ coda_vm_close(struct vm_area_struct *vma)
if (cvm_ops->host_vm_ops && cvm_ops->host_vm_ops->close)
cvm_ops->host_vm_ops->close(vma);
- if (atomic_dec_and_test(&cvm_ops->refcnt)) {
+ if (refcount_dec_and_test(&cvm_ops->refcnt)) {
vma->vm_ops = cvm_ops->host_vm_ops;
fput(cvm_ops->coda_file);
kfree(cvm_ops);
@@ -189,7 +190,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
cvm_ops->vm_ops.open = coda_vm_open;
cvm_ops->vm_ops.close = coda_vm_close;
cvm_ops->coda_file = coda_file;
- atomic_set(&cvm_ops->refcnt, 1);
+ refcount_set(&cvm_ops->refcnt, 1);
vma->vm_ops = &cvm_ops->vm_ops;
}
@@ -238,11 +239,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
struct coda_file_info *cfi;
struct coda_inode_info *cii;
struct inode *host_inode;
- int err;
cfi = coda_ftoc(coda_file);
- err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode),
+ venus_close(coda_inode->i_sb, coda_i2f(coda_inode),
coda_flags, coda_file->f_cred->fsuid);
host_inode = file_inode(cfi->cfi_container);
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 240669f51eac36..b39580ad4ce514 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -122,14 +122,10 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
hdr.opcode, hdr.unique);
nbytes = size;
}
- dcbuf = kvmalloc(nbytes, GFP_KERNEL);
- if (!dcbuf) {
- retval = -ENOMEM;
- goto out;
- }
- if (copy_from_user(dcbuf, buf, nbytes)) {
- kvfree(dcbuf);
- retval = -EFAULT;
+
+ dcbuf = vmemdup_user(buf, nbytes);
+ if (IS_ERR(dcbuf)) {
+ retval = PTR_ERR(dcbuf);
goto out;
}
@@ -388,7 +384,7 @@ MODULE_AUTHOR("Jan Harkes, Peter J. Braam");
MODULE_DESCRIPTION("Coda Distributed File System VFS interface");
MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR);
MODULE_LICENSE("GPL");
-MODULE_VERSION("7.0");
+MODULE_VERSION("7.2");
static int __init init_coda(void)
{
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index eb3b1898da462a..59f6cfd06f96a5 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -744,7 +744,8 @@ static int coda_upcall(struct venus_comm *vcp,
list_add_tail(&req->uc_chain, &vcp->vc_pending);
wake_up_interruptible(&vcp->vc_waitq);
- if (req->uc_flags & CODA_REQ_ASYNC) {
+ /* We can return early on asynchronous requests */
+ if (outSize == NULL) {
mutex_unlock(&vcp->vc_mutex);
return 0;
}
diff --git a/fs/exec.c b/fs/exec.c
index a098c133d8d740..5a7a07dfdc8127 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1028,9 +1028,9 @@ static int exec_mmap(struct mm_struct *mm)
setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
mm_update_next_owner(old_mm);
mmput(old_mm);
- return 0;
+ } else {
+ mmdrop_lazy_tlb(active_mm);
}
- mmdrop(active_mm);
return 0;
}
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 4a95a92546a0da..2a51432462820d 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -462,8 +462,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
goto out;
if (S_ISDIR(main_inode->i_mode)) {
- if (fd.entrylength < sizeof(struct hfs_cat_dir))
- /* panic? */;
+ WARN_ON(fd.entrylength < sizeof(struct hfs_cat_dir));
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_dir));
if (rec.type != HFS_CDR_DIR ||
@@ -483,8 +482,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
} else {
- if (fd.entrylength < sizeof(struct hfs_cat_file))
- /* panic? */;
+ WARN_ON(fd.entrylength < sizeof(struct hfs_cat_file));
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
if (rec.type != HFS_CDR_FIL ||
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 35472cba750e1d..9cdc6550b468e5 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -124,7 +124,7 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry,
hfsplus_cat_set_perms(inode, &folder->permissions);
if (inode == sbi->hidden_dir)
/* invisible and namelocked */
- folder->user_info.frFlags = cpu_to_be16(0x5000);
+ folder->info.user.frFlags = cpu_to_be16(0x5000);
return sizeof(*folder);
} else {
struct hfsplus_cat_file *file;
@@ -142,14 +142,14 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry,
if (cnid == inode->i_ino) {
hfsplus_cat_set_perms(inode, &file->permissions);
if (S_ISLNK(inode->i_mode)) {
- file->user_info.fdType =
+ file->info.user.fdType =
cpu_to_be32(HFSP_SYMLINK_TYPE);
- file->user_info.fdCreator =
+ file->info.user.fdCreator =
cpu_to_be32(HFSP_SYMLINK_CREATOR);
} else {
- file->user_info.fdType =
+ file->info.user.fdType =
cpu_to_be32(sbi->type);
- file->user_info.fdCreator =
+ file->info.user.fdCreator =
cpu_to_be32(sbi->creator);
}
if (HFSPLUS_FLG_IMMUTABLE &
@@ -158,11 +158,11 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry,
file->flags |=
cpu_to_be16(HFSPLUS_FILE_LOCKED);
} else {
- file->user_info.fdType =
+ file->info.user.fdType =
cpu_to_be32(HFSP_HARDLINK_TYPE);
- file->user_info.fdCreator =
+ file->info.user.fdCreator =
cpu_to_be32(HFSP_HFSPLUS_CREATOR);
- file->user_info.fdFlags =
+ file->info.user.fdFlags =
cpu_to_be16(0x100);
file->create_date =
HFSPLUS_I(sbi->hidden_dir)->create_date;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 84714bbccc1238..135279a19b559a 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -73,9 +73,9 @@ again:
goto fail;
}
cnid = be32_to_cpu(entry.file.id);
- if (entry.file.user_info.fdType ==
+ if (entry.file.info.user.fdType ==
cpu_to_be32(HFSP_HARDLINK_TYPE) &&
- entry.file.user_info.fdCreator ==
+ entry.file.info.user.fdCreator ==
cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
HFSPLUS_SB(sb)->hidden_dir &&
(entry.file.create_date ==
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 456e87aec7fd7e..005a043bc7eed0 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -260,8 +260,10 @@ struct hfsplus_cat_folder {
__be32 access_date;
__be32 backup_date;
struct hfsplus_perm permissions;
- struct DInfo user_info;
- struct DXInfo finder_info;
+ struct {
+ struct DInfo user;
+ struct DXInfo finder;
+ } info;
__be32 text_encoding;
__be32 subfolders; /* Subfolder count in HFSX. Reserved in HFS+. */
} __packed;
@@ -294,8 +296,10 @@ struct hfsplus_cat_file {
__be32 access_date;
__be32 backup_date;
struct hfsplus_perm permissions;
- struct FInfo user_info;
- struct FXInfo finder_info;
+ struct {
+ struct FInfo user;
+ struct FXInfo finder;
+ } info;
__be32 text_encoding;
u32 reserved2;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 6fef67c2a9f090..d08a8d1d40a4cb 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -509,8 +509,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
if (type == HFSPLUS_FOLDER) {
struct hfsplus_cat_folder *folder = &entry.folder;
- if (fd->entrylength < sizeof(struct hfsplus_cat_folder))
- /* panic? */;
+ WARN_ON(fd->entrylength < sizeof(struct hfsplus_cat_folder));
hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
sizeof(struct hfsplus_cat_folder));
hfsplus_get_perms(inode, &folder->permissions, 1);
@@ -530,8 +529,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
} else if (type == HFSPLUS_FILE) {
struct hfsplus_cat_file *file = &entry.file;
- if (fd->entrylength < sizeof(struct hfsplus_cat_file))
- /* panic? */;
+ WARN_ON(fd->entrylength < sizeof(struct hfsplus_cat_file));
hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
sizeof(struct hfsplus_cat_file));
@@ -588,8 +586,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
if (S_ISDIR(main_inode->i_mode)) {
struct hfsplus_cat_folder *folder = &entry.folder;
- if (fd.entrylength < sizeof(struct hfsplus_cat_folder))
- /* panic? */;
+ WARN_ON(fd.entrylength < sizeof(struct hfsplus_cat_folder));
hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
sizeof(struct hfsplus_cat_folder));
/* simple node checks? */
@@ -614,8 +611,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
} else {
struct hfsplus_cat_file *file = &entry.file;
- if (fd.entrylength < sizeof(struct hfsplus_cat_file))
- /* panic? */;
+ WARN_ON(fd.entrylength < sizeof(struct hfsplus_cat_file));
hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
sizeof(struct hfsplus_cat_file));
hfsplus_inode_write_fork(inode, &file->data_fork);
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index e2855ceefd3943..b21811baab0f26 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -261,10 +261,8 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
struct hfs_find_data cat_fd;
hfsplus_cat_entry entry;
u16 cat_entry_flags, cat_entry_type;
- u16 folder_finderinfo_len = sizeof(struct DInfo) +
- sizeof(struct DXInfo);
- u16 file_finderinfo_len = sizeof(struct FInfo) +
- sizeof(struct FXInfo);
+ u16 folder_finderinfo_len = sizeof(entry.folder.info);
+ u16 file_finderinfo_len = sizeof(entry.file.info);
if ((!S_ISREG(inode->i_mode) &&
!S_ISDIR(inode->i_mode)) ||
@@ -296,7 +294,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
sizeof(hfsplus_cat_entry));
if (be16_to_cpu(entry.type) == HFSPLUS_FOLDER) {
if (size == folder_finderinfo_len) {
- memcpy(&entry.folder.user_info, value,
+ memcpy(&entry.folder.info, value,
folder_finderinfo_len);
hfs_bnode_write(cat_fd.bnode, &entry,
cat_fd.entryoffset,
@@ -309,7 +307,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
}
} else if (be16_to_cpu(entry.type) == HFSPLUS_FILE) {
if (size == file_finderinfo_len) {
- memcpy(&entry.file.user_info, value,
+ memcpy(&entry.file.info, value,
file_finderinfo_len);
hfs_bnode_write(cat_fd.bnode, &entry,
cat_fd.entryoffset,
@@ -462,14 +460,14 @@ static ssize_t hfsplus_getxattr_finder_info(struct inode *inode,
if (entry_type == HFSPLUS_FOLDER) {
hfs_bnode_read(fd.bnode, folder_finder_info,
fd.entryoffset +
- offsetof(struct hfsplus_cat_folder, user_info),
+ offsetof(struct hfsplus_cat_folder, info.user),
folder_rec_len);
memcpy(value, folder_finder_info, folder_rec_len);
res = folder_rec_len;
} else if (entry_type == HFSPLUS_FILE) {
hfs_bnode_read(fd.bnode, file_finder_info,
fd.entryoffset +
- offsetof(struct hfsplus_cat_file, user_info),
+ offsetof(struct hfsplus_cat_file, info.user),
file_rec_len);
memcpy(value, file_finder_info, file_rec_len);
res = file_rec_len;
@@ -630,14 +628,14 @@ static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry,
len = sizeof(struct DInfo) + sizeof(struct DXInfo);
hfs_bnode_read(fd.bnode, folder_finder_info,
fd.entryoffset +
- offsetof(struct hfsplus_cat_folder, user_info),
+ offsetof(struct hfsplus_cat_folder, info.user),
len);
found_bit = find_first_bit((void *)folder_finder_info, len*8);
} else if (entry_type == HFSPLUS_FILE) {
len = sizeof(struct FInfo) + sizeof(struct FXInfo);
hfs_bnode_read(fd.bnode, file_finder_info,
fd.entryoffset +
- offsetof(struct hfsplus_cat_file, user_info),
+ offsetof(struct hfsplus_cat_file, info.user),
len);
found_bit = find_first_bit((void *)file_finder_info, len*8);
} else {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index cdfb1ae78a3f84..c092c4931c3eff 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1463,18 +1463,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
if (!mnt)
return ERR_PTR(-ENOENT);
- if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
- *ucounts = current_ucounts();
- if (user_shm_lock(size, *ucounts)) {
- task_lock(current);
- pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
- current->comm, current->pid);
- task_unlock(current);
- } else {
- *ucounts = NULL;
- return ERR_PTR(-EPERM);
- }
- }
+ if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm())
+ return ERR_PTR(-EPERM);
file = ERR_PTR(-ENOSPC);
inode = hugetlbfs_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0);
diff --git a/fs/inode.c b/fs/inode.c
index ed0cab8a32db10..a49695f57e1eaa 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -428,11 +428,20 @@ void ihold(struct inode *inode)
}
EXPORT_SYMBOL(ihold);
-static void inode_lru_list_add(struct inode *inode)
+static void __inode_add_lru(struct inode *inode, bool rotate)
{
+ if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
+ return;
+ if (atomic_read(&inode->i_count))
+ return;
+ if (!(inode->i_sb->s_flags & SB_ACTIVE))
+ return;
+ if (!mapping_shrinkable(&inode->i_data))
+ return;
+
if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_inc(nr_unused);
- else
+ else if (rotate)
inode->i_state |= I_REFERENCED;
}
@@ -443,16 +452,11 @@ static void inode_lru_list_add(struct inode *inode)
*/
void inode_add_lru(struct inode *inode)
{
- if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
- I_FREEING | I_WILL_FREE)) &&
- !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE)
- inode_lru_list_add(inode);
+ __inode_add_lru(inode, false);
}
-
static void inode_lru_list_del(struct inode *inode)
{
-
if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_dec(nr_unused);
}
@@ -728,10 +732,6 @@ again:
/*
* Isolate the inode from the LRU in preparation for freeing it.
*
- * Any inodes which are pinned purely because of attached pagecache have their
- * pagecache removed. If the inode has metadata buffers attached to
- * mapping->private_list then try to remove them.
- *
* If the inode has the I_REFERENCED flag set, then it means that it has been
* used recently - the flag is set in iput_final(). When we encounter such an
* inode, clear the flag and move it to the back of the LRU so it gets another
@@ -747,31 +747,39 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
struct inode *inode = container_of(item, struct inode, i_lru);
/*
- * we are inverting the lru lock/inode->i_lock here, so use a trylock.
- * If we fail to get the lock, just skip it.
+ * We are inverting the lru lock/inode->i_lock here, so use a
+ * trylock. If we fail to get the lock, just skip it.
*/
if (!spin_trylock(&inode->i_lock))
return LRU_SKIP;
/*
- * Referenced or dirty inodes are still in use. Give them another pass
- * through the LRU as we canot reclaim them now.
+ * Inodes can get referenced, redirtied, or repopulated while
+ * they're already on the LRU, and this can make them
+ * unreclaimable for a while. Remove them lazily here; iput,
+ * sync, or the last page cache deletion will requeue them.
*/
if (atomic_read(&inode->i_count) ||
- (inode->i_state & ~I_REFERENCED)) {
+ (inode->i_state & ~I_REFERENCED) ||
+ !mapping_shrinkable(&inode->i_data)) {
list_lru_isolate(lru, &inode->i_lru);
spin_unlock(&inode->i_lock);
this_cpu_dec(nr_unused);
return LRU_REMOVED;
}
- /* recently referenced inodes get one more pass */
+ /* Recently referenced inodes get one more pass */
if (inode->i_state & I_REFERENCED) {
inode->i_state &= ~I_REFERENCED;
spin_unlock(&inode->i_lock);
return LRU_ROTATE;
}
+ /*
+ * On highmem systems, mapping_shrinkable() permits dropping
+ * page cache in order to free up struct inodes: lowmem might
+ * be under pressure before the cache inside the highmem zone.
+ */
if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
__iget(inode);
spin_unlock(&inode->i_lock);
@@ -1638,7 +1646,7 @@ static void iput_final(struct inode *inode)
if (!drop &&
!(inode->i_state & I_DONTCACHE) &&
(sb->s_flags & SB_ACTIVE)) {
- inode_add_lru(inode);
+ __inode_add_lru(inode, true);
spin_unlock(&inode->i_lock);
return;
}
diff --git a/fs/internal.h b/fs/internal.h
index 3cd065c8a66b4c..2854ff29f1167b 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -149,7 +149,6 @@ extern int vfs_open(const struct path *, struct file *);
* inode.c
*/
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
-extern void inode_add_lru(struct inode *inode);
extern int dentry_needs_remove_privs(struct dentry *dentry);
/*
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f1cc8258d34a4b..b05fde7edc3a50 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5940,6 +5940,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
+ ocfs2_commit_trans(osb, handle);
mlog_errno(status);
goto bail;
}
@@ -5964,6 +5965,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
data_alloc_bh, start_blk,
num_clusters);
if (status < 0) {
+ ocfs2_commit_trans(osb, handle);
mlog_errno(status);
goto bail;
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2c46ff6ba4ea23..0939186f010e32 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -453,8 +453,12 @@ roll_back:
leave:
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
- if (handle)
+ if (handle) {
+ if (status < 0 && new_fe_bh != NULL)
+ ocfs2_set_links_count((struct ocfs2_dinode *)
+ new_fe_bh->b_data, 0);
ocfs2_commit_trans(osb, handle);
+ }
ocfs2_inode_unlock(dir, 1);
if (did_block_signals)
@@ -598,6 +602,8 @@ static int __ocfs2_mknod_locked(struct inode *dir,
leave:
if (status < 0) {
if (*new_fe_bh) {
+ if (fe)
+ ocfs2_set_links_count(fe, 0);
brelse(*new_fe_bh);
*new_fe_bh = NULL;
}
@@ -634,7 +640,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
parent_fe_bh, handle, inode_ac,
fe_blkno, suballoc_loc, suballoc_bit);
- if (status < 0) {
+ if (status < 0 && !(OCFS2_I(inode)->ip_inode_lockres.l_flags &
+ OCFS2_LOCK_INITIALIZED)) {
u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit);
int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode,
inode_ac->ac_bh, suballoc_bit, bg_blkno, 1);
@@ -2027,8 +2034,12 @@ bail:
ocfs2_clusters_to_bytes(osb->sb, 1));
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
- if (handle)
+ if (handle) {
+ if (status < 0 && new_fe_bh != NULL)
+ ocfs2_set_links_count((struct ocfs2_dinode *)
+ new_fe_bh->b_data, 0);
ocfs2_commit_trans(osb, handle);
+ }
ocfs2_inode_unlock(dir, 1);
if (did_block_signals)
@@ -2489,6 +2500,7 @@ out:
}
int ocfs2_create_inode_in_orphan(struct inode *dir,
+ struct buffer_head **dir_bh,
int mode,
struct inode **new_inode)
{
@@ -2597,13 +2609,16 @@ leave:
brelse(new_di_bh);
- if (!status)
- *new_inode = inode;
-
ocfs2_free_dir_lookup_result(&orphan_insert);
- ocfs2_inode_unlock(dir, 1);
- brelse(parent_di_bh);
+ if (!status) {
+ *new_inode = inode;
+ *dir_bh = parent_di_bh;
+ } else {
+ ocfs2_inode_unlock(dir, 1);
+ brelse(parent_di_bh);
+ }
+
return status;
}
@@ -2760,11 +2775,11 @@ bail:
}
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+ struct buffer_head *dir_bh,
struct inode *inode,
struct dentry *dentry)
{
int status = 0;
- struct buffer_head *parent_di_bh = NULL;
handle_t *handle = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_dinode *dir_di, *di;
@@ -2778,14 +2793,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
(unsigned long long)OCFS2_I(dir)->ip_blkno,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
- status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
- if (status < 0) {
- if (status != -ENOENT)
- mlog_errno(status);
- return status;
- }
-
- dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
+ dir_di = (struct ocfs2_dinode *) dir_bh->b_data;
if (!dir_di->i_links_count) {
/* can't make a file in a deleted directory. */
status = -ENOENT;
@@ -2798,7 +2806,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
goto leave;
/* get a spot inside the dir. */
- status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
+ status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh,
dentry->d_name.name,
dentry->d_name.len, &lookup);
if (status < 0) {
@@ -2862,7 +2870,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
ocfs2_journal_dirty(handle, di_bh);
status = ocfs2_add_entry(handle, dentry, inode,
- OCFS2_I(inode)->ip_blkno, parent_di_bh,
+ OCFS2_I(inode)->ip_blkno, dir_bh,
&lookup);
if (status < 0) {
mlog_errno(status);
@@ -2886,10 +2894,7 @@ orphan_unlock:
iput(orphan_dir_inode);
leave:
- ocfs2_inode_unlock(dir, 1);
-
brelse(di_bh);
- brelse(parent_di_bh);
brelse(orphan_dir_bh);
ocfs2_free_dir_lookup_result(&lookup);
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 9cc891eb874e04..03a2c526e2c1b8 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
struct buffer_head *orphan_dir_bh,
bool dio);
int ocfs2_create_inode_in_orphan(struct inode *dir,
+ struct buffer_head **dir_bh,
int mode,
struct inode **new_inode);
int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
@@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
struct inode *inode, struct buffer_head *di_bh,
int update_isize, loff_t end);
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+ struct buffer_head *dir_bh,
struct inode *new_inode,
struct dentry *new_dentry);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7f6355cbb58759..a9a0c7c37e8ed2 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
{
int error, had_lock;
struct inode *inode = d_inode(old_dentry);
- struct buffer_head *old_bh = NULL;
+ struct buffer_head *old_bh = NULL, *dir_bh = NULL;
struct inode *new_orphan_inode = NULL;
struct ocfs2_lock_holder oh;
@@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
return -EOPNOTSUPP;
- error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
+ error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode,
&new_orphan_inode);
if (error) {
mlog_errno(error);
@@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
/* If the security isn't preserved, we need to re-initialize them. */
if (!preserve) {
- error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
+ error = ocfs2_init_security_and_acl(dir, dir_bh,
+ new_orphan_inode,
&new_dentry->d_name);
if (error)
mlog_errno(error);
}
if (!error) {
- error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
+ error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh,
+ new_orphan_inode,
new_dentry);
if (error)
mlog_errno(error);
@@ -4328,6 +4330,11 @@ out:
iput(new_orphan_inode);
}
+ if (dir_bh) {
+ ocfs2_inode_unlock(dir, 1);
+ brelse(dir_bh);
+ }
+
return error;
}
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index dd784eb0cd7c42..3f23e3a5018ce7 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7203,16 +7203,13 @@ out:
/*
* Initialize security and acl for a already created inode.
* Used for reflink a non-preserve-security file.
- *
- * It uses common api like ocfs2_xattr_set, so the caller
- * must not hold any lock expect i_mutex.
*/
int ocfs2_init_security_and_acl(struct inode *dir,
+ struct buffer_head *dir_bh,
struct inode *inode,
const struct qstr *qstr)
{
int ret = 0;
- struct buffer_head *dir_bh = NULL;
ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
if (ret) {
@@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
goto leave;
}
- ret = ocfs2_inode_lock(dir, &dir_bh, 0);
- if (ret) {
- mlog_errno(ret);
- goto leave;
- }
ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
if (ret)
mlog_errno(ret);
- ocfs2_inode_unlock(dir, 0);
- brelse(dir_bh);
leave:
return ret;
}
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 00308b57f64f18..b27fd8ba00196a 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
struct buffer_head *new_bh,
bool preserve_security);
int ocfs2_init_security_and_acl(struct inode *dir,
+ struct buffer_head *dir_bh,
struct inode *inode,
const struct qstr *qstr);
#endif /* OCFS2_XATTR_H */
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index f5c25f580dd92f..9323a854a60aeb 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -134,8 +134,7 @@ struct posix_acl *get_acl(struct inode *inode, int type)
* to just call ->get_acl to fetch the ACL ourself. (This is going to
* be an unlikely race.)
*/
- if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED)
- /* fall through */ ;
+ cmpxchg(p, ACL_NOT_CACHED, sentinel);
/*
* Normally, the ACL returned by ->get_acl will be cached.
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 172c86270b3127..913bef0d2a36c4 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -72,7 +72,7 @@ out:
return 0;
}
-static int seq_fdinfo_open(struct inode *inode, struct file *file)
+static int proc_fdinfo_access_allowed(struct inode *inode)
{
bool allowed = false;
struct task_struct *task = get_proc_task(inode);
@@ -86,6 +86,16 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file)
if (!allowed)
return -EACCES;
+ return 0;
+}
+
+static int seq_fdinfo_open(struct inode *inode, struct file *file)
+{
+ int ret = proc_fdinfo_access_allowed(inode);
+
+ if (ret)
+ return ret;
+
return single_open(file, seq_show, inode);
}
@@ -348,12 +358,23 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
proc_fdinfo_instantiate);
}
+static int proc_open_fdinfo(struct inode *inode, struct file *file)
+{
+ int ret = proc_fdinfo_access_allowed(inode);
+
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
const struct inode_operations proc_fdinfo_inode_operations = {
.lookup = proc_lookupfdinfo,
.setattr = proc_setattr,
};
const struct file_operations proc_fdinfo_operations = {
+ .open = proc_open_fdinfo,
.read = generic_read_dir,
.iterate_shared = proc_readfdinfo,
.llseek = generic_file_llseek,
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 9f1077d94cde16..4dcbcd506cb6e5 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -115,7 +115,10 @@ u64 stable_page_flags(struct page *page)
* it differentiates a memory hole from a page with no flags
*/
if (!page)
- return 1 << KPF_NOPAGE;
+ return BIT_ULL(KPF_NOPAGE);
+
+ if (pfn_zone_device_reserved(page_to_pfn(page)))
+ return BIT_ULL(KPF_RESERVED);
k = page->flags;
u = 0;
@@ -127,22 +130,22 @@ u64 stable_page_flags(struct page *page)
* simple test in page_mapped() is not enough.
*/
if (!PageSlab(page) && page_mapped(page))
- u |= 1 << KPF_MMAP;
+ u |= BIT_ULL(KPF_MMAP);
if (PageAnon(page))
- u |= 1 << KPF_ANON;
+ u |= BIT_ULL(KPF_ANON);
if (PageKsm(page))
- u |= 1 << KPF_KSM;
+ u |= BIT_ULL(KPF_KSM);
/*
* compound pages: export both head/tail info
* they together define a compound page's start/end pos and order
*/
if (PageHead(page))
- u |= 1 << KPF_COMPOUND_HEAD;
+ u |= BIT_ULL(KPF_COMPOUND_HEAD);
if (PageTail(page))
- u |= 1 << KPF_COMPOUND_TAIL;
+ u |= BIT_ULL(KPF_COMPOUND_TAIL);
if (PageHuge(page))
- u |= 1 << KPF_HUGE;
+ u |= BIT_ULL(KPF_HUGE);
/*
* PageTransCompound can be true for non-huge compound pages (slab
* pages or pages allocated by drivers with __GFP_COMP) because it
@@ -153,14 +156,13 @@ u64 stable_page_flags(struct page *page)
struct page *head = compound_head(page);
if (PageLRU(head) || PageAnon(head))
- u |= 1 << KPF_THP;
+ u |= BIT_ULL(KPF_THP);
else if (is_huge_zero_page(head)) {
- u |= 1 << KPF_ZERO_PAGE;
- u |= 1 << KPF_THP;
+ u |= BIT_ULL(KPF_ZERO_PAGE);
+ u |= BIT_ULL(KPF_THP);
}
} else if (is_zero_pfn(page_to_pfn(page)))
- u |= 1 << KPF_ZERO_PAGE;
-
+ u |= BIT_ULL(KPF_ZERO_PAGE);
/*
* Caveats on high order pages: page->_refcount will only be set
@@ -168,23 +170,23 @@ u64 stable_page_flags(struct page *page)
* SLOB won't set PG_slab at all on compound pages.
*/
if (PageBuddy(page))
- u |= 1 << KPF_BUDDY;
+ u |= BIT_ULL(KPF_BUDDY);
else if (page_count(page) == 0 && is_free_buddy_page(page))
- u |= 1 << KPF_BUDDY;
+ u |= BIT_ULL(KPF_BUDDY);
if (PageOffline(page))
- u |= 1 << KPF_OFFLINE;
+ u |= BIT_ULL(KPF_OFFLINE);
if (PageTable(page))
- u |= 1 << KPF_PGTABLE;
+ u |= BIT_ULL(KPF_PGTABLE);
if (page_is_idle(page))
- u |= 1 << KPF_IDLE;
+ u |= BIT_ULL(KPF_IDLE);
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
if (PageTail(page) && PageSlab(compound_head(page)))
- u |= 1 << KPF_SLAB;
+ u |= BIT_ULL(KPF_SLAB);
u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
@@ -197,7 +199,7 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim);
if (PageSwapCache(page))
- u |= 1 << KPF_SWAPCACHE;
+ u |= BIT_ULL(KPF_SWAPCACHE);
u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked);
u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index cf25be3e032120..ad667dbc96f5cb 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -397,7 +397,6 @@ struct mem_size_stats {
u64 pss_shmem;
u64 pss_locked;
u64 swap_pss;
- bool check_shmem_swap;
};
static void smaps_page_accumulate(struct mem_size_stats *mss,
@@ -478,9 +477,11 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end,
__always_unused int depth, struct mm_walk *walk)
{
struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = walk->vma;
- mss->swap += shmem_partial_swap_usage(
- walk->vma->vm_file->f_mapping, addr, end);
+ mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping,
+ linear_page_index(vma, addr),
+ linear_page_index(vma, end));
return 0;
}
@@ -488,6 +489,16 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end,
#define smaps_pte_hole NULL
#endif /* CONFIG_SHMEM */
+static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk)
+{
+#ifdef CONFIG_SHMEM
+ if (walk->ops->pte_hole) {
+ /* depth is not used */
+ smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk);
+ }
+#endif
+}
+
static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct mm_walk *walk)
{
@@ -516,12 +527,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
}
} else if (is_pfn_swap_entry(swpent))
page = pfn_swap_entry_to_page(swpent);
- } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
- && pte_none(*pte))) {
- page = xa_load(&vma->vm_file->f_mapping->i_pages,
- linear_page_index(vma, addr));
- if (xa_is_value(page))
- mss->swap += PAGE_SIZE;
+ } else {
+ smaps_pte_hole_lookup(addr, walk);
return;
}
@@ -735,8 +742,6 @@ static void smap_gather_stats(struct vm_area_struct *vma,
return;
#ifdef CONFIG_SHMEM
- /* In case of smaps_rollup, reset the value from previous vma */
- mss->check_shmem_swap = false;
if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
/*
* For shared or readonly shmem mappings we know that all
@@ -754,7 +759,6 @@ static void smap_gather_stats(struct vm_area_struct *vma,
!(vma->vm_flags & VM_WRITE))) {
mss->swap += shmem_swapped;
} else {
- mss->check_shmem_swap = true;
ops = &smaps_shmem_walk_ops;
}
}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index e2302342a67f47..bc66d0173e3300 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -204,17 +204,20 @@ static int ramfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
int opt;
opt = fs_parse(fc, ramfs_fs_parameters, param, &result);
- if (opt < 0) {
+ if (opt == -ENOPARAM) {
+ opt = vfs_parse_fs_param_source(fc, param);
+ if (opt != -ENOPARAM)
+ return opt;
/*
* We might like to report bad mount options here;
* but traditionally ramfs has ignored all mount options,
* and as it is used as a !CONFIG_SHMEM simple substitute
* for tmpfs, better continue to ignore other mount options.
*/
- if (opt == -ENOPARAM)
- opt = 0;
- return opt;
+ return 0;
}
+ if (opt < 0)
+ return opt;
switch (opt) {
case Opt_mode:
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index cc8e2ed155c84e..d1def0771a4083 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -474,10 +474,8 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
struct sysv_sb_info *sbi;
struct buffer_head *bh;
- if (440 != sizeof (struct v7_super_block))
- panic("V7 FS: bad super-block size");
- if (64 != sizeof (struct sysv_inode))
- panic("sysv fs: bad i-node size");
+ BUILD_BUG_ON(sizeof(struct v7_super_block) != 440);
+ BUILD_BUG_ON(sizeof(struct sysv_inode) != 64);
sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
if (!sbi)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 003f0d31743eb3..22bf14ab2d163b 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1827,9 +1827,15 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
if (mode_wp && mode_dontwake)
return -EINVAL;
- ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
- uffdio_wp.range.len, mode_wp,
- &ctx->mmap_changing);
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
+ uffdio_wp.range.len, mode_wp,
+ &ctx->mmap_changing);
+ mmput(ctx->mm);
+ } else {
+ return -ESRCH;
+ }
+
if (ret)
return ret;
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 9957085b8148f2..61b0d4e33fc765 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -142,3 +142,12 @@
#else
#define __diag_GCC_8(s)
#endif
+
+/*
+ * https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-alloc_005fsize-function-attribute
+ * However, prior to 9.1, -Wno-alloc-size-larger-than does not work,
+ * making this attribute unusable.
+ */
+#if GCC_VERSION < 90100
+#define __alloc_size(x, ...) /**/
+#endif
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 05ceb2e92b0e72..505591d0d4560a 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -254,6 +254,11 @@ struct ftrace_likely_data {
#define asm_volatile_goto(x...) asm goto(x)
#endif
+/* If not specifically disabled, allow the use of __alloc_size attribute. */
+#ifndef __alloc_size
+# define __alloc_size(x, ...) __attribute__((__alloc_size__(x, ## __VA_ARGS__)))
+#endif
+
#ifdef CONFIG_CC_HAS_ASM_INLINE
#define asm_inline asm __inline
#else
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index d2b9c41c8edf5e..d58e0476ee8e3b 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -34,6 +34,8 @@
*/
extern struct static_key_false cpusets_pre_enable_key;
extern struct static_key_false cpusets_enabled_key;
+extern struct static_key_false cpusets_insane_config_key;
+
static inline bool cpusets_enabled(void)
{
return static_branch_unlikely(&cpusets_enabled_key);
@@ -51,6 +53,19 @@ static inline void cpuset_dec(void)
static_branch_dec_cpuslocked(&cpusets_pre_enable_key);
}
+/*
+ * This will get enabled whenever a cpuset configuration is considered
+ * unsupportable in general. E.g. movable only node which cannot satisfy
+ * any non movable allocations (see update_nodemask). Page allocator
+ * needs to make additional checks for those configurations and this
+ * check is meant to guard those checks without any overhead for sane
+ * configurations.
+ */
+static inline bool cpusets_insane_config(void)
+{
+ return static_branch_unlikely(&cpusets_insane_config_key);
+}
+
extern int cpuset_init(void);
extern void cpuset_init_smp(void);
extern void cpuset_force_rebuild(void);
@@ -167,6 +182,8 @@ static inline void set_mems_allowed(nodemask_t nodemask)
static inline bool cpusets_enabled(void) { return false; }
+static inline bool cpusets_insane_config(void) { return false; }
+
static inline int cpuset_init(void) { return 0; }
static inline void cpuset_init_smp(void) {}
diff --git a/include/linux/damon.h b/include/linux/damon.h
index d68b67b8d458d7..755d70804705b3 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -62,7 +62,7 @@ struct damon_target {
struct damon_ctx;
/**
- * struct damon_primitive Monitoring primitives for given use cases.
+ * struct damon_primitive - Monitoring primitives for given use cases.
*
* @init: Initialize primitive-internal data structures.
* @update: Update primitive-internal data structures.
@@ -108,8 +108,8 @@ struct damon_primitive {
void (*cleanup)(struct damon_ctx *context);
};
-/*
- * struct damon_callback Monitoring events notification callbacks.
+/**
+ * struct damon_callback - Monitoring events notification callbacks.
*
* @before_start: Called before starting the monitoring.
* @after_sampling: Called after each sampling.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e7a633353fd20c..be0afaea2eae1f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3192,6 +3192,7 @@ static inline void remove_inode_hash(struct inode *inode)
}
extern void inode_sb_list_add(struct inode *inode);
+extern void inode_add_lru(struct inode *inode);
extern int sb_set_blocksize(struct super_block *, int);
extern int sb_min_blocksize(struct super_block *, int);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 3745efd21cf6df..897538d5ffd202 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -618,8 +618,10 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
extern unsigned long get_zeroed_page(gfp_t gfp_mask);
+__alloc_size(1)
void *alloc_pages_exact(size_t size, gfp_t gfp_mask);
void free_pages_exact(void *virt, size_t size);
+__alloc_size(1)
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
#define __get_free_page(gfp_mask) \
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 27cdd715c5f94f..25aff0f2ed0b00 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -180,9 +180,9 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
#ifndef clear_user_highpage
static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
{
- void *addr = kmap_atomic(page);
+ void *addr = kmap_local_page(page);
clear_user_page(addr, vaddr, page);
- kunmap_atomic(addr);
+ kunmap_local(addr);
}
#endif
@@ -214,9 +214,9 @@ alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
static inline void clear_highpage(struct page *page)
{
- void *kaddr = kmap_atomic(page);
+ void *kaddr = kmap_local_page(page);
clear_page(kaddr);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
}
#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
@@ -239,7 +239,7 @@ static inline void zero_user_segments(struct page *page,
unsigned start1, unsigned end1,
unsigned start2, unsigned end2)
{
- void *kaddr = kmap_atomic(page);
+ void *kaddr = kmap_local_page(page);
unsigned int i;
BUG_ON(end1 > page_size(page) || end2 > page_size(page));
@@ -250,7 +250,7 @@ static inline void zero_user_segments(struct page *page,
if (end2 > start2)
memset(kaddr + start2, 0, end2 - start2);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
for (i = 0; i < compound_nr(page); i++)
flush_dcache_page(page + i);
}
@@ -275,11 +275,11 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
{
char *vfrom, *vto;
- vfrom = kmap_atomic(from);
- vto = kmap_atomic(to);
+ vfrom = kmap_local_page(from);
+ vto = kmap_local_page(to);
copy_user_page(vto, vfrom, vaddr, to);
- kunmap_atomic(vto);
- kunmap_atomic(vfrom);
+ kunmap_local(vto);
+ kunmap_local(vfrom);
}
#endif
@@ -290,11 +290,11 @@ static inline void copy_highpage(struct page *to, struct page *from)
{
char *vfrom, *vto;
- vfrom = kmap_atomic(from);
- vto = kmap_atomic(to);
+ vfrom = kmap_local_page(from);
+ vto = kmap_local_page(to);
copy_page(vto, vfrom);
- kunmap_atomic(vto);
- kunmap_atomic(vfrom);
+ kunmap_local(vto);
+ kunmap_local(vfrom);
}
#endif
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 1faebe1cd0ed5b..85e9a9423fe3a3 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -143,9 +143,6 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct page *ref_page);
-void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct page *ref_page);
void hugetlb_report_meminfo(struct seq_file *);
int hugetlb_report_node_meminfo(char *buf, int len, int nid);
void hugetlb_show_meminfo(void);
@@ -385,13 +382,6 @@ static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
BUG();
}
-static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page)
-{
- BUG();
-}
-
static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
unsigned int flags)
@@ -533,6 +523,11 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
* HPG_freed - Set when page is on the free lists.
* Synchronization: hugetlb_lock held for examination and modification.
* HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed.
+ * HPG_cma - Set if huge page was directly allocated from CMA area via
+ * cma_alloc. Initially set for gigantic page cma allocations, but can
+ * be set in non-gigantic pages if gigantic pages are demoted.
+ * Synchronization: Only accessed or modified when there is only one
+ * reference to the page at allocation, free or demote time.
*/
enum hugetlb_page_flags {
HPG_restore_reserve = 0,
@@ -540,6 +535,7 @@ enum hugetlb_page_flags {
HPG_temporary,
HPG_freed,
HPG_vmemmap_optimized,
+ HPG_cma,
__NR_HPAGEFLAGS,
};
@@ -586,6 +582,7 @@ HPAGEFLAG(Migratable, migratable)
HPAGEFLAG(Temporary, temporary)
HPAGEFLAG(Freed, freed)
HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
+HPAGEFLAG(Cma, cma)
#ifdef CONFIG_HUGETLB_PAGE
@@ -596,6 +593,7 @@ struct hstate {
int next_nid_to_alloc;
int next_nid_to_free;
unsigned int order;
+ unsigned int demote_order;
unsigned long mask;
unsigned long max_huge_pages;
unsigned long nr_huge_pages;
@@ -605,6 +603,7 @@ struct hstate {
unsigned long nr_overcommit_huge_pages;
struct list_head hugepage_activelist;
struct list_head hugepage_freelists[MAX_NUMNODES];
+ unsigned int max_huge_pages_node[MAX_NUMNODES];
unsigned int nr_huge_pages_node[MAX_NUMNODES];
unsigned int free_huge_pages_node[MAX_NUMNODES];
unsigned int surplus_huge_pages_node[MAX_NUMNODES];
@@ -637,8 +636,9 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
unsigned long address, struct page *page);
/* arch callback */
-int __init __alloc_bootmem_huge_page(struct hstate *h);
-int __init alloc_bootmem_huge_page(struct hstate *h);
+int __init __alloc_bootmem_huge_page(struct hstate *h, int nid);
+int __init alloc_bootmem_huge_page(struct hstate *h, int nid);
+bool __init node_specific_alloc_support(void);
void __init hugetlb_add_hstate(unsigned order);
bool __init arch_hugetlb_valid_size(unsigned long size);
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index de5f5913374de3..3946d7ccbd24a2 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -375,12 +375,14 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
void kasan_cache_shrink(struct kmem_cache *cache);
void kasan_cache_shutdown(struct kmem_cache *cache);
void kasan_record_aux_stack(void *ptr);
+void kasan_record_aux_stack_noalloc(void *ptr);
#else /* CONFIG_KASAN_GENERIC */
static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
static inline void kasan_record_aux_stack(void *ptr) {}
+static inline void kasan_record_aux_stack_noalloc(void *ptr) {}
#endif /* CONFIG_KASAN_GENERIC */
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 4091692bed8c00..f9edbbddd35491 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -6,9 +6,9 @@
#ifndef _LINUX_MEMPOLICY_H
#define _LINUX_MEMPOLICY_H 1
+#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/mmzone.h>
-#include <linux/dax.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
@@ -43,7 +43,7 @@ struct mm_struct;
* to 1, representing the caller of mpol_dup().
*/
struct mempolicy {
- atomic_t refcnt;
+ refcount_t refcnt;
unsigned short mode; /* See MPOL_* above */
unsigned short flags; /* See set_mempolicy() MPOL_F_* above */
nodemask_t nodes; /* interleave/bind/perfer */
@@ -94,7 +94,7 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
static inline void mpol_get(struct mempolicy *pol)
{
if (pol)
- atomic_inc(&pol->refcnt);
+ refcount_inc(&pol->refcnt);
}
extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index c0e9d35889e8db..119f130ef8f103 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -131,6 +131,7 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
}
#ifdef CONFIG_ZONE_DEVICE
+bool pfn_zone_device_reserved(unsigned long pfn);
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
@@ -143,6 +144,11 @@ unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
unsigned long memremap_compat_align(void);
#else
+static inline bool pfn_zone_device_reserved(unsigned long pfn)
+{
+ return false;
+}
+
static inline void *devm_memremap_pages(struct device *dev,
struct dev_pagemap *pgmap)
{
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 0d2aeb9b0f66a7..6e31d63c34d085 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -19,24 +19,7 @@ struct migration_target_control;
*/
#define MIGRATEPAGE_SUCCESS 0
-/*
- * Keep sync with:
- * - macro MIGRATE_REASON in include/trace/events/migrate.h
- * - migrate_reason_names[MR_TYPES] in mm/debug.c
- */
-enum migrate_reason {
- MR_COMPACTION,
- MR_MEMORY_FAILURE,
- MR_MEMORY_HOTPLUG,
- MR_SYSCALL, /* also applies to cpusets */
- MR_MEMPOLICY_MBIND,
- MR_NUMA_MISPLACED,
- MR_CONTIG_RANGE,
- MR_LONGTERM_PIN,
- MR_DEMOTION,
- MR_TYPES
-};
-
+/* Defined in mm/debug.c: */
extern const char *migrate_reason_names[MR_TYPES];
#ifdef CONFIG_MIGRATION
diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h
index 883c992490334b..f37cc03f9369ed 100644
--- a/include/linux/migrate_mode.h
+++ b/include/linux/migrate_mode.h
@@ -19,4 +19,17 @@ enum migrate_mode {
MIGRATE_SYNC_NO_COPY,
};
+enum migrate_reason {
+ MR_COMPACTION,
+ MR_MEMORY_FAILURE,
+ MR_MEMORY_HOTPLUG,
+ MR_SYSCALL, /* also applies to cpusets */
+ MR_MEMPOLICY_MBIND,
+ MR_NUMA_MISPLACED,
+ MR_CONTIG_RANGE,
+ MR_LONGTERM_PIN,
+ MR_DEMOTION,
+ MR_TYPES
+};
+
#endif /* MIGRATE_MODE_H_INCLUDED */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 40ff114aaf9e61..00bb2d938df4ad 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -794,40 +794,6 @@ static inline int is_vmalloc_or_module_addr(const void *x)
}
#endif
-extern void *kvmalloc_node(size_t size, gfp_t flags, int node);
-static inline void *kvmalloc(size_t size, gfp_t flags)
-{
- return kvmalloc_node(size, flags, NUMA_NO_NODE);
-}
-static inline void *kvzalloc_node(size_t size, gfp_t flags, int node)
-{
- return kvmalloc_node(size, flags | __GFP_ZERO, node);
-}
-static inline void *kvzalloc(size_t size, gfp_t flags)
-{
- return kvmalloc(size, flags | __GFP_ZERO);
-}
-
-static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
-{
- size_t bytes;
-
- if (unlikely(check_mul_overflow(n, size, &bytes)))
- return NULL;
-
- return kvmalloc(bytes, flags);
-}
-
-static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
-{
- return kvmalloc_array(n, size, flags | __GFP_ZERO);
-}
-
-extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize,
- gfp_t flags);
-extern void kvfree(const void *addr);
-extern void kvfree_sensitive(const void *addr, size_t len);
-
static inline int head_compound_mapcount(struct page *head)
{
return atomic_read(compound_mapcount_ptr(head)) + 1;
@@ -904,6 +870,8 @@ void put_pages_list(struct list_head *pages);
void split_page(struct page *page, unsigned int order);
void folio_copy(struct folio *dst, struct folio *src);
+unsigned long nr_free_buffer_pages(void);
+
/*
* Compound pages have a destructor function. Provide a
* prototype for that function and accessor functions.
@@ -1861,12 +1829,24 @@ extern void user_shm_unlock(size_t, struct ucounts *);
* Parameter block passed down to zap_pte_range in exceptional cases.
*/
struct zap_details {
- struct address_space *check_mapping; /* Check page->mapping if set */
- pgoff_t first_index; /* Lowest page->index to unmap */
- pgoff_t last_index; /* Highest page->index to unmap */
+ struct address_space *zap_mapping; /* Check page->mapping if set */
struct page *single_page; /* Locked page to be unmapped */
};
+/*
+ * We set details->zap_mappings when we want to unmap shared but keep private
+ * pages. Return true if skip zapping this page, false otherwise.
+ */
+static inline bool
+zap_skip_check_mapping(struct zap_details *details, struct page *page)
+{
+ if (!details || !page)
+ return false;
+
+ return details->zap_mapping &&
+ (details->zap_mapping != page_rmapping(page));
+}
+
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6a1d79d84675a7..fb36a29e3aaeb6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1220,6 +1220,28 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
#define for_each_zone_zonelist(zone, z, zlist, highidx) \
for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
+/* Whether the 'nodes' are all movable nodes */
+static inline bool movable_only_nodes(nodemask_t *nodes)
+{
+ struct zonelist *zonelist;
+ struct zoneref *z;
+ int nid;
+
+ if (nodes_empty(*nodes))
+ return false;
+
+ /*
+ * We can chose arbitrary node from the nodemask to get a
+ * zonelist as they are interlinked. We just need to find
+ * at least one zone that can satisfy kernel allocations.
+ */
+ nid = first_node(*nodes);
+ zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
+ z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes);
+ return (!z->zone) ? true : false;
+}
+
+
#ifdef CONFIG_SPARSEMEM
#include <asm/sparsemem.h>
#endif
@@ -1481,7 +1503,7 @@ static inline int pfn_valid(unsigned long pfn)
if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
return 0;
- ms = __nr_to_section(pfn_to_section_nr(pfn));
+ ms = __pfn_to_section(pfn);
if (!valid_section(ms))
return 0;
/*
@@ -1496,7 +1518,7 @@ static inline int pfn_in_present_section(unsigned long pfn)
{
if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
return 0;
- return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
+ return present_section(__pfn_to_section(pfn));
}
static inline unsigned long next_present_section_nr(unsigned long section_nr)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a68af80649a403..70bf0ec29ee3ab 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -236,7 +236,7 @@ static __always_inline int PageCompound(struct page *page)
#define PAGE_POISON_PATTERN -1l
static inline int PagePoisoned(const struct page *page)
{
- return page->flags == PAGE_POISON_PATTERN;
+ return READ_ONCE(page->flags) == PAGE_POISON_PATTERN;
}
#ifdef CONFIG_DEBUG_VM
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 013cdc90f5fdb2..3004c242be3ac2 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -24,6 +24,56 @@ static inline bool mapping_empty(struct address_space *mapping)
}
/*
+ * mapping_shrinkable - test if page cache state allows inode reclaim
+ * @mapping: the page cache mapping
+ *
+ * This checks the mapping's cache state for the pupose of inode
+ * reclaim and LRU management.
+ *
+ * The caller is expected to hold the i_lock, but is not required to
+ * hold the i_pages lock, which usually protects cache state. That's
+ * because the i_lock and the list_lru lock that protect the inode and
+ * its LRU state don't nest inside the irq-safe i_pages lock.
+ *
+ * Cache deletions are performed under the i_lock, which ensures that
+ * when an inode goes empty, it will reliably get queued on the LRU.
+ *
+ * Cache additions do not acquire the i_lock and may race with this
+ * check, in which case we'll report the inode as shrinkable when it
+ * has cache pages. This is okay: the shrinker also checks the
+ * refcount and the referenced bit, which will be elevated or set in
+ * the process of adding new cache pages to an inode.
+ */
+static inline bool mapping_shrinkable(struct address_space *mapping)
+{
+ void *head;
+
+ /*
+ * On highmem systems, there could be lowmem pressure from the
+ * inodes before there is highmem pressure from the page
+ * cache. Make inodes shrinkable regardless of cache state.
+ */
+ if (IS_ENABLED(CONFIG_HIGHMEM))
+ return true;
+
+ /* Cache completely empty? Shrink away. */
+ head = rcu_access_pointer(mapping->i_pages.xa_head);
+ if (!head)
+ return true;
+
+ /*
+ * The xarray stores single offset-0 entries directly in the
+ * head pointer, which allows non-resident page cache entries
+ * to escape the shadow shrinker's list of xarray nodes. The
+ * inode shrinker needs to pick them up under memory pressure.
+ */
+ if (!xa_is_node(head) && xa_is_value(head))
+ return true;
+
+ return false;
+}
+
+/*
* Bits in mapping->flags.
*/
enum mapping_flags {
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 5e76af742c807a..119f41815b322f 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -123,6 +123,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
pcpu_fc_populate_pte_fn_t populate_pte_fn);
#endif
+__alloc_size(1)
extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align);
extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr);
extern bool is_kernel_percpu_address(unsigned long addr);
@@ -131,7 +132,9 @@ extern bool is_kernel_percpu_address(unsigned long addr);
extern void __init setup_per_cpu_areas(void);
#endif
+__alloc_size(1)
extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp);
+__alloc_size(1)
extern void __percpu *__alloc_percpu(size_t size, size_t align);
extern void free_percpu(void __percpu *__pdata);
extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index e704b1a4c06c09..221c3c6438a784 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -12,6 +12,8 @@
#include <linux/memcontrol.h>
#include <linux/highmem.h>
+#include <linux/refcount.h>
+
/*
* The anon_vma heads a list of private "related" vmas, to scan if
* an anonymous page pointing to this anon_vma needs to be unmapped:
@@ -36,7 +38,7 @@ struct anon_vma {
* the reference is responsible for clearing up the
* anon_vma if they are the last user on release
*/
- atomic_t refcount;
+ refcount_t refcount;
/*
* Count of child anon_vmas and VMAs which points to this anon_vma.
@@ -100,14 +102,14 @@ enum ttu_flags {
#ifdef CONFIG_MMU
static inline void get_anon_vma(struct anon_vma *anon_vma)
{
- atomic_inc(&anon_vma->refcount);
+ refcount_inc(&anon_vma->refcount);
}
void __put_anon_vma(struct anon_vma *anon_vma);
static inline void put_anon_vma(struct anon_vma *anon_vma)
{
- if (atomic_dec_and_test(&anon_vma->refcount))
+ if (refcount_dec_and_test(&anon_vma->refcount))
__put_anon_vma(anon_vma);
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dbe17d9253f3b0..ca69638134036b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1064,6 +1064,7 @@ struct task_struct {
#ifdef CONFIG_DETECT_HUNG_TASK
unsigned long last_switch_count;
unsigned long last_switch_time;
+ unsigned long killed_time;
#endif
/* Filesystem information: */
struct fs_struct *fs;
@@ -1490,6 +1491,13 @@ struct task_struct {
struct callback_head l1d_flush_kill;
#endif
+#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT
+ unsigned long getblk_stamp;
+ unsigned int getblk_executed;
+ unsigned int getblk_bh_count;
+ unsigned long getblk_bh_state;
+#endif
+
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 5561486fddef7a..fd6e4d14f477da 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,27 @@ static inline void mmdrop(struct mm_struct *mm)
__mmdrop(mm);
}
+/* Helpers for lazy TLB mm refcounting */
+static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+{
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+ mmgrab(mm);
+}
+
+static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+{
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
+ mmdrop(mm);
+ } else {
+ /*
+ * mmdrop_lazy_tlb must provide a full memory barrier, see the
+ * membarrier comment in finish_task_switch which relies on
+ * this.
+ */
+ smp_mb();
+ }
+}
+
/**
* mmget() - Pin the address space associated with a &struct mm_struct.
* @mm: The address space to pin.
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 083f3ce550bcae..f77a8fbc06d423 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -142,8 +142,6 @@ struct mem_cgroup;
void __init kmem_cache_init(void);
bool slab_is_available(void);
-extern bool usercopy_fallback;
-
struct kmem_cache *kmem_cache_create(const char *name, unsigned int size,
unsigned int align, slab_flags_t flags,
void (*ctor)(void *));
@@ -152,8 +150,8 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name,
slab_flags_t flags,
unsigned int useroffset, unsigned int usersize,
void (*ctor)(void *));
-void kmem_cache_destroy(struct kmem_cache *);
-int kmem_cache_shrink(struct kmem_cache *);
+void kmem_cache_destroy(struct kmem_cache *s);
+int kmem_cache_shrink(struct kmem_cache *s);
/*
* Please use this macro to create slab caches. Simply specify the
@@ -181,11 +179,12 @@ int kmem_cache_shrink(struct kmem_cache *);
/*
* Common kmalloc functions provided by all allocators
*/
-void * __must_check krealloc(const void *, size_t, gfp_t);
-void kfree(const void *);
-void kfree_sensitive(const void *);
-size_t __ksize(const void *);
-size_t ksize(const void *);
+__must_check __alloc_size(2)
+void *krealloc(const void *objp, size_t new_size, gfp_t flags);
+void kfree(const void *objp);
+void kfree_sensitive(const void *objp);
+size_t __ksize(const void *objp);
+size_t ksize(const void *objp);
#ifdef CONFIG_PRINTK
bool kmem_valid_obj(void *object);
void kmem_dump_obj(void *object);
@@ -425,9 +424,10 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
#define kmalloc_index(s) __kmalloc_index(s, true)
#endif /* !CONFIG_SLOB */
+__alloc_size(1)
void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc;
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc;
-void kmem_cache_free(struct kmem_cache *, void *);
+void *kmem_cache_alloc(struct kmem_cache *s, gfp_t flags) __assume_kmalloc_alignment __malloc;
+void kmem_cache_free(struct kmem_cache *s, void *objp);
/*
* Bulk allocation and freeing operations. These are accelerated in an
@@ -436,8 +436,8 @@ void kmem_cache_free(struct kmem_cache *, void *);
*
* Note that interrupts must be enabled when calling these functions.
*/
-void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
-int kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p);
+int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p);
/*
* Caller must not use kfree_bulk() on memory not originally allocated
@@ -449,8 +449,10 @@ static __always_inline void kfree_bulk(size_t size, void **p)
}
#ifdef CONFIG_NUMA
-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc;
-void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc;
+__alloc_size(1)
+void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_slab_alignment __malloc;
+void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node)
+ __assume_slab_alignment __malloc;
#else
static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
@@ -464,17 +466,15 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f
#endif
#ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc;
+extern void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size)
+ __assume_slab_alignment __malloc;
#ifdef CONFIG_NUMA
-extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
- gfp_t gfpflags,
- int node, size_t size) __assume_slab_alignment __malloc;
+extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
+ int node, size_t size) __assume_slab_alignment __malloc;
#else
-static __always_inline void *
-kmem_cache_alloc_node_trace(struct kmem_cache *s,
- gfp_t gfpflags,
- int node, size_t size)
+static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
+ gfp_t gfpflags, int node, size_t size)
{
return kmem_cache_alloc_trace(s, gfpflags, size);
}
@@ -490,10 +490,8 @@ static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s,
return ret;
}
-static __always_inline void *
-kmem_cache_alloc_node_trace(struct kmem_cache *s,
- gfp_t gfpflags,
- int node, size_t size)
+static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
+ gfp_t gfpflags, int node, size_t size)
{
void *ret = kmem_cache_alloc_node(s, gfpflags, node);
@@ -502,13 +500,15 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
}
#endif /* CONFIG_TRACING */
-extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
+extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
+ __assume_page_alignment __malloc;
#ifdef CONFIG_TRACING
-extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
+extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+ __assume_page_alignment __malloc;
#else
-static __always_inline void *
-kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+static __always_inline void *kmalloc_order_trace(size_t size, gfp_t flags,
+ unsigned int order)
{
return kmalloc_order(size, flags, order);
}
@@ -574,6 +574,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
* Try really hard to succeed the allocation but fail
* eventually.
*/
+__alloc_size(1)
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
if (__builtin_constant_p(size)) {
@@ -596,6 +597,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
return __kmalloc(size, flags);
}
+__alloc_size(1)
static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
{
#ifndef CONFIG_SLOB
@@ -620,6 +622,7 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
* @size: element size.
* @flags: the type of memory to allocate (see kmalloc).
*/
+__alloc_size(1, 2)
static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
{
size_t bytes;
@@ -638,8 +641,9 @@ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
* @new_size: new size of a single member of the array
* @flags: the type of memory to allocate (see kmalloc)
*/
-static __must_check inline void *
-krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t flags)
+__must_check __alloc_size(2, 3)
+static inline void *krealloc_array(void *p, size_t new_n, size_t new_size,
+ gfp_t flags)
{
size_t bytes;
@@ -655,6 +659,7 @@ krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t flags)
* @size: element size.
* @flags: the type of memory to allocate (see kmalloc).
*/
+__alloc_size(1, 2)
static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
{
return kmalloc_array(n, size, flags | __GFP_ZERO);
@@ -668,7 +673,7 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
* allocator where we care about the real place the memory allocation
* request comes from.
*/
-extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
+extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller);
#define kmalloc_track_caller(size, flags) \
__kmalloc_track_caller(size, flags, _RET_IP_)
@@ -684,6 +689,7 @@ static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags,
return __kmalloc_node(bytes, flags, node);
}
+__alloc_size(1, 2)
static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node)
{
return kmalloc_array_node(n, size, flags | __GFP_ZERO, node);
@@ -691,7 +697,8 @@ static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node)
#ifdef CONFIG_NUMA
-extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
+extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
+ unsigned long caller);
#define kmalloc_node_track_caller(size, flags, node) \
__kmalloc_node_track_caller(size, flags, node, \
_RET_IP_)
@@ -716,6 +723,7 @@ static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
* @size: how many bytes of memory are required.
* @flags: the type of memory to allocate (see kmalloc).
*/
+__alloc_size(1)
static inline void *kzalloc(size_t size, gfp_t flags)
{
return kmalloc(size, flags | __GFP_ZERO);
@@ -727,11 +735,53 @@ static inline void *kzalloc(size_t size, gfp_t flags)
* @flags: the type of memory to allocate (see kmalloc).
* @node: memory node from which to allocate
*/
+__alloc_size(1)
static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
{
return kmalloc_node(size, flags | __GFP_ZERO, node);
}
+__alloc_size(1)
+extern void *kvmalloc_node(size_t size, gfp_t flags, int node);
+__alloc_size(1)
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+ return kvmalloc_node(size, flags, NUMA_NO_NODE);
+}
+__alloc_size(1)
+static inline void *kvzalloc_node(size_t size, gfp_t flags, int node)
+{
+ return kvmalloc_node(size, flags | __GFP_ZERO, node);
+}
+__alloc_size(1)
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+ return kvmalloc(size, flags | __GFP_ZERO);
+}
+
+__alloc_size(1, 2)
+static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
+{
+ size_t bytes;
+
+ if (unlikely(check_mul_overflow(n, size, &bytes)))
+ return NULL;
+
+ return kvmalloc(bytes, flags);
+}
+
+__alloc_size(1, 2)
+static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
+{
+ return kvmalloc_array(n, size, flags | __GFP_ZERO);
+}
+
+__alloc_size(3)
+extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize,
+ gfp_t flags);
+extern void kvfree(const void *addr);
+extern void kvfree_sensitive(const void *addr, size_t len);
+
unsigned int kmem_cache_size(struct kmem_cache *s);
void __init kmem_cache_init_late(void);
diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index 6bb4bc1a5f5459..c34b55a6e5540c 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -11,15 +11,24 @@
#ifndef _LINUX_STACKDEPOT_H
#define _LINUX_STACKDEPOT_H
+#include <linux/gfp.h>
+
typedef u32 depot_stack_handle_t;
+depot_stack_handle_t __stack_depot_save(unsigned long *entries,
+ unsigned int nr_entries,
+ gfp_t gfp_flags, bool can_alloc);
+
depot_stack_handle_t stack_depot_save(unsigned long *entries,
unsigned int nr_entries, gfp_t gfp_flags);
unsigned int stack_depot_fetch(depot_stack_handle_t handle,
unsigned long **entries);
-unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
+int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
+ int spaces);
+
+void stack_depot_print(depot_stack_handle_t stack);
#ifdef CONFIG_STACKDEPOT
int stack_depot_init(void);
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index 9edecb494e9e2d..bef158815e83d6 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -21,6 +21,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task,
unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
unsigned int size, unsigned int skipnr);
unsigned int stack_trace_save_user(unsigned long *store, unsigned int size);
+unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
/* Internal interfaces. Do not use in generic code */
#ifdef CONFIG_ARCH_STACKWALK
diff --git a/include/linux/swap.h b/include/linux/swap.h
index cdf0957a88a49a..d1ea44b31f19f5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -341,7 +341,6 @@ void workingset_update_node(struct xa_node *node);
/* linux/mm/page_alloc.c */
extern unsigned long totalreserve_pages;
-extern unsigned long nr_free_buffer_pages(void);
/* Definition of global_zone_page_state not available yet */
#define nr_free_pages() global_zone_page_state(NR_FREE_PAGES)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 671d402c3778fa..7a5265a0acdcf9 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -22,7 +22,7 @@ struct notifier_block; /* in notifier.h */
#define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */
#define VM_DMA_COHERENT 0x00000010 /* dma_alloc_coherent */
#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */
-#define VM_NO_GUARD 0x00000040 /* don't add guard page */
+#define VM_NO_GUARD 0x00000040 /* ***DANGEROUS*** don't add guard page */
#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
#define VM_FLUSH_RESET_PERMS 0x00000100 /* reset direct map and flush TLB on unmap, can't be freed in atomic context */
#define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */
@@ -136,20 +136,31 @@ static inline void vmalloc_init(void)
static inline unsigned long vmalloc_nr_pages(void) { return 0; }
#endif
+__alloc_size(1)
extern void *vmalloc(unsigned long size);
+__alloc_size(1)
extern void *vzalloc(unsigned long size);
+__alloc_size(1)
extern void *vmalloc_user(unsigned long size);
+__alloc_size(1)
extern void *vmalloc_node(unsigned long size, int node);
+__alloc_size(1)
extern void *vzalloc_node(unsigned long size, int node);
+__alloc_size(1)
extern void *vmalloc_32(unsigned long size);
+__alloc_size(1)
extern void *vmalloc_32_user(unsigned long size);
+__alloc_size(1)
extern void *__vmalloc(unsigned long size, gfp_t gfp_mask);
+__alloc_size(1)
extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
const void *caller);
+__alloc_size(1)
void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
int node, const void *caller);
+__alloc_size(1)
void *vmalloc_no_huge(unsigned long size);
extern void vfree(const void *addr);
diff --git a/init/main.c b/init/main.c
index f24c62ee329052..eb32a3cf683974 100644
--- a/init/main.c
+++ b/init/main.c
@@ -180,7 +180,7 @@ EXPORT_SYMBOL_GPL(static_key_initialized);
unsigned int reset_devices;
EXPORT_SYMBOL(reset_devices);
-static int __init set_reset_devices(char *str)
+static int __init set_reset_devices(char *str __always_unused)
{
reset_devices = 1;
return 1;
@@ -230,13 +230,13 @@ static bool __init obsolete_checksetup(char *line)
unsigned long loops_per_jiffy = (1<<12);
EXPORT_SYMBOL(loops_per_jiffy);
-static int __init debug_kernel(char *str)
+static int __init debug_kernel(char *str __always_unused)
{
console_loglevel = CONSOLE_LOGLEVEL_DEBUG;
return 0;
}
-static int __init quiet_kernel(char *str)
+static int __init quiet_kernel(char *str __always_unused)
{
console_loglevel = CONSOLE_LOGLEVEL_QUIET;
return 0;
@@ -480,7 +480,7 @@ static void __init setup_boot_config(void)
get_boot_config_from_initrd(NULL, NULL);
}
-static int __init warn_bootconfig(char *str)
+static int __init warn_bootconfig(char *str __always_unused)
{
pr_warn("WARNING: 'bootconfig' found on the kernel command line but CONFIG_BOOT_CONFIG is not set.\n");
return 0;
@@ -509,7 +509,8 @@ static void __init repair_env_string(char *param, char *val)
/* Anything after -- gets handed straight to init. */
static int __init set_init_arg(char *param, char *val,
- const char *unused, void *arg)
+ const char *unused __always_unused,
+ void *arg __always_unused)
{
unsigned int i;
@@ -534,7 +535,8 @@ static int __init set_init_arg(char *param, char *val,
* unused parameters (modprobe will find them in /proc/cmdline).
*/
static int __init unknown_bootoption(char *param, char *val,
- const char *unused, void *arg)
+ const char *unused __always_unused,
+ void *arg __always_unused)
{
size_t len = strlen(param);
@@ -734,7 +736,8 @@ noinline void __ref rest_init(void)
/* Check for early params. */
static int __init do_early_param(char *param, char *val,
- const char *unused, void *arg)
+ const char *unused __always_unused,
+ void *arg __always_unused)
{
const struct obs_kernel_param *p;
@@ -1354,8 +1357,10 @@ static const char *initcall_level_names[] __initdata = {
"late",
};
-static int __init ignore_unknown_bootoption(char *param, char *val,
- const char *unused, void *arg)
+static int __init ignore_unknown_bootoption(char *param __always_unused,
+ char *val __always_unused,
+ const char *unused __always_unused,
+ void *arg __always_unused)
{
return 0;
}
@@ -1492,7 +1497,7 @@ void __weak free_initmem(void)
free_initmem_default(POISON_FREE_INITMEM);
}
-static int __ref kernel_init(void *unused)
+static int __ref kernel_init(void *unused __always_unused)
{
int ret;
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 3f312bf2b11638..f101c171753f6f 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -10,6 +10,7 @@
#include <linux/nsproxy.h>
#include <linux/sysctl.h>
#include <linux/uaccess.h>
+#include <linux/capability.h>
#include <linux/ipc_namespace.h>
#include <linux/msg.h>
#include "util.h"
@@ -22,7 +23,6 @@ static void *get_ipc(struct ctl_table *table)
return which;
}
-#ifdef CONFIG_PROC_SYSCTL
static int proc_ipc_dointvec(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -104,13 +104,17 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
return ret;
}
-#else
-#define proc_ipc_doulongvec_minmax NULL
-#define proc_ipc_dointvec NULL
-#define proc_ipc_dointvec_minmax NULL
-#define proc_ipc_dointvec_minmax_orphans NULL
-#define proc_ipc_auto_msgmni NULL
-#define proc_ipc_sem_dointvec NULL
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static int proc_ipc_dointvec_minmax_checkpoint_restore(struct ctl_table *table,
+ int write, void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct user_namespace *user_ns = current->nsproxy->ipc_ns->user_ns;
+
+ if (write && !checkpoint_restore_ns_capable(user_ns))
+ return -EPERM;
+
+ return proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
#endif
int ipc_mni = IPCMNI;
@@ -198,8 +202,8 @@ static struct ctl_table ipc_kern_table[] = {
.procname = "sem_next_id",
.data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id,
.maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id),
- .mode = 0644,
- .proc_handler = proc_ipc_dointvec_minmax,
+ .mode = 0666,
+ .proc_handler = proc_ipc_dointvec_minmax_checkpoint_restore,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_INT_MAX,
},
@@ -207,8 +211,8 @@ static struct ctl_table ipc_kern_table[] = {
.procname = "msg_next_id",
.data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id,
.maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id),
- .mode = 0644,
- .proc_handler = proc_ipc_dointvec_minmax,
+ .mode = 0666,
+ .proc_handler = proc_ipc_dointvec_minmax_checkpoint_restore,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_INT_MAX,
},
@@ -216,8 +220,8 @@ static struct ctl_table ipc_kern_table[] = {
.procname = "shm_next_id",
.data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id,
.maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id),
- .mode = 0644,
- .proc_handler = proc_ipc_dointvec_minmax,
+ .mode = 0666,
+ .proc_handler = proc_ipc_dointvec_minmax_checkpoint_restore,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_INT_MAX,
},
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2a9695ccb65f53..d0e163a0209979 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -69,6 +69,13 @@
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
+/*
+ * There could be abnormal cpuset configurations for cpu or memory
+ * node binding, add this key to provide a quick low-cost judgement
+ * of the situation.
+ */
+DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
+
/* See "Frequency meter" comments, below. */
struct fmeter {
@@ -372,6 +379,17 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
+static inline void check_insane_mems_config(nodemask_t *nodes)
+{
+ if (!cpusets_insane_config() &&
+ movable_only_nodes(nodes)) {
+ static_branch_enable(&cpusets_insane_config_key);
+ pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
+ "Cpuset allocations might fail even with a lot of memory available.\n",
+ nodemask_pr_args(nodes));
+ }
+}
+
/*
* Cgroup v2 behavior is used on the "cpus" and "mems" control files when
* on default hierarchy or when the cpuset_v2_mode flag is set by mounting
@@ -1870,6 +1888,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
+ check_insane_mems_config(&trialcs->mems_allowed);
+
spin_lock_irq(&callback_lock);
cs->mems_allowed = trialcs->mems_allowed;
spin_unlock_irq(&callback_lock);
@@ -3173,6 +3193,9 @@ update_tasks:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
+ if (mems_updated)
+ check_insane_mems_config(&new_mems);
+
if (is_in_v2_mode())
hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 192e43a874076d..fffe8b7382012b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -613,7 +613,7 @@ static int finish_cpu(unsigned int cpu)
*/
if (mm != &init_mm)
idle->active_mm = &init_mm;
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
return 0;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index a53863dafb3d75..11fafe1ad8b142 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -474,7 +474,7 @@ static void exit_mm(void)
__set_current_state(TASK_RUNNING);
mmap_read_lock(mm);
}
- mmgrab(mm);
+ mmgrab_lazy_tlb(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
diff --git a/kernel/fork.c b/kernel/fork.c
index 67679e3933f2fc..cf65223c776b5a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -685,6 +685,53 @@ static void check_mm(struct mm_struct *mm)
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+static void do_shoot_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ if (current->active_mm == mm) {
+ WARN_ON_ONCE(current->mm);
+ current->active_mm = &init_mm;
+ switch_mm(mm, &init_mm, current);
+ }
+}
+
+static void do_check_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void shoot_lazy_tlbs(struct mm_struct *mm)
+{
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+ /*
+ * IPI overheads have not found to be expensive, but they could
+ * be reduced in a number of possible ways, for example (in
+ * roughly increasing order of complexity):
+ * - A batch of mms requiring IPIs could be gathered and freed
+ * at once.
+ * - CPUs could store their active mm somewhere that can be
+ * remotely checked without a lock, to filter out
+ * false-positives in the cpumask.
+ * - After mm_users or mm_count reaches zero, switching away
+ * from the mm could clear mm_cpumask to reduce some IPIs
+ * (some batching or delaying would help).
+ * - A delayed freeing and RCU-like quiescing sequence based on
+ * mm switching to avoid IPIs completely.
+ */
+ on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+ if (IS_ENABLED(CONFIG_DEBUG_VM))
+ on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+ } else {
+ /*
+ * In this case, lazy tlb mms are refounted and would not reach
+ * __mmdrop until all CPUs have switched away and mmdrop()ed.
+ */
+ }
+}
+
/*
* Called when the last reference to the mm
* is dropped: either by a lazy thread or by
@@ -694,6 +741,10 @@ void __mmdrop(struct mm_struct *mm)
{
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
+
+ /* Ensure no CPUs are using this as their lazy tlb mm */
+ shoot_lazy_tlbs(mm);
+
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
@@ -3026,7 +3077,7 @@ int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
int ksys_unshare(unsigned long unshare_flags)
{
struct fs_struct *fs, *new_fs = NULL;
- struct files_struct *fd, *new_fd = NULL;
+ struct files_struct *new_fd = NULL;
struct cred *new_cred = NULL;
struct nsproxy *new_nsproxy = NULL;
int do_sysvsem = 0;
@@ -3113,11 +3164,8 @@ int ksys_unshare(unsigned long unshare_flags)
spin_unlock(&fs->lock);
}
- if (new_fd) {
- fd = current->files;
- current->files = new_fd;
- new_fd = fd;
- }
+ if (new_fd)
+ swap(current->files, new_fd);
task_unlock(current);
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 9888e2bc8c7671..8cc07e7f29aa19 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -145,6 +145,47 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
touch_nmi_watchdog();
}
+static void check_killed_task(struct task_struct *t, unsigned long timeout)
+{
+ unsigned long stamp = t->killed_time;
+
+ /*
+ * Ensure the task is not frozen.
+ * Also, skip vfork and any other user process that freezer should skip.
+ */
+ if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
+ return;
+ /*
+ * Skip threads which are already inside do_exit(), for exit_mm() etc.
+ * might take many seconds.
+ */
+ if (t->flags & PF_EXITING)
+ return;
+ if (!stamp) {
+ stamp = jiffies;
+ if (!stamp)
+ stamp++;
+ t->killed_time = stamp;
+ return;
+ }
+ if (time_is_after_jiffies(stamp + timeout * HZ))
+ return;
+ trace_sched_process_hang(t);
+ if (sysctl_hung_task_panic) {
+ console_verbose();
+ hung_task_call_panic = true;
+ }
+ /*
+ * This thread failed to terminate for more than
+ * sysctl_hung_task_timeout_secs seconds, complain:
+ */
+ pr_err("INFO: task %s:%d can't die for more than %ld seconds.\n",
+ t->comm, t->pid, (jiffies - stamp) / HZ);
+ sched_show_task(t);
+ hung_task_show_lock = true;
+ touch_nmi_watchdog();
+}
+
/*
* To avoid extending the RCU grace period for an unbounded amount of time,
* periodically exit the critical section and enter a new one.
@@ -196,6 +237,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
goto unlock;
last_break = jiffies;
}
+ /* Check threads which are about to terminate. */
+ if (unlikely(fatal_signal_pending(t)))
+ check_killed_task(t, timeout);
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE)
check_hung_task(t, timeout);
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 80bfe71bbe13ea..36ca640c4f8e76 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -88,6 +88,7 @@ static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas);
struct kcov_percpu_data {
void *irq_area;
+ local_lock_t lock;
unsigned int saved_mode;
unsigned int saved_size;
@@ -96,7 +97,9 @@ struct kcov_percpu_data {
int saved_sequence;
};
-static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data);
+static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
/* Must be called with kcov_remote_lock locked. */
static struct kcov_remote *kcov_remote_find(u64 handle)
@@ -824,7 +827,7 @@ void kcov_remote_start(u64 handle)
if (!in_task() && !in_serving_softirq())
return;
- local_irq_save(flags);
+ local_lock_irqsave(&kcov_percpu_data.lock, flags);
/*
* Check that kcov_remote_start() is not called twice in background
@@ -832,7 +835,7 @@ void kcov_remote_start(u64 handle)
*/
mode = READ_ONCE(t->kcov_mode);
if (WARN_ON(in_task() && kcov_mode_enabled(mode))) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
/*
@@ -841,14 +844,15 @@ void kcov_remote_start(u64 handle)
* happened while collecting coverage from a background thread.
*/
if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
spin_lock(&kcov_remote_lock);
remote = kcov_remote_find(handle);
if (!remote) {
- spin_unlock_irqrestore(&kcov_remote_lock, flags);
+ spin_unlock(&kcov_remote_lock);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
kcov_debug("handle = %llx, context: %s\n", handle,
@@ -869,19 +873,19 @@ void kcov_remote_start(u64 handle)
size = CONFIG_KCOV_IRQ_AREA_SIZE;
area = this_cpu_ptr(&kcov_percpu_data)->irq_area;
}
- spin_unlock_irqrestore(&kcov_remote_lock, flags);
+ spin_unlock(&kcov_remote_lock);
/* Can only happen when in_task(). */
if (!area) {
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
area = vmalloc(size * sizeof(unsigned long));
if (!area) {
kcov_put(kcov);
return;
}
+ local_lock_irqsave(&kcov_percpu_data.lock, flags);
}
- local_irq_save(flags);
-
/* Reset coverage size. */
*(u64 *)area = 0;
@@ -891,7 +895,7 @@ void kcov_remote_start(u64 handle)
}
kcov_start(t, kcov, size, area, mode, sequence);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
}
EXPORT_SYMBOL(kcov_remote_start);
@@ -965,12 +969,12 @@ void kcov_remote_stop(void)
if (!in_task() && !in_serving_softirq())
return;
- local_irq_save(flags);
+ local_lock_irqsave(&kcov_percpu_data.lock, flags);
mode = READ_ONCE(t->kcov_mode);
barrier();
if (!kcov_mode_enabled(mode)) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
/*
@@ -978,12 +982,12 @@ void kcov_remote_stop(void)
* actually found the remote handle and started collecting coverage.
*/
if (in_serving_softirq() && !t->kcov_softirq) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
/* Make sure that kcov_softirq is only set when in softirq. */
if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
@@ -1013,7 +1017,7 @@ void kcov_remote_stop(void)
spin_unlock(&kcov_remote_lock);
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
/* Get in kcov_remote_start(). */
kcov_put(kcov);
@@ -1034,8 +1038,8 @@ static int __init kcov_init(void)
int cpu;
for_each_possible_cpu(cpu) {
- void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE *
- sizeof(unsigned long));
+ void *area = vmalloc_node(CONFIG_KCOV_IRQ_AREA_SIZE *
+ sizeof(unsigned long), cpu_to_node(cpu));
if (!area)
return -ENOMEM;
per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4a4d7092a2d828..4d7ab4da2d633b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1350,14 +1350,19 @@ void kthread_use_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(tsk->mm);
+ /*
+ * It's possible that tsk->active_mm == mm here, but we must
+ * still mmgrab(mm) and mmdrop_lazy_tlb(active_mm), because lazy
+ * mm may not have its own refcount (see mmgrab/drop_lazy_tlb()).
+ */
+ mmgrab(mm);
+
task_lock(tsk);
/* Hold off tlb flush IPIs while switching mm's */
local_irq_disable();
active_mm = tsk->active_mm;
- if (active_mm != mm) {
- mmgrab(mm);
+ if (active_mm != mm)
tsk->active_mm = mm;
- }
tsk->mm = mm;
membarrier_update_current_mm(mm);
switch_mm_irqs_off(active_mm, mm, tsk);
@@ -1374,12 +1379,9 @@ void kthread_use_mm(struct mm_struct *mm)
* memory barrier after storing to tsk->mm, before accessing
* user-space memory. A full memory barrier for membarrier
* {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
- * mmdrop(), or explicitly with smp_mb().
+ * mmdrop_lazy_tlb().
*/
- if (active_mm != mm)
- mmdrop(active_mm);
- else
- smp_mb();
+ mmdrop_lazy_tlb(active_mm);
to_kthread(tsk)->oldfs = force_uaccess_begin();
}
@@ -1411,10 +1413,13 @@ void kthread_unuse_mm(struct mm_struct *mm)
local_irq_disable();
tsk->mm = NULL;
membarrier_update_current_mm(NULL);
+ mmgrab_lazy_tlb(mm);
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
local_irq_enable();
task_unlock(tsk);
+
+ mmdrop(mm);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);
diff --git a/kernel/resource.c b/kernel/resource.c
index ca9f5198a01ff9..5ad3eba619ba75 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -73,6 +73,18 @@ static struct resource *next_resource(struct resource *p)
return p->sibling;
}
+static struct resource *next_resource_skip_children(struct resource *p)
+{
+ while (!p->sibling && p->parent)
+ p = p->parent;
+ return p->sibling;
+}
+
+#define for_each_resource(_root, _p, _skip_children) \
+ for ((_p) = (_root)->child; (_p); \
+ (_p) = (_skip_children) ? next_resource_skip_children(_p) : \
+ next_resource(_p))
+
static void *r_next(struct seq_file *m, void *v, loff_t *pos)
{
struct resource *p = v;
@@ -1707,37 +1719,49 @@ static int strict_iomem_checks;
#endif
/*
- * check if an address is reserved in the iomem resource tree
- * returns true if reserved, false if not reserved.
+ * Check if an address is exclusive to the kernel and must not be mapped to
+ * user space, for example, via /dev/mem.
+ *
+ * Returns true if exclusive to the kernel, otherwise returns false.
*/
bool iomem_is_exclusive(u64 addr)
{
- struct resource *p = &iomem_resource;
- bool err = false;
- loff_t l;
+ const unsigned int exclusive_system_ram = IORESOURCE_SYSTEM_RAM |
+ IORESOURCE_EXCLUSIVE;
+ bool skip_children = false, err = false;
int size = PAGE_SIZE;
-
- if (!strict_iomem_checks)
- return false;
+ struct resource *p;
addr = addr & PAGE_MASK;
read_lock(&resource_lock);
- for (p = p->child; p ; p = r_next(NULL, p, &l)) {
- /*
- * We can probably skip the resources without
- * IORESOURCE_IO attribute?
- */
+ for_each_resource(&iomem_resource, p, skip_children) {
if (p->start >= addr + size)
break;
- if (p->end < addr)
+ if (p->end < addr) {
+ skip_children = true;
continue;
+ }
+ skip_children = false;
+
+ /*
+ * IORESOURCE_SYSTEM_RAM resources are exclusive if
+ * IORESOURCE_EXCLUSIVE is set, even if they
+ * are not busy and even if "iomem=relaxed" is set. The
+ * responsible driver dynamically adds/removes system RAM within
+ * such an area and uncontrolled access is dangerous.
+ */
+ if ((p->flags & exclusive_system_ram) == exclusive_system_ram) {
+ err = true;
+ break;
+ }
+
/*
* A resource is exclusive if IORESOURCE_EXCLUSIVE is set
* or CONFIG_IO_STRICT_DEVMEM is enabled and the
* resource is busy.
*/
- if ((p->flags & IORESOURCE_BUSY) == 0)
+ if (!strict_iomem_checks || !(p->flags & IORESOURCE_BUSY))
continue;
if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
|| p->flags & IORESOURCE_EXCLUSIVE) {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d87b9abee37a5c..b2cb1737e1ce9f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4772,7 +4772,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
__releases(rq->lock)
{
struct rq *rq = this_rq();
- struct mm_struct *mm = rq->prev_mm;
+ struct mm_struct *mm = NULL;
long prev_state;
/*
@@ -4791,7 +4791,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
current->comm, current->pid, preempt_count()))
preempt_count_set(FORK_PREEMPT_COUNT);
- rq->prev_mm = NULL;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+ mm = rq->prev_lazy_mm;
+ rq->prev_lazy_mm = NULL;
+#endif
/*
* A task struct has one reference for the use as "current".
@@ -4831,13 +4834,14 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* rq->curr, before returning to userspace, so provide them here:
*
* - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
- * provided by mmdrop(),
+ * provided by mmdrop_lazy_tlb(),
* - a sync_core for SYNC_CORE.
*/
if (mm) {
membarrier_mm_sync_core_before_usermode(mm);
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
}
+
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
@@ -4900,9 +4904,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
/*
* kernel -> kernel lazy + transfer active
- * user -> kernel lazy + mmgrab() active
+ * user -> kernel lazy + mmgrab_lazy_tlb() active
*
- * kernel -> user switch + mmdrop() active
+ * kernel -> user switch + mmdrop_lazy_tlb() active
* user -> user switch
*/
if (!next->mm) { // to kernel
@@ -4910,7 +4914,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
next->active_mm = prev->active_mm;
if (prev->mm) // from user
- mmgrab(prev->active_mm);
+ mmgrab_lazy_tlb(prev->active_mm);
else
prev->active_mm = NULL;
} else { // to user
@@ -4926,9 +4930,20 @@ context_switch(struct rq *rq, struct task_struct *prev,
switch_mm_irqs_off(prev->active_mm, next->mm, next);
if (!prev->mm) { // from kernel
- /* will mmdrop() in finish_task_switch(). */
- rq->prev_mm = prev->active_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+ /* Will mmdrop_lazy_tlb() in finish_task_switch(). */
+ rq->prev_lazy_mm = prev->active_mm;
prev->active_mm = NULL;
+#else
+ /*
+ * Without MMU_LAZY_TLB_REFCOUNT there is no lazy
+ * tracking (because no rq->prev_lazy_mm) in
+ * finish_task_switch, so no mmdrop_lazy_tlb(), so no
+ * memory barrier for membarrier (see the membarrier
+ * comment in finish_task_switch()). Do it here.
+ */
+ smp_mb();
+#endif
}
}
@@ -9396,7 +9411,7 @@ void __init sched_init(void)
/*
* The boot idle thread does lazy MMU switching as well:
*/
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
enter_lazy_tlb(&init_mm, current);
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b7a38797d921ef..009daebf9248e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -980,7 +980,9 @@ struct rq {
struct task_struct *idle;
struct task_struct *stop;
unsigned long next_balance;
- struct mm_struct *prev_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+ struct mm_struct *prev_lazy_mm;
+#endif
unsigned int clock_update_flags;
u64 clock;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index c56faae461d940..b2f0b912af4351 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1496,7 +1496,6 @@ static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
-int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
static unsigned long __read_mostly *sched_numa_onlined_nodes;
#endif
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 9f8117c7cfddee..9c625257023d29 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -13,6 +13,7 @@
#include <linux/export.h>
#include <linux/kallsyms.h>
#include <linux/stacktrace.h>
+#include <linux/interrupt.h>
/**
* stack_trace_print - Print the entries in the stack trace
@@ -373,3 +374,32 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
#endif /* CONFIG_USER_STACKTRACE_SUPPORT */
#endif /* !CONFIG_ARCH_STACKWALK */
+
+static inline bool in_irqentry_text(unsigned long ptr)
+{
+ return (ptr >= (unsigned long)&__irqentry_text_start &&
+ ptr < (unsigned long)&__irqentry_text_end) ||
+ (ptr >= (unsigned long)&__softirqentry_text_start &&
+ ptr < (unsigned long)&__softirqentry_text_end);
+}
+
+/**
+ * filter_irq_stacks - Find first IRQ stack entry in trace
+ * @entries: Pointer to stack trace array
+ * @nr_entries: Number of entries in the storage array
+ *
+ * Return: Number of trace entries until IRQ stack starts.
+ */
+unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries)
+{
+ unsigned int i;
+
+ for (i = 0; i < nr_entries; i++) {
+ if (in_irqentry_text(entries[i])) {
+ /* Include the irqentry function into the stack. */
+ return i + 1;
+ }
+ }
+ return nr_entries;
+}
+EXPORT_SYMBOL_GPL(filter_irq_stacks);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 083be6af29d705..857c1ccad9e864 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -3316,7 +3316,7 @@ static struct ctl_table fs_table[] = {
.procname = "protected_symlinks",
.data = &sysctl_protected_symlinks,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
@@ -3325,7 +3325,7 @@ static struct ctl_table fs_table[] = {
.procname = "protected_hardlinks",
.data = &sysctl_protected_hardlinks,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
@@ -3334,7 +3334,7 @@ static struct ctl_table fs_table[] = {
.procname = "protected_fifos",
.data = &sysctl_protected_fifos,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = &two,
@@ -3343,7 +3343,7 @@ static struct ctl_table fs_table[] = {
.procname = "protected_regular",
.data = &sysctl_protected_regular,
.maxlen = sizeof(int),
- .mode = 0600,
+ .mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = &two,
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 257ffb993ea235..f00de83d024627 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -137,7 +137,7 @@ static void __acct_update_integrals(struct task_struct *tsk,
* the rest of the math is done in xacct_add_tsk.
*/
tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
- tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
+ tsk->acct_vm_mem1 += delta * READ_ONCE(tsk->mm->total_vm) >> 10;
}
/**
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 33a6b4a2443d27..9a042a44900252 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1350,7 +1350,7 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
struct worker_pool *pool = pwq->pool;
/* record the work call stack in order to print it in KASAN reports */
- kasan_record_aux_stack(work);
+ kasan_record_aux_stack_noalloc(work);
/* we own @work, set data and link */
set_work_pwq(work, pwq, extra_flags);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ab070231f5511f..dbff322207ad23 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1782,6 +1782,12 @@ config IO_STRICT_DEVMEM
menu "$(SRCARCH) Debugging"
+config DEBUG_AID_FOR_SYZBOT
+ bool "Additional debug code for syzbot"
+ default n
+ help
+ This option is intended for testing by syzbot.
+
source "arch/$(SRCARCH)/Kconfig.debug"
endmenu
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 0a2e417f83cbae..b437ae79aca143 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -20,7 +20,6 @@
*/
#include <linux/gfp.h>
-#include <linux/interrupt.h>
#include <linux/jhash.h>
#include <linux/kernel.h>
#include <linux/mm.h>
@@ -102,8 +101,8 @@ static bool init_stack_slab(void **prealloc)
}
/* Allocation of a new stack in raw storage */
-static struct stack_record *depot_alloc_stack(unsigned long *entries, int size,
- u32 hash, void **prealloc, gfp_t alloc_flags)
+static struct stack_record *
+depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
{
struct stack_record *stack;
size_t required_size = struct_size(stack, entries, size);
@@ -215,6 +214,49 @@ static inline struct stack_record *find_stack(struct stack_record *bucket,
}
/**
+ * stack_depot_snprint - print stack entries from a depot into a buffer
+ *
+ * @handle: Stack depot handle which was returned from
+ * stack_depot_save().
+ * @buf: Pointer to the print buffer
+ *
+ * @size: Size of the print buffer
+ *
+ * @spaces: Number of leading spaces to print
+ *
+ * Return: Number of bytes printed.
+ */
+int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
+ int spaces)
+{
+ unsigned long *entries;
+ unsigned int nr_entries;
+
+ nr_entries = stack_depot_fetch(handle, &entries);
+ return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries,
+ spaces) : 0;
+}
+EXPORT_SYMBOL_GPL(stack_depot_snprint);
+
+/**
+ * stack_depot_print - print stack entries from a depot
+ *
+ * @stack: Stack depot handle which was returned from
+ * stack_depot_save().
+ *
+ */
+void stack_depot_print(depot_stack_handle_t stack)
+{
+ unsigned long *entries;
+ unsigned int nr_entries;
+
+ nr_entries = stack_depot_fetch(stack, &entries);
+ if (nr_entries > 0)
+ stack_trace_print(entries, nr_entries, 0);
+}
+EXPORT_SYMBOL_GPL(stack_depot_print);
+
+/**
* stack_depot_fetch - Fetch stack entries from a depot
*
* @handle: Stack depot handle which was returned from
@@ -232,6 +274,9 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle,
struct stack_record *stack;
*entries = NULL;
+ if (!handle)
+ return 0;
+
if (parts.slabindex > depot_index) {
WARN(1, "slab index %d out of bounds (%d) for stack id %08x\n",
parts.slabindex, depot_index, handle);
@@ -248,17 +293,28 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle,
EXPORT_SYMBOL_GPL(stack_depot_fetch);
/**
- * stack_depot_save - Save a stack trace from an array
+ * __stack_depot_save - Save a stack trace from an array
*
* @entries: Pointer to storage array
* @nr_entries: Size of the storage array
* @alloc_flags: Allocation gfp flags
+ * @can_alloc: Allocate stack slabs (increased chance of failure if false)
+ *
+ * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is
+ * %true, is allowed to replenish the stack slab pool in case no space is left
+ * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids
+ * any allocations and will fail if no space is left to store the stack trace.
+ *
+ * Context: Any context, but setting @can_alloc to %false is required if
+ * alloc_pages() cannot be used from the current context. Currently
+ * this is the case from contexts where neither %GFP_ATOMIC nor
+ * %GFP_NOWAIT can be used (NMI, raw_spin_lock).
*
- * Return: The handle of the stack struct stored in depot
+ * Return: The handle of the stack struct stored in depot, 0 on failure.
*/
-depot_stack_handle_t stack_depot_save(unsigned long *entries,
- unsigned int nr_entries,
- gfp_t alloc_flags)
+depot_stack_handle_t __stack_depot_save(unsigned long *entries,
+ unsigned int nr_entries,
+ gfp_t alloc_flags, bool can_alloc)
{
struct stack_record *found = NULL, **bucket;
depot_stack_handle_t retval = 0;
@@ -291,7 +347,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
* The smp_load_acquire() here pairs with smp_store_release() to
* |next_slab_inited| in depot_alloc_stack() and init_stack_slab().
*/
- if (unlikely(!smp_load_acquire(&next_slab_inited))) {
+ if (unlikely(can_alloc && !smp_load_acquire(&next_slab_inited))) {
/*
* Zero out zone modifiers, as we don't have specific zone
* requirements. Keep the flags related to allocation in atomic
@@ -309,9 +365,8 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
found = find_stack(*bucket, entries, nr_entries, hash);
if (!found) {
- struct stack_record *new =
- depot_alloc_stack(entries, nr_entries,
- hash, &prealloc, alloc_flags);
+ struct stack_record *new = depot_alloc_stack(entries, nr_entries, hash, &prealloc);
+
if (new) {
new->next = *bucket;
/*
@@ -340,27 +395,24 @@ exit:
fast_exit:
return retval;
}
-EXPORT_SYMBOL_GPL(stack_depot_save);
-
-static inline int in_irqentry_text(unsigned long ptr)
-{
- return (ptr >= (unsigned long)&__irqentry_text_start &&
- ptr < (unsigned long)&__irqentry_text_end) ||
- (ptr >= (unsigned long)&__softirqentry_text_start &&
- ptr < (unsigned long)&__softirqentry_text_end);
-}
+EXPORT_SYMBOL_GPL(__stack_depot_save);
-unsigned int filter_irq_stacks(unsigned long *entries,
- unsigned int nr_entries)
+/**
+ * stack_depot_save - Save a stack trace from an array
+ *
+ * @entries: Pointer to storage array
+ * @nr_entries: Size of the storage array
+ * @alloc_flags: Allocation gfp flags
+ *
+ * Context: Contexts where allocations via alloc_pages() are allowed.
+ * See __stack_depot_save() for more details.
+ *
+ * Return: The handle of the stack struct stored in depot, 0 on failure.
+ */
+depot_stack_handle_t stack_depot_save(unsigned long *entries,
+ unsigned int nr_entries,
+ gfp_t alloc_flags)
{
- unsigned int i;
-
- for (i = 0; i < nr_entries; i++) {
- if (in_irqentry_text(entries[i])) {
- /* Include the irqentry function into the stack. */
- return i + 1;
- }
- }
- return nr_entries;
+ return __stack_depot_save(entries, nr_entries, alloc_flags, true);
}
-EXPORT_SYMBOL_GPL(filter_irq_stacks);
+EXPORT_SYMBOL_GPL(stack_depot_save);
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 8835e078457858..aa8e4225021978 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -493,7 +493,7 @@ static void kmalloc_oob_in_memset(struct kunit *test)
kfree(ptr);
}
-static void kmalloc_memmove_invalid_size(struct kunit *test)
+static void kmalloc_memmove_negative_size(struct kunit *test)
{
char *ptr;
size_t size = 64;
@@ -515,6 +515,21 @@ static void kmalloc_memmove_invalid_size(struct kunit *test)
kfree(ptr);
}
+static void kmalloc_memmove_invalid_size(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 64;
+ volatile size_t invalid_size = size;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ memset((char *)ptr, 0, 64);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ memmove((char *)ptr, (char *)ptr + 4, invalid_size));
+ kfree(ptr);
+}
+
static void kmalloc_uaf(struct kunit *test)
{
char *ptr;
@@ -1129,6 +1144,7 @@ static struct kunit_case kasan_kunit_test_cases[] = {
KUNIT_CASE(kmalloc_oob_memset_4),
KUNIT_CASE(kmalloc_oob_memset_8),
KUNIT_CASE(kmalloc_oob_memset_16),
+ KUNIT_CASE(kmalloc_memmove_negative_size),
KUNIT_CASE(kmalloc_memmove_invalid_size),
KUNIT_CASE(kmalloc_uaf),
KUNIT_CASE(kmalloc_uaf_memset),
diff --git a/mm/Kconfig b/mm/Kconfig
index d16ba9249bc531..7d815181560a40 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -109,6 +109,13 @@ config NUMA_KEEP_MEMINFO
config MEMORY_ISOLATION
bool
+# IORESOURCE_SYSTEM_RAM regions in the kernel resource tree that are marked
+# IORESOURCE_EXCLUSIVE cannot be mapped to user space, for example, via
+# /dev/mem.
+config EXCLUSIVE_SYSTEM_RAM
+ def_bool y
+ depends on !DEVMEM || STRICT_DEVMEM
+
#
# Only be set on architectures that have completely implemented memory hotplug
# feature. If you are not sure, don't touch it.
@@ -640,6 +647,7 @@ config ZSWAP_ZPOOL_DEFAULT_Z3FOLD
config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
bool "zsmalloc"
+ depends on !PREEMPT_RT
select ZSMALLOC
help
Use the zsmalloc allocator as the default allocator.
@@ -690,7 +698,7 @@ config Z3FOLD
config ZSMALLOC
tristate "Memory allocator for compressed pages"
- depends on MMU
+ depends on MMU && !PREEMPT_RT
help
zsmalloc is a slab-based memory allocator designed to store
compressed RAM pages. zsmalloc uses virtual memory mapping
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 37024798a97caf..ba8898c7eb8eb3 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -30,7 +30,7 @@ config DAMON_VADDR
select PAGE_IDLE_FLAG
help
This builds the default data access monitoring primitives for DAMON
- that works for virtual address spaces.
+ that work for virtual address spaces.
config DAMON_VADDR_KUNIT_TEST
bool "Test for DAMON primitives" if !KUNIT_ALL_TESTS
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 30e9211f494a78..d993db50280cb3 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -314,7 +314,7 @@ static int __damon_start(struct damon_ctx *ctx)
nr_running_ctxs);
if (IS_ERR(ctx->kdamond)) {
err = PTR_ERR(ctx->kdamond);
- ctx->kdamond = 0;
+ ctx->kdamond = NULL;
}
}
mutex_unlock(&ctx->kdamond_lock);
@@ -652,9 +652,7 @@ static int kdamond_fn(void *data)
unsigned int max_nr_accesses = 0;
unsigned long sz_limit = 0;
- mutex_lock(&ctx->kdamond_lock);
- pr_info("kdamond (%d) starts\n", ctx->kdamond->pid);
- mutex_unlock(&ctx->kdamond_lock);
+ pr_debug("kdamond (%d) starts\n", current->pid);
if (ctx->primitive.init)
ctx->primitive.init(ctx);
@@ -705,7 +703,7 @@ static int kdamond_fn(void *data)
if (ctx->primitive.cleanup)
ctx->primitive.cleanup(ctx);
- pr_debug("kdamond (%d) finishes\n", ctx->kdamond->pid);
+ pr_debug("kdamond (%d) finishes\n", current->pid);
mutex_lock(&ctx->kdamond_lock);
ctx->kdamond = NULL;
mutex_unlock(&ctx->kdamond_lock);
@@ -714,7 +712,7 @@ static int kdamond_fn(void *data)
nr_running_ctxs--;
mutex_unlock(&damon_lock);
- do_exit(0);
+ return 0;
}
#include "core-test.h"
diff --git a/mm/debug.c b/mm/debug.c
index fae0f81ad83130..4333b6784a20c7 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -16,17 +16,19 @@
#include <linux/ctype.h>
#include "internal.h"
+#include <trace/events/migrate.h>
+
+/*
+ * Define EM() and EMe() so that MIGRATE_REASON from trace/events/migrate.h can
+ * be used to populate migrate_reason_names[].
+ */
+#undef EM
+#undef EMe
+#define EM(a, b) b,
+#define EMe(a, b) b
const char *migrate_reason_names[MR_TYPES] = {
- "compaction",
- "memory_failure",
- "memory_hotplug",
- "syscall_or_cpuset",
- "mempolicy_mbind",
- "numa_misplaced",
- "contig_range",
- "longterm_pin",
- "demotion",
+ MIGRATE_REASON
};
const struct trace_print_flags pageflag_names[] = {
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 1403639302e482..228e3954b90c15 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -1104,13 +1104,14 @@ static int __init init_args(struct pgtable_debug_args *args)
/*
* Initialize the debugging data.
*
- * __P000 (or even __S000) will help create page table entries with
- * PROT_NONE permission as required for pxx_protnone_tests().
+ * protection_map[0] (or even protection_map[8]) will help create
+ * page table entries with PROT_NONE permission as required for
+ * pxx_protnone_tests().
*/
memset(args, 0, sizeof(*args));
args->vaddr = get_random_vaddr();
args->page_prot = vm_get_page_prot(VMFLAGS);
- args->page_prot_none = __P000;
+ args->page_prot_none = protection_map[0];
args->is_contiguous_page = false;
args->pud_pfn = ULONG_MAX;
args->pmd_pfn = ULONG_MAX;
diff --git a/mm/filemap.c b/mm/filemap.c
index 5e206a429b5724..30e4e9bf5ca0d2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -261,9 +261,13 @@ void delete_from_page_cache(struct page *page)
struct address_space *mapping = page_mapping(page);
BUG_ON(!PageLocked(page));
+ spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
__delete_from_page_cache(page, NULL);
xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
page_cache_free_page(mapping, page);
}
@@ -339,6 +343,7 @@ void delete_from_page_cache_batch(struct address_space *mapping,
if (!pagevec_count(pvec))
return;
+ spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
for (i = 0; i < pagevec_count(pvec); i++) {
trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
@@ -347,6 +352,9 @@ void delete_from_page_cache_batch(struct address_space *mapping,
}
page_cache_delete_batch(mapping, pvec);
xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
for (i = 0; i < pagevec_count(pvec); i++)
page_cache_free_page(mapping, pvec->pages[i]);
@@ -2088,7 +2096,6 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
if (!xa_is_value(page)) {
if (page->index < start)
goto put;
- VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
if (page->index + thp_nr_pages(page) - 1 > end)
goto put;
if (!trylock_page(page))
@@ -3201,15 +3208,8 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
}
}
- if (pmd_none(*vmf->pmd)) {
- vmf->ptl = pmd_lock(mm, vmf->pmd);
- if (likely(pmd_none(*vmf->pmd))) {
- mm_inc_nr_ptes(mm);
- pmd_populate(mm, vmf->pmd, vmf->prealloc_pte);
- vmf->prealloc_pte = NULL;
- }
- spin_unlock(vmf->ptl);
- }
+ if (pmd_none(*vmf->pmd))
+ pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
/* See comment in handle_pte_fault() */
if (pmd_devmap_trans_unstable(vmf->pmd)) {
diff --git a/mm/gup.c b/mm/gup.c
index 886d6148d3d03a..48c7a5a1e85ba0 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2228,7 +2228,6 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
{
int nr_start = *nr;
struct dev_pagemap *pgmap = NULL;
- int ret = 1;
do {
struct page *page = pfn_to_page(pfn);
@@ -2236,14 +2235,12 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
pgmap = get_dev_pagemap(pfn, pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, flags, pages);
- ret = 0;
break;
}
SetPageReferenced(page);
pages[*nr] = page;
if (unlikely(!try_grab_page(page, flags))) {
undo_dev_pagemap(nr, nr_start, flags, pages);
- ret = 0;
break;
}
(*nr)++;
@@ -2251,7 +2248,7 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
} while (addr += PAGE_SIZE, addr != end);
put_dev_pagemap(pgmap);
- return ret;
+ return addr == end;
}
static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6378c10664599f..4e2d91ff28d424 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -66,6 +66,7 @@ static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
static bool __initdata parsed_valid_hugepagesz = true;
static bool __initdata parsed_default_hugepagesz;
+static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
/*
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -1260,9 +1261,9 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static void destroy_compound_gigantic_page(struct page *page,
- unsigned int order)
+/* used to demote non-gigantic_huge pages as well */
+static void __destroy_compound_gigantic_page(struct page *page,
+ unsigned int order, bool demote)
{
int i;
int nr_pages = 1 << order;
@@ -1272,8 +1273,10 @@ static void destroy_compound_gigantic_page(struct page *page,
atomic_set(compound_pincount_ptr(page), 0);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+ p->mapping = NULL;
clear_compound_head(p);
- set_page_refcounted(p);
+ if (!demote)
+ set_page_refcounted(p);
}
set_compound_order(page, 0);
@@ -1281,18 +1284,27 @@ static void destroy_compound_gigantic_page(struct page *page,
__ClearPageHead(page);
}
+static void destroy_compound_gigantic_page_for_demote(struct page *page,
+ unsigned int order)
+{
+ __destroy_compound_gigantic_page(page, order, true);
+}
+
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
+static void destroy_compound_gigantic_page(struct page *page,
+ unsigned int order)
+{
+ __destroy_compound_gigantic_page(page, order, false);
+}
+
static void free_gigantic_page(struct page *page, unsigned int order)
{
- /*
- * If the page isn't allocated using the cma allocator,
- * cma_release() returns false.
- */
#ifdef CONFIG_CMA
- if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
- return;
+ if (HPageCma(page))
+ cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order);
+ else
#endif
-
- free_contig_range(page_to_pfn(page), 1 << order);
+ free_contig_range(page_to_pfn(page), 1 << order);
}
#ifdef CONFIG_CONTIG_ALLOC
@@ -1311,8 +1323,10 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
if (hugetlb_cma[nid]) {
page = cma_alloc(hugetlb_cma[nid], nr_pages,
huge_page_order(h), true);
- if (page)
+ if (page) {
+ SetHPageCma(page);
return page;
+ }
}
if (!(gfp_mask & __GFP_THISNODE)) {
@@ -1322,8 +1336,10 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
page = cma_alloc(hugetlb_cma[node], nr_pages,
huge_page_order(h), true);
- if (page)
+ if (page) {
+ SetHPageCma(page);
return page;
+ }
}
}
}
@@ -1353,12 +1369,15 @@ static inline void destroy_compound_gigantic_page(struct page *page,
/*
* Remove hugetlb page from lists, and update dtor so that page appears
- * as just a compound page. A reference is held on the page.
+ * as just a compound page.
+ *
+ * A reference is held on the page, except in the case of demote.
*
* Must be called with hugetlb lock held.
*/
-static void remove_hugetlb_page(struct hstate *h, struct page *page,
- bool adjust_surplus)
+static void __remove_hugetlb_page(struct hstate *h, struct page *page,
+ bool adjust_surplus,
+ bool demote)
{
int nid = page_to_nid(page);
@@ -1396,8 +1415,12 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
*
* This handles the case where more than one ref is held when and
* after update_and_free_page is called.
+ *
+ * In the case of demote we do not ref count the page as it will soon
+ * be turned into a page of smaller size.
*/
- set_page_refcounted(page);
+ if (!demote)
+ set_page_refcounted(page);
if (hstate_is_gigantic(h))
set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
else
@@ -1407,6 +1430,18 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
h->nr_huge_pages_node[nid]--;
}
+static void remove_hugetlb_page(struct hstate *h, struct page *page,
+ bool adjust_surplus)
+{
+ __remove_hugetlb_page(h, page, adjust_surplus, false);
+}
+
+static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page,
+ bool adjust_surplus)
+{
+ __remove_hugetlb_page(h, page, adjust_surplus, true);
+}
+
static void add_hugetlb_page(struct hstate *h, struct page *page,
bool adjust_surplus)
{
@@ -1480,6 +1515,20 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
destroy_compound_gigantic_page(page, huge_page_order(h));
free_gigantic_page(page, huge_page_order(h));
} else {
+#ifdef CONFIG_CMA
+ /*
+ * Could be a page that was demoted from a gigantic page
+ * which was allocated in a CMA area.
+ */
+ if (HPageCma(page)) {
+ destroy_compound_gigantic_page(page,
+ huge_page_order(h));
+ if (!cma_release(hugetlb_cma[page_to_nid(page)], page,
+ 1 << huge_page_order(h)))
+ VM_BUG_ON_PAGE(1, page);
+ return;
+ }
+#endif
__free_pages(page, huge_page_order(h));
}
}
@@ -1664,7 +1713,8 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
spin_unlock_irq(&hugetlb_lock);
}
-static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
+static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
+ bool demote)
{
int i, j;
int nr_pages = 1 << order;
@@ -1702,10 +1752,16 @@ static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
* the set of pages can not be converted to a gigantic page.
* The caller who allocated the pages should then discard the
* pages using the appropriate free interface.
+ *
+ * In the case of demote, the ref count will be zero.
*/
- if (!page_ref_freeze(p, 1)) {
- pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
- goto out_error;
+ if (!demote) {
+ if (!page_ref_freeze(p, 1)) {
+ pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
+ goto out_error;
+ }
+ } else {
+ VM_BUG_ON_PAGE(page_count(p), p);
}
set_page_count(p, 0);
set_compound_head(p, page);
@@ -1730,6 +1786,17 @@ out_error:
return false;
}
+static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
+{
+ return __prep_compound_gigantic_page(page, order, false);
+}
+
+static bool prep_compound_gigantic_page_for_demote(struct page *page,
+ unsigned int order)
+{
+ return __prep_compound_gigantic_page(page, order, true);
+}
+
/*
* PageHuge() only returns true for hugetlbfs pages, but not for normal or
* transparent huge pages. See the PageTransHuge() documentation for more
@@ -2868,33 +2935,41 @@ out_subpool_put:
return ERR_PTR(-ENOSPC);
}
-int alloc_bootmem_huge_page(struct hstate *h)
+int alloc_bootmem_huge_page(struct hstate *h, int nid)
__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
-int __alloc_bootmem_huge_page(struct hstate *h)
+int __alloc_bootmem_huge_page(struct hstate *h, int nid)
{
struct huge_bootmem_page *m;
int nr_nodes, node;
+ if (nid >= nr_online_nodes)
+ return 0;
+ /* do node specific alloc */
+ if (nid != NUMA_NO_NODE) {
+ m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
+ 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ if (m)
+ goto found;
+ else
+ return 0;
+ }
+ /* do all node balanced alloc */
for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
- void *addr;
-
- addr = memblock_alloc_try_nid_raw(
+ m = memblock_alloc_try_nid_raw(
huge_page_size(h), huge_page_size(h),
0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
- if (addr) {
- /*
- * Use the beginning of the huge page to store the
- * huge_bootmem_page struct (until gather_bootmem
- * puts them into the mem_map).
- */
- m = addr;
+ /*
+ * Use the beginning of the huge page to store the
+ * huge_bootmem_page struct (until gather_bootmem
+ * puts them into the mem_map).
+ */
+ if (m)
goto found;
- }
+ else
+ return 0;
}
- return 0;
found:
- BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
/* Put them into a private list first because mem_map is not up yet */
INIT_LIST_HEAD(&m->list);
list_add(&m->list, &huge_boot_pages);
@@ -2934,12 +3009,61 @@ static void __init gather_bootmem_prealloc(void)
cond_resched();
}
}
+static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
+{
+ unsigned long i;
+ char buf[32];
+
+ for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
+ if (hstate_is_gigantic(h)) {
+ if (!alloc_bootmem_huge_page(h, nid))
+ break;
+ } else {
+ struct page *page;
+ gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+
+ page = alloc_fresh_huge_page(h, gfp_mask, nid,
+ &node_states[N_MEMORY], NULL);
+ if (!page)
+ break;
+ put_page(page); /* free it into the hugepage allocator */
+ }
+ cond_resched();
+ }
+ if (i == h->max_huge_pages_node[nid])
+ return;
+
+ string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
+ pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n",
+ h->max_huge_pages_node[nid], buf, nid, i);
+ h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
+ h->max_huge_pages_node[nid] = i;
+}
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
unsigned long i;
nodemask_t *node_alloc_noretry;
+ bool node_specific_alloc = false;
+
+ /* skip gigantic hugepages allocation if hugetlb_cma enabled */
+ if (hstate_is_gigantic(h) && hugetlb_cma_size) {
+ pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
+ return;
+ }
+
+ /* do node specific alloc */
+ for (i = 0; i < nr_online_nodes; i++) {
+ if (h->max_huge_pages_node[i] > 0) {
+ hugetlb_hstate_alloc_pages_onenode(h, i);
+ node_specific_alloc = true;
+ }
+ }
+
+ if (node_specific_alloc)
+ return;
+ /* bellow will do all node balanced alloc */
if (!hstate_is_gigantic(h)) {
/*
* Bit mask controlling how hard we retry per-node allocations.
@@ -2960,11 +3084,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
for (i = 0; i < h->max_huge_pages; ++i) {
if (hstate_is_gigantic(h)) {
- if (hugetlb_cma_size) {
- pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
- goto free;
- }
- if (!alloc_bootmem_huge_page(h))
+ if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
break;
} else if (!alloc_pool_huge_page(h,
&node_states[N_MEMORY],
@@ -2980,13 +3100,12 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
h->max_huge_pages, buf, i);
h->max_huge_pages = i;
}
-free:
kfree(node_alloc_noretry);
}
static void __init hugetlb_init_hstates(void)
{
- struct hstate *h;
+ struct hstate *h, *h2;
for_each_hstate(h) {
if (minimum_order > huge_page_order(h))
@@ -2995,6 +3114,22 @@ static void __init hugetlb_init_hstates(void)
/* oversize hugepages were init'ed in early boot */
if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
+
+ /*
+ * Set demote order for each hstate. hstates are not ordered,
+ * so this is brute force. Note that h->demote_order is
+ * initially 0. If cma is used for gigantic pages, the smallest
+ * demote size is HUGETLB_PAGE_ORDER.
+ */
+ if (!hugetlb_cma_size || !(h->order <= HUGETLB_PAGE_ORDER)) {
+ for_each_hstate(h2) {
+ if (h2 == h)
+ continue;
+ if (h2->order < h->order &&
+ h2->order > h->demote_order)
+ h->demote_order = h2->order;
+ }
+ }
}
VM_BUG_ON(minimum_order == UINT_MAX);
}
@@ -3235,9 +3370,81 @@ out:
return 0;
}
+static int demote_free_huge_page(struct hstate *h, struct page *page)
+{
+ int i, nid = page_to_nid(page);
+ struct hstate *target_hstate;
+ bool cma_page = HPageCma(page);
+
+ target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
+
+ remove_hugetlb_page_for_demote(h, page, false);
+ spin_unlock_irq(&hugetlb_lock);
+
+ if (alloc_huge_page_vmemmap(h, page)) {
+ /* Allocation of vmemmmap failed, we can not demote page */
+ spin_lock_irq(&hugetlb_lock);
+ set_page_refcounted(page);
+ add_hugetlb_page(h, page, false);
+ return 1;
+ }
+
+ /*
+ * Use destroy_compound_gigantic_page_for_demote for all huge page
+ * sizes as it will not ref count pages.
+ */
+ destroy_compound_gigantic_page_for_demote(page, huge_page_order(h));
+
+ for (i = 0; i < pages_per_huge_page(h);
+ i += pages_per_huge_page(target_hstate)) {
+ if (hstate_is_gigantic(target_hstate))
+ prep_compound_gigantic_page_for_demote(page + i,
+ target_hstate->order);
+ else
+ prep_compound_page(page + i, target_hstate->order);
+ set_page_private(page + i, 0);
+ set_page_refcounted(page + i);
+ prep_new_huge_page(target_hstate, page + i, nid);
+ if (cma_page)
+ SetHPageCma(page + i);
+ put_page(page + i);
+ }
+
+ spin_lock_irq(&hugetlb_lock);
+ return 0;
+}
+
+static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+ __must_hold(&hugetlb_lock)
+{
+ int nr_nodes, node;
+ struct page *page;
+ int rc = 0;
+
+ lockdep_assert_held(&hugetlb_lock);
+
+ /* We should never get here if no demote order */
+ if (!h->demote_order)
+ return rc;
+
+ for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
+ if (!list_empty(&h->hugepage_freelists[node])) {
+ page = list_entry(h->hugepage_freelists[node].next,
+ struct page, lru);
+ rc = !demote_free_huge_page(h, page);
+ break;
+ }
+ }
+
+ return rc;
+}
+
#define HSTATE_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+#define HSTATE_ATTR_WO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
+
#define HSTATE_ATTR(_name) \
static struct kobj_attribute _name##_attr = \
__ATTR(_name, 0644, _name##_show, _name##_store)
@@ -3433,6 +3640,114 @@ static ssize_t surplus_hugepages_show(struct kobject *kobj,
}
HSTATE_ATTR_RO(surplus_hugepages);
+static ssize_t demote_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ unsigned long nr_demote;
+ unsigned long nr_available;
+ nodemask_t nodes_allowed, *n_mask;
+ struct hstate *h;
+ int err;
+ int nid;
+
+ err = kstrtoul(buf, 10, &nr_demote);
+ if (err)
+ return err;
+ h = kobj_to_hstate(kobj, &nid);
+
+ /* Synchronize with other sysfs operations modifying huge pages */
+ mutex_lock(&h->resize_lock);
+
+ spin_lock_irq(&hugetlb_lock);
+ if (nid != NUMA_NO_NODE) {
+ nr_available = h->free_huge_pages_node[nid];
+ init_nodemask_of_node(&nodes_allowed, nid);
+ n_mask = &nodes_allowed;
+ } else {
+ nr_available = h->free_huge_pages;
+ n_mask = &node_states[N_MEMORY];
+ }
+ nr_available -= h->resv_huge_pages;
+ if (nr_available <= 0)
+ goto out;
+ nr_demote = min(nr_available, nr_demote);
+
+ while (nr_demote) {
+ if (!demote_pool_huge_page(h, n_mask))
+ break;
+
+ /*
+ * We may have dropped the lock in the routines to
+ * demote/free a page. Recompute nr_demote as counts could
+ * have changed and we want to make sure we do not demote
+ * a reserved huge page.
+ */
+ nr_demote--;
+ if (nid != NUMA_NO_NODE)
+ nr_available = h->free_huge_pages_node[nid];
+ else
+ nr_available = h->free_huge_pages;
+ nr_available -= h->resv_huge_pages;
+ if (nr_available <= 0)
+ nr_demote = 0;
+ else
+ nr_demote = min(nr_available, nr_demote);
+ }
+
+out:
+ spin_unlock_irq(&hugetlb_lock);
+ mutex_unlock(&h->resize_lock);
+
+ return len;
+}
+HSTATE_ATTR_WO(demote);
+
+static ssize_t demote_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h;
+ unsigned long demote_size;
+ int nid;
+
+ h = kobj_to_hstate(kobj, &nid);
+ demote_size = h->demote_order;
+
+ return sysfs_emit(buf, "%lukB\n",
+ (unsigned long)(PAGE_SIZE << h->demote_order) / SZ_1K);
+}
+
+static ssize_t demote_size_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct hstate *h, *t_hstate;
+ unsigned long demote_size;
+ unsigned int demote_order;
+ int nid;
+
+ demote_size = (unsigned long)memparse(buf, NULL);
+
+ t_hstate = size_to_hstate(demote_size);
+ if (!t_hstate)
+ return -EINVAL;
+ demote_order = t_hstate->order;
+ if (demote_order < HUGETLB_PAGE_ORDER)
+ return -EINVAL;
+
+ /* demote order must be smaller hstate order */
+ h = kobj_to_hstate(kobj, &nid);
+ if (demote_order >= h->order)
+ return -EINVAL;
+
+ /* resize_lock synchronizes access to demote size and writes */
+ mutex_lock(&h->resize_lock);
+ h->demote_order = demote_order;
+ mutex_unlock(&h->resize_lock);
+
+ return count;
+}
+HSTATE_ATTR(demote_size);
+
static struct attribute *hstate_attrs[] = {
&nr_hugepages_attr.attr,
&nr_overcommit_hugepages_attr.attr,
@@ -3449,6 +3764,16 @@ static const struct attribute_group hstate_attr_group = {
.attrs = hstate_attrs,
};
+static struct attribute *hstate_demote_attrs[] = {
+ &demote_size_attr.attr,
+ &demote_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group hstate_demote_attr_group = {
+ .attrs = hstate_demote_attrs,
+};
+
static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
struct kobject **hstate_kobjs,
const struct attribute_group *hstate_attr_group)
@@ -3466,6 +3791,12 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
hstate_kobjs[hi] = NULL;
}
+ if (h->demote_order) {
+ if (sysfs_create_group(hstate_kobjs[hi],
+ &hstate_demote_attr_group))
+ pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
+ }
+
return retval;
}
@@ -3671,6 +4002,11 @@ static int __init hugetlb_init(void)
}
default_hstate.max_huge_pages =
default_hstate_max_huge_pages;
+
+ for (i = 0; i < nr_online_nodes; i++)
+ if (default_hugepages_in_node[i] > 0)
+ default_hstate.max_huge_pages_node[i] =
+ default_hugepages_in_node[i];
}
}
@@ -3731,6 +4067,10 @@ void __init hugetlb_add_hstate(unsigned int order)
parsed_hstate = h;
}
+bool __init __weak node_specific_alloc_support(void)
+{
+ return true;
+}
/*
* hugepages command line processing
* hugepages normally follows a valid hugepagsz or default_hugepagsz
@@ -3742,6 +4082,10 @@ static int __init hugepages_setup(char *s)
{
unsigned long *mhp;
static unsigned long *last_mhp;
+ int node = NUMA_NO_NODE;
+ int count;
+ unsigned long tmp;
+ char *p = s;
if (!parsed_valid_hugepagesz) {
pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
@@ -3765,8 +4109,40 @@ static int __init hugepages_setup(char *s)
return 0;
}
- if (sscanf(s, "%lu", mhp) <= 0)
- *mhp = 0;
+ while (*p) {
+ count = 0;
+ if (sscanf(p, "%lu%n", &tmp, &count) != 1)
+ goto invalid;
+ /* Parameter is node format */
+ if (p[count] == ':') {
+ if (!node_specific_alloc_support()) {
+ pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
+ return 0;
+ }
+ node = tmp;
+ p += count + 1;
+ if (node < 0 || node >= nr_online_nodes)
+ goto invalid;
+ /* Parse hugepages */
+ if (sscanf(p, "%lu%n", &tmp, &count) != 1)
+ goto invalid;
+ if (!hugetlb_max_hstate)
+ default_hugepages_in_node[node] = tmp;
+ else
+ parsed_hstate->max_huge_pages_node[node] = tmp;
+ *mhp += tmp;
+ /* Go to parse next node*/
+ if (p[count] == ',')
+ p += count + 1;
+ else
+ break;
+ } else {
+ if (p != s)
+ goto invalid;
+ *mhp = tmp;
+ break;
+ }
+ }
/*
* Global state is always initialized later in hugetlb_init.
@@ -3779,6 +4155,10 @@ static int __init hugepages_setup(char *s)
last_mhp = mhp;
return 1;
+
+invalid:
+ pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
+ return 0;
}
__setup("hugepages=", hugepages_setup);
@@ -3840,6 +4220,7 @@ __setup("hugepagesz=", hugepagesz_setup);
static int __init default_hugepagesz_setup(char *s)
{
unsigned long size;
+ int i;
parsed_valid_hugepagesz = false;
if (parsed_default_hugepagesz) {
@@ -3868,6 +4249,10 @@ static int __init default_hugepagesz_setup(char *s)
*/
if (default_hstate_max_huge_pages) {
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+ for (i = 0; i < nr_online_nodes; i++)
+ if (default_hugepages_in_node[i] > 0)
+ default_hstate.max_huge_pages_node[i] =
+ default_hugepages_in_node[i];
if (hstate_is_gigantic(&default_hstate))
hugetlb_hstate_alloc_pages(&default_hstate);
default_hstate_max_huge_pages = 0;
@@ -4426,9 +4811,9 @@ again:
return ret;
}
-void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct page *ref_page)
+static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct page *ref_page)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
@@ -6410,7 +6795,13 @@ void __init hugetlb_cma_reserve(int order)
size = round_up(size, PAGE_SIZE << order);
snprintf(name, sizeof(name), "hugetlb%d", nid);
- res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
+ /*
+ * Note that 'order per bit' is based on smallest size that
+ * may be returned to CMA allocator in the case of
+ * huge page demotion.
+ */
+ res = cma_declare_contiguous_nid(0, size, 0,
+ PAGE_SIZE << HUGETLB_PAGE_ORDER,
0, false, name,
&hugetlb_cma[nid], nid);
if (res) {
diff --git a/mm/internal.h b/mm/internal.h
index b1001ebeb286b7..18256e32a14c6a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -47,6 +47,7 @@ bool __folio_end_writeback(struct folio *folio);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
+void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
{
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 2baf121fb8c500..3e0999892c368f 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -30,20 +30,20 @@
#include "kasan.h"
#include "../slab.h"
-depot_stack_handle_t kasan_save_stack(gfp_t flags)
+depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc)
{
unsigned long entries[KASAN_STACK_DEPTH];
unsigned int nr_entries;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
nr_entries = filter_irq_stacks(entries, nr_entries);
- return stack_depot_save(entries, nr_entries, flags);
+ return __stack_depot_save(entries, nr_entries, flags, can_alloc);
}
void kasan_set_track(struct kasan_track *track, gfp_t flags)
{
track->pid = current->pid;
- track->stack = kasan_save_stack(flags);
+ track->stack = kasan_save_stack(flags, true);
}
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index c3f5ba7a294a8a..84a038b07c6fe0 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -328,7 +328,7 @@ DEFINE_ASAN_SET_SHADOW(f3);
DEFINE_ASAN_SET_SHADOW(f5);
DEFINE_ASAN_SET_SHADOW(f8);
-void kasan_record_aux_stack(void *addr)
+static void __kasan_record_aux_stack(void *addr, bool can_alloc)
{
struct page *page = kasan_addr_to_page(addr);
struct kmem_cache *cache;
@@ -345,7 +345,17 @@ void kasan_record_aux_stack(void *addr)
return;
alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0];
- alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT);
+ alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT, can_alloc);
+}
+
+void kasan_record_aux_stack(void *addr)
+{
+ return __kasan_record_aux_stack(addr, true);
+}
+
+void kasan_record_aux_stack_noalloc(void *addr)
+{
+ return __kasan_record_aux_stack(addr, false);
}
void kasan_set_free_info(struct kmem_cache *cache,
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 8bf568a80eb853..fa6b48d0851348 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -251,7 +251,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip);
struct page *kasan_addr_to_page(const void *addr);
-depot_stack_handle_t kasan_save_stack(gfp_t flags);
+depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc);
void kasan_set_track(struct kasan_track *track, gfp_t flags);
void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag);
struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 884a950c702658..3239fd8f87475e 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -132,20 +132,11 @@ static void end_report(unsigned long *flags, unsigned long addr)
kasan_enable_current();
}
-static void print_stack(depot_stack_handle_t stack)
-{
- unsigned long *entries;
- unsigned int nr_entries;
-
- nr_entries = stack_depot_fetch(stack, &entries);
- stack_trace_print(entries, nr_entries, 0);
-}
-
static void print_track(struct kasan_track *track, const char *prefix)
{
pr_err("%s by task %u:\n", prefix, track->pid);
if (track->stack) {
- print_stack(track->stack);
+ stack_depot_print(track->stack);
} else {
pr_err("(stack is not available)\n");
}
@@ -214,12 +205,12 @@ static void describe_object_stacks(struct kmem_cache *cache, void *object,
return;
if (alloc_meta->aux_stack[0]) {
pr_err("Last potentially related work creation:\n");
- print_stack(alloc_meta->aux_stack[0]);
+ stack_depot_print(alloc_meta->aux_stack[0]);
pr_err("\n");
}
if (alloc_meta->aux_stack[1]) {
pr_err("Second to last potentially related work creation:\n");
- print_stack(alloc_meta->aux_stack[1]);
+ stack_depot_print(alloc_meta->aux_stack[1]);
pr_err("\n");
}
#endif
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 7a97db8bc8e75b..8767307a58381f 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -10,12 +10,15 @@
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/debugfs.h>
+#include <linux/hash.h>
#include <linux/irq_work.h>
+#include <linux/jhash.h>
#include <linux/kcsan-checks.h>
#include <linux/kfence.h>
#include <linux/kmemleak.h>
#include <linux/list.h>
#include <linux/lockdep.h>
+#include <linux/log2.h>
#include <linux/memblock.h>
#include <linux/moduleparam.h>
#include <linux/random.h>
@@ -82,6 +85,10 @@ static const struct kernel_param_ops sample_interval_param_ops = {
};
module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
+/* Pool usage% threshold when currently covered allocations are skipped. */
+static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
+module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
+
/* The pool of pages used for guard pages and objects. */
char *__kfence_pool __ro_after_init;
EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
@@ -105,6 +112,32 @@ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
/* Gates the allocation, ensuring only one succeeds in a given period. */
atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
+/*
+ * A Counting Bloom filter of allocation coverage: limits currently covered
+ * allocations of the same source filling up the pool.
+ *
+ * Assuming a range of 15%-85% unique allocations in the pool at any point in
+ * time, the below parameters provide a probablity of 0.02-0.33 for false
+ * positive hits respectively:
+ *
+ * P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
+ */
+#define ALLOC_COVERED_HNUM 2
+#define ALLOC_COVERED_ORDER (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2)
+#define ALLOC_COVERED_SIZE (1 << ALLOC_COVERED_ORDER)
+#define ALLOC_COVERED_HNEXT(h) hash_32(h, ALLOC_COVERED_ORDER)
+#define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1)
+static atomic_t alloc_covered[ALLOC_COVERED_SIZE];
+
+/* Stack depth used to determine uniqueness of an allocation. */
+#define UNIQUE_ALLOC_STACK_DEPTH 8UL
+
+/*
+ * Randomness for stack hashes, making the same collisions across reboots and
+ * different machines less likely.
+ */
+static u32 stack_hash_seed __ro_after_init;
+
/* Statistics counters for debugfs. */
enum kfence_counter_id {
KFENCE_COUNTER_ALLOCATED,
@@ -112,6 +145,9 @@ enum kfence_counter_id {
KFENCE_COUNTER_FREES,
KFENCE_COUNTER_ZOMBIES,
KFENCE_COUNTER_BUGS,
+ KFENCE_COUNTER_SKIP_INCOMPAT,
+ KFENCE_COUNTER_SKIP_CAPACITY,
+ KFENCE_COUNTER_SKIP_COVERED,
KFENCE_COUNTER_COUNT,
};
static atomic_long_t counters[KFENCE_COUNTER_COUNT];
@@ -121,11 +157,59 @@ static const char *const counter_names[] = {
[KFENCE_COUNTER_FREES] = "total frees",
[KFENCE_COUNTER_ZOMBIES] = "zombie allocations",
[KFENCE_COUNTER_BUGS] = "total bugs",
+ [KFENCE_COUNTER_SKIP_INCOMPAT] = "skipped allocations (incompatible)",
+ [KFENCE_COUNTER_SKIP_CAPACITY] = "skipped allocations (capacity)",
+ [KFENCE_COUNTER_SKIP_COVERED] = "skipped allocations (covered)",
};
static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
/* === Internals ============================================================ */
+static inline bool should_skip_covered(void)
+{
+ unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;
+
+ return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
+}
+
+static u32 get_alloc_stack_hash(unsigned long *stack_entries, unsigned long num_entries)
+{
+ num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
+ num_entries = filter_irq_stacks(stack_entries, num_entries);
+ return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed);
+}
+
+/*
+ * Adds (or subtracts) count @val for allocation stack trace hash
+ * @alloc_stack_hash from Counting Bloom filter.
+ */
+static void alloc_covered_add(u32 alloc_stack_hash, int val)
+{
+ int i;
+
+ for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
+ atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
+ alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
+ }
+}
+
+/*
+ * Returns true if the allocation stack trace hash @alloc_stack_hash is
+ * currently contained (non-zero count) in Counting Bloom filter.
+ */
+static bool alloc_covered_contains(u32 alloc_stack_hash)
+{
+ int i;
+
+ for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
+ if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
+ return false;
+ alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
+ }
+
+ return true;
+}
+
static bool kfence_protect(unsigned long addr)
{
return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
@@ -183,19 +267,26 @@ static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *m
* Update the object's metadata state, including updating the alloc/free stacks
* depending on the state transition.
*/
-static noinline void metadata_update_state(struct kfence_metadata *meta,
- enum kfence_object_state next)
+static noinline void
+metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next,
+ unsigned long *stack_entries, size_t num_stack_entries)
{
struct kfence_track *track =
next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;
lockdep_assert_held(&meta->lock);
- /*
- * Skip over 1 (this) functions; noinline ensures we do not accidentally
- * skip over the caller by never inlining.
- */
- track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
+ if (stack_entries) {
+ memcpy(track->stack_entries, stack_entries,
+ num_stack_entries * sizeof(stack_entries[0]));
+ } else {
+ /*
+ * Skip over 1 (this) functions; noinline ensures we do not
+ * accidentally skip over the caller by never inlining.
+ */
+ num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
+ }
+ track->num_stack_entries = num_stack_entries;
track->pid = task_pid_nr(current);
track->cpu = raw_smp_processor_id();
track->ts_nsec = local_clock(); /* Same source as printk timestamps. */
@@ -257,7 +348,9 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
}
}
-static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
+static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
+ unsigned long *stack_entries, size_t num_stack_entries,
+ u32 alloc_stack_hash)
{
struct kfence_metadata *meta = NULL;
unsigned long flags;
@@ -271,8 +364,10 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
list_del_init(&meta->list);
}
raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
- if (!meta)
+ if (!meta) {
+ atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
return NULL;
+ }
if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
/*
@@ -314,10 +409,12 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
addr = (void *)meta->addr;
/* Update remaining metadata. */
- metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED);
+ metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries);
/* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
WRITE_ONCE(meta->cache, cache);
meta->size = size;
+ meta->alloc_stack_hash = alloc_stack_hash;
+
for_each_canary(meta, set_canary_byte);
/* Set required struct page fields. */
@@ -330,6 +427,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
raw_spin_unlock_irqrestore(&meta->lock, flags);
+ alloc_covered_add(alloc_stack_hash, 1);
+
/* Memory initialization. */
/*
@@ -394,10 +493,12 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
memzero_explicit(addr, meta->size);
/* Mark the object as freed. */
- metadata_update_state(meta, KFENCE_OBJECT_FREED);
+ metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0);
raw_spin_unlock_irqrestore(&meta->lock, flags);
+ alloc_covered_add(meta->alloc_stack_hash, -1);
+
/* Protect to detect use-after-frees. */
kfence_protect((unsigned long)addr);
@@ -663,6 +764,7 @@ void __init kfence_init(void)
if (!kfence_sample_interval)
return;
+ stack_hash_seed = (u32)random_get_entropy();
if (!kfence_init_pool()) {
pr_err("%s failed\n", __func__);
return;
@@ -736,12 +838,18 @@ void kfence_shutdown_cache(struct kmem_cache *s)
void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
{
+ unsigned long stack_entries[KFENCE_STACK_DEPTH];
+ unsigned long num_stack_entries;
+ u32 alloc_stack_hash;
+
/*
* Perform size check before switching kfence_allocation_gate, so that
* we don't disable KFENCE without making an allocation.
*/
- if (size > PAGE_SIZE)
+ if (size > PAGE_SIZE) {
+ atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
return NULL;
+ }
/*
* Skip allocations from non-default zones, including DMA. We cannot
@@ -749,8 +857,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
* properties (e.g. reside in DMAable memory).
*/
if ((flags & GFP_ZONEMASK) ||
- (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32)))
+ (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
+ atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
return NULL;
+ }
/*
* allocation_gate only needs to become non-zero, so it doesn't make
@@ -776,7 +886,25 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
if (!READ_ONCE(kfence_enabled))
return NULL;
- return kfence_guarded_alloc(s, size, flags);
+ num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
+
+ /*
+ * Do expensive check for coverage of allocation in slow-path after
+ * allocation_gate has already become non-zero, even though it might
+ * mean not making any allocation within a given sample interval.
+ *
+ * This ensures reasonable allocation coverage when the pool is almost
+ * full, including avoiding long-lived allocations of the same source
+ * filling up the pool (e.g. pagecache allocations).
+ */
+ alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
+ if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
+ atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
+ return NULL;
+ }
+
+ return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
+ alloc_stack_hash);
}
size_t kfence_ksize(const void *addr)
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index c1f23c61e5f911..2a2d5de9d37916 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -87,6 +87,8 @@ struct kfence_metadata {
/* Allocation and free stack information. */
struct kfence_track alloc_track;
struct kfence_track free_track;
+ /* For updating alloc_covered on frees. */
+ u32 alloc_stack_hash;
};
extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index f1690cf541998d..695030c1fff8bb 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -32,6 +32,11 @@
#define arch_kfence_test_address(addr) (addr)
#endif
+#define KFENCE_TEST_REQUIRES(test, cond) do { \
+ if (!(cond)) \
+ kunit_skip((test), "Test requires: " #cond); \
+} while (0)
+
/* Report as observed from console. */
static struct {
spinlock_t lock;
@@ -555,8 +560,7 @@ static void test_init_on_free(struct kunit *test)
};
int i;
- if (!IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON))
- return;
+ KFENCE_TEST_REQUIRES(test, IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON));
/* Assume it hasn't been disabled on command line. */
setup_test_cache(test, size, 0, NULL);
@@ -603,10 +607,8 @@ static void test_gfpzero(struct kunit *test)
char *buf1, *buf2;
int i;
- if (CONFIG_KFENCE_SAMPLE_INTERVAL > 100) {
- kunit_warn(test, "skipping ... would take too long\n");
- return;
- }
+ /* Skip if we think it'd take too long. */
+ KFENCE_TEST_REQUIRES(test, CONFIG_KFENCE_SAMPLE_INTERVAL <= 100);
setup_test_cache(test, size, 0, NULL);
buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8dab23a71fc4f6..568f8e713603d3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -239,7 +239,7 @@ enum res_type {
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
-static inline bool should_force_charge(void)
+static inline bool task_is_dying(void)
{
return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
(current->flags & PF_EXITING);
@@ -1576,7 +1576,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
* A few threads which were not waiting at mutex_lock_killable() can
* fail to bail out. Therefore, check again after holding oom_lock.
*/
- ret = should_force_charge() || out_of_memory(&oc);
+ ret = task_is_dying() || out_of_memory(&oc);
unlock:
mutex_unlock(&oom_lock);
@@ -2544,6 +2544,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
struct page_counter *counter;
enum oom_status oom_status;
unsigned long nr_reclaimed;
+ bool passed_oom = false;
bool may_swap = true;
bool drained = false;
unsigned long pflags;
@@ -2579,15 +2580,6 @@ retry:
goto force;
/*
- * Unlike in global OOM situations, memcg is not in a physical
- * memory shortage. Allow dying and OOM-killed tasks to
- * bypass the last charges so that they can exit quickly and
- * free their memory.
- */
- if (unlikely(should_force_charge()))
- goto force;
-
- /*
* Prevent unbounded recursion when reclaim operations need to
* allocate memory. This might exceed the limits temporarily,
* but we prefer facilitating memory reclaim and getting back
@@ -2644,8 +2636,9 @@ retry:
if (gfp_mask & __GFP_RETRY_MAYFAIL)
goto nomem;
- if (fatal_signal_pending(current))
- goto force;
+ /* Avoid endless loop for tasks bypassed by the oom killer */
+ if (passed_oom && task_is_dying())
+ goto nomem;
/*
* keep retrying as long as the memcg oom killer is able to make
@@ -2654,14 +2647,10 @@ retry:
*/
oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
get_order(nr_pages * PAGE_SIZE));
- switch (oom_status) {
- case OOM_SUCCESS:
+ if (oom_status == OOM_SUCCESS) {
+ passed_oom = true;
nr_retries = MAX_RECLAIM_RETRIES;
goto retry;
- case OOM_FAILED:
- goto force;
- default:
- goto nomem;
}
nomem:
if (!(gfp_mask & __GFP_NOFAIL))
@@ -5561,7 +5550,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
#endif
static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
- unsigned long addr, pte_t ptent, swp_entry_t *entry)
+ unsigned long addr, pte_t ptent)
{
if (!vma->vm_file) /* anonymous vma */
return NULL;
@@ -5736,7 +5725,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, ptent, &ent);
else if (pte_none(ptent))
- page = mc_handle_file_pte(vma, addr, ptent, &ent);
+ page = mc_handle_file_pte(vma, addr, ptent);
if (!page && !ent.val)
return ret;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fffe4afaff437f..92eeb317adf4ad 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -39,6 +39,7 @@
#include <linux/kernel-page-flags.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
+#include <linux/dax.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
diff --git a/mm/memory.c b/mm/memory.c
index 6308eeaed13669..12a7b2094434f3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -433,35 +433,39 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
}
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
+void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
{
- spinlock_t *ptl;
- pgtable_t new = pte_alloc_one(mm);
- if (!new)
- return -ENOMEM;
+ spinlock_t *ptl = pmd_lock(mm, pmd);
- /*
- * Ensure all pte setup (eg. pte page lock and page clearing) are
- * visible before the pte is made visible to other CPUs by being
- * put into page tables.
- *
- * The other side of the story is the pointer chasing in the page
- * table walking code (when walking the page table without locking;
- * ie. most of the time). Fortunately, these data accesses consist
- * of a chain of data-dependent loads, meaning most CPUs (alpha
- * being the notable exception) will already guarantee loads are
- * seen in-order. See the alpha page table accessors for the
- * smp_rmb() barriers in page table walking code.
- */
- smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
-
- ptl = pmd_lock(mm, pmd);
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
mm_inc_nr_ptes(mm);
- pmd_populate(mm, pmd, new);
- new = NULL;
+ /*
+ * Ensure all pte setup (eg. pte page lock and page clearing) are
+ * visible before the pte is made visible to other CPUs by being
+ * put into page tables.
+ *
+ * The other side of the story is the pointer chasing in the page
+ * table walking code (when walking the page table without locking;
+ * ie. most of the time). Fortunately, these data accesses consist
+ * of a chain of data-dependent loads, meaning most CPUs (alpha
+ * being the notable exception) will already guarantee loads are
+ * seen in-order. See the alpha page table accessors for the
+ * smp_rmb() barriers in page table walking code.
+ */
+ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
+ pmd_populate(mm, pmd, *pte);
+ *pte = NULL;
}
spin_unlock(ptl);
+}
+
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
+{
+ pgtable_t new = pte_alloc_one(mm);
+ if (!new)
+ return -ENOMEM;
+
+ pmd_install(mm, pmd, &new);
if (new)
pte_free(mm, new);
return 0;
@@ -473,10 +477,9 @@ int __pte_alloc_kernel(pmd_t *pmd)
if (!new)
return -ENOMEM;
- smp_wmb(); /* See comment in __pte_alloc */
-
spin_lock(&init_mm.page_table_lock);
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
+ smp_wmb(); /* See comment in pmd_install() */
pmd_populate_kernel(&init_mm, pmd, new);
new = NULL;
}
@@ -1333,16 +1336,8 @@ again:
struct page *page;
page = vm_normal_page(vma, addr, ptent);
- if (unlikely(details) && page) {
- /*
- * unmap_shared_mapping_pages() wants to
- * invalidate cache without truncating:
- * unmap shared but keep private pages.
- */
- if (details->check_mapping &&
- details->check_mapping != page_rmapping(page))
- continue;
- }
+ if (unlikely(zap_skip_check_mapping(details, page)))
+ continue;
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
tlb_remove_tlb_entry(tlb, pte, addr);
@@ -1375,17 +1370,8 @@ again:
is_device_exclusive_entry(entry)) {
struct page *page = pfn_swap_entry_to_page(entry);
- if (unlikely(details && details->check_mapping)) {
- /*
- * unmap_shared_mapping_pages() wants to
- * invalidate cache without truncating:
- * unmap shared but keep private pages.
- */
- if (details->check_mapping !=
- page_rmapping(page))
- continue;
- }
-
+ if (unlikely(zap_skip_check_mapping(details, page)))
+ continue;
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
rss[mm_counter(page)]--;
@@ -2724,19 +2710,19 @@ EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
* proceeding (but do_wp_page is only called after already making such a check;
* and do_anonymous_page can safely check later on).
*/
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
- pte_t *page_table, pte_t orig_pte)
+static inline int pte_unmap_same(struct vm_fault *vmf)
{
int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
if (sizeof(pte_t) > sizeof(unsigned long)) {
- spinlock_t *ptl = pte_lockptr(mm, pmd);
+ spinlock_t *ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(ptl);
- same = pte_same(*page_table, orig_pte);
+ same = pte_same(*vmf->pte, vmf->orig_pte);
spin_unlock(ptl);
}
#endif
- pte_unmap(page_table);
+ pte_unmap(vmf->pte);
+ vmf->pte = NULL;
return same;
}
@@ -3321,20 +3307,20 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
}
static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
+ pgoff_t first_index,
+ pgoff_t last_index,
struct zap_details *details)
{
struct vm_area_struct *vma;
pgoff_t vba, vea, zba, zea;
- vma_interval_tree_foreach(vma, root,
- details->first_index, details->last_index) {
-
+ vma_interval_tree_foreach(vma, root, first_index, last_index) {
vba = vma->vm_pgoff;
vea = vba + vma_pages(vma) - 1;
- zba = details->first_index;
+ zba = first_index;
if (zba < vba)
zba = vba;
- zea = details->last_index;
+ zea = last_index;
if (zea > vea)
zea = vea;
@@ -3360,18 +3346,22 @@ void unmap_mapping_page(struct page *page)
{
struct address_space *mapping = page->mapping;
struct zap_details details = { };
+ pgoff_t first_index;
+ pgoff_t last_index;
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(PageTail(page));
- details.check_mapping = mapping;
- details.first_index = page->index;
- details.last_index = page->index + thp_nr_pages(page) - 1;
+ first_index = page->index;
+ last_index = page->index + thp_nr_pages(page) - 1;
+
+ details.zap_mapping = mapping;
details.single_page = page;
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
- unmap_mapping_range_tree(&mapping->i_mmap, &details);
+ unmap_mapping_range_tree(&mapping->i_mmap, first_index,
+ last_index, &details);
i_mmap_unlock_write(mapping);
}
@@ -3391,16 +3381,17 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
pgoff_t nr, bool even_cows)
{
struct zap_details details = { };
+ pgoff_t first_index = start;
+ pgoff_t last_index = start + nr - 1;
- details.check_mapping = even_cows ? NULL : mapping;
- details.first_index = start;
- details.last_index = start + nr - 1;
- if (details.last_index < details.first_index)
- details.last_index = ULONG_MAX;
+ details.zap_mapping = even_cows ? NULL : mapping;
+ if (last_index < first_index)
+ last_index = ULONG_MAX;
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
- unmap_mapping_range_tree(&mapping->i_mmap, &details);
+ unmap_mapping_range_tree(&mapping->i_mmap, first_index,
+ last_index, &details);
i_mmap_unlock_write(mapping);
}
EXPORT_SYMBOL_GPL(unmap_mapping_pages);
@@ -3488,7 +3479,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vm_fault_t ret = 0;
void *shadow = NULL;
- if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
+ if (!pte_unmap_same(vmf))
goto out;
entry = pte_to_swp_entry(vmf->orig_pte);
@@ -3853,7 +3844,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
- smp_wmb(); /* See comment in __pte_alloc() */
}
ret = vma->vm_ops->fault(vmf);
@@ -3915,7 +3905,6 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
- smp_wmb(); /* See comment in __pte_alloc() */
}
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
@@ -4028,17 +4017,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
return ret;
}
- if (vmf->prealloc_pte) {
- vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- if (likely(pmd_none(*vmf->pmd))) {
- mm_inc_nr_ptes(vma->vm_mm);
- pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
- vmf->prealloc_pte = NULL;
- }
- spin_unlock(vmf->ptl);
- } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
+ if (vmf->prealloc_pte)
+ pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
+ else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
return VM_FAULT_OOM;
- }
}
/* See comment in handle_pte_fault() */
@@ -4147,7 +4129,6 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
- smp_wmb(); /* See comment in __pte_alloc() */
}
return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
@@ -4822,13 +4803,13 @@ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
if (!new)
return -ENOMEM;
- smp_wmb(); /* See comment in __pte_alloc */
-
spin_lock(&mm->page_table_lock);
- if (pgd_present(*pgd)) /* Another has populated it */
+ if (pgd_present(*pgd)) { /* Another has populated it */
p4d_free(mm, new);
- else
+ } else {
+ smp_wmb(); /* See comment in pmd_install() */
pgd_populate(mm, pgd, new);
+ }
spin_unlock(&mm->page_table_lock);
return 0;
}
@@ -4845,11 +4826,10 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
if (!new)
return -ENOMEM;
- smp_wmb(); /* See comment in __pte_alloc */
-
spin_lock(&mm->page_table_lock);
if (!p4d_present(*p4d)) {
mm_inc_nr_puds(mm);
+ smp_wmb(); /* See comment in pmd_install() */
p4d_populate(mm, p4d, new);
} else /* Another has populated it */
pud_free(mm, new);
@@ -4870,14 +4850,14 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
if (!new)
return -ENOMEM;
- smp_wmb(); /* See comment in __pte_alloc */
-
ptl = pud_lock(mm, pud);
if (!pud_present(*pud)) {
mm_inc_nr_pmds(mm);
+ smp_wmb(); /* See comment in pmd_install() */
pud_populate(mm, pud, new);
- } else /* Another has populated it */
+ } else { /* Another has populated it */
pmd_free(mm, new);
+ }
spin_unlock(ptl);
return 0;
}
@@ -5414,7 +5394,6 @@ long copy_huge_page_from_user(struct page *dst_page,
unsigned int pages_per_huge_page,
bool allow_pagefault)
{
- void *src = (void *)usr_src;
void *page_kaddr;
unsigned long i, rc = 0;
unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
@@ -5427,8 +5406,7 @@ long copy_huge_page_from_user(struct page *dst_page,
else
page_kaddr = kmap_atomic(subpage);
rc = copy_from_user(page_kaddr,
- (const void __user *)(src + i * PAGE_SIZE),
- PAGE_SIZE);
+ usr_src + i * PAGE_SIZE, PAGE_SIZE);
if (allow_pagefault)
kunmap(subpage);
else
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9fd0be32a281e2..4ea91c3ff768fb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -57,7 +57,7 @@ enum {
ONLINE_POLICY_AUTO_MOVABLE,
};
-const char *online_policy_to_str[] = {
+static const char * const online_policy_to_str[] = {
[ONLINE_POLICY_CONTIG_ZONES] = "contig-zones",
[ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable",
};
@@ -1664,6 +1664,12 @@ static int scan_movable_pages(unsigned long start, unsigned long end,
*/
if (PageOffline(page) && page_count(page))
return -EBUSY;
+ /*
+ * HWPoisoned dirty swapcache pages are definitely unmovable
+ * because they are kept for killing owner processes.
+ */
+ if (PageHWPoison(page) && PageSwapCache(page))
+ return -EBUSY;
if (!PageHuge(page))
continue;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 251df91ddc80ab..e64a0ce85a1acd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -123,7 +123,7 @@ enum zone_type policy_zone = 0;
* run-time system-wide default policy => local allocation
*/
static struct mempolicy default_policy = {
- .refcnt = ATOMIC_INIT(1), /* never free it */
+ .refcnt = { ATOMIC_INIT(1), }, /* never free it */
.mode = MPOL_LOCAL,
};
@@ -293,7 +293,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!policy)
return ERR_PTR(-ENOMEM);
- atomic_set(&policy->refcnt, 1);
+ refcount_set(&policy->refcnt, 1);
policy->mode = mode;
policy->flags = flags;
@@ -303,7 +303,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
/* Slow path of a mpol destructor. */
void __mpol_put(struct mempolicy *p)
{
- if (!atomic_dec_and_test(&p->refcnt))
+ if (!refcount_dec_and_test(&p->refcnt))
return;
kmem_cache_free(policy_cache, p);
}
@@ -2253,7 +2253,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
nodemask_t mems = cpuset_mems_allowed(current);
mpol_rebind_policy(new, &mems);
}
- atomic_set(&new->refcnt, 1);
+ refcount_set(&new->refcnt, 1);
return new;
}
@@ -2548,7 +2548,7 @@ restart:
goto alloc_new;
*mpol_new = *n->policy;
- atomic_set(&mpol_new->refcnt, 1);
+ refcount_set(&mpol_new->refcnt, 1);
sp_node_init(n_new, end, n->end, mpol_new);
n->end = start;
sp_insert(sp, n_new);
@@ -2742,7 +2742,7 @@ void __init numa_policy_init(void)
for_each_node(nid) {
preferred_node_policy[nid] = (struct mempolicy) {
- .refcnt = ATOMIC_INIT(1),
+ .refcnt = { ATOMIC_INIT(1), },
.mode = MPOL_PREFERRED,
.flags = MPOL_F_MOF | MPOL_F_MORON,
.nodes = nodemask_of_node(nid),
diff --git a/mm/memremap.c b/mm/memremap.c
index 5a66a71ab5911a..84de22c14567e9 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -109,6 +109,26 @@ static unsigned long pfn_next(unsigned long pfn)
return pfn + 1;
}
+/*
+ * This returns true if the page is reserved by ZONE_DEVICE driver.
+ */
+bool pfn_zone_device_reserved(unsigned long pfn)
+{
+ struct dev_pagemap *pgmap;
+ struct vmem_altmap *altmap;
+ bool ret = false;
+
+ pgmap = get_dev_pagemap(pfn, NULL);
+ if (!pgmap)
+ return ret;
+ altmap = pgmap_altmap(pgmap);
+ if (altmap && pfn < (altmap->base_pfn + altmap->reserve))
+ ret = true;
+ put_dev_pagemap(pgmap);
+
+ return ret;
+}
+
#define for_each_device_pfn(pfn, map, i) \
for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
diff --git a/mm/migrate.c b/mm/migrate.c
index 433c453b47f9f9..4985d6a26ec463 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -3065,7 +3065,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
EXPORT_SYMBOL(migrate_vma_finalize);
#endif /* CONFIG_DEVICE_PRIVATE */
-#if defined(CONFIG_MEMORY_HOTPLUG)
+#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_HOTPLUG_CPU)
/* Disable reclaim-based migration. */
static void __disable_all_migrate_targets(void)
{
@@ -3207,25 +3207,7 @@ static void set_migration_target_nodes(void)
put_online_mems();
}
-/*
- * React to hotplug events that might affect the migration targets
- * like events that online or offline NUMA nodes.
- *
- * The ordering is also currently dependent on which nodes have
- * CPUs. That means we need CPU on/offline notification too.
- */
-static int migration_online_cpu(unsigned int cpu)
-{
- set_migration_target_nodes();
- return 0;
-}
-
-static int migration_offline_cpu(unsigned int cpu)
-{
- set_migration_target_nodes();
- return 0;
-}
-
+#if defined(CONFIG_MEMORY_HOTPLUG)
/*
* This leaves migrate-on-reclaim transiently disabled between
* the MEM_GOING_OFFLINE and MEM_OFFLINE events. This runs
@@ -3238,8 +3220,18 @@ static int migration_offline_cpu(unsigned int cpu)
* set_migration_target_nodes().
*/
static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
- unsigned long action, void *arg)
+ unsigned long action, void *_arg)
{
+ struct memory_notify *arg = _arg;
+
+ /*
+ * Only update the node migration order when a node is
+ * changing status, like online->offline. This avoids
+ * the overhead of synchronize_rcu() in most cases.
+ */
+ if (arg->status_change_nid < 0)
+ return notifier_from_errno(0);
+
switch (action) {
case MEM_GOING_OFFLINE:
/*
@@ -3272,6 +3264,27 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
return notifier_from_errno(0);
}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * React to hotplug events that might affect the migration targets
+ * like events that online or offline NUMA nodes.
+ *
+ * The ordering is also currently dependent on which nodes have
+ * CPUs. That means we need CPU on/offline notification too.
+ */
+static int migration_online_cpu(unsigned int cpu)
+{
+ set_migration_target_nodes();
+ return 0;
+}
+
+static int migration_offline_cpu(unsigned int cpu)
+{
+ set_migration_target_nodes();
+ return 0;
+}
static int __init migrate_on_reclaim_init(void)
{
@@ -3292,4 +3305,5 @@ static int __init migrate_on_reclaim_init(void)
return 0;
}
late_initcall(migrate_on_reclaim_init);
-#endif /* CONFIG_MEMORY_HOTPLUG */
+#endif /* CONFIG_HOTPLUG_CPU */
+#endif /* CONFIG_MEMORY_HOTPLUG || CONFIG_HOTPLUG_CPU */
diff --git a/mm/mmap.c b/mm/mmap.c
index 88dcc5c252255f..b22a07f5e76145 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3332,7 +3332,7 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
{
- mm->total_vm += npages;
+ WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages);
if (is_exec_mapping(flags))
mm->exec_vm += npages;
diff --git a/mm/mremap.c b/mm/mremap.c
index badfe17ade1f06..c0b6c41b7b78f6 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -565,6 +565,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
bool *locked, unsigned long flags,
struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
{
+ long to_account = new_len - old_len;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
unsigned long vm_flags = vma->vm_flags;
@@ -583,6 +584,9 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (mm->map_count >= sysctl_max_map_count - 3)
return -ENOMEM;
+ if (unlikely(flags & MREMAP_DONTUNMAP))
+ to_account = new_len;
+
if (vma->vm_ops && vma->vm_ops->may_split) {
if (vma->vm_start != old_addr)
err = vma->vm_ops->may_split(vma, old_addr);
@@ -604,8 +608,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (err)
return err;
- if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) {
- if (security_vm_enough_memory_mm(mm, new_len >> PAGE_SHIFT))
+ if (vm_flags & VM_ACCOUNT) {
+ if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
return -ENOMEM;
}
@@ -613,8 +617,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
&need_rmap_locks);
if (!new_vma) {
- if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT))
- vm_unacct_memory(new_len >> PAGE_SHIFT);
+ if (vm_flags & VM_ACCOUNT)
+ vm_unacct_memory(to_account >> PAGE_SHIFT);
return -ENOMEM;
}
@@ -708,8 +712,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
}
static struct vm_area_struct *vma_to_resize(unsigned long addr,
- unsigned long old_len, unsigned long new_len, unsigned long flags,
- unsigned long *p)
+ unsigned long old_len, unsigned long new_len, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
@@ -768,13 +771,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
(new_len - old_len) >> PAGE_SHIFT))
return ERR_PTR(-ENOMEM);
- if (vma->vm_flags & VM_ACCOUNT) {
- unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
- if (security_vm_enough_memory_mm(mm, charged))
- return ERR_PTR(-ENOMEM);
- *p = charged;
- }
-
return vma;
}
@@ -787,7 +783,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
- unsigned long charged = 0;
unsigned long map_flags = 0;
if (offset_in_page(new_addr))
@@ -830,7 +825,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
old_len = new_len;
}
- vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
+ vma = vma_to_resize(addr, old_len, new_len, flags);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
@@ -853,7 +848,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
((addr - vma->vm_start) >> PAGE_SHIFT),
map_flags);
if (IS_ERR_VALUE(ret))
- goto out1;
+ goto out;
/* We got a new mapping */
if (!(flags & MREMAP_FIXED))
@@ -862,12 +857,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
uf_unmap);
- if (!(offset_in_page(ret)))
- goto out;
-
-out1:
- vm_unacct_memory(charged);
-
out:
return ret;
}
@@ -899,7 +888,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
- unsigned long charged = 0;
bool locked = false;
bool downgraded = false;
struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
@@ -981,7 +969,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/*
* Ok, we need to grow..
*/
- vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
+ vma = vma_to_resize(addr, old_len, new_len, flags);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
@@ -992,10 +980,18 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (old_len == vma->vm_end - addr) {
/* can we just expand the current mapping? */
if (vma_expandable(vma, new_len - old_len)) {
- int pages = (new_len - old_len) >> PAGE_SHIFT;
+ long pages = (new_len - old_len) >> PAGE_SHIFT;
+
+ if (vma->vm_flags & VM_ACCOUNT) {
+ if (security_vm_enough_memory_mm(mm, pages)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
if (vma_adjust(vma, vma->vm_start, addr + new_len,
vma->vm_pgoff, NULL)) {
+ vm_unacct_memory(pages);
ret = -ENOMEM;
goto out;
}
@@ -1034,10 +1030,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
&locked, flags, &uf, &uf_unmap);
}
out:
- if (offset_in_page(ret)) {
- vm_unacct_memory(charged);
+ if (offset_in_page(ret))
locked = false;
- }
if (downgraded)
mmap_read_unlock(current->mm);
else
diff --git a/mm/nommu.c b/mm/nommu.c
index 41ef204e748202..55a9e48a7a0268 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1638,12 +1638,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
-unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
-{
- return -ENOMEM;
-}
-
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
BUG();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 831340e7ad8b47..ea2525a76a4a32 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -234,8 +234,11 @@ long oom_badness(struct task_struct *p, unsigned long totalpages)
mm_pgtables_bytes(p->mm) / PAGE_SIZE;
task_unlock(p);
- /* Normalize to oom_score_adj units */
- adj *= totalpages / 1000;
+ /*
+ * Normalize to oom_score_adj units. You should never
+ * multiply by zero here, or oom_score_adj will not work.
+ */
+ adj *= (totalpages + 1000) / 1000;
points += adj;
return points;
@@ -641,6 +644,8 @@ done:
static int oom_reaper(void *unused)
{
+ set_freezable();
+
while (true) {
struct task_struct *tsk = NULL;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 544ff5a11cb896..668edb16446a4c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -677,10 +677,8 @@ static inline int pindex_to_order(unsigned int pindex)
int order = pindex / MIGRATE_PCPTYPES;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (order > PAGE_ALLOC_COSTLY_ORDER) {
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
order = pageblock_order;
- VM_BUG_ON(order != pageblock_order);
- }
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
#endif
@@ -1428,14 +1426,8 @@ static inline void prefetch_buddy(struct page *page)
/*
* Frees a number of pages from the PCP lists
- * Assumes all pages on list are in same zone, and of same order.
+ * Assumes all pages on list are in same zone.
* count is the number of pages to free.
- *
- * If the zone was previously in an "all pages pinned" state then look to
- * see if this freeing clears that state.
- *
- * And clear the zone's pages_scanned counter, to hold off the "all pages are
- * pinned" detection logic.
*/
static void free_pcppages_bulk(struct zone *zone, int count,
struct per_cpu_pages *pcp)
@@ -1589,7 +1581,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
struct zone *zone = &pgdat->node_zones[zid];
- if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
+ if (zone_spans_pfn(zone, pfn))
break;
}
__init_single_page(pfn_to_page(pfn), pfn, zid, nid);
@@ -3966,6 +3958,8 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
}
#ifdef CONFIG_NUMA
+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
+
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
@@ -4914,6 +4908,19 @@ retry_cpuset:
if (!ac->preferred_zoneref->zone)
goto nopage;
+ /*
+ * Check for insane configurations where the cpuset doesn't contain
+ * any suitable zone to satisfy the request - e.g. non-movable
+ * GFP_HIGHUSER allocations from MOVABLE nodes only.
+ */
+ if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) {
+ struct zoneref *z = first_zones_zonelist(ac->zonelist,
+ ac->highest_zoneidx,
+ &cpuset_current_mems_allowed);
+ if (!z->zone)
+ goto nopage;
+ }
+
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
@@ -5624,8 +5631,8 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
unsigned int order = get_order(size);
unsigned long addr;
- if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
- gfp_mask &= ~__GFP_COMP;
+ if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
+ gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
addr = __get_free_pages(gfp_mask, order);
return make_alloc_exact(addr, order, size);
@@ -5649,8 +5656,8 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
unsigned int order = get_order(size);
struct page *p;
- if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
- gfp_mask &= ~__GFP_COMP;
+ if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
+ gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
p = alloc_pages_node(nid, gfp_mask, order);
if (!p)
@@ -5992,6 +5999,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
printk(KERN_CONT
"%s"
" free:%lukB"
+ " boost:%lukB"
" min:%lukB"
" low:%lukB"
" high:%lukB"
@@ -6012,6 +6020,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
"\n",
zone->name,
K(zone_page_state(zone, NR_FREE_PAGES)),
+ K(zone->watermark_boost),
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
@@ -6267,7 +6276,7 @@ static void build_zonelists(pg_data_t *pgdat)
*/
if (node_distance(local_node, node) !=
node_distance(local_node, prev_node))
- node_load[node] = load;
+ node_load[node] += load;
node_order[nr_nodes++] = node;
prev_node = node;
@@ -6276,6 +6285,10 @@ static void build_zonelists(pg_data_t *pgdat)
build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
build_thisnode_zonelists(pgdat);
+ pr_info("Fallback order for Node %d: ", local_node);
+ for (node = 0; node < nr_nodes; node++)
+ pr_cont("%d ", node_order[node]);
+ pr_cont("\n");
}
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
@@ -8138,8 +8151,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
}
if (pages && s)
- pr_info("Freeing %s memory: %ldK\n",
- s, pages << (PAGE_SHIFT - 10));
+ pr_info("Freeing %s memory: %ldK\n", s, K(pages));
return pages;
}
@@ -8184,14 +8196,13 @@ void __init mem_init_print_info(void)
", %luK highmem"
#endif
")\n",
- nr_free_pages() << (PAGE_SHIFT - 10),
- physpages << (PAGE_SHIFT - 10),
+ K(nr_free_pages()), K(physpages),
codesize >> 10, datasize >> 10, rosize >> 10,
(init_data_size + init_code_size) >> 10, bss_size >> 10,
- (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
- totalcma_pages << (PAGE_SHIFT - 10)
+ K(physpages - totalram_pages() - totalcma_pages),
+ K(totalcma_pages)
#ifdef CONFIG_HIGHMEM
- , totalhigh_pages() << (PAGE_SHIFT - 10)
+ , K(totalhigh_pages())
#endif
);
}
@@ -8768,7 +8779,8 @@ void *__init alloc_large_system_hash(const char *tablename,
} else if (get_order(size) >= MAX_ORDER || hashdist) {
table = __vmalloc(size, gfp_flags);
virt = true;
- huge = is_vm_area_hugepages(table);
+ if (table)
+ huge = is_vm_area_hugepages(table);
} else {
/*
* If bucketsize is not a power-of-two, we may free
@@ -9365,21 +9377,21 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
}
#endif
+/*
+ * This function returns a stable result only if called under zone lock.
+ */
bool is_free_buddy_page(struct page *page)
{
- struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
- unsigned long flags;
unsigned int order;
- spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
- if (PageBuddy(page_head) && buddy_order(page_head) >= order)
+ if (PageBuddy(page_head) &&
+ buddy_order_unsafe(page_head) >= order)
break;
}
- spin_unlock_irqrestore(&zone->lock, flags);
return order < MAX_ORDER;
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index a95c2c6562d0f0..f67c4c70f17f66 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -94,8 +94,13 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
buddy = page + (buddy_pfn - pfn);
if (!is_migrate_isolate_page(buddy)) {
- __isolate_free_page(page, order);
- isolated_page = true;
+ isolated_page = !!__isolate_free_page(page, order);
+ /*
+ * Isolating a free page in an isolated pageblock
+ * is expected to always work as watermarks don't
+ * apply here.
+ */
+ VM_WARN_ON(!isolated_page);
}
}
}
@@ -183,7 +188,6 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned migratetype, int flags)
{
unsigned long pfn;
- unsigned long undo_pfn;
struct page *page;
BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
@@ -193,25 +197,12 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
pfn < end_pfn;
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
- if (page) {
- if (set_migratetype_isolate(page, migratetype, flags)) {
- undo_pfn = pfn;
- goto undo;
- }
+ if (page && set_migratetype_isolate(page, migratetype, flags)) {
+ undo_isolate_page_range(start_pfn, pfn, migratetype);
+ return -EBUSY;
}
}
return 0;
-undo:
- for (pfn = start_pfn;
- pfn < undo_pfn;
- pfn += pageblock_nr_pages) {
- struct page *page = pfn_to_online_page(pfn);
- if (!page)
- continue;
- unset_migratetype_isolate(page, migratetype);
- }
-
- return -EBUSY;
}
/*
diff --git a/mm/page_owner.c b/mm/page_owner.c
index d24ed221357c93..a83f546c06b597 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -329,8 +329,6 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
depot_stack_handle_t handle)
{
int ret, pageblock_mt, page_mt;
- unsigned long *entries;
- unsigned int nr_entries;
char *kbuf;
count = min_t(size_t, count, PAGE_SIZE);
@@ -361,8 +359,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
if (ret >= count)
goto err;
- nr_entries = stack_depot_fetch(handle, &entries);
- ret += stack_trace_snprint(kbuf + ret, count - ret, entries, nr_entries, 0);
+ ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
if (ret >= count)
goto err;
@@ -394,8 +391,6 @@ void __dump_page_owner(const struct page *page)
struct page_ext *page_ext = lookup_page_ext(page);
struct page_owner *page_owner;
depot_stack_handle_t handle;
- unsigned long *entries;
- unsigned int nr_entries;
gfp_t gfp_mask;
int mt;
@@ -423,20 +418,17 @@ void __dump_page_owner(const struct page *page)
page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec);
handle = READ_ONCE(page_owner->handle);
- if (!handle) {
+ if (!handle)
pr_alert("page_owner allocation stack trace missing\n");
- } else {
- nr_entries = stack_depot_fetch(handle, &entries);
- stack_trace_print(entries, nr_entries, 0);
- }
+ else
+ stack_depot_print(handle);
handle = READ_ONCE(page_owner->free_handle);
if (!handle) {
pr_alert("page_owner free stack trace missing\n");
} else {
- nr_entries = stack_depot_fetch(handle, &entries);
pr_alert("page last free stack trace:\n");
- stack_trace_print(entries, nr_entries, 0);
+ stack_depot_print(handle);
}
if (page_owner->last_migrate_reason != -1)
diff --git a/mm/rmap.c b/mm/rmap.c
index 3a1059c284c379..25545c3dfb2f2b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -89,7 +89,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
if (anon_vma) {
- atomic_set(&anon_vma->refcount, 1);
+ refcount_set(&anon_vma->refcount, 1);
anon_vma->degree = 1; /* Reference for first vma */
anon_vma->parent = anon_vma;
/*
@@ -104,7 +104,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
static inline void anon_vma_free(struct anon_vma *anon_vma)
{
- VM_BUG_ON(atomic_read(&anon_vma->refcount));
+ VM_BUG_ON(refcount_read(&anon_vma->refcount));
/*
* Synchronize against page_lock_anon_vma_read() such that
@@ -446,7 +446,7 @@ static void anon_vma_ctor(void *data)
struct anon_vma *anon_vma = data;
init_rwsem(&anon_vma->rwsem);
- atomic_set(&anon_vma->refcount, 0);
+ refcount_set(&anon_vma->refcount, 0);
anon_vma->rb_root = RB_ROOT_CACHED;
}
@@ -496,7 +496,7 @@ struct anon_vma *page_get_anon_vma(struct page *page)
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
- if (!atomic_inc_not_zero(&anon_vma->refcount)) {
+ if (!refcount_inc_not_zero(&anon_vma->refcount)) {
anon_vma = NULL;
goto out;
}
@@ -555,7 +555,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
}
/* trylock failed, we got to sleep */
- if (!atomic_inc_not_zero(&anon_vma->refcount)) {
+ if (!refcount_inc_not_zero(&anon_vma->refcount)) {
anon_vma = NULL;
goto out;
}
@@ -570,7 +570,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
rcu_read_unlock();
anon_vma_lock_read(anon_vma);
- if (atomic_dec_and_test(&anon_vma->refcount)) {
+ if (refcount_dec_and_test(&anon_vma->refcount)) {
/*
* Oops, we held the last refcount, release the lock
* and bail -- can't simply use put_anon_vma() because
@@ -2224,7 +2224,7 @@ void __put_anon_vma(struct anon_vma *anon_vma)
struct anon_vma *root = anon_vma->root;
anon_vma_free(anon_vma);
- if (root != anon_vma && atomic_dec_and_test(&root->refcount))
+ if (root != anon_vma && refcount_dec_and_test(&root->refcount))
anon_vma_free(root);
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 17e344e26e7360..31ab952377bb3b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -855,9 +855,8 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma)
return swapped << PAGE_SHIFT;
/* Here comes the more involved part */
- return shmem_partial_swap_usage(mapping,
- linear_page_index(vma, vma->vm_start),
- linear_page_index(vma, vma->vm_end));
+ return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
+ vma->vm_pgoff + vma_pages(vma));
}
/*
@@ -2426,7 +2425,6 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
- SetPageDirty(page);
unlock_page(page);
return 0;
out_delete_from_cache:
diff --git a/mm/slab.c b/mm/slab.c
index d0f72563766300..4d826394ffcbff 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4207,19 +4207,6 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
n <= cachep->useroffset - offset + cachep->usersize)
return;
- /*
- * If the copy is still within the allocated object, produce
- * a warning instead of rejecting the copy. This is intended
- * to be a temporary method to find any missing usercopy
- * whitelists.
- */
- if (usercopy_fallback &&
- offset <= cachep->object_size &&
- n <= cachep->object_size - offset) {
- usercopy_warn("SLAB object", cachep->name, to_user, offset, n);
- return;
- }
-
usercopy_abort("SLAB object", cachep->name, to_user, offset, n);
}
#endif /* CONFIG_HARDENED_USERCOPY */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index ec2bb0beed757c..e5d080a9300933 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -37,14 +37,6 @@ LIST_HEAD(slab_caches);
DEFINE_MUTEX(slab_mutex);
struct kmem_cache *kmem_cache;
-#ifdef CONFIG_HARDENED_USERCOPY
-bool usercopy_fallback __ro_after_init =
- IS_ENABLED(CONFIG_HARDENED_USERCOPY_FALLBACK);
-module_param(usercopy_fallback, bool, 0400);
-MODULE_PARM_DESC(usercopy_fallback,
- "WARN instead of reject usercopy whitelist violations");
-#endif
-
static LIST_HEAD(slab_caches_to_rcu_destroy);
static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
diff --git a/mm/slub.c b/mm/slub.c
index 3d2025f7163b29..157973e22fafd7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1701,7 +1701,8 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s,
}
static inline bool slab_free_freelist_hook(struct kmem_cache *s,
- void **head, void **tail)
+ void **head, void **tail,
+ int *cnt)
{
void *object;
@@ -1728,6 +1729,12 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
*head = object;
if (!*tail)
*tail = object;
+ } else {
+ /*
+ * Adjust the reconstructed freelist depth
+ * accordingly if object's reuse is delayed.
+ */
+ --(*cnt);
}
} while (object != old_tail);
@@ -3413,7 +3420,9 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
struct kmem_cache_cpu *c;
unsigned long tid;
- memcg_slab_free_hook(s, &head, 1);
+ /* memcg_slab_free_hook() is already called for bulk free. */
+ if (!tail)
+ memcg_slab_free_hook(s, &head, 1);
redo:
/*
* Determine the currently cpus per cpu slab.
@@ -3480,7 +3489,7 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
* With KASAN enabled slab_free_freelist_hook modifies the freelist
* to remove objects, whose reuse must be delayed.
*/
- if (slab_free_freelist_hook(s, &head, &tail))
+ if (slab_free_freelist_hook(s, &head, &tail, &cnt))
do_slab_free(s, page, head, tail, cnt, addr);
}
@@ -3513,7 +3522,11 @@ static inline void free_nonslab_page(struct page *page, void *object)
{
unsigned int order = compound_order(page);
- VM_BUG_ON_PAGE(!PageCompound(page), page);
+ if (WARN_ON(!PageCompound(page))) {
+ dump_page(page, "invalid free nonslab page");
+ pr_warn("object pointer: 0x%p\n", object);
+ }
+
kfree_hook(object);
mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order));
__free_pages(page, order);
@@ -4203,8 +4216,8 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
if (alloc_kmem_cache_cpus(s))
return 0;
- free_kmem_cache_nodes(s);
error:
+ __kmem_cache_release(s);
return -EINVAL;
}
@@ -4457,7 +4470,6 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
{
struct kmem_cache *s;
unsigned int offset;
- size_t object_size;
bool is_kfence = is_kfence_address(ptr);
ptr = kasan_reset_tag(ptr);
@@ -4490,19 +4502,6 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
n <= s->useroffset - offset + s->usersize)
return;
- /*
- * If the copy is still within the allocated object, produce
- * a warning instead of rejecting the copy. This is intended
- * to be a temporary method to find any missing usercopy
- * whitelists.
- */
- object_size = slab_ksize(s);
- if (usercopy_fallback &&
- offset <= object_size && n <= object_size - offset) {
- usercopy_warn("SLUB object", s->name, to_user, offset, n);
- return;
- }
-
usercopy_abort("SLUB object", s->name, to_user, offset, n);
}
#endif /* CONFIG_HARDENED_USERCOPY */
@@ -4880,13 +4879,15 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
return 0;
err = sysfs_slab_add(s);
- if (err)
+ if (err) {
__kmem_cache_release(s);
+ return err;
+ }
if (s->flags & SLAB_STORE_USER)
debugfs_slab_add(s);
- return err;
+ return 0;
}
void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
@@ -6108,9 +6109,14 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep)
struct kmem_cache *s = file_inode(filep)->i_private;
unsigned long *obj_map;
+ if (!t)
+ return -ENOMEM;
+
obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
- if (!obj_map)
+ if (!obj_map) {
+ seq_release_private(inode, filep);
return -ENOMEM;
+ }
if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0)
alloc = TRACK_ALLOC;
@@ -6119,6 +6125,7 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep)
if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) {
bitmap_free(obj_map);
+ seq_release_private(inode, filep);
return -ENOMEM;
}
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index bdce883f92863e..db6df27c852a7e 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -76,7 +76,7 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
set_pte_at(&init_mm, addr, pte, entry);
}
- /* Make pte visible before pmd. See comment in __pte_alloc(). */
+ /* Make pte visible before pmd. See comment in pmd_install(). */
smp_wmb();
pmd_populate_kernel(&init_mm, pmd, pgtable);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 41c9e92f1f004f..ca90d7fc894781 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3118,7 +3118,7 @@ static bool swap_discardable(struct swap_info_struct *si)
{
struct request_queue *q = bdev_get_queue(si->bdev);
- if (!q || !blk_queue_discard(q))
+ if (!blk_queue_discard(q))
return false;
return true;
diff --git a/mm/truncate.c b/mm/truncate.c
index 714eaf19821d7f..cc83a3f7c1ad34 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -45,9 +45,13 @@ static inline void __clear_shadow_entry(struct address_space *mapping,
static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
void *entry)
{
+ spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
__clear_shadow_entry(mapping, index, entry);
xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
}
/*
@@ -73,8 +77,10 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
return;
dax = dax_mapping(mapping);
- if (!dax)
+ if (!dax) {
+ spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
+ }
for (i = j; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
@@ -93,8 +99,12 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
__clear_shadow_entry(mapping, index, page);
}
- if (!dax)
+ if (!dax) {
xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
+ }
pvec->nr = j;
}
@@ -567,6 +577,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
return 0;
+ spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
if (PageDirty(page))
goto failed;
@@ -574,6 +585,9 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
BUG_ON(page_has_private(page));
__delete_from_page_cache(page, NULL);
xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
if (mapping->a_ops->freepage)
mapping->a_ops->freepage(page);
@@ -582,6 +596,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
return 1;
failed:
xa_unlock_irq(&mapping->i_pages);
+ spin_unlock(&mapping->host->i_lock);
return 0;
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 36e5f6ab976f0d..ac6f036298cd6f 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -69,10 +69,9 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
pgoff_t offset, max_off;
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+ _dst_pte = pte_mkdirty(_dst_pte);
if (page_in_cache && !vm_shared)
writable = false;
- if (writable || !page_in_cache)
- _dst_pte = pte_mkdirty(_dst_pte);
if (writable) {
if (wp_copy)
_dst_pte = pte_mkuffd_wp(_dst_pte);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d77830ff604cab..48e717626e94fe 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2743,6 +2743,13 @@ void *vmap(struct page **pages, unsigned int count,
might_sleep();
+ /*
+ * Your top guard is someone else's bottom guard. Not having a top
+ * guard compromises someone else's mappings too.
+ */
+ if (WARN_ON_ONCE(flags & VM_NO_GUARD))
+ flags &= ~VM_NO_GUARD;
+
if (count > totalram_pages())
return NULL;
@@ -2816,6 +2823,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
unsigned int order, unsigned int nr_pages, struct page **pages)
{
unsigned int nr_allocated = 0;
+ struct page *page;
+ int i;
/*
* For order-0 pages we make use of bulk allocator, if
@@ -2826,6 +2835,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
if (!order) {
while (nr_allocated < nr_pages) {
unsigned int nr, nr_pages_request;
+ page = NULL;
/*
* A maximum allowed request is hard-coded and is 100
@@ -2835,9 +2845,23 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
*/
nr_pages_request = min(100U, nr_pages - nr_allocated);
- nr = alloc_pages_bulk_array_node(gfp, nid,
- nr_pages_request, pages + nr_allocated);
-
+ if (nid == NUMA_NO_NODE) {
+ for (i = 0; i < nr_pages_request; i++) {
+ page = alloc_page(gfp);
+ if (page)
+ pages[nr_allocated + i] = page;
+ else {
+ nr = i;
+ break;
+ }
+ }
+ if (i >= nr_pages_request)
+ nr = nr_pages_request;
+ } else {
+ nr = alloc_pages_bulk_array_node(gfp, nid,
+ nr_pages_request,
+ pages + nr_allocated);
+ }
nr_allocated += nr;
cond_resched();
@@ -2856,11 +2880,13 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
gfp |= __GFP_COMP;
/* High-order pages or fallback path if "bulk" fails. */
- while (nr_allocated < nr_pages) {
- struct page *page;
- int i;
- page = alloc_pages_node(nid, gfp, order);
+ page = NULL;
+ while (nr_allocated < nr_pages) {
+ if (nid == NUMA_NO_NODE)
+ page = alloc_pages(gfp, order);
+ else
+ page = alloc_pages_node(nid, gfp, order);
if (unlikely(!page))
break;
@@ -2884,6 +2910,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
int node)
{
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
+ const gfp_t orig_gfp_mask = gfp_mask;
unsigned long addr = (unsigned long)area->addr;
unsigned long size = get_vm_area_size(area);
unsigned long array_size;
@@ -2904,7 +2931,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
}
if (!area->pages) {
- warn_alloc(gfp_mask, NULL,
+ warn_alloc(orig_gfp_mask, NULL,
"vmalloc error: size %lu, failed to allocated page array size %lu",
nr_small_pages * PAGE_SIZE, array_size);
free_vm_area(area);
@@ -2924,7 +2951,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
* allocation request, free them via __vfree() if any.
*/
if (area->nr_pages != nr_small_pages) {
- warn_alloc(gfp_mask, NULL,
+ warn_alloc(orig_gfp_mask, NULL,
"vmalloc error: size %lu, page order %u, failed to allocate pages",
area->nr_pages * PAGE_SIZE, page_order);
goto fail;
@@ -2932,7 +2959,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
if (vmap_pages_range(addr, addr + size, prot, area->pages,
page_shift) < 0) {
- warn_alloc(gfp_mask, NULL,
+ warn_alloc(orig_gfp_mask, NULL,
"vmalloc error: size %lu, failed to map pages",
area->nr_pages * PAGE_SIZE);
goto fail;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 71f178f85f5b0b..7cf702bc195977 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1120,6 +1120,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
+ if (!PageSwapCache(page))
+ spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
/*
* The non racy check for a busy page.
@@ -1188,6 +1190,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
shadow = workingset_eviction(page, target_memcg);
__delete_from_page_cache(page, shadow);
xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
if (freepage != NULL)
freepage(page);
@@ -1197,6 +1202,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
cannot_free:
xa_unlock_irq(&mapping->i_pages);
+ if (!PageSwapCache(page))
+ spin_unlock(&mapping->host->i_lock);
return 0;
}
@@ -1352,7 +1359,6 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
{
int target_nid = next_demotion_node(pgdat->node_id);
unsigned int nr_succeeded;
- int err;
if (list_empty(demote_pages))
return 0;
@@ -1361,7 +1367,7 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
return 0;
/* Demotion ignores all cpuset and mempolicy settings */
- err = migrate_pages(demote_pages, alloc_demote_page, NULL,
+ migrate_pages(demote_pages, alloc_demote_page, NULL,
target_nid, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8ce2620344b2f8..354bc732efd962 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -165,6 +165,34 @@ atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
EXPORT_SYMBOL(vm_zone_stat);
EXPORT_SYMBOL(vm_node_stat);
+#ifdef CONFIG_NUMA
+static void fold_vm_zone_numa_events(struct zone *zone)
+{
+ unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
+ int cpu;
+ enum numa_stat_item item;
+
+ for_each_online_cpu(cpu) {
+ struct per_cpu_zonestat *pzstats;
+
+ pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+ for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
+ zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
+ }
+
+ for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
+ zone_numa_event_add(zone_numa_events[item], zone, item);
+}
+
+void fold_vm_numa_events(void)
+{
+ struct zone *zone;
+
+ for_each_populated_zone(zone)
+ fold_vm_zone_numa_events(zone);
+}
+#endif
+
#ifdef CONFIG_SMP
int calculate_pressure_threshold(struct zone *zone)
@@ -771,34 +799,6 @@ static int fold_diff(int *zone_diff, int *node_diff)
return changes;
}
-#ifdef CONFIG_NUMA
-static void fold_vm_zone_numa_events(struct zone *zone)
-{
- unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
- int cpu;
- enum numa_stat_item item;
-
- for_each_online_cpu(cpu) {
- struct per_cpu_zonestat *pzstats;
-
- pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
- for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
- zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
- }
-
- for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
- zone_numa_event_add(zone_numa_events[item], zone, item);
-}
-
-void fold_vm_numa_events(void)
-{
- struct zone *zone;
-
- for_each_populated_zone(zone)
- fold_vm_zone_numa_events(zone);
-}
-#endif
-
/*
* Update the zone counters for the current cpu.
*
@@ -1070,8 +1070,13 @@ static void fill_contig_page_info(struct zone *zone,
for (order = 0; order < MAX_ORDER; order++) {
unsigned long blocks;
- /* Count number of free blocks */
- blocks = zone->free_area[order].nr_free;
+ /*
+ * Count number of free blocks.
+ *
+ * Access to nr_free is lockless as nr_free is used only for
+ * diagnostic purposes. Use data_race to avoid KCSAN warning.
+ */
+ blocks = data_race(zone->free_area[order].nr_free);
info->free_blocks_total += blocks;
/* Count free base pages */
@@ -1445,7 +1450,11 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
for (order = 0; order < MAX_ORDER; ++order)
- seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+ /*
+ * Access to nr_free is lockless as nr_free is used only for
+ * printing purposes. Use data_race to avoid KCSAN warning.
+ */
+ seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free));
seq_putc(m, '\n');
}
@@ -1656,6 +1665,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
}
seq_printf(m,
"\n pages free %lu"
+ "\n boost %lu"
"\n min %lu"
"\n low %lu"
"\n high %lu"
@@ -1664,6 +1674,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n managed %lu"
"\n cma %lu",
zone_page_state(zone, NR_FREE_PAGES),
+ zone->watermark_boost,
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
diff --git a/mm/workingset.c b/mm/workingset.c
index 109ab978251ac1..8c03afe1d67cb9 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -543,6 +543,13 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
goto out;
}
+ if (!spin_trylock(&mapping->host->i_lock)) {
+ xa_unlock(&mapping->i_pages);
+ spin_unlock_irq(lru_lock);
+ ret = LRU_RETRY;
+ goto out;
+ }
+
list_lru_isolate(lru, item);
__dec_lruvec_kmem_state(node, WORKINGSET_NODES);
@@ -562,6 +569,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
out_invalid:
xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
ret = LRU_REMOVED_RETRY;
out:
cond_resched();
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 68e8831068f4b5..3e713ff6261eea 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1828,13 +1828,13 @@ static void putback_zspage_deferred(struct zs_pool *pool,
static inline void zs_pool_dec_isolated(struct zs_pool *pool)
{
VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0);
- atomic_long_dec(&pool->isolated_pages);
/*
- * There's no possibility of racing, since wait_for_isolated_drain()
- * checks the isolated count under &class->lock after enqueuing
- * on migration_wait.
+ * Checking pool->destroying must happen after atomic_long_dec()
+ * for pool->isolated_pages above. Paired with the smp_mb() in
+ * zs_unregister_migration().
*/
- if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying)
+ smp_mb__after_atomic();
+ if (atomic_long_dec_and_test(&pool->isolated_pages) && pool->destroying)
wake_up_all(&pool->migration_wait);
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 414c179c28e0dd..080dc5fd6deab3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -260,7 +260,6 @@
#include <linux/random.h>
#include <linux/memblock.h>
#include <linux/highmem.h>
-#include <linux/swap.h>
#include <linux/cache.h>
#include <linux/err.h>
#include <linux/time.h>
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2a7825a5b84254..ed6ebc3a1a8293 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -78,7 +78,6 @@
#include <asm/ioctls.h>
#include <linux/memblock.h>
#include <linux/highmem.h>
-#include <linux/swap.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/module.h>
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index c25097092a060b..034e537bb55c96 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -24,7 +24,6 @@
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include <linux/workqueue.h>
-#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
index 896b8f5bc88535..04a060ac7fdf6e 100644
--- a/net/openvswitch/meter.c
+++ b/net/openvswitch/meter.c
@@ -12,7 +12,6 @@
#include <linux/openvswitch.h>
#include <linux/netlink.h>
#include <linux/rculist.h>
-#include <linux/swap.h>
#include <net/netlink.h>
#include <net/genetlink.h>
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index ec0f52567c16ff..35928fefae3327 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -33,7 +33,6 @@
#include <linux/seq_file.h>
#include <linux/memblock.h>
#include <linux/highmem.h>
-#include <linux/swap.h>
#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/protocol.h>
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index c27d2312cfc307..88cb294dc4472e 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -489,7 +489,8 @@ our $Attribute = qr{
____cacheline_aligned|
____cacheline_aligned_in_smp|
____cacheline_internodealigned_in_smp|
- __weak
+ __weak|
+ __alloc_size\s*\(\s*\d+\s*(?:,\s*\d+\s*)?\)
}x;
our $Modifier;
our $Inline = qr{inline|__always_inline|noinline|__inline|__inline__};
diff --git a/scripts/const_structs.checkpatch b/scripts/const_structs.checkpatch
index 1aae4f4fdacce4..3980985205a060 100644
--- a/scripts/const_structs.checkpatch
+++ b/scripts/const_structs.checkpatch
@@ -54,7 +54,11 @@ sd_desc
seq_operations
sirfsoc_padmux
snd_ac97_build_ops
+snd_pcm_ops
+snd_rawmidi_ops
snd_soc_component_driver
+snd_soc_dai_ops
+snd_soc_ops
soc_pcmcia_socket_ops
stacktrace_ops
sysfs_ops
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index 17fdc620d5488c..acf6ea71129921 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -178,6 +178,7 @@ assum||assume
assumtpion||assumption
asuming||assuming
asycronous||asynchronous
+asychronous||asynchronous
asynchnous||asynchronous
asynchromous||asynchronous
asymetric||asymmetric
@@ -241,6 +242,7 @@ beter||better
betweeen||between
bianries||binaries
bitmast||bitmask
+bitwiedh||bitwidth
boardcast||broadcast
borad||board
boundry||boundary
@@ -265,7 +267,10 @@ calucate||calculate
calulate||calculate
cancelation||cancellation
cancle||cancel
+cant||can't
+cant'||can't
canot||cannot
+cann't||can't
capabilites||capabilities
capabilties||capabilities
capabilty||capability
@@ -501,6 +506,7 @@ disble||disable
disgest||digest
disired||desired
dispalying||displaying
+dissable||disable
diplay||display
directon||direction
direcly||directly
@@ -595,6 +601,7 @@ exceded||exceeded
exceds||exceeds
exceeed||exceed
excellant||excellent
+exchnage||exchange
execeeded||exceeded
execeeds||exceeds
exeed||exceed
@@ -938,6 +945,7 @@ migrateable||migratable
milliseonds||milliseconds
minium||minimum
minimam||minimum
+minimun||minimum
miniumum||minimum
minumum||minimum
misalinged||misaligned
@@ -956,6 +964,7 @@ mmnemonic||mnemonic
mnay||many
modfiy||modify
modifer||modifier
+modul||module
modulues||modules
momery||memory
memomry||memory
@@ -1154,6 +1163,7 @@ programable||programmable
programers||programmers
programm||program
programms||programs
+progres||progress
progresss||progress
prohibitted||prohibited
prohibitting||prohibiting
@@ -1328,6 +1338,7 @@ servive||service
setts||sets
settting||setting
shapshot||snapshot
+shoft||shift
shotdown||shutdown
shoud||should
shouldnt||shouldn't
@@ -1439,6 +1450,7 @@ syfs||sysfs
symetric||symmetric
synax||syntax
synchonized||synchronized
+sychronization||synchronization
synchronuously||synchronously
syncronize||synchronize
syncronized||synchronized
@@ -1521,6 +1533,7 @@ unexpexted||unexpected
unfortunatelly||unfortunately
unifiy||unify
uniterrupted||uninterrupted
+uninterruptable||uninterruptible
unintialized||uninitialized
unitialized||uninitialized
unkmown||unknown
@@ -1553,6 +1566,7 @@ unuseful||useless
unvalid||invalid
upate||update
upsupported||unsupported
+useable||usable
usefule||useful
usefull||useful
usege||usage
@@ -1574,6 +1588,7 @@ varient||variant
vaule||value
verbse||verbose
veify||verify
+verfication||verification
veriosn||version
verisons||versions
verison||version
@@ -1586,6 +1601,7 @@ visiters||visitors
vitual||virtual
vunerable||vulnerable
wakeus||wakeups
+was't||wasn't
wathdog||watchdog
wating||waiting
wiat||wait
diff --git a/security/Kconfig b/security/Kconfig
index fe6c0395fa0251..0b847f435beb5a 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -163,20 +163,6 @@ config HARDENED_USERCOPY
or are part of the kernel text. This kills entire classes
of heap overflow exploits and similar kernel memory exposures.
-config HARDENED_USERCOPY_FALLBACK
- bool "Allow usercopy whitelist violations to fallback to object size"
- depends on HARDENED_USERCOPY
- default y
- help
- This is a temporary option that allows missing usercopy whitelists
- to be discovered via a WARN() to the kernel log, instead of
- rejecting the copy, falling back to non-whitelisted hardened
- usercopy that checks the slab allocation size instead of the
- whitelist size. This option will be removed once it seems like
- all missing usercopy whitelists have been identified and fixed.
- Booting with "slab_common.usercopy_fallback=Y/N" can change
- this setting.
-
config HARDENED_USERCOPY_PAGESPAN
bool "Refuse to copy allocations that span multiple pages"
depends on HARDENED_USERCOPY
diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c
index b61dcdb44c5bec..0e142d25b0316f 100644
--- a/tools/testing/selftests/vm/ksm_tests.c
+++ b/tools/testing/selftests/vm/ksm_tests.c
@@ -354,12 +354,34 @@ err_out:
return KSFT_FAIL;
}
+static int get_next_mem_node(int node)
+{
+
+ long node_size;
+ int mem_node = 0;
+ int i, max_node = numa_max_node();
+
+ for (i = node + 1; i <= max_node + node; i++) {
+ mem_node = i % (max_node + 1);
+ node_size = numa_node_size(mem_node, NULL);
+ if (node_size > 0)
+ break;
+ }
+ return mem_node;
+}
+
+static int get_first_mem_node(void)
+{
+ return get_next_mem_node(numa_max_node());
+}
+
static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes,
size_t page_size)
{
void *numa1_map_ptr, *numa2_map_ptr;
struct timespec start_time;
int page_count = 2;
+ int first_node;
if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
perror("clock_gettime");
@@ -370,7 +392,7 @@ static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_a
perror("NUMA support not enabled");
return KSFT_SKIP;
}
- if (numa_max_node() < 1) {
+ if (numa_num_configured_nodes() <= 1) {
printf("At least 2 NUMA nodes must be available\n");
return KSFT_SKIP;
}
@@ -378,8 +400,9 @@ static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_a
return KSFT_FAIL;
/* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */
- numa1_map_ptr = numa_alloc_onnode(page_size, 0);
- numa2_map_ptr = numa_alloc_onnode(page_size, 1);
+ first_node = get_first_mem_node();
+ numa1_map_ptr = numa_alloc_onnode(page_size, first_node);
+ numa2_map_ptr = numa_alloc_onnode(page_size, get_next_mem_node(first_node));
if (!numa1_map_ptr || !numa2_map_ptr) {
perror("numa_alloc_onnode");
return KSFT_FAIL;
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 10ab56c2484ae6..946e10c6312bcc 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -57,6 +57,7 @@
#include <assert.h>
#include <inttypes.h>
#include <stdint.h>
+#include <sys/random.h>
#include "../kselftest.h"
@@ -79,10 +80,6 @@ static int test_type;
#define ALARM_INTERVAL_SECS 10
static volatile bool test_uffdio_copy_eexist = true;
static volatile bool test_uffdio_zeropage_eexist = true;
-/* Whether to test uffd write-protection */
-static bool test_uffdio_wp = false;
-/* Whether to test uffd minor faults */
-static bool test_uffdio_minor = false;
static bool map_shared;
static int shm_fd;
@@ -90,6 +87,7 @@ static int huge_fd;
static char *huge_fd_off0;
static unsigned long long *count_verify;
static int uffd = -1;
+static uint64_t uffd_features;
static int uffd_flags, finished, *pipefd;
static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
static char *zeropage;
@@ -307,37 +305,24 @@ static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
}
struct uffd_test_ops {
- unsigned long expected_ioctls;
void (*allocate_area)(void **alloc_area);
void (*release_pages)(char *rel_area);
void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
};
-#define SHMEM_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \
- (1 << _UFFDIO_COPY) | \
- (1 << _UFFDIO_ZEROPAGE))
-
-#define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \
- (1 << _UFFDIO_COPY) | \
- (1 << _UFFDIO_ZEROPAGE) | \
- (1 << _UFFDIO_WRITEPROTECT))
-
static struct uffd_test_ops anon_uffd_test_ops = {
- .expected_ioctls = ANON_EXPECTED_IOCTLS,
.allocate_area = anon_allocate_area,
.release_pages = anon_release_pages,
.alias_mapping = noop_alias_mapping,
};
static struct uffd_test_ops shmem_uffd_test_ops = {
- .expected_ioctls = SHMEM_EXPECTED_IOCTLS,
.allocate_area = shmem_allocate_area,
.release_pages = shmem_release_pages,
.alias_mapping = shmem_alias_mapping,
};
static struct uffd_test_ops hugetlb_uffd_test_ops = {
- .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC & ~(1 << _UFFDIO_CONTINUE),
.allocate_area = hugetlb_allocate_area,
.release_pages = hugetlb_release_pages,
.alias_mapping = hugetlb_alias_mapping,
@@ -345,7 +330,7 @@ static struct uffd_test_ops hugetlb_uffd_test_ops = {
static struct uffd_test_ops *uffd_test_ops;
-static void userfaultfd_open(uint64_t *features)
+static void userfaultfd_open(uint64_t features)
{
struct uffdio_api uffdio_api;
@@ -355,14 +340,61 @@ static void userfaultfd_open(uint64_t *features)
uffd_flags = fcntl(uffd, F_GETFD, NULL);
uffdio_api.api = UFFD_API;
- uffdio_api.features = *features;
+ uffdio_api.features = features;
if (ioctl(uffd, UFFDIO_API, &uffdio_api))
err("UFFDIO_API failed.\nPlease make sure to "
"run with either root or ptrace capability.");
if (uffdio_api.api != UFFD_API)
err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
- *features = uffdio_api.features;
+ uffd_features = uffdio_api.features;
+}
+
+static inline bool uffd_wp_supported(void)
+{
+ return test_type == TEST_ANON &&
+ (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP);
+}
+
+static inline uint64_t uffd_minor_feature(void)
+{
+ if (test_type == TEST_HUGETLB && map_shared)
+ return UFFD_FEATURE_MINOR_HUGETLBFS;
+ else if (test_type == TEST_SHMEM)
+ return UFFD_FEATURE_MINOR_SHMEM;
+ else
+ return 0;
+}
+
+static inline bool uffd_minor_supported(void)
+{
+ return uffd_features & uffd_minor_feature();
+}
+
+static unsigned long get_expected_ioctls(uint64_t mode)
+{
+ unsigned long ioctls = UFFD_API_RANGE_IOCTLS;
+
+ if (test_type == TEST_HUGETLB)
+ ioctls &= ~(1 << _UFFDIO_ZEROPAGE);
+
+ if (!((mode & UFFDIO_REGISTER_MODE_WP) && uffd_wp_supported()))
+ ioctls &= ~(1 << _UFFDIO_WRITEPROTECT);
+
+ if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && uffd_minor_supported()))
+ ioctls &= ~(1 << _UFFDIO_CONTINUE);
+
+ return ioctls;
+}
+
+static inline void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls)
+{
+ uint64_t expected = get_expected_ioctls(mode);
+ uint64_t actual = ioctls & expected;
+
+ if (actual != expected)
+ err("missing ioctl(s); expected: %"PRIx64" actual: %"PRIx64,
+ expected, actual);
}
static inline void munmap_area(void **area)
@@ -397,6 +429,7 @@ static void uffd_test_ctx_clear(void)
err("close uffd");
uffd = -1;
}
+ uffd_features = 0;
huge_fd_off0 = NULL;
munmap_area((void **)&area_src);
@@ -405,7 +438,7 @@ static void uffd_test_ctx_clear(void)
munmap_area((void **)&area_dst_alias);
}
-static void uffd_test_ctx_init_ext(uint64_t *features)
+static void uffd_test_ctx_init(uint64_t features)
{
unsigned long nr, cpu;
@@ -414,9 +447,6 @@ static void uffd_test_ctx_init_ext(uint64_t *features)
uffd_test_ops->allocate_area((void **)&area_src);
uffd_test_ops->allocate_area((void **)&area_dst);
- uffd_test_ops->release_pages(area_src);
- uffd_test_ops->release_pages(area_dst);
-
userfaultfd_open(features);
count_verify = malloc(nr_pages * sizeof(unsigned long long));
@@ -437,6 +467,26 @@ static void uffd_test_ctx_init_ext(uint64_t *features)
*(area_count(area_src, nr) + 1) = 1;
}
+ /*
+ * After initialization of area_src, we must explicitly release pages
+ * for area_dst to make sure it's fully empty. Otherwise we could have
+ * some area_dst pages be errornously initialized with zero pages,
+ * hence we could hit memory corruption later in the test.
+ *
+ * One example is when THP is globally enabled, above allocate_area()
+ * calls could have the two areas merged into a single VMA (as they
+ * will have the same VMA flags so they're mergeable). When we
+ * initialize the area_src above, it's possible that some part of
+ * area_dst could have been faulted in via one huge THP that will be
+ * shared between area_src and area_dst. It could cause some of the
+ * area_dst won't be trapped by missing userfaults.
+ *
+ * This release_pages() will guarantee even if that happened, we'll
+ * proactively split the thp and drop any accidentally initialized
+ * pages within area_dst.
+ */
+ uffd_test_ops->release_pages(area_dst);
+
pipefd = malloc(sizeof(int) * nr_cpus * 2);
if (!pipefd)
err("pipefd");
@@ -445,11 +495,6 @@ static void uffd_test_ctx_init_ext(uint64_t *features)
err("pipe");
}
-static inline void uffd_test_ctx_init(uint64_t features)
-{
- uffd_test_ctx_init_ext(&features);
-}
-
static int my_bcmp(char *str1, char *str2, size_t n)
{
unsigned long i;
@@ -501,22 +546,10 @@ static void continue_range(int ufd, __u64 start, __u64 len)
static void *locking_thread(void *arg)
{
unsigned long cpu = (unsigned long) arg;
- struct random_data rand;
unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
- int32_t rand_nr;
unsigned long long count;
- char randstate[64];
- unsigned int seed;
- if (bounces & BOUNCE_RANDOM) {
- seed = (unsigned int) time(NULL) - bounces;
- if (!(bounces & BOUNCE_RACINGFAULTS))
- seed += cpu;
- bzero(&rand, sizeof(rand));
- bzero(&randstate, sizeof(randstate));
- if (initstate_r(seed, randstate, sizeof(randstate), &rand))
- err("initstate_r failed");
- } else {
+ if (!(bounces & BOUNCE_RANDOM)) {
page_nr = -bounces;
if (!(bounces & BOUNCE_RACINGFAULTS))
page_nr += cpu * nr_pages_per_cpu;
@@ -524,15 +557,8 @@ static void *locking_thread(void *arg)
while (!finished) {
if (bounces & BOUNCE_RANDOM) {
- if (random_r(&rand, &rand_nr))
- err("random_r failed");
- page_nr = rand_nr;
- if (sizeof(page_nr) > sizeof(rand_nr)) {
- if (random_r(&rand, &rand_nr))
- err("random_r failed");
- page_nr |= (((unsigned long) rand_nr) << 16) <<
- 16;
- }
+ if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr))
+ err("getrandom failed");
} else
page_nr += 1;
page_nr %= nr_pages;
@@ -587,7 +613,7 @@ static int __copy_page(int ufd, unsigned long offset, bool retry)
uffdio_copy.dst = (unsigned long) area_dst + offset;
uffdio_copy.src = (unsigned long) area_src + offset;
uffdio_copy.len = page_size;
- if (test_uffdio_wp)
+ if (uffd_wp_supported())
uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
else
uffdio_copy.mode = 0;
@@ -778,7 +804,7 @@ static void *background_thread(void *arg)
* at least the first half of the pages mapped already which
* can be write-protected for testing
*/
- if (test_uffdio_wp)
+ if (uffd_wp_supported())
wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
nr_pages_per_cpu * page_size, true);
@@ -1013,11 +1039,9 @@ static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
{
struct uffdio_zeropage uffdio_zeropage;
int ret;
- unsigned long has_zeropage;
+ bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE);
__s64 res;
- has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE);
-
if (offset >= nr_pages * page_size)
err("unexpected offset %lu", offset);
uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
@@ -1057,24 +1081,22 @@ static int uffdio_zeropage(int ufd, unsigned long offset)
static int userfaultfd_zeropage_test(void)
{
struct uffdio_register uffdio_register;
- unsigned long expected_ioctls;
printf("testing UFFDIO_ZEROPAGE: ");
fflush(stdout);
- uffd_test_ctx_init(0);
+ uffd_test_ctx_init(UFFD_FEATURE_PAGEFAULT_FLAG_WP);
uffdio_register.range.start = (unsigned long) area_dst;
uffdio_register.range.len = nr_pages * page_size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
- if (test_uffdio_wp)
+ if (uffd_wp_supported())
uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
err("register failure");
- expected_ioctls = uffd_test_ops->expected_ioctls;
- if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
- err("unexpected missing ioctl for anon memory");
+ assert_expected_ioctls_present(
+ uffdio_register.mode, uffdio_register.ioctls);
if (uffdio_zeropage(uffd, 0))
if (my_bcmp(area_dst, zeropage, page_size))
@@ -1087,9 +1109,8 @@ static int userfaultfd_zeropage_test(void)
static int userfaultfd_events_test(void)
{
struct uffdio_register uffdio_register;
- unsigned long expected_ioctls;
pthread_t uffd_mon;
- int err, features;
+ int err;
pid_t pid;
char c;
struct uffd_stats stats = { 0 };
@@ -1097,23 +1118,21 @@ static int userfaultfd_events_test(void)
printf("testing events (fork, remap, remove): ");
fflush(stdout);
- features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
- UFFD_FEATURE_EVENT_REMOVE;
- uffd_test_ctx_init(features);
+ uffd_test_ctx_init(UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
+ UFFD_FEATURE_EVENT_REMOVE | UFFD_FEATURE_PAGEFAULT_FLAG_WP);
fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
uffdio_register.range.start = (unsigned long) area_dst;
uffdio_register.range.len = nr_pages * page_size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
- if (test_uffdio_wp)
+ if (uffd_wp_supported())
uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
err("register failure");
- expected_ioctls = uffd_test_ops->expected_ioctls;
- if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
- err("unexpected missing ioctl for anon memory");
+ assert_expected_ioctls_present(
+ uffdio_register.mode, uffdio_register.ioctls);
if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
err("uffd_poll_thread create");
@@ -1141,10 +1160,9 @@ static int userfaultfd_events_test(void)
static int userfaultfd_sig_test(void)
{
struct uffdio_register uffdio_register;
- unsigned long expected_ioctls;
unsigned long userfaults;
pthread_t uffd_mon;
- int err, features;
+ int err;
pid_t pid;
char c;
struct uffd_stats stats = { 0 };
@@ -1152,22 +1170,21 @@ static int userfaultfd_sig_test(void)
printf("testing signal delivery: ");
fflush(stdout);
- features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
- uffd_test_ctx_init(features);
+ uffd_test_ctx_init(UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_SIGBUS |
+ UFFD_FEATURE_PAGEFAULT_FLAG_WP);
fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
uffdio_register.range.start = (unsigned long) area_dst;
uffdio_register.range.len = nr_pages * page_size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
- if (test_uffdio_wp)
+ if (uffd_wp_supported())
uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
err("register failure");
- expected_ioctls = uffd_test_ops->expected_ioctls;
- if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
- err("unexpected missing ioctl for anon memory");
+ assert_expected_ioctls_present(
+ uffdio_register.mode, uffdio_register.ioctls);
if (faulting_process(1))
err("faulting process failed");
@@ -1202,32 +1219,25 @@ static int userfaultfd_sig_test(void)
static int userfaultfd_minor_test(void)
{
struct uffdio_register uffdio_register;
- unsigned long expected_ioctls;
unsigned long p;
pthread_t uffd_mon;
uint8_t expected_byte;
void *expected_page;
char c;
struct uffd_stats stats = { 0 };
- uint64_t req_features, features_out;
-
- if (!test_uffdio_minor)
- return 0;
+ uint64_t features = uffd_minor_feature();
printf("testing minor faults: ");
fflush(stdout);
- if (test_type == TEST_HUGETLB)
- req_features = UFFD_FEATURE_MINOR_HUGETLBFS;
- else if (test_type == TEST_SHMEM)
- req_features = UFFD_FEATURE_MINOR_SHMEM;
- else
- return 1;
+ if (!features) {
+ printf("skipping test due to unsupported memory type\n");
+ fflush(stdout);
+ return 0;
+ }
- features_out = req_features;
- uffd_test_ctx_init_ext(&features_out);
- /* If kernel reports required features aren't supported, skip test. */
- if ((features_out & req_features) != req_features) {
+ uffd_test_ctx_init(features);
+ if (!uffd_minor_supported()) {
printf("skipping test due to lack of feature support\n");
fflush(stdout);
return 0;
@@ -1239,10 +1249,8 @@ static int userfaultfd_minor_test(void)
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
err("register failure");
- expected_ioctls = uffd_test_ops->expected_ioctls;
- expected_ioctls |= 1 << _UFFDIO_CONTINUE;
- if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
- err("unexpected missing ioctl(s)");
+ assert_expected_ioctls_present(
+ uffdio_register.mode, uffdio_register.ioctls);
/*
* After registering with UFFD, populate the non-UFFD-registered side of
@@ -1349,10 +1357,6 @@ static void userfaultfd_pagemap_test(unsigned int test_pgsize)
int pagemap_fd;
uint64_t value;
- /* Pagemap tests uffd-wp only */
- if (!test_uffdio_wp)
- return;
-
/* Not enough memory to test this page size */
if (test_pgsize > nr_pages * page_size)
return;
@@ -1361,7 +1365,12 @@ static void userfaultfd_pagemap_test(unsigned int test_pgsize)
/* Flush so it doesn't flush twice in parent/child later */
fflush(stdout);
- uffd_test_ctx_init(0);
+ uffd_test_ctx_init(UFFD_FEATURE_PAGEFAULT_FLAG_WP);
+ /* Pagemap tests uffd-wp only */
+ if (!uffd_wp_supported()) {
+ printf("skipping test due to lack of feature support\n");
+ return;
+ }
if (test_pgsize > page_size) {
/* This is a thp test */
@@ -1426,7 +1435,7 @@ static int userfaultfd_stress(void)
struct uffdio_register uffdio_register;
struct uffd_stats uffd_stats[nr_cpus];
- uffd_test_ctx_init(0);
+ uffd_test_ctx_init(UFFD_FEATURE_PAGEFAULT_FLAG_WP);
if (posix_memalign(&area, page_size, page_size))
err("out of memory");
@@ -1439,8 +1448,6 @@ static int userfaultfd_stress(void)
pthread_attr_setstacksize(&attr, 16*1024*1024);
while (bounces--) {
- unsigned long expected_ioctls;
-
printf("bounces: %d, mode:", bounces);
if (bounces & BOUNCE_RANDOM)
printf(" rnd");
@@ -1464,14 +1471,12 @@ static int userfaultfd_stress(void)
uffdio_register.range.start = (unsigned long) area_dst;
uffdio_register.range.len = nr_pages * page_size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
- if (test_uffdio_wp)
+ if (uffd_wp_supported())
uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
err("register failure");
- expected_ioctls = uffd_test_ops->expected_ioctls;
- if ((uffdio_register.ioctls & expected_ioctls) !=
- expected_ioctls)
- err("unexpected missing ioctl for anon memory");
+ assert_expected_ioctls_present(
+ uffdio_register.mode, uffdio_register.ioctls);
if (area_dst_alias) {
uffdio_register.range.start = (unsigned long)
@@ -1513,7 +1518,7 @@ static int userfaultfd_stress(void)
return 1;
/* Clear all the write protections if there is any */
- if (test_uffdio_wp)
+ if (uffd_wp_supported())
wp_range(uffd, (unsigned long)area_dst,
nr_pages * page_size, false);
@@ -1595,8 +1600,6 @@ static void set_test_type(const char *type)
if (!strcmp(type, "anon")) {
test_type = TEST_ANON;
uffd_test_ops = &anon_uffd_test_ops;
- /* Only enable write-protect test for anonymous test */
- test_uffdio_wp = true;
} else if (!strcmp(type, "hugetlb")) {
test_type = TEST_HUGETLB;
uffd_test_ops = &hugetlb_uffd_test_ops;
@@ -1604,13 +1607,10 @@ static void set_test_type(const char *type)
map_shared = true;
test_type = TEST_HUGETLB;
uffd_test_ops = &hugetlb_uffd_test_ops;
- /* Minor faults require shared hugetlb; only enable here. */
- test_uffdio_minor = true;
} else if (!strcmp(type, "shmem")) {
map_shared = true;
test_type = TEST_SHMEM;
uffd_test_ops = &shmem_uffd_test_ops;
- test_uffdio_minor = true;
} else {
err("Unknown test type: %s", type);
}
diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c
index 0e75f22c94750b..9ebb84a9c73103 100644
--- a/tools/vm/page_owner_sort.c
+++ b/tools/vm/page_owner_sort.c
@@ -5,6 +5,8 @@
* Example use:
* cat /sys/kernel/debug/page_owner > page_owner_full.txt
* ./page_owner_sort page_owner_full.txt sorted_page_owner.txt
+ * Or sort by total memory:
+ * ./page_owner_sort -m page_owner_full.txt sorted_page_owner.txt
*
* See Documentation/vm/page_owner.rst
*/
@@ -16,14 +18,18 @@
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
+#include <regex.h>
+#include <errno.h>
struct block_list {
char *txt;
int len;
int num;
+ int page_num;
};
-
+static int sort_by_memory;
+static regex_t order_pattern;
static struct block_list *list;
static int list_size;
static int max_size;
@@ -59,12 +65,50 @@ static int compare_num(const void *p1, const void *p2)
return l2->num - l1->num;
}
+static int compare_page_num(const void *p1, const void *p2)
+{
+ const struct block_list *l1 = p1, *l2 = p2;
+
+ return l2->page_num - l1->page_num;
+}
+
+static int get_page_num(char *buf)
+{
+ int err, val_len, order_val;
+ char order_str[4] = {0};
+ char *endptr;
+ regmatch_t pmatch[2];
+
+ err = regexec(&order_pattern, buf, 2, pmatch, REG_NOTBOL);
+ if (err != 0 || pmatch[1].rm_so == -1) {
+ printf("no order pattern in %s\n", buf);
+ return 0;
+ }
+ val_len = pmatch[1].rm_eo - pmatch[1].rm_so;
+ if (val_len > 2) /* max_order should not exceed 2 digits */
+ goto wrong_order;
+
+ memcpy(order_str, buf + pmatch[1].rm_so, val_len);
+
+ errno = 0;
+ order_val = strtol(order_str, &endptr, 10);
+ if (errno != 0 || endptr == order_str || *endptr != '\0')
+ goto wrong_order;
+
+ return 1 << order_val;
+
+wrong_order:
+ printf("wrong order in follow buf:\n%s\n", buf);
+ return 0;
+}
+
static void add_list(char *buf, int len)
{
if (list_size != 0 &&
len == list[list_size-1].len &&
memcmp(buf, list[list_size-1].txt, len) == 0) {
list[list_size-1].num++;
+ list[list_size-1].page_num += get_page_num(buf);
return;
}
if (list_size == max_size) {
@@ -74,6 +118,7 @@ static void add_list(char *buf, int len)
list[list_size].txt = malloc(len+1);
list[list_size].len = len;
list[list_size].num = 1;
+ list[list_size].page_num = get_page_num(buf);
memcpy(list[list_size].txt, buf, len);
list[list_size].txt[len] = 0;
list_size++;
@@ -85,6 +130,13 @@ static void add_list(char *buf, int len)
#define BUF_SIZE (128 * 1024)
+static void usage(void)
+{
+ printf("Usage: ./page_owner_sort [-m] <input> <output>\n"
+ "-m Sort by total memory. If this option is unset, sort by times\n"
+ );
+}
+
int main(int argc, char **argv)
{
FILE *fin, *fout;
@@ -92,21 +144,39 @@ int main(int argc, char **argv)
int ret, i, count;
struct block_list *list2;
struct stat st;
+ int err;
+ int opt;
- if (argc < 3) {
- printf("Usage: ./program <input> <output>\n");
- perror("open: ");
+ while ((opt = getopt(argc, argv, "m")) != -1)
+ switch (opt) {
+ case 'm':
+ sort_by_memory = 1;
+ break;
+ default:
+ usage();
+ exit(1);
+ }
+
+ if (optind >= (argc - 1)) {
+ usage();
exit(1);
}
- fin = fopen(argv[1], "r");
- fout = fopen(argv[2], "w");
+ fin = fopen(argv[optind], "r");
+ fout = fopen(argv[optind + 1], "w");
if (!fin || !fout) {
- printf("Usage: ./program <input> <output>\n");
+ usage();
perror("open: ");
exit(1);
}
+ err = regcomp(&order_pattern, "order\\s*([0-9]*),", REG_EXTENDED|REG_NEWLINE);
+ if (err != 0 || order_pattern.re_nsub != 1) {
+ printf("%s: Invalid pattern 'order\\s*([0-9]*),' code %d\n",
+ argv[0], err);
+ exit(1);
+ }
+
fstat(fileno(fin), &st);
max_size = st.st_size / 100; /* hack ... */
@@ -145,13 +215,19 @@ int main(int argc, char **argv)
list2[count++] = list[i];
} else {
list2[count-1].num += list[i].num;
+ list2[count-1].page_num += list[i].page_num;
}
}
- qsort(list2, count, sizeof(list[0]), compare_num);
+ if (sort_by_memory)
+ qsort(list2, count, sizeof(list[0]), compare_page_num);
+ else
+ qsort(list2, count, sizeof(list[0]), compare_num);
for (i = 0; i < count; i++)
- fprintf(fout, "%d times:\n%s\n", list2[i].num, list2[i].txt);
+ fprintf(fout, "%d times, %d pages:\n%s\n",
+ list2[i].num, list2[i].page_num, list2[i].txt);
+ regfree(&order_pattern);
return 0;
}