foo

author: Andrew Morton <akpm@linux-foundation.org> 2024-04-04 14:35:15 -0700
committer: Andrew Morton <akpm@linux-foundation.org> 2024-04-04 14:35:15 -0700
commit: bca1a38142b0fdc6fa41a32a72e1e0e86ab7b8c5 (patch)
tree: 243c90a940649bff869556428fa0cb23d23dda8e
parent: 3cc903845731f4b389d53bfd4b54038cf1eec0cb (diff)
download: 25-new-bca1a38142b0fdc6fa41a32a72e1e0e86ab7b8c5.tar.gz
17 files changed, 750 insertions, 4 deletions
diff --git a/patches/mmpage_owner-fix-accounting-of-pages-when-migrating.patch b/patches/mmpage_owner-fix-accounting-of-pages-when-migrating.patch
new file mode 100644
index 000000000..1a0122f7d
--- /dev/null
+++ b/patches/mmpage_owner-fix-accounting-of-pages-when-migrating.patch
@@ -0,0 +1,76 @@
+From: Oscar Salvador <osalvador@suse.de>
+Subject: mm,page_owner: fix accounting of pages when migrating
+Date: Thu, 4 Apr 2024 09:07:01 +0200
+
+Upon migration, new allocated pages are being given the handle of the old
+pages.  This is problematic because it means that for the stack which
+allocated the old page, we will be substracting the old page + the new one
+when that page is freed, creating an accounting imbalance.
+
+There is an interest in keeping it that way, as otherwise the output will
+biased towards migration stacks should those operations occur often, but
+that is not really helpful.
+
+The link from the new page to the old stack is being performed by calling
+__update_page_owner_handle() in __folio_copy_owner().  The only thing that
+is left is to link the migrate stack to the old page, so the old page will
+be subtracted from the migrate stack, avoiding by doing so any possible
+imbalance.
+
+Link: https://lkml.kernel.org/r/20240404070702.2744-4-osalvador@suse.de
+Fixes: 217b2119b9e2 ("mm,page_owner: implement the tracking of the stacks count")
+Signed-off-by: Oscar Salvador <osalvador@suse.de>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
+Cc: Andrey Konovalov <andreyknvl@gmail.com>
+Cc: Marco Elver <elver@google.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ mm/page_owner.c |   15 +++++++++++++++
+ 1 file changed, 15 insertions(+)
+
+--- a/mm/page_owner.c~mmpage_owner-fix-accounting-of-pages-when-migrating
++++ a/mm/page_owner.c
+@@ -366,9 +366,12 @@ void __split_page_owner(struct page *pag
+ 
+ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
+ {
++	int i;
+ 	struct page_ext *old_ext;
+ 	struct page_ext *new_ext;
+ 	struct page_owner *old_page_owner;
++	struct page_owner *new_page_owner;
++	depot_stack_handle_t migrate_handle;
+ 
+ 	old_ext = page_ext_get(&old->page);
+ 	if (unlikely(!old_ext))
+@@ -381,6 +384,8 @@ void __folio_copy_owner(struct folio *ne
+ 	}
+ 
+ 	old_page_owner = get_page_owner(old_ext);
++	new_page_owner = get_page_owner(new_ext);
++	migrate_handle = new_page_owner->handle;
+ 	__update_page_owner_handle(new_ext, old_page_owner->handle,
+ 				   old_page_owner->order, old_page_owner->gfp_mask,
+ 				   old_page_owner->last_migrate_reason,
+@@ -395,6 +400,16 @@ void __folio_copy_owner(struct folio *ne
+ 					old_page_owner->free_pid,
+ 					old_page_owner->free_tgid,
+ 					old_page_owner->free_ts_nsec);
++	/*
++	 * We linked the original stack to the new folio, we need to do the same
++	 * for the new one and the old folio otherwise there will be an imbalance
++	 * when subtracting those pages from the stack.
++	 */
++	for (i = 0; i < (1 << new_page_owner->order); i++) {
++		old_page_owner->handle = migrate_handle;
++		old_ext = page_ext_next(old_ext);
++		old_page_owner = get_page_owner(old_ext);
++	}
+ 
+ 	page_ext_put(new_ext);
+ 	page_ext_put(old_ext);
+_
diff --git a/patches/mmpage_owner-fix-printing-of-stack-records.patch b/patches/mmpage_owner-fix-printing-of-stack-records.patch
new file mode 100644
index 000000000..81ffb34ee
--- /dev/null
+++ b/patches/mmpage_owner-fix-printing-of-stack-records.patch
@@ -0,0 +1,48 @@
+From: Oscar Salvador <osalvador@suse.de>
+Subject: mm,page_owner: fix printing of stack records
+Date: Thu, 4 Apr 2024 09:07:02 +0200
+
+When seq_* code sees that its buffer overflowed, it re-allocates a bigger
+onecand calls seq_operations->start() callback again.  stack_start()
+naively though that if it got called again, it meant that the old record
+got already printed so it returned the next object, but that is not true.
+
+The consequence of that is that every time stack_stop() -> stack_start()
+get called because we needed a bigger buffer, stack_start() will skip
+entries, and those will not be printed.
+
+Fix it by not advancing to the next object in stack_start().
+
+Link: https://lkml.kernel.org/r/20240404070702.2744-5-osalvador@suse.de
+Fixes: 765973a09803 ("mm,page_owner: display all stacks and their count")
+Signed-off-by: Oscar Salvador <osalvador@suse.de>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
+Cc: Andrey Konovalov <andreyknvl@gmail.com>
+Cc: Marco Elver <elver@google.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ mm/page_owner.c |    4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+--- a/mm/page_owner.c~mmpage_owner-fix-printing-of-stack-records
++++ a/mm/page_owner.c
+@@ -872,13 +872,11 @@ static void *stack_start(struct seq_file
+ 		 * value of stack_list.
+ 		 */
+ 		stack = smp_load_acquire(&stack_list);
++		m->private = stack;
+ 	} else {
+ 		stack = m->private;
+-		stack = stack->next;
+ 	}
+ 
+-	m->private = stack;
+-
+ 	return stack;
+ }
+ 
+_
diff --git a/patches/mmpage_owner-fix-refcount-imbalance.patch b/patches/mmpage_owner-fix-refcount-imbalance.patch
new file mode 100644
index 000000000..7780a1956
--- /dev/null
+++ b/patches/mmpage_owner-fix-refcount-imbalance.patch
@@ -0,0 +1,233 @@
+From: Oscar Salvador <osalvador@suse.de>
+Subject: mm,page_owner: fix refcount imbalance
+Date: Thu, 4 Apr 2024 09:07:00 +0200
+
+Current code does not contemplate scenarios were an allocation and free
+operation on the same pages do not handle it in the same amount at once. 
+To give an example, page_alloc_exact(), where we will allocate a page of
+enough order to stafisfy the size request, but we will free the remainings
+right away.
+
+In the above example, we will increment the stack_record refcount only
+once, but we will decrease it the same number of times as number of unused
+pages we have to free.  This will lead to a warning because of refcount
+imbalance.
+
+Fix this by recording the number of base pages in the refcount field.
+
+Link: https://lkml.kernel.org/r/20240404070702.2744-3-osalvador@suse.de
+Reported-by: syzbot+41bbfdb8d41003d12c0f@syzkaller.appspotmail.com
+Closes: https://lore.kernel.org/linux-mm/00000000000090e8ff0613eda0e5@google.com
+Fixes: 217b2119b9e2 ("mm,page_owner: implement the tracking of the stacks count")
+Signed-off-by: Oscar Salvador <osalvador@suse.de>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Tested-by: Alexandre Ghiti <alexghiti@rivosinc.com>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Andrey Konovalov <andreyknvl@gmail.com>
+Cc: Marco Elver <elver@google.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ Documentation/mm/page_owner.rst |   73 +++++++++++++++---------------
+ mm/page_owner.c                 |   34 ++++++++-----
+ 2 files changed, 58 insertions(+), 49 deletions(-)
+
+--- a/Documentation/mm/page_owner.rst~mmpage_owner-fix-refcount-imbalance
++++ a/Documentation/mm/page_owner.rst
+@@ -24,10 +24,10 @@ fragmentation statistics can be obtained
+ each page. It is already implemented and activated if page owner is
+ enabled. Other usages are more than welcome.
+ 
+-It can also be used to show all the stacks and their outstanding
+-allocations, which gives us a quick overview of where the memory is going
+-without the need to screen through all the pages and match the allocation
+-and free operation.
++It can also be used to show all the stacks and their current number of
++allocated base pages, which gives us a quick overview of where the memory
++is going without the need to screen through all the pages and match the
++allocation and free operation.
+ 
+ page owner is disabled by default. So, if you'd like to use it, you need
+ to add "page_owner=on" to your boot cmdline. If the kernel is built
+@@ -75,42 +75,45 @@ Usage
+ 
+ 	cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt
+ 	cat stacks.txt
+-	 prep_new_page+0xa9/0x120
+-	 get_page_from_freelist+0x7e6/0x2140
+-	 __alloc_pages+0x18a/0x370
+-	 new_slab+0xc8/0x580
+-	 ___slab_alloc+0x1f2/0xaf0
+-	 __slab_alloc.isra.86+0x22/0x40
+-	 kmem_cache_alloc+0x31b/0x350
+-	 __khugepaged_enter+0x39/0x100
+-	 dup_mmap+0x1c7/0x5ce
+-	 copy_process+0x1afe/0x1c90
+-	 kernel_clone+0x9a/0x3c0
+-	 __do_sys_clone+0x66/0x90
+-	 do_syscall_64+0x7f/0x160
+-	 entry_SYSCALL_64_after_hwframe+0x6c/0x74
+-	stack_count: 234
++	 post_alloc_hook+0x177/0x1a0
++	 get_page_from_freelist+0xd01/0xd80
++	 __alloc_pages+0x39e/0x7e0
++	 allocate_slab+0xbc/0x3f0
++	 ___slab_alloc+0x528/0x8a0
++	 kmem_cache_alloc+0x224/0x3b0
++	 sk_prot_alloc+0x58/0x1a0
++	 sk_alloc+0x32/0x4f0
++	 inet_create+0x427/0xb50
++	 __sock_create+0x2e4/0x650
++	 inet_ctl_sock_create+0x30/0x180
++	 igmp_net_init+0xc1/0x130
++	 ops_init+0x167/0x410
++	 setup_net+0x304/0xa60
++	 copy_net_ns+0x29b/0x4a0
++	 create_new_namespaces+0x4a1/0x820
++	nr_base_pages: 16
+ 	...
+ 	...
+ 	echo 7000 > /sys/kernel/debug/page_owner_stacks/count_threshold
+ 	cat /sys/kernel/debug/page_owner_stacks/show_stacks> stacks_7000.txt
+ 	cat stacks_7000.txt
+-	 prep_new_page+0xa9/0x120
+-	 get_page_from_freelist+0x7e6/0x2140
+-	 __alloc_pages+0x18a/0x370
+-	 alloc_pages_mpol+0xdf/0x1e0
+-	 folio_alloc+0x14/0x50
+-	 filemap_alloc_folio+0xb0/0x100
+-	 page_cache_ra_unbounded+0x97/0x180
+-	 filemap_fault+0x4b4/0x1200
+-	 __do_fault+0x2d/0x110
+-	 do_pte_missing+0x4b0/0xa30
+-	 __handle_mm_fault+0x7fa/0xb70
+-	 handle_mm_fault+0x125/0x300
+-	 do_user_addr_fault+0x3c9/0x840
+-	 exc_page_fault+0x68/0x150
+-	 asm_exc_page_fault+0x22/0x30
+-	stack_count: 8248
++	 post_alloc_hook+0x177/0x1a0
++	 get_page_from_freelist+0xd01/0xd80
++	 __alloc_pages+0x39e/0x7e0
++	 alloc_pages_mpol+0x22e/0x490
++	 folio_alloc+0xd5/0x110
++	 filemap_alloc_folio+0x78/0x230
++	 page_cache_ra_order+0x287/0x6f0
++	 filemap_get_pages+0x517/0x1160
++	 filemap_read+0x304/0x9f0
++	 xfs_file_buffered_read+0xe6/0x1d0 [xfs]
++	 xfs_file_read_iter+0x1f0/0x380 [xfs]
++	 __kernel_read+0x3b9/0x730
++	 kernel_read_file+0x309/0x4d0
++	 __do_sys_finit_module+0x381/0x730
++	 do_syscall_64+0x8d/0x150
++	 entry_SYSCALL_64_after_hwframe+0x62/0x6a
++	nr_base_pages: 20824
+ 	...
+ 
+ 	cat /sys/kernel/debug/page_owner > page_owner_full.txt
+--- a/mm/page_owner.c~mmpage_owner-fix-refcount-imbalance
++++ a/mm/page_owner.c
+@@ -196,7 +196,8 @@ static void add_stack_record_to_list(str
+ 	spin_unlock_irqrestore(&stack_list_lock, flags);
+ }
+ 
+-static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask)
++static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask,
++				   int nr_base_pages)
+ {
+ 	struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
+ 
+@@ -217,15 +218,20 @@ static void inc_stack_record_count(depot
+ 			/* Add the new stack_record to our list */
+ 			add_stack_record_to_list(stack_record, gfp_mask);
+ 	}
+-	refcount_inc(&stack_record->count);
++	refcount_add(nr_base_pages, &stack_record->count);
+ }
+ 
+-static void dec_stack_record_count(depot_stack_handle_t handle)
++static void dec_stack_record_count(depot_stack_handle_t handle,
++				   int nr_base_pages)
+ {
+ 	struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
+ 
+-	if (stack_record)
+-		refcount_dec(&stack_record->count);
++	if (!stack_record)
++		return;
++
++	if (refcount_sub_and_test(nr_base_pages, &stack_record->count))
++		pr_warn("%s: refcount went to 0 for %u handle\n", __func__,
++			handle);
+ }
+ 
+ static inline void __update_page_owner_handle(struct page_ext *page_ext,
+@@ -306,7 +312,7 @@ void __reset_page_owner(struct page *pag
+ 		 * the machinery is not ready yet, we cannot decrement
+ 		 * their refcount either.
+ 		 */
+-		dec_stack_record_count(alloc_handle);
++		dec_stack_record_count(alloc_handle, 1 << order);
+ }
+ 
+ noinline void __set_page_owner(struct page *page, unsigned short order,
+@@ -325,7 +331,7 @@ noinline void __set_page_owner(struct pa
+ 				   current->pid, current->tgid, ts_nsec,
+ 				   current->comm);
+ 	page_ext_put(page_ext);
+-	inc_stack_record_count(handle, gfp_mask);
++	inc_stack_record_count(handle, gfp_mask, 1 << order);
+ }
+ 
+ void __set_page_owner_migrate_reason(struct page *page, int reason)
+@@ -872,11 +878,11 @@ static void *stack_next(struct seq_file
+ 	return stack;
+ }
+ 
+-static unsigned long page_owner_stack_threshold;
++static unsigned long page_owner_pages_threshold;
+ 
+ static int stack_print(struct seq_file *m, void *v)
+ {
+-	int i, stack_count;
++	int i, nr_base_pages;
+ 	struct stack *stack = v;
+ 	unsigned long *entries;
+ 	unsigned long nr_entries;
+@@ -887,14 +893,14 @@ static int stack_print(struct seq_file *
+ 
+ 	nr_entries = stack_record->size;
+ 	entries = stack_record->entries;
+-	stack_count = refcount_read(&stack_record->count) - 1;
++	nr_base_pages = refcount_read(&stack_record->count) - 1;
+ 
+-	if (stack_count < 1 || stack_count < page_owner_stack_threshold)
++	if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold)
+ 		return 0;
+ 
+ 	for (i = 0; i < nr_entries; i++)
+ 		seq_printf(m, " %pS\n", (void *)entries[i]);
+-	seq_printf(m, "stack_count: %d\n\n", stack_count);
++	seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages);
+ 
+ 	return 0;
+ }
+@@ -924,13 +930,13 @@ static const struct file_operations page
+ 
+ static int page_owner_threshold_get(void *data, u64 *val)
+ {
+-	*val = READ_ONCE(page_owner_stack_threshold);
++	*val = READ_ONCE(page_owner_pages_threshold);
+ 	return 0;
+ }
+ 
+ static int page_owner_threshold_set(void *data, u64 val)
+ {
+-	WRITE_ONCE(page_owner_stack_threshold, val);
++	WRITE_ONCE(page_owner_pages_threshold, val);
+ 	return 0;
+ }
+ 
+_
diff --git a/patches/mmpage_owner-update-metadata-for-tail-pages.patch b/patches/mmpage_owner-update-metadata-for-tail-pages.patch
new file mode 100644
index 000000000..1df730a5c
--- /dev/null
+++ b/patches/mmpage_owner-update-metadata-for-tail-pages.patch
@@ -0,0 +1,253 @@
+From: Oscar Salvador <osalvador@suse.de>
+Subject: mm,page_owner: update metadata for tail pages
+Date: Thu, 4 Apr 2024 09:06:59 +0200
+
+Patch series "page_owner: Fix refcount imbalance and print fixup", v4.
+
+This series consists of a refactoring/correctness of updating the metadata
+of tail pages, a couple of fixups for the refcounting part and a fixup for
+the stack_start() function.
+
+From this series on, instead of counting the stacks, we count the
+outstanding nr_base_pages each stack has, which gives us a much better
+memory overview.  The other fixup is for the migration part.
+
+A more detailed explanation can be found in the changelog of the
+respective patches.
+
+
+This patch (of 4):
+
+__set_page_owner_handle() and __reset_page_owner() update the metadata of
+all pages when the page is of a higher-order, but we miss to do the same
+when the pages are migrated.  __folio_copy_owner() only updates the
+metadata of the head page, meaning that the information stored in the
+first page and the tail pages will not match.
+
+Strictly speaking that is not a big problem because 1) we do not print
+tail pages and 2) upon splitting all tail pages will inherit the metadata
+of the head page, but it is better to have all metadata in check should
+there be any problem, so it can ease debugging.
+
+For that purpose, a couple of helpers are created
+__update_page_owner_handle() which updates the metadata on allocation, and
+__update_page_owner_free_handle() which does the same when the page is
+freed.
+
+__folio_copy_owner() will make use of both as it needs to entirely replace
+the page_owner metadata for the new page.
+
+Link: https://lkml.kernel.org/r/20240404070702.2744-1-osalvador@suse.de
+Link: https://lkml.kernel.org/r/20240404070702.2744-2-osalvador@suse.de
+Signed-off-by: Oscar Salvador <osalvador@suse.de>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
+Cc: Andrey Konovalov <andreyknvl@gmail.com>
+Cc: Marco Elver <elver@google.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ mm/page_owner.c |  137 ++++++++++++++++++++++++----------------------
+ 1 file changed, 74 insertions(+), 63 deletions(-)
+
+--- a/mm/page_owner.c~mmpage_owner-update-metadata-for-tail-pages
++++ a/mm/page_owner.c
+@@ -228,9 +228,58 @@ static void dec_stack_record_count(depot
+ 		refcount_dec(&stack_record->count);
+ }
+ 
+-void __reset_page_owner(struct page *page, unsigned short order)
++static inline void __update_page_owner_handle(struct page_ext *page_ext,
++					      depot_stack_handle_t handle,
++					      unsigned short order,
++					      gfp_t gfp_mask,
++					      short last_migrate_reason, u64 ts_nsec,
++					      pid_t pid, pid_t tgid, char *comm)
+ {
+ 	int i;
++	struct page_owner *page_owner;
++
++	for (i = 0; i < (1 << order); i++) {
++		page_owner = get_page_owner(page_ext);
++		page_owner->handle = handle;
++		page_owner->order = order;
++		page_owner->gfp_mask = gfp_mask;
++		page_owner->last_migrate_reason = last_migrate_reason;
++		page_owner->pid = pid;
++		page_owner->tgid = tgid;
++		page_owner->ts_nsec = ts_nsec;
++		strscpy(page_owner->comm, comm,
++			sizeof(page_owner->comm));
++		__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
++		__set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
++		page_ext = page_ext_next(page_ext);
++	}
++}
++
++static inline void __update_page_owner_free_handle(struct page_ext *page_ext,
++						   depot_stack_handle_t handle,
++						   unsigned short order,
++						   pid_t pid, pid_t tgid,
++						   u64 free_ts_nsec)
++{
++	int i;
++	struct page_owner *page_owner;
++
++	for (i = 0; i < (1 << order); i++) {
++		page_owner = get_page_owner(page_ext);
++		/* Only __reset_page_owner() wants to clear the bit */
++		if (handle) {
++			__clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
++			page_owner->free_handle = handle;
++		}
++		page_owner->free_ts_nsec = free_ts_nsec;
++		page_owner->free_pid = current->pid;
++		page_owner->free_tgid = current->tgid;
++		page_ext = page_ext_next(page_ext);
++	}
++}
++
++void __reset_page_owner(struct page *page, unsigned short order)
++{
+ 	struct page_ext *page_ext;
+ 	depot_stack_handle_t handle;
+ 	depot_stack_handle_t alloc_handle;
+@@ -245,16 +294,10 @@ void __reset_page_owner(struct page *pag
+ 	alloc_handle = page_owner->handle;
+ 
+ 	handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
+-	for (i = 0; i < (1 << order); i++) {
+-		__clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
+-		page_owner->free_handle = handle;
+-		page_owner->free_ts_nsec = free_ts_nsec;
+-		page_owner->free_pid = current->pid;
+-		page_owner->free_tgid = current->tgid;
+-		page_ext = page_ext_next(page_ext);
+-		page_owner = get_page_owner(page_ext);
+-	}
++	__update_page_owner_free_handle(page_ext, handle, order, current->pid,
++					current->tgid, free_ts_nsec);
+ 	page_ext_put(page_ext);
++
+ 	if (alloc_handle != early_handle)
+ 		/*
+ 		 * early_handle is being set as a handle for all those
+@@ -266,36 +309,11 @@ void __reset_page_owner(struct page *pag
+ 		dec_stack_record_count(alloc_handle);
+ }
+ 
+-static inline void __set_page_owner_handle(struct page_ext *page_ext,
+-					depot_stack_handle_t handle,
+-					unsigned short order, gfp_t gfp_mask)
+-{
+-	struct page_owner *page_owner;
+-	int i;
+-	u64 ts_nsec = local_clock();
+-
+-	for (i = 0; i < (1 << order); i++) {
+-		page_owner = get_page_owner(page_ext);
+-		page_owner->handle = handle;
+-		page_owner->order = order;
+-		page_owner->gfp_mask = gfp_mask;
+-		page_owner->last_migrate_reason = -1;
+-		page_owner->pid = current->pid;
+-		page_owner->tgid = current->tgid;
+-		page_owner->ts_nsec = ts_nsec;
+-		strscpy(page_owner->comm, current->comm,
+-			sizeof(page_owner->comm));
+-		__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
+-		__set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
+-
+-		page_ext = page_ext_next(page_ext);
+-	}
+-}
+-
+ noinline void __set_page_owner(struct page *page, unsigned short order,
+ 					gfp_t gfp_mask)
+ {
+ 	struct page_ext *page_ext;
++	u64 ts_nsec = local_clock();
+ 	depot_stack_handle_t handle;
+ 
+ 	handle = save_stack(gfp_mask);
+@@ -303,7 +321,9 @@ noinline void __set_page_owner(struct pa
+ 	page_ext = page_ext_get(page);
+ 	if (unlikely(!page_ext))
+ 		return;
+-	__set_page_owner_handle(page_ext, handle, order, gfp_mask);
++	__update_page_owner_handle(page_ext, handle, order, gfp_mask, -1,
++				   current->pid, current->tgid, ts_nsec,
++				   current->comm);
+ 	page_ext_put(page_ext);
+ 	inc_stack_record_count(handle, gfp_mask);
+ }
+@@ -342,7 +362,7 @@ void __folio_copy_owner(struct folio *ne
+ {
+ 	struct page_ext *old_ext;
+ 	struct page_ext *new_ext;
+-	struct page_owner *old_page_owner, *new_page_owner;
++	struct page_owner *old_page_owner;
+ 
+ 	old_ext = page_ext_get(&old->page);
+ 	if (unlikely(!old_ext))
+@@ -355,31 +375,21 @@ void __folio_copy_owner(struct folio *ne
+ 	}
+ 
+ 	old_page_owner = get_page_owner(old_ext);
+-	new_page_owner = get_page_owner(new_ext);
+-	new_page_owner->order = old_page_owner->order;
+-	new_page_owner->gfp_mask = old_page_owner->gfp_mask;
+-	new_page_owner->last_migrate_reason =
+-		old_page_owner->last_migrate_reason;
+-	new_page_owner->handle = old_page_owner->handle;
+-	new_page_owner->pid = old_page_owner->pid;
+-	new_page_owner->tgid = old_page_owner->tgid;
+-	new_page_owner->free_pid = old_page_owner->free_pid;
+-	new_page_owner->free_tgid = old_page_owner->free_tgid;
+-	new_page_owner->ts_nsec = old_page_owner->ts_nsec;
+-	new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
+-	strcpy(new_page_owner->comm, old_page_owner->comm);
+-
++	__update_page_owner_handle(new_ext, old_page_owner->handle,
++				   old_page_owner->order, old_page_owner->gfp_mask,
++				   old_page_owner->last_migrate_reason,
++				   old_page_owner->ts_nsec, old_page_owner->pid,
++				   old_page_owner->tgid, old_page_owner->comm);
+ 	/*
+-	 * We don't clear the bit on the old folio as it's going to be freed
+-	 * after migration. Until then, the info can be useful in case of
+-	 * a bug, and the overall stats will be off a bit only temporarily.
+-	 * Also, migrate_misplaced_transhuge_page() can still fail the
+-	 * migration and then we want the old folio to retain the info. But
+-	 * in that case we also don't need to explicitly clear the info from
+-	 * the new page, which will be freed.
++	 * Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio
++	 * will be freed after migration. Keep them until then as they may be
++	 * useful.
+ 	 */
+-	__set_bit(PAGE_EXT_OWNER, &new_ext->flags);
+-	__set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags);
++	__update_page_owner_free_handle(new_ext, 0, old_page_owner->order,
++					old_page_owner->free_pid,
++					old_page_owner->free_tgid,
++					old_page_owner->free_ts_nsec);
++
+ 	page_ext_put(new_ext);
+ 	page_ext_put(old_ext);
+ }
+@@ -787,8 +797,9 @@ static void init_pages_in_zone(pg_data_t
+ 				goto ext_put_continue;
+ 
+ 			/* Found early allocated page */
+-			__set_page_owner_handle(page_ext, early_handle,
+-						0, 0);
++			__update_page_owner_handle(page_ext, early_handle, 0, 0,
++						   -1, local_clock(), current->pid,
++						   current->tgid, current->comm);
+ 			count++;
+ ext_put_continue:
+ 			page_ext_put(page_ext);
+_
diff --git a/patches/dax-busc-replace-warn_on_once-with-lockdep-asserts.patch b/patches/old/dax-busc-replace-warn_on_once-with-lockdep-asserts.patch
index 7fa13e59d..7fa13e59d 100644
--- a/patches/dax-busc-replace-warn_on_once-with-lockdep-asserts.patch
+++ b/patches/old/dax-busc-replace-warn_on_once-with-lockdep-asserts.patch
diff --git a/pc/dax-busc-replace-warn_on_once-with-lockdep-asserts.pc b/pc/dax-busc-replace-warn_on_once-with-lockdep-asserts.pc
deleted file mode 100644
index cb1dd57cb..000000000
--- a/pc/dax-busc-replace-warn_on_once-with-lockdep-asserts.pc
+++ /dev/null
@@ -1 +0,0 @@
-drivers/dax/bus.c
diff --git a/pc/devel-series b/pc/devel-series
index 8c811a172..aef4e3b16 100644
--- a/pc/devel-series
+++ b/pc/devel-series
@@ -72,6 +72,11 @@ stackdepot-rename-pool_index-to-pool_index_plus_1.patch
 #userfaultfd-change-src_folio-after-ensuring-its-unpinned-in-uffdio_move.patch: acks?
 userfaultfd-change-src_folio-after-ensuring-its-unpinned-in-uffdio_move.patch
 #
+mmpage_owner-update-metadata-for-tail-pages.patch
+mmpage_owner-fix-refcount-imbalance.patch
+mmpage_owner-fix-accounting-of-pages-when-migrating.patch
+mmpage_owner-fix-printing-of-stack-records.patch
+#
 ### hfe
 #
 #ENDBRANCH mm-hotfixes-unstable
@@ -455,7 +460,6 @@ mm-gup-consistently-name-gup-fast-functions.patch
 mm-treewide-rename-config_have_fast_gup-to-config_have_gup_fast.patch
 mm-use-gup-fast-instead-fast-gup-in-remaining-comments.patch
 #
-dax-busc-replace-warn_on_once-with-lockdep-asserts.patch
 #
 #mm-ksm-remove-redundant-code-in-ksm_fork.patch: Stefan?
 mm-ksm-remove-redundant-code-in-ksm_fork.patch
@@ -468,7 +472,6 @@ hugetlb-convert-hugetlb_wp-to-use-struct-vm_fault.patch
 selftests-break-the-dependency-upon-local-header-files.patch
 selftests-mm-fix-additional-build-errors-for-selftests.patch
 #
-#mm-cma-drop-incorrect-alignment-check-in-cma_init_reserved_mem.patch+1: effects? -stable?
 mm-cma-drop-incorrect-alignment-check-in-cma_init_reserved_mem.patch
 #mm-hugetlb-pass-correct-order_per_bit-to-cma_declare_contiguous_nid.patch: https://lkml.kernel.org/r/e74cfee3-565f-4c69-bb7b-bdd40d01d212@redhat.com
 mm-hugetlb-pass-correct-order_per_bit-to-cma_declare_contiguous_nid.patch
diff --git a/pc/mmpage_owner-fix-accounting-of-pages-when-migrating.pc b/pc/mmpage_owner-fix-accounting-of-pages-when-migrating.pc
new file mode 100644
index 000000000..89fe6a5de
--- /dev/null
+++ b/pc/mmpage_owner-fix-accounting-of-pages-when-migrating.pc
@@ -0,0 +1 @@
+mm/page_owner.c
diff --git a/pc/mmpage_owner-fix-printing-of-stack-records.pc b/pc/mmpage_owner-fix-printing-of-stack-records.pc
new file mode 100644
index 000000000..89fe6a5de
--- /dev/null
+++ b/pc/mmpage_owner-fix-printing-of-stack-records.pc
@@ -0,0 +1 @@
+mm/page_owner.c
diff --git a/pc/mmpage_owner-fix-refcount-imbalance.pc b/pc/mmpage_owner-fix-refcount-imbalance.pc
new file mode 100644
index 000000000..5a9efe536
--- /dev/null
+++ b/pc/mmpage_owner-fix-refcount-imbalance.pc
@@ -0,0 +1,2 @@
+Documentation/mm/page_owner.rst
+mm/page_owner.c
diff --git a/pc/mmpage_owner-update-metadata-for-tail-pages.pc b/pc/mmpage_owner-update-metadata-for-tail-pages.pc
new file mode 100644
index 000000000..89fe6a5de
--- /dev/null
+++ b/pc/mmpage_owner-update-metadata-for-tail-pages.pc
@@ -0,0 +1 @@
+mm/page_owner.c
diff --git a/txt/mm-hugetlb-pass-correct-order_per_bit-to-cma_declare_contiguous_nid.txt b/txt/mm-hugetlb-pass-correct-order_per_bit-to-cma_declare_contiguous_nid.txt
index 3491f5271..abfb1269a 100644
--- a/txt/mm-hugetlb-pass-correct-order_per_bit-to-cma_declare_contiguous_nid.txt
+++ b/txt/mm-hugetlb-pass-correct-order_per_bit-to-cma_declare_contiguous_nid.txt
@@ -14,7 +14,7 @@ So, correctly pass in the order instead.
 Link: https://lkml.kernel.org/r/20240404162515.527802-2-fvdl@google.com
 Fixes: cf11e85fc08c ("mm: hugetlb: optionally allocate gigantic hugepages using cma")
 Signed-off-by: Frank van der Linden <fvdl@google.com>
-Cc: Roman Gushchin <roman.gushchin@linux.dev>
+Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
 Cc: David Hildenbrand <david@redhat.com>
 Cc: Marek Szyprowski <m.szyprowski@samsung.com>
 Cc: Muchun Song <muchun.song@linux.dev>
diff --git a/txt/mmpage_owner-fix-accounting-of-pages-when-migrating.txt b/txt/mmpage_owner-fix-accounting-of-pages-when-migrating.txt
new file mode 100644
index 000000000..0d313e4eb
--- /dev/null
+++ b/txt/mmpage_owner-fix-accounting-of-pages-when-migrating.txt
@@ -0,0 +1,28 @@
+From: Oscar Salvador <osalvador@suse.de>
+Subject: mm,page_owner: fix accounting of pages when migrating
+Date: Thu, 4 Apr 2024 09:07:01 +0200
+
+Upon migration, new allocated pages are being given the handle of the old
+pages.  This is problematic because it means that for the stack which
+allocated the old page, we will be substracting the old page + the new one
+when that page is freed, creating an accounting imbalance.
+
+There is an interest in keeping it that way, as otherwise the output will
+biased towards migration stacks should those operations occur often, but
+that is not really helpful.
+
+The link from the new page to the old stack is being performed by calling
+__update_page_owner_handle() in __folio_copy_owner().  The only thing that
+is left is to link the migrate stack to the old page, so the old page will
+be subtracted from the migrate stack, avoiding by doing so any possible
+imbalance.
+
+Link: https://lkml.kernel.org/r/20240404070702.2744-4-osalvador@suse.de
+Fixes: 217b2119b9e2 ("mm,page_owner: implement the tracking of the stacks count")
+Signed-off-by: Oscar Salvador <osalvador@suse.de>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
+Cc: Andrey Konovalov <andreyknvl@gmail.com>
+Cc: Marco Elver <elver@google.com>
+Cc: Michal Hocko <mhocko@suse.com>
diff --git a/txt/mmpage_owner-fix-printing-of-stack-records.txt b/txt/mmpage_owner-fix-printing-of-stack-records.txt
new file mode 100644
index 000000000..1ef4874da
--- /dev/null
+++ b/txt/mmpage_owner-fix-printing-of-stack-records.txt
@@ -0,0 +1,24 @@
+From: Oscar Salvador <osalvador@suse.de>
+Subject: mm,page_owner: fix printing of stack records
+Date: Thu, 4 Apr 2024 09:07:02 +0200
+
+When seq_* code sees that its buffer overflowed, it re-allocates a bigger
+onecand calls seq_operations->start() callback again.  stack_start()
+naively though that if it got called again, it meant that the old record
+got already printed so it returned the next object, but that is not true.
+
+The consequence of that is that every time stack_stop() -> stack_start()
+get called because we needed a bigger buffer, stack_start() will skip
+entries, and those will not be printed.
+
+Fix it by not advancing to the next object in stack_start().
+
+Link: https://lkml.kernel.org/r/20240404070702.2744-5-osalvador@suse.de
+Fixes: 765973a09803 ("mm,page_owner: display all stacks and their count")
+Signed-off-by: Oscar Salvador <osalvador@suse.de>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
+Cc: Andrey Konovalov <andreyknvl@gmail.com>
+Cc: Marco Elver <elver@google.com>
+Cc: Michal Hocko <mhocko@suse.com>
diff --git a/txt/mmpage_owner-fix-refcount-imbalance.txt b/txt/mmpage_owner-fix-refcount-imbalance.txt
new file mode 100644
index 000000000..2f18c29bb
--- /dev/null
+++ b/txt/mmpage_owner-fix-refcount-imbalance.txt
@@ -0,0 +1,28 @@
+From: Oscar Salvador <osalvador@suse.de>
+Subject: mm,page_owner: fix refcount imbalance
+Date: Thu, 4 Apr 2024 09:07:00 +0200
+
+Current code does not contemplate scenarios were an allocation and free
+operation on the same pages do not handle it in the same amount at once. 
+To give an example, page_alloc_exact(), where we will allocate a page of
+enough order to stafisfy the size request, but we will free the remainings
+right away.
+
+In the above example, we will increment the stack_record refcount only
+once, but we will decrease it the same number of times as number of unused
+pages we have to free.  This will lead to a warning because of refcount
+imbalance.
+
+Fix this by recording the number of base pages in the refcount field.
+
+Link: https://lkml.kernel.org/r/20240404070702.2744-3-osalvador@suse.de
+Reported-by: syzbot+41bbfdb8d41003d12c0f@syzkaller.appspotmail.com
+Closes: https://lore.kernel.org/linux-mm/00000000000090e8ff0613eda0e5@google.com
+Fixes: 217b2119b9e2 ("mm,page_owner: implement the tracking of the stacks count")
+Signed-off-by: Oscar Salvador <osalvador@suse.de>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Tested-by: Alexandre Ghiti <alexghiti@rivosinc.com>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Andrey Konovalov <andreyknvl@gmail.com>
+Cc: Marco Elver <elver@google.com>
+Cc: Michal Hocko <mhocko@suse.com>
diff --git a/txt/mmpage_owner-update-metadata-for-tail-pages.txt b/txt/mmpage_owner-update-metadata-for-tail-pages.txt
new file mode 100644
index 000000000..ae89e83fc
--- /dev/null
+++ b/txt/mmpage_owner-update-metadata-for-tail-pages.txt
@@ -0,0 +1,49 @@
+From: Oscar Salvador <osalvador@suse.de>
+Subject: mm,page_owner: update metadata for tail pages
+Date: Thu, 4 Apr 2024 09:06:59 +0200
+
+Patch series "page_owner: Fix refcount imbalance and print fixup", v4.
+
+This series consists of a refactoring/correctness of updating the metadata
+of tail pages, a couple of fixups for the refcounting part and a fixup for
+the stack_start() function.
+
+From this series on, instead of counting the stacks, we count the
+outstanding nr_base_pages each stack has, which gives us a much better
+memory overview.  The other fixup is for the migration part.
+
+A more detailed explanation can be found in the changelog of the
+respective patches.
+
+
+This patch (of 4):
+
+__set_page_owner_handle() and __reset_page_owner() update the metadata of
+all pages when the page is of a higher-order, but we miss to do the same
+when the pages are migrated.  __folio_copy_owner() only updates the
+metadata of the head page, meaning that the information stored in the
+first page and the tail pages will not match.
+
+Strictly speaking that is not a big problem because 1) we do not print
+tail pages and 2) upon splitting all tail pages will inherit the metadata
+of the head page, but it is better to have all metadata in check should
+there be any problem, so it can ease debugging.
+
+For that purpose, a couple of helpers are created
+__update_page_owner_handle() which updates the metadata on allocation, and
+__update_page_owner_free_handle() which does the same when the page is
+freed.
+
+__folio_copy_owner() will make use of both as it needs to entirely replace
+the page_owner metadata for the new page.
+
+Link: https://lkml.kernel.org/r/20240404070702.2744-1-osalvador@suse.de
+Link: https://lkml.kernel.org/r/20240404070702.2744-2-osalvador@suse.de
+Signed-off-by: Oscar Salvador <osalvador@suse.de>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
+Cc: Andrey Konovalov <andreyknvl@gmail.com>
+Cc: Marco Elver <elver@google.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Oscar Salvador <osalvador@suse.de>
diff --git a/txt/dax-busc-replace-warn_on_once-with-lockdep-asserts.txt b/txt/old/dax-busc-replace-warn_on_once-with-lockdep-asserts.txt
index 080af4bfa..080af4bfa 100644
--- a/txt/dax-busc-replace-warn_on_once-with-lockdep-asserts.txt
+++ b/txt/old/dax-busc-replace-warn_on_once-with-lockdep-asserts.txt
author	Andrew Morton <akpm@linux-foundation.org>	2024-04-04 14:35:15 -0700
committer	Andrew Morton <akpm@linux-foundation.org>	2024-04-04 14:35:15 -0700
commit	bca1a38142b0fdc6fa41a32a72e1e0e86ab7b8c5 (patch)
tree	243c90a940649bff869556428fa0cb23d23dda8e
parent	3cc903845731f4b389d53bfd4b54038cf1eec0cb (diff)
download	25-new-bca1a38142b0fdc6fa41a32a72e1e0e86ab7b8c5.tar.gz