arm64: tlb: skip tlbi broadcast

With multiple NUMA nodes and multiple sockets, the tlbi broadcast shall be delivered through the interconnects in turn increasing the CPU interconnect traffic and the latency of the tlbi broadcast instruction. To avoid the synchronous delivery of the tlbi broadcast before the tlbi instruction can be retired, the hardware would need to implement a replicated mm_cpumask bitflag for each ASID and every CPU would need to tell every other CPU which ASID is being loaded. Exactly what x86 does with mm_cpumask in software. Even within a single NUMA node the latency of the tlbi broadcast instruction increases almost linearly with the number of CPUs trying to send tlbi broadcasts at the same time. If a single thread of the process is running and it's also running in the CPU issuing the TLB flush, or if no thread of the process are running, we can achieve full SMP scalability in the arm64 TLB flushng by skipping the tlbi broadcasting. After the local TLB flush this means the ASID context goes out of sync in all CPUs except the local one. This can be tracked on the per-mm cpumask: if the bit is set it means the ASID context is stale for that CPU. This results in an extra local ASID TLB flush only when threads are running in new CPUs after a TLB flush. Skipping the tlbi instruction broadcasting is already implemented in local_flush_tlb_all(), this patch only extends it to flush_tlb_mm(), flush_tlb_range() and flush_tlb_page() too. The below benchmarks are measured on a non-NUMA 32 CPUs system (ARMv8 Ampere), so it should be far from a worst case scenario: the enterprise kernel config allows multiple NUMA nodes with NR_CPUS set by default to 4096. === stock === # cat for-each-cpu.sh #!/bin/bash for i in $(seq `nproc`); do "$@" &>/dev/null & done wait # perf stat -r 10 -e dummy ./for-each-cpu.sh ./mprotect-threaded 10000 [..] 2.1696 +- 0.0122 seconds time elapsed ( +- 0.56% ) # perf stat -r 10 -e dummy ./for-each-cpu.sh ./gperftools/tcmalloc_large_heap_fragmentation_unittest [..] 0.99018 +- 0.00360 seconds time elapsed ( +- 0.36% ) # cat sort-compute #!/bin/bash for x in `seq 256`; do for i in `seq 32`; do /usr/bin/sort </usr/bin/sort >/dev/null; done & done wait # perf stat -r 10 -e dummy ./sort-compute [..] 1.8094 +- 0.0139 seconds time elapsed ( +- 0.77% ) [..] === patch applied === # perf stat -r 10 -e dummy ./for-each-cpu.sh ./mprotect-threaded 10000 [..] 0.13941 +- 0.00449 seconds time elapsed ( +- 3.22% ) # perf stat -r 10 -e dummy ./for-each-cpu.sh ./gperftools/tcmalloc_large_heap_fragmentation_unittest [..] 0.90510 +- 0.00262 seconds time elapsed ( +- 0.29% ) # perf stat -r 10 -e dummy ./sort-compute [..] 1.64025 +- 0.00618 seconds time elapsed ( +- 0.38% ) Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
author: Andrea Arcangeli <aarcange@redhat.com> 2019-12-03 12:32:01 -0500
committer: Andrea Arcangeli <aarcange@redhat.com> 2023-11-11 22:03:38 -0500
commit: 78e8089e340ec80006b98c291d98dc8ac48f8bf6 (patch)
tree: 8b91c218cb6f4574356f049ade4f32cac7b4c030
parent: 7ac6822587f2d182c178b4caac8739c15c919c0f (diff)
download: aa-78e8089e340ec80006b98c291d98dc8ac48f8bf6.tar.gz
5 files changed, 269 insertions, 61 deletions
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index 53cbbb96f7ebf..038ed1ec5d4ff 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -105,7 +105,7 @@ static inline void free_screen_info(struct screen_info *si)
 
 static inline void efi_set_pgd(struct mm_struct *mm)
 {
-	__switch_mm(mm);
+	__switch_mm(mm, smp_processor_id());
 
 	if (system_uses_ttbr0_pan()) {
 		if (mm != current->active_mm) {
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 45d20ca986117..29a510c837472 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -24,6 +24,7 @@ typedef struct {
 	refcount_t	pinned;
 	void		*vdso;
 	unsigned long	flags;
+	atomic_t	nr_active_mm;
 } mm_context_t;
 
 /*
@@ -52,7 +53,8 @@ typedef struct {
  * ensure that the page-table walker on CPU 1 *must* see the invalid PTE
  * written by CPU 0.
  */
-#define ASID(mm)	(atomic64_read(&(mm)->context.id) & 0xffff)
+#define __ASID(asid)	((asid) & 0xffff)
+#define ASID(mm)	__ASID(atomic64_read(&(mm)->context.id))
 
 static inline bool arm64_kernel_unmapped_at_el0(void)
 {
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index f4ba93d4ffebe..9ecc6e2266c2e 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -164,6 +164,7 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
 	atomic64_set(&mm->context.id, 0);
 	refcount_set(&mm->context.pinned, 0);
+	atomic_set(&mm->context.nr_active_mm, 0);
 	return 0;
 }
 
@@ -194,6 +195,23 @@ static inline void update_saved_ttbr0(struct task_struct *tsk,
 static inline void
 enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
+	unsigned int cpu = smp_processor_id();
+	if (per_cpu(cpu_not_lazy_tlb, cpu) &&
+	    is_idle_task(tsk)) {
+		per_cpu(cpu_not_lazy_tlb, cpu) = false;
+		if (!system_uses_ttbr0_pan())
+			cpu_set_reserved_ttbr0();
+		/*
+		 * DSB will flush the speculative pagetable walks on
+		 * the old asid. It's required before decreasing
+		 * nr_active_mm because after decreasing nr_active_mm
+		 * the tlbi broadcast may not happen on the unloaded
+		 * asid before the pagetables are freed.
+		 */
+		dsb(ish);
+		atomic_dec(&mm->context.nr_active_mm);
+	}
+	VM_WARN_ON(atomic_read(&mm->context.nr_active_mm) < 0);
 	/*
 	 * We don't actually care about the ttbr0 mapping, so point it at the
 	 * zero page.
@@ -201,7 +219,7 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 	update_saved_ttbr0(tsk, &init_mm);
 }
 
-static inline void __switch_mm(struct mm_struct *next)
+static inline void __switch_mm(struct mm_struct *next, unsigned int cpu)
 {
 	/*
 	 * init_mm.pgd does not contain any user mappings and it is always
@@ -219,8 +237,27 @@ static inline void
 switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	  struct task_struct *tsk)
 {
-	if (prev != next)
-		__switch_mm(next);
+	unsigned int cpu = smp_processor_id();
+
+	if (!per_cpu(cpu_not_lazy_tlb, cpu)) {
+		per_cpu(cpu_not_lazy_tlb, cpu) = true;
+		atomic_inc(&next->context.nr_active_mm);
+		__switch_mm(next, cpu);
+	} else if (prev != next) {
+		atomic_inc(&next->context.nr_active_mm);
+		__switch_mm(next, cpu);
+		/*
+		 * DSB will flush the speculative pagetable walks on
+		 * the old asid. It's required before decreasing
+		 * nr_active_mm because after decreasing nr_active_mm
+		 * the tlbi broadcast may not happen on the unloaded
+		 * asid before the pagetables are freed.
+		 */
+		dsb(ish);
+		atomic_dec(&prev->context.nr_active_mm);
+	}
+	VM_WARN_ON(!atomic_read(&next->context.nr_active_mm));
+	VM_WARN_ON(atomic_read(&prev->context.nr_active_mm) < 0);
 
 	/*
 	 * Update the saved TTBR0_EL1 of the scheduled-in task as the previous
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 412a3b9a3c25d..e86f7f6e2169a 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -235,6 +235,15 @@ static inline void local_flush_tlb_all(void)
 	isb();
 }
 
+static inline void local_flush_tlb_asid(unsigned long asid)
+{
+	asid = __TLBI_VADDR(0, __ASID(asid));
+	dsb(nshst);
+	__tlbi(aside1, asid);
+	__tlbi_user(aside1, asid);
+	dsb(nsh);
+}
+
 static inline void flush_tlb_all(void)
 {
 	dsb(ishst);
@@ -243,15 +252,45 @@ static inline void flush_tlb_all(void)
 	isb();
 }
 
+DECLARE_PER_CPU(bool, cpu_not_lazy_tlb);
+
+enum tlb_flush_types {
+	TLB_FLUSH_NO,
+	TLB_FLUSH_LOCAL,
+	TLB_FLUSH_BROADCAST,
+};
+extern enum tlb_flush_types tlb_flush_check(struct mm_struct *mm,
+					    unsigned int cpu);
+
 static inline void flush_tlb_mm(struct mm_struct *mm)
 {
 	unsigned long asid;
+	enum tlb_flush_types flush;
 
-	dsb(ishst);
+	flush = tlb_flush_check(mm, get_cpu());
 	asid = __TLBI_VADDR(0, ASID(mm));
-	__tlbi(aside1is, asid);
-	__tlbi_user(aside1is, asid);
-	dsb(ish);
+	switch (flush) {
+	case TLB_FLUSH_LOCAL:
+
+		dsb(nshst);
+		__tlbi(aside1, asid);
+		__tlbi_user(aside1, asid);
+		dsb(nsh);
+
+		fallthrough;
+	case TLB_FLUSH_NO:
+		put_cpu();
+		break;
+	case TLB_FLUSH_BROADCAST:
+		put_cpu();
+
+		dsb(ishst);
+		__tlbi(aside1is, asid);
+		__tlbi_user(aside1is, asid);
+		dsb(ish);
+
+		break;
+	}
 }
 
 static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
@@ -268,8 +307,33 @@ static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
 static inline void flush_tlb_page(struct vm_area_struct *vma,
 				  unsigned long uaddr)
 {
-	flush_tlb_page_nosync(vma, uaddr);
-	dsb(ish);
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long addr = __TLBI_VADDR(uaddr, ASID(mm));
+	enum tlb_flush_types flush;
+
+	flush = tlb_flush_check(mm, get_cpu());
+	switch (flush) {
+	case TLB_FLUSH_LOCAL:
+
+		dsb(nshst);
+		__tlbi(vale1, addr);
+		__tlbi_user(vale1, addr);
+		dsb(nsh);
+
+		fallthrough;
+	case TLB_FLUSH_NO:
+		put_cpu();
+		break;
+	case TLB_FLUSH_BROADCAST:
+		put_cpu();
+
+		dsb(ishst);
+		__tlbi(vale1is, addr);
+		__tlbi_user(vale1is, addr);
+		dsb(ish);
+
+		break;
+	}
 }
 
 /*
@@ -285,7 +349,9 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 {
 	int num = 0;
 	int scale = 0;
+	struct mm_struct *mm = vma->vm_mm;
 	unsigned long asid, addr, pages;
+	enum tlb_flush_types flush;
 
 	start = round_down(start, stride);
 	end = round_up(end, stride);
@@ -300,64 +366,114 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 	if ((!system_supports_tlb_range() &&
 	     (end - start) >= (MAX_TLBI_OPS * stride)) ||
 	    pages >= MAX_TLBI_RANGE_PAGES) {
-		flush_tlb_mm(vma->vm_mm);
+		flush_tlb_mm(mm);
 		return;
 	}
 
-	dsb(ishst);
-	asid = ASID(vma->vm_mm);
+	flush = tlb_flush_check(mm, get_cpu());
+	asid = ASID(mm);
+	switch (flush) {
+	case TLB_FLUSH_LOCAL:
+
+		dsb(nshst);
+
+		while (pages > 0) {
+			if (!system_supports_tlb_range() ||
+			    pages % 2 == 1) {
+				addr = __TLBI_VADDR(start, asid);
+				if (last_level) {
+					__tlbi_level(vale1, addr, tlb_level);
+					__tlbi_user_level(vale1, addr, tlb_level);
+				} else {
+					__tlbi_level(vae1, addr, tlb_level);
+					__tlbi_user_level(vae1, addr, tlb_level);
+				}
+				start += stride;
+				pages -= stride >> PAGE_SHIFT;
+				continue;
+			}
 
-	/*
-	 * When the CPU does not support TLB range operations, flush the TLB
-	 * entries one by one at the granularity of 'stride'. If the TLB
-	 * range ops are supported, then:
-	 *
-	 * 1. If 'pages' is odd, flush the first page through non-range
-	 *    operations;
-	 *
-	 * 2. For remaining pages: the minimum range granularity is decided
-	 *    by 'scale', so multiple range TLBI operations may be required.
-	 *    Start from scale = 0, flush the corresponding number of pages
-	 *    ((num+1)*2^(5*scale+1) starting from 'addr'), then increase it
-	 *    until no pages left.
-	 *
-	 * Note that certain ranges can be represented by either num = 31 and
-	 * scale or num = 0 and scale + 1. The loop below favours the latter
-	 * since num is limited to 30 by the __TLBI_RANGE_NUM() macro.
-	 */
-	while (pages > 0) {
-		if (!system_supports_tlb_range() ||
-		    pages % 2 == 1) {
-			addr = __TLBI_VADDR(start, asid);
-			if (last_level) {
-				__tlbi_level(vale1is, addr, tlb_level);
-				__tlbi_user_level(vale1is, addr, tlb_level);
-			} else {
-				__tlbi_level(vae1is, addr, tlb_level);
-				__tlbi_user_level(vae1is, addr, tlb_level);
+			num = __TLBI_RANGE_NUM(pages, scale);
+			if (num >= 0) {
+				addr = __TLBI_VADDR_RANGE(start, asid, scale,
+							  num, tlb_level);
+				if (last_level) {
+					__tlbi(rvale1, addr);
+					__tlbi_user(rvale1, addr);
+				} else {
+					__tlbi(rvae1, addr);
+					__tlbi_user(rvae1, addr);
+				}
+				start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT;
+				pages -= __TLBI_RANGE_PAGES(num, scale);
 			}
-			start += stride;
-			pages -= stride >> PAGE_SHIFT;
-			continue;
+			scale++;
 		}
+		dsb(nsh);
+
+		fallthrough;
+	case TLB_FLUSH_NO:
+		put_cpu();
+		break;
+	case TLB_FLUSH_BROADCAST:
+		put_cpu();
+
+		dsb(ishst);
+
+		/*
+		 * When the CPU does not support TLB range operations, flush the TLB
+		 * entries one by one at the granularity of 'stride'. If the TLB
+		 * range ops are supported, then:
+		 *
+		 * 1. If 'pages' is odd, flush the first page through non-range
+		 *    operations;
+		 *
+		 * 2. For remaining pages: the minimum range granularity is decided
+		 *    by 'scale', so multiple range TLBI operations may be required.
+		 *    Start from scale = 0, flush the corresponding number of pages
+		 *    ((num+1)*2^(5*scale+1) starting from 'addr'), then increase it
+		 *    until no pages left.
+		 *
+		 * Note that certain ranges can be represented by either num = 31 and
+		 * scale or num = 0 and scale + 1. The loop below favours the latter
+		 * since num is limited to 30 by the __TLBI_RANGE_NUM() macro.
+		 */
+		while (pages > 0) {
+			if (!system_supports_tlb_range() ||
+			    pages % 2 == 1) {
+				addr = __TLBI_VADDR(start, asid);
+				if (last_level) {
+					__tlbi_level(vale1is, addr, tlb_level);
+					__tlbi_user_level(vale1is, addr, tlb_level);
+				} else {
+					__tlbi_level(vae1is, addr, tlb_level);
+					__tlbi_user_level(vae1is, addr, tlb_level);
+				}
+				start += stride;
+				pages -= stride >> PAGE_SHIFT;
+				continue;
+			}
 
-		num = __TLBI_RANGE_NUM(pages, scale);
-		if (num >= 0) {
-			addr = __TLBI_VADDR_RANGE(start, asid, scale,
-						  num, tlb_level);
-			if (last_level) {
-				__tlbi(rvale1is, addr);
-				__tlbi_user(rvale1is, addr);
-			} else {
-				__tlbi(rvae1is, addr);
-				__tlbi_user(rvae1is, addr);
+			num = __TLBI_RANGE_NUM(pages, scale);
+			if (num >= 0) {
+				addr = __TLBI_VADDR_RANGE(start, asid, scale,
+							  num, tlb_level);
+				if (last_level) {
+					__tlbi(rvale1is, addr);
+					__tlbi_user(rvale1is, addr);
+				} else {
+					__tlbi(rvae1is, addr);
+					__tlbi_user(rvae1is, addr);
+				}
+				start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT;
+				pages -= __TLBI_RANGE_PAGES(num, scale);
 			}
-			start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT;
-			pages -= __TLBI_RANGE_PAGES(num, scale);
+			scale++;
 		}
-		scale++;
+		dsb(ish);
+
+		break;
 	}
-	dsb(ish);
 }
 
 static inline void flush_tlb_range(struct vm_area_struct *vma,
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index cd72576ae2b76..f0004bb306d74 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -26,6 +26,7 @@ static unsigned long *asid_map;
 static DEFINE_PER_CPU(atomic64_t, active_asids);
 static DEFINE_PER_CPU(u64, reserved_asids);
 static cpumask_t tlb_flush_pending;
+DEFINE_PER_CPU(bool, cpu_not_lazy_tlb);
 
 static unsigned long max_pinned_asids;
 static unsigned long nr_pinned_asids;
@@ -209,13 +210,19 @@ static u64 new_context(struct mm_struct *mm)
 set_asid:
 	__set_bit(asid, asid_map);
 	cur_idx = asid;
+	/*
+	  * check_and_switch_context() will change the ASID of this mm
+	  * so no need of extra ASID local TLB flushes: the new ASID
+	  * isn't stale anymore after the tlb_flush_pending was set.
+	  */
+	cpumask_clear(mm_cpumask(mm));
 	return idx2asid(asid) | generation;
 }
 
 void check_and_switch_context(struct mm_struct *mm)
 {
 	unsigned long flags;
-	unsigned int cpu;
+	unsigned int cpu = smp_processor_id();
 	u64 asid, old_active_asid;
 
 	if (system_supports_cnp())
@@ -251,7 +258,6 @@ void check_and_switch_context(struct mm_struct *mm)
 		atomic64_set(&mm->context.id, asid);
 	}
 
-	cpu = smp_processor_id();
 	if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending))
 		local_flush_tlb_all();
 
@@ -259,6 +265,15 @@ void check_and_switch_context(struct mm_struct *mm)
 	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
 
 switch_mm_fastpath:
+	/*
+	 * Enforce CPU ordering between the atomic_inc(nr_active_mm)
+	 * in switch_mm() and the below cpumask_test_cpu(mm_cpumask).
+	 */
+	smp_mb();
+	if (cpumask_test_cpu(cpu, mm_cpumask(mm))) {
+		cpumask_clear_cpu(cpu, mm_cpumask(mm));
+		local_flush_tlb_asid(asid);
+	}
 
 	arm64_apply_bp_hardening();
 
@@ -270,6 +285,44 @@ switch_mm_fastpath:
 		cpu_switch_mm(mm->pgd, mm);
 }
 
+enum tlb_flush_types tlb_flush_check(struct mm_struct *mm, unsigned int cpu)
+{
+	if (atomic_read(&mm->context.nr_active_mm) <= 1) {
+		bool is_local = current->active_mm == mm &&
+			per_cpu(cpu_not_lazy_tlb, cpu);
+		cpumask_t *stale_cpumask = mm_cpumask(mm);
+		unsigned int next_zero = cpumask_next_zero(-1, stale_cpumask);
+		bool local_is_clear = false;
+		if (next_zero < nr_cpu_ids &&
+		    (is_local && next_zero == cpu)) {
+			next_zero = cpumask_next_zero(next_zero, stale_cpumask);
+			local_is_clear = true;
+		}
+		if (next_zero < nr_cpu_ids) {
+			cpumask_setall(stale_cpumask);
+			local_is_clear = false;
+		}
+
+		/*
+		 * Enforce CPU ordering between the above
+		 * cpumask_setall(mm_cpumask) and the below
+		 * atomic_read(nr_active_mm).
+		 */
+		smp_mb();
+
+		if (likely(atomic_read(&mm->context.nr_active_mm)) <= 1) {
+			if (is_local) {
+				if (!local_is_clear)
+					cpumask_clear_cpu(cpu, stale_cpumask);
+				return TLB_FLUSH_LOCAL;
+			}
+			if (atomic_read(&mm->context.nr_active_mm) == 0)
+				return TLB_FLUSH_NO;
+		}
+	}
+	return TLB_FLUSH_BROADCAST;
+}
+
 unsigned long arm64_mm_context_get(struct mm_struct *mm)
 {
 	unsigned long flags;
author	Andrea Arcangeli <aarcange@redhat.com>	2019-12-03 12:32:01 -0500
committer	Andrea Arcangeli <aarcange@redhat.com>	2023-11-11 22:03:38 -0500
commit	78e8089e340ec80006b98c291d98dc8ac48f8bf6 (patch)
tree	8b91c218cb6f4574356f049ade4f32cac7b4c030
parent	7ac6822587f2d182c178b4caac8739c15c919c0f (diff)
download	aa-78e8089e340ec80006b98c291d98dc8ac48f8bf6.tar.gz