Merge patch series "riscv: ASID-related and UP-related TLB flush enhancements"

Samuel Holland <samuel.holland@sifive.com> says: This series converts uniprocessor kernel builds to use the same TLB flushing code as SMP builds, to take advantage of batching and existing range- and ASID-based TLB flush optimizations. It optimizes out IPIs and SBI calls based on the online CPU count, which also covers the scenario where SMP was enabled at build time but only one CPU is present/online. A final optimization is to use single-ASID flushes wherever possible, to avoid unnecessary TLB misses for kernel mappings. This series has a semantic conflict with the AIA patches that are in linux-next due to the removal of the third parameter of riscv_ipi_set_virq_range(), which is called from imsic_ipi_domain_init() in drivers/irqchip/irq-riscv-imsic-early.c. The resolution is to remove the extra argument from the call site. Here are some numbers from D1 which show the performance impact: v6.9-rc1: System Benchmarks Partial Index BASELINE RESULT INDEX Execl Throughput 43.0 198.5 46.2 File Copy 1024 bufsize 2000 maxblocks 3960.0 73934.4 186.7 File Copy 256 bufsize 500 maxblocks 1655.0 20242.6 122.3 File Copy 4096 bufsize 8000 maxblocks 5800.0 197706.4 340.9 Pipe Throughput 12440.0 176974.2 142.3 Pipe-based Context Switching 4000.0 23626.8 59.1 Process Creation 126.0 449.9 35.7 Shell Scripts (1 concurrent) 42.4 544.4 128.4 Shell Scripts (16 concurrent) --- 35.3 --- Shell Scripts (8 concurrent) 6.0 71.6 119.3 System Call Overhead 15000.0 248072.6 165.4 ======== System Benchmarks Index Score (Partial Only) 110.6 v6.9-rc1 + this patch series: System Benchmarks Partial Index BASELINE RESULT INDEX Execl Throughput 43.0 196.8 45.8 File Copy 1024 bufsize 2000 maxblocks 3960.0 71782.2 181.3 File Copy 256 bufsize 500 maxblocks 1655.0 21269.4 128.5 File Copy 4096 bufsize 8000 maxblocks 5800.0 199424.0 343.8 Pipe Throughput 12440.0 196468.6 157.9 Pipe-based Context Switching 4000.0 24261.8 60.7 Process Creation 126.0 459.0 36.4 Shell Scripts (1 concurrent) 42.4 543.8 128.2 Shell Scripts (16 concurrent) --- 35.5 --- Shell Scripts (8 concurrent) 6.0 71.7 119.6 System Call Overhead 15000.0 259415.2 172.9 ======== System Benchmarks Index Score (Partial Only) 113.0 * b4-shazam-lts: riscv: mm: Always use an ASID to flush mm contexts riscv: mm: Preserve global TLB entries when switching contexts riscv: mm: Make asid_bits a local variable riscv: mm: Use a fixed layout for the MM context ID riscv: mm: Introduce cntx2asid/cntx2version helper macros riscv: Avoid TLB flush loops when affected by SiFive CIP-1200 riscv: Apply SiFive CIP-1200 workaround to single-ASID sfence.vma riscv: mm: Combine the SMP and UP TLB flush code riscv: Only send remote fences when some other CPU is online riscv: mm: Broadcast kernel TLB flushes only when needed riscv: Use IPIs for remote cache/TLB flushes by default riscv: Factor out page table TLB synchronization riscv: Flush the instruction cache during SMP bringup Link: https://lore.kernel.org/r/20240327045035.368512-1-samuel.holland@sifive.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
author: Palmer Dabbelt <palmer@rivosinc.com> 2024-04-29 10:49:39 -0700
committer: Palmer Dabbelt <palmer@rivosinc.com> 2024-04-30 10:35:48 -0700
commit: 4f16345d92006110a9e686ee0d68a5d64a10fe83 (patch)
tree: ff067eb46739e7cba218b788887e66d7981a3fd0 /arch/riscv/mm/context.c
parent: 48b4fc66939d5ed49da305cad9788dcd953b5cd2 (diff)
parent: daef19263fc102551d734f39d9ad417098ffb360 (diff)
1 files changed, 10 insertions, 13 deletions
diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
index 9b6db024103b..4abe3de23225 100644
--- a/arch/riscv/mm/context.c
+++ b/arch/riscv/mm/context.c
@@ -21,9 +21,7 @@
 
 DEFINE_STATIC_KEY_FALSE(use_asid_allocator);
 
-static unsigned long asid_bits;
 static unsigned long num_asids;
-unsigned long asid_mask;
 
 static atomic_long_t current_version;
 
@@ -82,7 +80,7 @@ static void __flush_context(void)
 		if (cntx == 0)
 			cntx = per_cpu(reserved_context, i);
 
-		__set_bit(cntx & asid_mask, context_asid_map);
+		__set_bit(cntx2asid(cntx), context_asid_map);
 		per_cpu(reserved_context, i) = cntx;
 	}
 
@@ -103,7 +101,7 @@ static unsigned long __new_context(struct mm_struct *mm)
 	lockdep_assert_held(&context_lock);
 
 	if (cntx != 0) {
-		unsigned long newcntx = ver | (cntx & asid_mask);
+		unsigned long newcntx = ver | cntx2asid(cntx);
 
 		/*
 		 * If our current CONTEXT was active during a rollover, we
@@ -116,7 +114,7 @@ static unsigned long __new_context(struct mm_struct *mm)
 		 * We had a valid CONTEXT in a previous life, so try to
 		 * re-use it if possible.
 		 */
-		if (!__test_and_set_bit(cntx & asid_mask, context_asid_map))
+		if (!__test_and_set_bit(cntx2asid(cntx), context_asid_map))
 			return newcntx;
 	}
 
@@ -129,7 +127,7 @@ static unsigned long __new_context(struct mm_struct *mm)
 		goto set_asid;
 
 	/* We're out of ASIDs, so increment current_version */
-	ver = atomic_long_add_return_relaxed(num_asids, &current_version);
+	ver = atomic_long_add_return_relaxed(BIT(SATP_ASID_BITS), &current_version);
 
 	/* Flush everything  */
 	__flush_context();
@@ -169,7 +167,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
 	 */
 	old_active_cntx = atomic_long_read(&per_cpu(active_context, cpu));
 	if (old_active_cntx &&
-	    ((cntx & ~asid_mask) == atomic_long_read(&current_version)) &&
+	    (cntx2version(cntx) == atomic_long_read(&current_version)) &&
 	    atomic_long_cmpxchg_relaxed(&per_cpu(active_context, cpu),
 					old_active_cntx, cntx))
 		goto switch_mm_fast;
@@ -178,7 +176,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
 
 	/* Check that our ASID belongs to the current_version. */
 	cntx = atomic_long_read(&mm->context.id);
-	if ((cntx & ~asid_mask) != atomic_long_read(&current_version)) {
+	if (cntx2version(cntx) != atomic_long_read(&current_version)) {
 		cntx = __new_context(mm);
 		atomic_long_set(&mm->context.id, cntx);
 	}
@@ -192,7 +190,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
 
 switch_mm_fast:
 	csr_write(CSR_SATP, virt_to_pfn(mm->pgd) |
-		  ((cntx & asid_mask) << SATP_ASID_SHIFT) |
+		  (cntx2asid(cntx) << SATP_ASID_SHIFT) |
 		  satp_mode);
 
 	if (need_flush_tlb)
@@ -203,7 +201,7 @@ static void set_mm_noasid(struct mm_struct *mm)
 {
 	/* Switch the page table and blindly nuke entire local TLB */
 	csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode);
-	local_flush_tlb_all();
+	local_flush_tlb_all_asid(0);
 }
 
 static inline void set_mm(struct mm_struct *prev,
@@ -228,7 +226,7 @@ static inline void set_mm(struct mm_struct *prev,
 
 static int __init asids_init(void)
 {
-	unsigned long old;
+	unsigned long asid_bits, old;
 
 	/* Figure-out number of ASID bits in HW */
 	old = csr_read(CSR_SATP);
@@ -248,7 +246,6 @@ static int __init asids_init(void)
 	/* Pre-compute ASID details */
 	if (asid_bits) {
 		num_asids = 1 << asid_bits;
-		asid_mask = num_asids - 1;
 	}
 
 	/*
@@ -256,7 +253,7 @@ static int __init asids_init(void)
 	 * at-least twice more than CPUs
 	 */
 	if (num_asids > (2 * num_possible_cpus())) {
-		atomic_long_set(&current_version, num_asids);
+		atomic_long_set(&current_version, BIT(SATP_ASID_BITS));
 
 		context_asid_map = bitmap_zalloc(num_asids, GFP_KERNEL);
 		if (!context_asid_map)
author	Palmer Dabbelt <palmer@rivosinc.com>	2024-04-29 10:49:39 -0700
committer	Palmer Dabbelt <palmer@rivosinc.com>	2024-04-30 10:35:48 -0700
commit	4f16345d92006110a9e686ee0d68a5d64a10fe83 (patch)
tree	ff067eb46739e7cba218b788887e66d7981a3fd0 /arch/riscv/mm/context.c
parent	48b4fc66939d5ed49da305cad9788dcd953b5cd2 (diff)
parent	daef19263fc102551d734f39d9ad417098ffb360 (diff)