Merge tag 'x86_urgent_for_6.5_rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 CFI fixes from Peter Zijlstra: "Fix kCFI/FineIBT weaknesses The primary bug Alyssa noticed was that with FineIBT enabled function prologues have a spurious ENDBR instruction: __cfi_foo: endbr64 subl $hash, %r10d jz 1f ud2 nop 1: foo: endbr64 <--- *sadface* This means that any indirect call that fails to target the __cfi symbol and instead targets (the regular old) foo+0, will succeed due to that second ENDBR. Fixing this led to the discovery of a single indirect call that was still doing this: ret_from_fork(). Since that's an assembly stub the compiler would not generate the proper kCFI indirect call magic and it would not get patched. Brian came up with the most comprehensive fix -- convert the thing to C with only a very thin asm wrapper. This ensures the kernel thread boostrap is a proper kCFI call. While discussing all this, Kees noted that kCFI hashes could/should be poisoned to seal all functions whose address is never taken, further limiting the valid kCFI targets -- much like we already do for IBT. So what was a 'simple' observation and fix cascaded into a bunch of inter-related CFI infrastructure fixes" * tag 'x86_urgent_for_6.5_rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/cfi: Only define poison_cfi() if CONFIG_X86_KERNEL_IBT=y x86/fineibt: Poison ENDBR at +0 x86: Rewrite ret_from_fork() in C x86/32: Remove schedule_tail_wrapper() x86/cfi: Extend ENDBR sealing to kCFI x86/alternative: Rename apply_ibt_endbr() x86/cfi: Extend {JMP,CAKK}_NOSPEC comment
author: Linus Torvalds <torvalds@linux-foundation.org> 2023-07-14 20:19:25 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2023-07-14 20:19:25 -0700
commit: b6e6cc1f78c772e952495b7416c9ac9029f9390c (patch)
tree: f43d33a19e988dcec55b8ce4597e165deb1459d7 /arch/x86
parent: be522ac7cdcc1b7dd19fa348205363041ab65a98 (diff)
parent: 535d0ae39185a266536a1e97ff9a8956d7fbb9df (diff)
9 files changed, 119 insertions, 74 deletions
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 91397f58ac30..6e6af42e044a 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -720,26 +720,6 @@ SYM_CODE_END(__switch_to_asm)
 .popsection
 
 /*
- * The unwinder expects the last frame on the stack to always be at the same
- * offset from the end of the page, which allows it to validate the stack.
- * Calling schedule_tail() directly would break that convention because its an
- * asmlinkage function so its argument has to be pushed on the stack.  This
- * wrapper creates a proper "end of stack" frame header before the call.
- */
-.pushsection .text, "ax"
-SYM_FUNC_START(schedule_tail_wrapper)
-	FRAME_BEGIN
-
-	pushl	%eax
-	call	schedule_tail
-	popl	%eax
-
-	FRAME_END
-	RET
-SYM_FUNC_END(schedule_tail_wrapper)
-.popsection
-
-/*
  * A newly forked process directly context switches into this address.
  *
  * eax: prev task we switched from
@@ -747,29 +727,22 @@ SYM_FUNC_END(schedule_tail_wrapper)
  * edi: kernel thread arg
  */
 .pushsection .text, "ax"
-SYM_CODE_START(ret_from_fork)
-	call	schedule_tail_wrapper
+SYM_CODE_START(ret_from_fork_asm)
+	movl	%esp, %edx	/* regs */
 
-	testl	%ebx, %ebx
-	jnz	1f		/* kernel threads are uncommon */
+	/* return address for the stack unwinder */
+	pushl	$.Lsyscall_32_done
 
-2:
-	/* When we fork, we trace the syscall return in the child, too. */
-	movl    %esp, %eax
-	call    syscall_exit_to_user_mode
-	jmp     .Lsyscall_32_done
+	FRAME_BEGIN
+	/* prev already in EAX */
+	movl	%ebx, %ecx	/* fn */
+	pushl	%edi		/* fn_arg */
+	call	ret_from_fork
+	addl	$4, %esp
+	FRAME_END
 
-	/* kernel thread */
-1:	movl	%edi, %eax
-	CALL_NOSPEC ebx
-	/*
-	 * A kernel thread is allowed to return here after successfully
-	 * calling kernel_execve().  Exit to userspace to complete the execve()
-	 * syscall.
-	 */
-	movl	$0, PT_EAX(%esp)
-	jmp	2b
-SYM_CODE_END(ret_from_fork)
+	RET
+SYM_CODE_END(ret_from_fork_asm)
 .popsection
 
 SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f31e286c2977..91f6818884fa 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -284,36 +284,19 @@ SYM_FUNC_END(__switch_to_asm)
  * r12: kernel thread arg
  */
 .pushsection .text, "ax"
-	__FUNC_ALIGN
-SYM_CODE_START_NOALIGN(ret_from_fork)
-	UNWIND_HINT_END_OF_STACK
+SYM_CODE_START(ret_from_fork_asm)
+	UNWIND_HINT_REGS
 	ANNOTATE_NOENDBR // copy_thread
 	CALL_DEPTH_ACCOUNT
-	movq	%rax, %rdi
-	call	schedule_tail			/* rdi: 'prev' task parameter */
 
-	testq	%rbx, %rbx			/* from kernel_thread? */
-	jnz	1f				/* kernel threads are uncommon */
+	movq	%rax, %rdi		/* prev */
+	movq	%rsp, %rsi		/* regs */
+	movq	%rbx, %rdx		/* fn */
+	movq	%r12, %rcx		/* fn_arg */
+	call	ret_from_fork
 
-2:
-	UNWIND_HINT_REGS
-	movq	%rsp, %rdi
-	call	syscall_exit_to_user_mode	/* returns with IRQs disabled */
 	jmp	swapgs_restore_regs_and_return_to_usermode
-
-1:
-	/* kernel thread */
-	UNWIND_HINT_END_OF_STACK
-	movq	%r12, %rdi
-	CALL_NOSPEC rbx
-	/*
-	 * A kernel thread is allowed to return here after successfully
-	 * calling kernel_execve().  Exit to userspace to complete the execve()
-	 * syscall.
-	 */
-	movq	$0, RAX(%rsp)
-	jmp	2b
-SYM_CODE_END(ret_from_fork)
+SYM_CODE_END(ret_from_fork_asm)
 .popsection
 
 .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 6c15a622ad60..9c4da699e11a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -96,7 +96,7 @@ extern void alternative_instructions(void);
 extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
 extern void apply_retpolines(s32 *start, s32 *end);
 extern void apply_returns(s32 *start, s32 *end);
-extern void apply_ibt_endbr(s32 *start, s32 *end);
+extern void apply_seal_endbr(s32 *start, s32 *end);
 extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine,
 			  s32 *start_cfi, s32 *end_cfi);
 
diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h
index baae6b4fea23..1e59581d500c 100644
--- a/arch/x86/include/asm/ibt.h
+++ b/arch/x86/include/asm/ibt.h
@@ -34,7 +34,7 @@
 /*
  * Create a dummy function pointer reference to prevent objtool from marking
  * the function as needing to be "sealed" (i.e. ENDBR converted to NOP by
- * apply_ibt_endbr()).
+ * apply_seal_endbr()).
  */
 #define IBT_NOSEAL(fname)				\
 	".pushsection .discard.ibt_endbr_noseal\n\t"	\
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 55388c9f7601..1a65cf4acb2b 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -234,6 +234,10 @@
  * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
  * indirect jmp/call which may be susceptible to the Spectre variant 2
  * attack.
+ *
+ * NOTE: these do not take kCFI into account and are thus not comparable to C
+ * indirect calls, take care when using. The target of these should be an ENDBR
+ * instruction irrespective of kCFI.
  */
 .macro JMP_NOSPEC reg:req
 #ifdef CONFIG_RETPOLINE
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 5c91305d09d2..f42dbf17f52b 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -12,7 +12,9 @@ struct task_struct *__switch_to_asm(struct task_struct *prev,
 __visible struct task_struct *__switch_to(struct task_struct *prev,
 					  struct task_struct *next);
 
-asmlinkage void ret_from_fork(void);
+asmlinkage void ret_from_fork_asm(void);
+__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+			     int (*fn)(void *), void *fn_arg);
 
 /*
  * This is the structure pointed to by thread.sp for an inactive task.  The
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 72646d75b6ff..2dcf3a06af09 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -778,6 +778,8 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
 
 #ifdef CONFIG_X86_KERNEL_IBT
 
+static void poison_cfi(void *addr);
+
 static void __init_or_module poison_endbr(void *addr, bool warn)
 {
 	u32 endbr, poison = gen_endbr_poison();
@@ -802,8 +804,11 @@ static void __init_or_module poison_endbr(void *addr, bool warn)
 
 /*
  * Generated by: objtool --ibt
+ *
+ * Seal the functions for indirect calls by clobbering the ENDBR instructions
+ * and the kCFI hash value.
  */
-void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end)
+void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end)
 {
 	s32 *s;
 
@@ -812,13 +817,13 @@ void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end)
 
 		poison_endbr(addr, true);
 		if (IS_ENABLED(CONFIG_FINEIBT))
-			poison_endbr(addr - 16, false);
+			poison_cfi(addr - 16);
 	}
 }
 
 #else
 
-void __init_or_module apply_ibt_endbr(s32 *start, s32 *end) { }
+void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
 
 #endif /* CONFIG_X86_KERNEL_IBT */
 
@@ -1063,6 +1068,17 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end)
 	return 0;
 }
 
+static void cfi_rewrite_endbr(s32 *start, s32 *end)
+{
+	s32 *s;
+
+	for (s = start; s < end; s++) {
+		void *addr = (void *)s + *s;
+
+		poison_endbr(addr+16, false);
+	}
+}
+
 /* .retpoline_sites */
 static int cfi_rand_callers(s32 *start, s32 *end)
 {
@@ -1157,14 +1173,19 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
 		return;
 
 	case CFI_FINEIBT:
+		/* place the FineIBT preamble at func()-16 */
 		ret = cfi_rewrite_preamble(start_cfi, end_cfi);
 		if (ret)
 			goto err;
 
+		/* rewrite the callers to target func()-16 */
 		ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
 		if (ret)
 			goto err;
 
+		/* now that nobody targets func()+0, remove ENDBR there */
+		cfi_rewrite_endbr(start_cfi, end_cfi);
+
 		if (builtin)
 			pr_info("Using FineIBT CFI\n");
 		return;
@@ -1177,6 +1198,41 @@ err:
 	pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
 }
 
+static inline void poison_hash(void *addr)
+{
+	*(u32 *)addr = 0;
+}
+
+static void poison_cfi(void *addr)
+{
+	switch (cfi_mode) {
+	case CFI_FINEIBT:
+		/*
+		 * __cfi_\func:
+		 *	osp nopl (%rax)
+		 *	subl	$0, %r10d
+		 *	jz	1f
+		 *	ud2
+		 * 1:	nop
+		 */
+		poison_endbr(addr, false);
+		poison_hash(addr + fineibt_preamble_hash);
+		break;
+
+	case CFI_KCFI:
+		/*
+		 * __cfi_\func:
+		 *	movl	$0, %eax
+		 *	.skip	11, 0x90
+		 */
+		poison_hash(addr + 1);
+		break;
+
+	default:
+		break;
+	}
+}
+
 #else
 
 static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
@@ -1184,6 +1240,10 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
 {
 }
 
+#ifdef CONFIG_X86_KERNEL_IBT
+static void poison_cfi(void *addr) { }
+#endif
+
 #endif
 
 void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
@@ -1565,7 +1625,10 @@ void __init alternative_instructions(void)
 	 */
 	callthunks_patch_builtin_calls();
 
-	apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
+	/*
+	 * Seal all functions that do not have their address taken.
+	 */
+	apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
 
 #ifdef CONFIG_SMP
 	/* Patch to UP if other cpus not imminent. */
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index b05f62ee2344..5f71a0cf4399 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -358,7 +358,7 @@ int module_finalize(const Elf_Ehdr *hdr,
 	}
 	if (ibt_endbr) {
 		void *iseg = (void *)ibt_endbr->sh_addr;
-		apply_ibt_endbr(iseg, iseg + ibt_endbr->sh_size);
+		apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size);
 	}
 	if (locks) {
 		void *lseg = (void *)locks->sh_addr;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ff9b80a0e3e3..72015dba72ab 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@
 #include <linux/static_call.h>
 #include <trace/events/power.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/entry-common.h>
 #include <asm/cpu.h>
 #include <asm/apic.h>
 #include <linux/uaccess.h>
@@ -134,6 +135,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
 		return do_set_thread_area_64(p, ARCH_SET_FS, tls);
 }
 
+__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+				     int (*fn)(void *), void *fn_arg)
+{
+	schedule_tail(prev);
+
+	/* Is this a kernel thread? */
+	if (unlikely(fn)) {
+		fn(fn_arg);
+		/*
+		 * A kernel thread is allowed to return here after successfully
+		 * calling kernel_execve().  Exit to userspace to complete the
+		 * execve() syscall.
+		 */
+		regs->ax = 0;
+	}
+
+	syscall_exit_to_user_mode(regs);
+}
+
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
 	unsigned long clone_flags = args->flags;
@@ -149,7 +169,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 	frame = &fork_frame->frame;
 
 	frame->bp = encode_frame_pointer(childregs);
-	frame->ret_addr = (unsigned long) ret_from_fork;
+	frame->ret_addr = (unsigned long) ret_from_fork_asm;
 	p->thread.sp = (unsigned long) fork_frame;
 	p->thread.io_bitmap = NULL;
 	p->thread.iopl_warn = 0;
author	Linus Torvalds <torvalds@linux-foundation.org>	2023-07-14 20:19:25 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2023-07-14 20:19:25 -0700
commit	b6e6cc1f78c772e952495b7416c9ac9029f9390c (patch)
tree	f43d33a19e988dcec55b8ce4597e165deb1459d7 /arch/x86
parent	be522ac7cdcc1b7dd19fa348205363041ab65a98 (diff)
parent	535d0ae39185a266536a1e97ff9a8956d7fbb9df (diff)