summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2015-02-03 12:24:08 +0100
committerIngo Molnar <mingo@kernel.org>2015-02-03 12:24:08 +0100
commitb57c0b5175ddbe9b477801f9994a5b330702c1ba (patch)
tree69d5d9276255650af1914a30c2c7900a4522a9d4
parentad6e46869aa7228750c99d662837ef01862f84d6 (diff)
parent96b6352c12711d5c0bb7157f49c92580248e8146 (diff)
Merge tag 'pr-20150201-x86-entry' of git://git.kernel.org/pub/scm/linux/kernel/git/luto/linux into x86/asm
Pull "x86: Entry cleanups and a bugfix for 3.20" from Andy Lutomirski: " This fixes a bug in the RCU code I added in ist_enter. It also includes the sysret stuff discussed here: http://lkml.kernel.org/g/cover.1421453410.git.luto%40amacapital.net " Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/kernel/entry_64.S106
-rw-r--r--arch/x86/kernel/traps.c25
2 files changed, 77 insertions, 54 deletions
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 501212f14c87..db13655c3a2a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -361,15 +361,12 @@ system_call_fastpath:
* Has incomplete stack frame and undefined top of stack.
*/
ret_from_sys_call:
- movl $_TIF_ALLWORK_MASK,%edi
- /* edi: flagmask */
-sysret_check:
+ testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ jnz int_ret_from_sys_call_fixup /* Go the the slow path */
+
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
- andl %edi,%edx
- jnz sysret_careful
CFI_REMEMBER_STATE
/*
* sysretq will re-enable interrupts:
@@ -383,49 +380,10 @@ sysret_check:
USERGS_SYSRET64
CFI_RESTORE_STATE
- /* Handle reschedules */
- /* edx: work, edi: workmask */
-sysret_careful:
- bt $TIF_NEED_RESCHED,%edx
- jnc sysret_signal
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- pushq_cfi %rdi
- SCHEDULE_USER
- popq_cfi %rdi
- jmp sysret_check
- /* Handle a signal */
-sysret_signal:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
-#ifdef CONFIG_AUDITSYSCALL
- bt $TIF_SYSCALL_AUDIT,%edx
- jc sysret_audit
-#endif
- /*
- * We have a signal, or exit tracing or single-step.
- * These all wind up with the iret return path anyway,
- * so just join that path right now.
- */
+int_ret_from_sys_call_fixup:
FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
- jmp int_check_syscall_exit_work
-
-#ifdef CONFIG_AUDITSYSCALL
- /*
- * Return fast path for syscall audit. Call __audit_syscall_exit()
- * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
- * masked off.
- */
-sysret_audit:
- movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
- cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
- setbe %al /* 1 if so, 0 if not */
- movzbl %al,%edi /* zero-extend that into %edi */
- call __audit_syscall_exit
- movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
- jmp sysret_check
-#endif /* CONFIG_AUDITSYSCALL */
+ jmp int_ret_from_sys_call
/* Do syscall tracing */
tracesys:
@@ -794,6 +752,60 @@ retint_swapgs: /* return to user-space */
*/
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_IRETQ
+
+ /*
+ * Try to use SYSRET instead of IRET if we're returning to
+ * a completely clean 64-bit userspace context.
+ */
+ movq (RCX-R11)(%rsp), %rcx
+ cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */
+ jne opportunistic_sysret_failed
+
+ /*
+ * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP. It's not worth
+ * testing for canonicalness exactly -- this check detects any
+ * of the 17 high bits set, which is true for non-canonical
+ * or kernel addresses. (This will pessimize vsyscall=native.
+ * Big deal.)
+ *
+ * If virtual addresses ever become wider, this will need
+ * to be updated to remain correct on both old and new CPUs.
+ */
+ .ifne __VIRTUAL_MASK_SHIFT - 47
+ .error "virtual address width changed -- sysret checks need update"
+ .endif
+ shr $__VIRTUAL_MASK_SHIFT, %rcx
+ jnz opportunistic_sysret_failed
+
+ cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */
+ jne opportunistic_sysret_failed
+
+ movq (R11-ARGOFFSET)(%rsp), %r11
+ cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */
+ jne opportunistic_sysret_failed
+
+ testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */
+ jnz opportunistic_sysret_failed
+
+ /* nothing to check for RSP */
+
+ cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */
+ jne opportunistic_sysret_failed
+
+ /*
+ * We win! This label is here just for ease of understanding
+ * perf profiles. Nothing jumps here.
+ */
+irq_return_via_sysret:
+ CFI_REMEMBER_STATE
+ RESTORE_ARGS 1,8,1
+ movq (RSP-RIP)(%rsp),%rsp
+ USERGS_SYSRET64
+ CFI_RESTORE_STATE
+
+opportunistic_sysret_failed:
SWAPGS
jmp restore_args
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7176f84f95a4..c74f2f5652da 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -110,15 +110,11 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
enum ctx_state ist_enter(struct pt_regs *regs)
{
- /*
- * We are atomic because we're on the IST stack (or we're on x86_32,
- * in which case we still shouldn't schedule.
- */
- preempt_count_add(HARDIRQ_OFFSET);
+ enum ctx_state prev_state;
if (user_mode_vm(regs)) {
/* Other than that, we're just an exception. */
- return exception_enter();
+ prev_state = exception_enter();
} else {
/*
* We might have interrupted pretty much anything. In
@@ -127,12 +123,27 @@ enum ctx_state ist_enter(struct pt_regs *regs)
* but we need to notify RCU.
*/
rcu_nmi_enter();
- return IN_KERNEL; /* the value is irrelevant. */
+ prev_state = IN_KERNEL; /* the value is irrelevant. */
}
+
+ /*
+ * We are atomic because we're on the IST stack (or we're on x86_32,
+ * in which case we still shouldn't schedule).
+ *
+ * This must be after exception_enter(), because exception_enter()
+ * won't do anything if in_interrupt() returns true.
+ */
+ preempt_count_add(HARDIRQ_OFFSET);
+
+ /* This code is a bit fragile. Test it. */
+ rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
+
+ return prev_state;
}
void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
{
+ /* Must be before exception_exit. */
preempt_count_sub(HARDIRQ_OFFSET);
if (user_mode_vm(regs))