summaryrefslogtreecommitdiff
path: root/kernel/sys.c
diff options
context:
space:
mode:
authorJoey Gouly <joey.gouly@arm.com>2023-01-19 16:03:43 +0000
committerAndrew Morton <akpm@linux-foundation.org>2023-02-02 22:33:24 -0800
commitb507808ebce23561d4ff8c2aa1fb949fe402bc61 (patch)
treef38296cee8417053ea3a21d3a58772edf382a248 /kernel/sys.c
parente6d2c436ff693869e83a65e61643b922e193e162 (diff)
mm: implement memory-deny-write-execute as a prctl
Patch series "mm: In-kernel support for memory-deny-write-execute (MDWE)", v2. The background to this is that systemd has a configuration option called MemoryDenyWriteExecute [2], implemented as a SECCOMP BPF filter. Its aim is to prevent a user task from inadvertently creating an executable mapping that is (or was) writeable. Since such BPF filter is stateless, it cannot detect mappings that were previously writeable but subsequently changed to read-only. Therefore the filter simply rejects any mprotect(PROT_EXEC). The side-effect is that on arm64 with BTI support (Branch Target Identification), the dynamic loader cannot change an ELF section from PROT_EXEC to PROT_EXEC|PROT_BTI using mprotect(). For libraries, it can resort to unmapping and re-mapping but for the main executable it does not have a file descriptor. The original bug report in the Red Hat bugzilla - [3] - and subsequent glibc workaround for libraries - [4]. This series adds in-kernel support for this feature as a prctl PR_SET_MDWE, that is inherited on fork(). The prctl denies PROT_WRITE | PROT_EXEC mappings. Like the systemd BPF filter it also denies adding PROT_EXEC to mappings. However unlike the BPF filter it only denies it if the mapping didn't previous have PROT_EXEC. This allows to PROT_EXEC -> PROT_EXEC | PROT_BTI with mprotect(), which is a problem with the BPF filter. This patch (of 2): The aim of such policy is to prevent a user task from creating an executable mapping that is also writeable. An example of mmap() returning -EACCESS if the policy is enabled: mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, 0, 0); Similarly, mprotect() would return -EACCESS below: addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0); mprotect(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC); The BPF filter that systemd MDWE uses is stateless, and disallows mprotect() with PROT_EXEC completely. This new prctl allows PROT_EXEC to be enabled if it was already PROT_EXEC, which allows the following case: addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0); mprotect(addr, size, PROT_READ | PROT_EXEC | PROT_BTI); where PROT_BTI enables branch tracking identification on arm64. Link: https://lkml.kernel.org/r/20230119160344.54358-1-joey.gouly@arm.com Link: https://lkml.kernel.org/r/20230119160344.54358-2-joey.gouly@arm.com Signed-off-by: Joey Gouly <joey.gouly@arm.com> Co-developed-by: Catalin Marinas <catalin.marinas@arm.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Jeremy Linton <jeremy.linton@arm.com> Cc: Kees Cook <keescook@chromium.org> Cc: Lennart Poettering <lennart@poettering.net> Cc: Mark Brown <broonie@kernel.org> Cc: nd <nd@arm.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Szabolcs Nagy <szabolcs.nagy@arm.com> Cc: Topi Miettinen <toiwoton@gmail.com> Cc: Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl> Cc: David Hildenbrand <david@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'kernel/sys.c')
-rw-r--r--kernel/sys.c33
1 files changed, 33 insertions, 0 deletions
diff --git a/kernel/sys.c b/kernel/sys.c
index 5fd54bf0e886..b3cab94545ed 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2348,6 +2348,33 @@ static int prctl_set_vma(unsigned long opt, unsigned long start,
}
#endif /* CONFIG_ANON_VMA_NAME */
+static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN))
+ return -EINVAL;
+
+ if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
+ set_bit(MMF_HAS_MDWE, &current->mm->flags);
+ else if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
+ return -EPERM; /* Cannot unset the flag */
+
+ return 0;
+}
+
+static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ return test_bit(MMF_HAS_MDWE, &current->mm->flags) ?
+ PR_MDWE_REFUSE_EXEC_GAIN : 0;
+}
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2623,6 +2650,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = sched_core_share_pid(arg2, arg3, arg4, arg5);
break;
#endif
+ case PR_SET_MDWE:
+ error = prctl_set_mdwe(arg2, arg3, arg4, arg5);
+ break;
+ case PR_GET_MDWE:
+ error = prctl_get_mdwe(arg2, arg3, arg4, arg5);
+ break;
case PR_SET_VMA:
error = prctl_set_vma(arg2, arg3, arg4, arg5);
break;