diff options
Diffstat (limited to 'kernel')
71 files changed, 5023 insertions, 2684 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 12c679f769c6..b302b4731d16 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -64,10 +64,7 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o -obj-$(CONFIG_CGROUPS) += cgroup.o -obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o -obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o -obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_CGROUPS) += cgroup/ obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o diff --git a/kernel/audit.c b/kernel/audit.c index 6e399bb69d7c..e794544f5e63 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -121,7 +121,7 @@ u32 audit_sig_sid = 0; 3) suppressed due to audit_rate_limit 4) suppressed due to audit_backlog_limit */ -static atomic_t audit_lost = ATOMIC_INIT(0); +static atomic_t audit_lost = ATOMIC_INIT(0); /* The netlink socket. */ static struct sock *audit_sock; @@ -1058,6 +1058,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err < 0) return err; } + if (s.mask == AUDIT_STATUS_LOST) { + u32 lost = atomic_xchg(&audit_lost, 0); + + audit_log_config_change("lost", 0, lost, 1); + return lost; + } break; } case AUDIT_GET_FEATURE: @@ -1349,7 +1355,9 @@ static int __init audit_init(void) panic("audit: failed to start the kauditd thread (%d)\n", err); } - audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); + audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, + "state=initialized audit_enabled=%u res=1", + audit_enabled); return 0; } diff --git a/kernel/audit.h b/kernel/audit.h index 960d49c9db5e..ca579880303a 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -199,6 +199,9 @@ struct audit_context { struct { int argc; } execve; + struct { + char *name; + } module; }; int fds[2]; struct audit_proctitle proctitle; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index cf1fa43512c1..d6a8de5f8fa3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1221,7 +1221,7 @@ static void show_special(struct audit_context *context, int *call_panic) context->ipc.perm_mode); } break; } - case AUDIT_MQ_OPEN: { + case AUDIT_MQ_OPEN: audit_log_format(ab, "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld " "mq_msgsize=%ld mq_curmsgs=%ld", @@ -1230,8 +1230,8 @@ static void show_special(struct audit_context *context, int *call_panic) context->mq_open.attr.mq_maxmsg, context->mq_open.attr.mq_msgsize, context->mq_open.attr.mq_curmsgs); - break; } - case AUDIT_MQ_SENDRECV: { + break; + case AUDIT_MQ_SENDRECV: audit_log_format(ab, "mqdes=%d msg_len=%zd msg_prio=%u " "abs_timeout_sec=%ld abs_timeout_nsec=%ld", @@ -1240,12 +1240,12 @@ static void show_special(struct audit_context *context, int *call_panic) context->mq_sendrecv.msg_prio, context->mq_sendrecv.abs_timeout.tv_sec, context->mq_sendrecv.abs_timeout.tv_nsec); - break; } - case AUDIT_MQ_NOTIFY: { + break; + case AUDIT_MQ_NOTIFY: audit_log_format(ab, "mqdes=%d sigev_signo=%d", context->mq_notify.mqdes, context->mq_notify.sigev_signo); - break; } + break; case AUDIT_MQ_GETSETATTR: { struct mq_attr *attr = &context->mq_getsetattr.mqstat; audit_log_format(ab, @@ -1255,19 +1255,24 @@ static void show_special(struct audit_context *context, int *call_panic) attr->mq_flags, attr->mq_maxmsg, attr->mq_msgsize, attr->mq_curmsgs); break; } - case AUDIT_CAPSET: { + case AUDIT_CAPSET: audit_log_format(ab, "pid=%d", context->capset.pid); audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); - break; } - case AUDIT_MMAP: { + break; + case AUDIT_MMAP: audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, context->mmap.flags); - break; } - case AUDIT_EXECVE: { + break; + case AUDIT_EXECVE: audit_log_execve_info(context, &ab); - break; } + break; + case AUDIT_KERN_MODULE: + audit_log_format(ab, "name="); + audit_log_untrustedstring(ab, context->module.name); + kfree(context->module.name); + break; } audit_log_end(ab); } @@ -2368,6 +2373,15 @@ void __audit_mmap_fd(int fd, int flags) context->type = AUDIT_MMAP; } +void __audit_log_kern_module(char *name) +{ + struct audit_context *context = current->audit_context; + + context->module.name = kmalloc(strlen(name) + 1, GFP_KERNEL); + strcpy(context->module.name, name); + context->type = AUDIT_KERN_MODULE; +} + static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; @@ -2411,7 +2425,7 @@ void audit_core_dumps(long signr) if (unlikely(!ab)) return; audit_log_task(ab); - audit_log_format(ab, " sig=%ld", signr); + audit_log_format(ab, " sig=%ld res=1", signr); audit_log_end(ab); } diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 1276474ac3cd..e1ce4f4fd7fd 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3d55d95dcf49..6b6f41f0b211 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -269,7 +269,7 @@ static const struct bpf_map_ops array_ops = { .map_delete_elem = array_map_delete_elem, }; -static struct bpf_map_type_list array_type __read_mostly = { +static struct bpf_map_type_list array_type __ro_after_init = { .ops = &array_ops, .type = BPF_MAP_TYPE_ARRAY, }; @@ -283,7 +283,7 @@ static const struct bpf_map_ops percpu_array_ops = { .map_delete_elem = array_map_delete_elem, }; -static struct bpf_map_type_list percpu_array_type __read_mostly = { +static struct bpf_map_type_list percpu_array_type __ro_after_init = { .ops = &percpu_array_ops, .type = BPF_MAP_TYPE_PERCPU_ARRAY, }; @@ -409,7 +409,7 @@ static const struct bpf_map_ops prog_array_ops = { .map_fd_put_ptr = prog_fd_array_put_ptr, }; -static struct bpf_map_type_list prog_array_type __read_mostly = { +static struct bpf_map_type_list prog_array_type __ro_after_init = { .ops = &prog_array_ops, .type = BPF_MAP_TYPE_PROG_ARRAY, }; @@ -522,7 +522,7 @@ static const struct bpf_map_ops perf_event_array_ops = { .map_release = perf_event_fd_array_release, }; -static struct bpf_map_type_list perf_event_array_type __read_mostly = { +static struct bpf_map_type_list perf_event_array_type __ro_after_init = { .ops = &perf_event_array_ops, .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, }; @@ -564,7 +564,7 @@ static const struct bpf_map_ops cgroup_array_ops = { .map_fd_put_ptr = cgroup_fd_array_put_ptr, }; -static struct bpf_map_type_list cgroup_array_type __read_mostly = { +static struct bpf_map_type_list cgroup_array_type __ro_after_init = { .ops = &cgroup_array_ops, .type = BPF_MAP_TYPE_CGROUP_ARRAY, }; diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 89b7ef41c86b..f62d1d56f41d 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -213,11 +213,10 @@ __bpf_lru_list_shrink_inactive(struct bpf_lru *lru, enum bpf_lru_list_type tgt_free_type) { struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; - struct bpf_lru_node *node, *tmp_node, *first_node; + struct bpf_lru_node *node, *tmp_node; unsigned int nshrinked = 0; unsigned int i = 0; - first_node = list_first_entry(inactive, struct bpf_lru_node, list); list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { if (bpf_lru_node_is_ref(node)) { __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); @@ -361,7 +360,8 @@ static void __local_list_add_pending(struct bpf_lru *lru, list_add(&node->list, local_pending_list(loc_l)); } -struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l) +static struct bpf_lru_node * +__local_list_pop_free(struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; @@ -374,8 +374,8 @@ struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l) return node; } -struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru, - struct bpf_lru_locallist *loc_l) +static struct bpf_lru_node * +__local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; bool force = false; @@ -558,8 +558,9 @@ void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) bpf_common_lru_push_free(lru, node); } -void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, - u32 elem_size, u32 nr_elems) +static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, + u32 node_offset, u32 elem_size, + u32 nr_elems) { struct bpf_lru_list *l = &lru->common_lru.lru_list; u32 i; @@ -575,8 +576,9 @@ void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, } } -void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, - u32 elem_size, u32 nr_elems) +static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, + u32 node_offset, u32 elem_size, + u32 nr_elems) { u32 i, pcpu_entries; int cpu; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 503d4211988a..f45827e205d3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -28,6 +28,9 @@ #include <linux/moduleloader.h> #include <linux/bpf.h> #include <linux/frame.h> +#include <linux/rbtree_latch.h> +#include <linux/kallsyms.h> +#include <linux/rcupdate.h> #include <asm/unaligned.h> @@ -95,6 +98,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) fp->aux = aux; fp->aux->prog = fp; + INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); + return fp; } EXPORT_SYMBOL_GPL(bpf_prog_alloc); @@ -290,6 +295,206 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, } #ifdef CONFIG_BPF_JIT +static __always_inline void +bpf_get_prog_addr_region(const struct bpf_prog *prog, + unsigned long *symbol_start, + unsigned long *symbol_end) +{ + const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog); + unsigned long addr = (unsigned long)hdr; + + WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); + + *symbol_start = addr; + *symbol_end = addr + hdr->pages * PAGE_SIZE; +} + +static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +{ + BUILD_BUG_ON(sizeof("bpf_prog_") + + sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN); + + sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); + sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); + *sym = 0; +} + +static __always_inline unsigned long +bpf_get_prog_addr_start(struct latch_tree_node *n) +{ + unsigned long symbol_start, symbol_end; + const struct bpf_prog_aux *aux; + + aux = container_of(n, struct bpf_prog_aux, ksym_tnode); + bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); + + return symbol_start; +} + +static __always_inline bool bpf_tree_less(struct latch_tree_node *a, + struct latch_tree_node *b) +{ + return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b); +} + +static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n) +{ + unsigned long val = (unsigned long)key; + unsigned long symbol_start, symbol_end; + const struct bpf_prog_aux *aux; + + aux = container_of(n, struct bpf_prog_aux, ksym_tnode); + bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); + + if (val < symbol_start) + return -1; + if (val >= symbol_end) + return 1; + + return 0; +} + +static const struct latch_tree_ops bpf_tree_ops = { + .less = bpf_tree_less, + .comp = bpf_tree_comp, +}; + +static DEFINE_SPINLOCK(bpf_lock); +static LIST_HEAD(bpf_kallsyms); +static struct latch_tree_root bpf_tree __cacheline_aligned; + +int bpf_jit_kallsyms __read_mostly; + +static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) +{ + WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); + list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms); + latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); +} + +static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux) +{ + if (list_empty(&aux->ksym_lnode)) + return; + + latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); + list_del_rcu(&aux->ksym_lnode); +} + +static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) +{ + return fp->jited && !bpf_prog_was_classic(fp); +} + +static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) +{ + return list_empty(&fp->aux->ksym_lnode) || + fp->aux->ksym_lnode.prev == LIST_POISON2; +} + +void bpf_prog_kallsyms_add(struct bpf_prog *fp) +{ + unsigned long flags; + + if (!bpf_prog_kallsyms_candidate(fp) || + !capable(CAP_SYS_ADMIN)) + return; + + spin_lock_irqsave(&bpf_lock, flags); + bpf_prog_ksym_node_add(fp->aux); + spin_unlock_irqrestore(&bpf_lock, flags); +} + +void bpf_prog_kallsyms_del(struct bpf_prog *fp) +{ + unsigned long flags; + + if (!bpf_prog_kallsyms_candidate(fp)) + return; + + spin_lock_irqsave(&bpf_lock, flags); + bpf_prog_ksym_node_del(fp->aux); + spin_unlock_irqrestore(&bpf_lock, flags); +} + +static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) +{ + struct latch_tree_node *n; + + if (!bpf_jit_kallsyms_enabled()) + return NULL; + + n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops); + return n ? + container_of(n, struct bpf_prog_aux, ksym_tnode)->prog : + NULL; +} + +const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char *sym) +{ + unsigned long symbol_start, symbol_end; + struct bpf_prog *prog; + char *ret = NULL; + + rcu_read_lock(); + prog = bpf_prog_kallsyms_find(addr); + if (prog) { + bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end); + bpf_get_prog_name(prog, sym); + + ret = sym; + if (size) + *size = symbol_end - symbol_start; + if (off) + *off = addr - symbol_start; + } + rcu_read_unlock(); + + return ret; +} + +bool is_bpf_text_address(unsigned long addr) +{ + bool ret; + + rcu_read_lock(); + ret = bpf_prog_kallsyms_find(addr) != NULL; + rcu_read_unlock(); + + return ret; +} + +int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *sym) +{ + unsigned long symbol_start, symbol_end; + struct bpf_prog_aux *aux; + unsigned int it = 0; + int ret = -ERANGE; + + if (!bpf_jit_kallsyms_enabled()) + return ret; + + rcu_read_lock(); + list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) { + if (it++ != symnum) + continue; + + bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); + bpf_get_prog_name(aux->prog, sym); + + *value = symbol_start; + *type = BPF_SYM_ELF_TYPE; + + ret = 0; + break; + } + rcu_read_unlock(); + + return ret; +} + struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, @@ -326,6 +531,24 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) module_memfree(hdr); } +/* This symbol is only overridden by archs that have different + * requirements than the usual eBPF JITs, f.e. when they only + * implement cBPF JIT, do not set images read-only, etc. + */ +void __weak bpf_jit_free(struct bpf_prog *fp) +{ + if (fp->jited) { + struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); + + bpf_jit_binary_unlock_ro(hdr); + bpf_jit_binary_free(hdr); + + WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); + } + + bpf_prog_unlock_free(fp); +} + int bpf_jit_harden __read_mostly; static int bpf_jit_blind_insn(const struct bpf_insn *from, @@ -1154,12 +1377,22 @@ const struct bpf_func_proto bpf_tail_call_proto = { .arg3_type = ARG_ANYTHING, }; -/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */ +/* Stub for JITs that only support cBPF. eBPF programs are interpreted. + * It is encouraged to implement bpf_int_jit_compile() instead, so that + * eBPF and implicitly also cBPF can get JITed! + */ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) { return prog; } +/* Stub for JITs that support eBPF. All cBPF code gets transformed into + * eBPF by the kernel and is later compiled by bpf_int_jit_compile(). + */ +void __weak bpf_jit_compile(struct bpf_prog *prog) +{ +} + bool __weak bpf_helper_changes_pkt_data(void *func) { return false; @@ -1173,3 +1406,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, { return -EFAULT; } + +/* All definitions of tracepoints related to BPF. */ +#define CREATE_TRACE_POINTS +#include <linux/bpf_trace.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type); +EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index a753bbe7df0a..3ea87fb19a94 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1023,7 +1023,7 @@ static const struct bpf_map_ops htab_ops = { .map_delete_elem = htab_map_delete_elem, }; -static struct bpf_map_type_list htab_type __read_mostly = { +static struct bpf_map_type_list htab_type __ro_after_init = { .ops = &htab_ops, .type = BPF_MAP_TYPE_HASH, }; @@ -1037,7 +1037,7 @@ static const struct bpf_map_ops htab_lru_ops = { .map_delete_elem = htab_lru_map_delete_elem, }; -static struct bpf_map_type_list htab_lru_type __read_mostly = { +static struct bpf_map_type_list htab_lru_type __ro_after_init = { .ops = &htab_lru_ops, .type = BPF_MAP_TYPE_LRU_HASH, }; @@ -1124,7 +1124,7 @@ static const struct bpf_map_ops htab_percpu_ops = { .map_delete_elem = htab_map_delete_elem, }; -static struct bpf_map_type_list htab_percpu_type __read_mostly = { +static struct bpf_map_type_list htab_percpu_type __ro_after_init = { .ops = &htab_percpu_ops, .type = BPF_MAP_TYPE_PERCPU_HASH, }; @@ -1138,7 +1138,7 @@ static const struct bpf_map_ops htab_lru_percpu_ops = { .map_delete_elem = htab_lru_map_delete_elem, }; -static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = { +static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = { .ops = &htab_lru_percpu_ops, .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, }; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 045cbe673356..3d24e238221e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -176,6 +176,6 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { .func = bpf_get_current_comm, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_RAW_STACK, - .arg2_type = ARG_CONST_STACK_SIZE, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, }; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 0b030c9126d3..fddcae801724 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -21,6 +21,7 @@ #include <linux/parser.h> #include <linux/filter.h> #include <linux/bpf.h> +#include <linux/bpf_trace.h> enum bpf_type { BPF_TYPE_UNSPEC = 0, @@ -281,6 +282,13 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname) ret = bpf_obj_do_pin(pname, raw, type); if (ret != 0) bpf_any_put(raw, type); + if ((trace_bpf_obj_pin_prog_enabled() || + trace_bpf_obj_pin_map_enabled()) && !ret) { + if (type == BPF_TYPE_PROG) + trace_bpf_obj_pin_prog(raw, ufd, pname); + if (type == BPF_TYPE_MAP) + trace_bpf_obj_pin_map(raw, ufd, pname); + } out: putname(pname); return ret; @@ -342,8 +350,15 @@ int bpf_obj_get_user(const char __user *pathname) else goto out; - if (ret < 0) + if (ret < 0) { bpf_any_put(raw, type); + } else if (trace_bpf_obj_get_prog_enabled() || + trace_bpf_obj_get_map_enabled()) { + if (type == BPF_TYPE_PROG) + trace_bpf_obj_get_prog(raw, ret, pname); + if (type == BPF_TYPE_MAP) + trace_bpf_obj_get_map(raw, ret, pname); + } out: putname(pname); return ret; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c new file mode 100644 index 000000000000..8bfe0afaee10 --- /dev/null +++ b/kernel/bpf/lpm_trie.c @@ -0,0 +1,521 @@ +/* + * Longest prefix match list implementation + * + * Copyright (c) 2016,2017 Daniel Mack + * Copyright (c) 2016 David Herrmann + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include <linux/bpf.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/vmalloc.h> +#include <net/ipv6.h> + +/* Intermediate node */ +#define LPM_TREE_NODE_FLAG_IM BIT(0) + +struct lpm_trie_node; + +struct lpm_trie_node { + struct rcu_head rcu; + struct lpm_trie_node __rcu *child[2]; + u32 prefixlen; + u32 flags; + u8 data[0]; +}; + +struct lpm_trie { + struct bpf_map map; + struct lpm_trie_node __rcu *root; + size_t n_entries; + size_t max_prefixlen; + size_t data_size; + raw_spinlock_t lock; +}; + +/* This trie implements a longest prefix match algorithm that can be used to + * match IP addresses to a stored set of ranges. + * + * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is + * interpreted as big endian, so data[0] stores the most significant byte. + * + * Match ranges are internally stored in instances of struct lpm_trie_node + * which each contain their prefix length as well as two pointers that may + * lead to more nodes containing more specific matches. Each node also stores + * a value that is defined by and returned to userspace via the update_elem + * and lookup functions. + * + * For instance, let's start with a trie that was created with a prefix length + * of 32, so it can be used for IPv4 addresses, and one single element that + * matches 192.168.0.0/16. The data array would hence contain + * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will + * stick to IP-address notation for readability though. + * + * As the trie is empty initially, the new node (1) will be places as root + * node, denoted as (R) in the example below. As there are no other node, both + * child pointers are %NULL. + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * + * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already + * a node with the same data and a smaller prefix (ie, a less specific one), + * node (2) will become a child of (1). In child index depends on the next bit + * that is outside of what (1) matches, and that bit is 0, so (2) will be + * child[0] of (1): + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * | + * +----------------+ + * | (2) | + * | 192.168.0.0/24 | + * | value: 2 | + * | [0] [1] | + * +----------------+ + * + * The child[1] slot of (1) could be filled with another node which has bit #17 + * (the next bit after the ones that (1) matches on) set to 1. For instance, + * 192.168.128.0/24: + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * | | + * +----------------+ +------------------+ + * | (2) | | (3) | + * | 192.168.0.0/24 | | 192.168.128.0/24 | + * | value: 2 | | value: 3 | + * | [0] [1] | | [0] [1] | + * +----------------+ +------------------+ + * + * Let's add another node (4) to the game for 192.168.1.0/24. In order to place + * it, node (1) is looked at first, and because (4) of the semantics laid out + * above (bit #17 is 0), it would normally be attached to (1) as child[0]. + * However, that slot is already allocated, so a new node is needed in between. + * That node does not have a value attached to it and it will never be + * returned to users as result of a lookup. It is only there to differentiate + * the traversal further. It will get a prefix as wide as necessary to + * distinguish its two children: + * + * +----------------+ + * | (1) (R) | + * | 192.168.0.0/16 | + * | value: 1 | + * | [0] [1] | + * +----------------+ + * | | + * +----------------+ +------------------+ + * | (4) (I) | | (3) | + * | 192.168.0.0/23 | | 192.168.128.0/24 | + * | value: --- | | value: 3 | + * | [0] [1] | | [0] [1] | + * +----------------+ +------------------+ + * | | + * +----------------+ +----------------+ + * | (2) | | (5) | + * | 192.168.0.0/24 | | 192.168.1.0/24 | + * | value: 2 | | value: 5 | + * | [0] [1] | | [0] [1] | + * +----------------+ +----------------+ + * + * 192.168.1.1/32 would be a child of (5) etc. + * + * An intermediate node will be turned into a 'real' node on demand. In the + * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie. + * + * A fully populated trie would have a height of 32 nodes, as the trie was + * created with a prefix length of 32. + * + * The lookup starts at the root node. If the current node matches and if there + * is a child that can be used to become more specific, the trie is traversed + * downwards. The last node in the traversal that is a non-intermediate one is + * returned. + */ + +static inline int extract_bit(const u8 *data, size_t index) +{ + return !!(data[index / 8] & (1 << (7 - (index % 8)))); +} + +/** + * longest_prefix_match() - determine the longest prefix + * @trie: The trie to get internal sizes from + * @node: The node to operate on + * @key: The key to compare to @node + * + * Determine the longest prefix of @node that matches the bits in @key. + */ +static size_t longest_prefix_match(const struct lpm_trie *trie, + const struct lpm_trie_node *node, + const struct bpf_lpm_trie_key *key) +{ + size_t prefixlen = 0; + size_t i; + + for (i = 0; i < trie->data_size; i++) { + size_t b; + + b = 8 - fls(node->data[i] ^ key->data[i]); + prefixlen += b; + + if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen) + return min(node->prefixlen, key->prefixlen); + + if (b < 8) + break; + } + + return prefixlen; +} + +/* Called from syscall or from eBPF program */ +static void *trie_lookup_elem(struct bpf_map *map, void *_key) +{ + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node *node, *found = NULL; + struct bpf_lpm_trie_key *key = _key; + + /* Start walking the trie from the root node ... */ + + for (node = rcu_dereference(trie->root); node;) { + unsigned int next_bit; + size_t matchlen; + + /* Determine the longest prefix of @node that matches @key. + * If it's the maximum possible prefix for this trie, we have + * an exact match and can return it directly. + */ + matchlen = longest_prefix_match(trie, node, key); + if (matchlen == trie->max_prefixlen) { + found = node; + break; + } + + /* If the number of bits that match is smaller than the prefix + * length of @node, bail out and return the node we have seen + * last in the traversal (ie, the parent). + */ + if (matchlen < node->prefixlen) + break; + + /* Consider this node as return candidate unless it is an + * artificially added intermediate one. + */ + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + found = node; + + /* If the node match is fully satisfied, let's see if we can + * become more specific. Determine the next bit in the key and + * traverse down. + */ + next_bit = extract_bit(key->data, node->prefixlen); + node = rcu_dereference(node->child[next_bit]); + } + + if (!found) + return NULL; + + return found->data + trie->data_size; +} + +static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, + const void *value) +{ + struct lpm_trie_node *node; + size_t size = sizeof(struct lpm_trie_node) + trie->data_size; + + if (value) + size += trie->map.value_size; + + node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); + if (!node) + return NULL; + + node->flags = 0; + + if (value) + memcpy(node->data + trie->data_size, value, + trie->map.value_size); + + return node; +} + +/* Called from syscall or from eBPF program */ +static int trie_update_elem(struct bpf_map *map, + void *_key, void *value, u64 flags) +{ + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; + struct lpm_trie_node __rcu **slot; + struct bpf_lpm_trie_key *key = _key; + unsigned long irq_flags; + unsigned int next_bit; + size_t matchlen = 0; + int ret = 0; + + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + if (key->prefixlen > trie->max_prefixlen) + return -EINVAL; + + raw_spin_lock_irqsave(&trie->lock, irq_flags); + + /* Allocate and fill a new node */ + + if (trie->n_entries == trie->map.max_entries) { + ret = -ENOSPC; + goto out; + } + + new_node = lpm_trie_node_alloc(trie, value); + if (!new_node) { + ret = -ENOMEM; + goto out; + } + + trie->n_entries++; + + new_node->prefixlen = key->prefixlen; + RCU_INIT_POINTER(new_node->child[0], NULL); + RCU_INIT_POINTER(new_node->child[1], NULL); + memcpy(new_node->data, key->data, trie->data_size); + + /* Now find a slot to attach the new node. To do that, walk the tree + * from the root and match as many bits as possible for each node until + * we either find an empty slot or a slot that needs to be replaced by + * an intermediate node. + */ + slot = &trie->root; + + while ((node = rcu_dereference_protected(*slot, + lockdep_is_held(&trie->lock)))) { + matchlen = longest_prefix_match(trie, node, key); + + if (node->prefixlen != matchlen || + node->prefixlen == key->prefixlen || + node->prefixlen == trie->max_prefixlen) + break; + + next_bit = extract_bit(key->data, node->prefixlen); + slot = &node->child[next_bit]; + } + + /* If the slot is empty (a free child pointer or an empty root), + * simply assign the @new_node to that slot and be done. + */ + if (!node) { + rcu_assign_pointer(*slot, new_node); + goto out; + } + + /* If the slot we picked already exists, replace it with @new_node + * which already has the correct data array set. + */ + if (node->prefixlen == matchlen) { + new_node->child[0] = node->child[0]; + new_node->child[1] = node->child[1]; + + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + trie->n_entries--; + + rcu_assign_pointer(*slot, new_node); + kfree_rcu(node, rcu); + + goto out; + } + + /* If the new node matches the prefix completely, it must be inserted + * as an ancestor. Simply insert it between @node and *@slot. + */ + if (matchlen == key->prefixlen) { + next_bit = extract_bit(node->data, matchlen); + rcu_assign_pointer(new_node->child[next_bit], node); + rcu_assign_pointer(*slot, new_node); + goto out; + } + + im_node = lpm_trie_node_alloc(trie, NULL); + if (!im_node) { + ret = -ENOMEM; + goto out; + } + + im_node->prefixlen = matchlen; + im_node->flags |= LPM_TREE_NODE_FLAG_IM; + memcpy(im_node->data, node->data, trie->data_size); + + /* Now determine which child to install in which slot */ + if (extract_bit(key->data, matchlen)) { + rcu_assign_pointer(im_node->child[0], node); + rcu_assign_pointer(im_node->child[1], new_node); + } else { + rcu_assign_pointer(im_node->child[0], new_node); + rcu_assign_pointer(im_node->child[1], node); + } + + /* Finally, assign the intermediate node to the determined spot */ + rcu_assign_pointer(*slot, im_node); + +out: + if (ret) { + if (new_node) + trie->n_entries--; + + kfree(new_node); + kfree(im_node); + } + + raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + + return ret; +} + +static int trie_delete_elem(struct bpf_map *map, void *key) +{ + /* TODO */ + return -ENOSYS; +} + +#define LPM_DATA_SIZE_MAX 256 +#define LPM_DATA_SIZE_MIN 1 + +#define LPM_VAL_SIZE_MAX (KMALLOC_MAX_SIZE - LPM_DATA_SIZE_MAX - \ + sizeof(struct lpm_trie_node)) +#define LPM_VAL_SIZE_MIN 1 + +#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key) + (X)) +#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX) +#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) + +static struct bpf_map *trie_alloc(union bpf_attr *attr) +{ + struct lpm_trie *trie; + u64 cost = sizeof(*trie), cost_per_node; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || + attr->map_flags != BPF_F_NO_PREALLOC || + attr->key_size < LPM_KEY_SIZE_MIN || + attr->key_size > LPM_KEY_SIZE_MAX || + attr->value_size < LPM_VAL_SIZE_MIN || + attr->value_size > LPM_VAL_SIZE_MAX) + return ERR_PTR(-EINVAL); + + trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN); + if (!trie) + return ERR_PTR(-ENOMEM); + + /* copy mandatory map attributes */ + trie->map.map_type = attr->map_type; + trie->map.key_size = attr->key_size; + trie->map.value_size = attr->value_size; + trie->map.max_entries = attr->max_entries; + trie->data_size = attr->key_size - + offsetof(struct bpf_lpm_trie_key, data); + trie->max_prefixlen = trie->data_size * 8; + + cost_per_node = sizeof(struct lpm_trie_node) + + attr->value_size + trie->data_size; + cost += (u64) attr->max_entries * cost_per_node; + if (cost >= U32_MAX - PAGE_SIZE) { + ret = -E2BIG; + goto out_err; + } + + trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + ret = bpf_map_precharge_memlock(trie->map.pages); + if (ret) + goto out_err; + + raw_spin_lock_init(&trie->lock); + + return &trie->map; +out_err: + kfree(trie); + return ERR_PTR(ret); +} + +static void trie_free(struct bpf_map *map) +{ + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct lpm_trie_node __rcu **slot; + struct lpm_trie_node *node; + + raw_spin_lock(&trie->lock); + + /* Always start at the root and walk down to a node that has no + * children. Then free that node, nullify its reference in the parent + * and start over. + */ + + for (;;) { + slot = &trie->root; + + for (;;) { + node = rcu_dereference_protected(*slot, + lockdep_is_held(&trie->lock)); + if (!node) + goto unlock; + + if (rcu_access_pointer(node->child[0])) { + slot = &node->child[0]; + continue; + } + + if (rcu_access_pointer(node->child[1])) { + slot = &node->child[1]; + continue; + } + + kfree(node); + RCU_INIT_POINTER(*slot, NULL); + break; + } + } + +unlock: + raw_spin_unlock(&trie->lock); +} + +static const struct bpf_map_ops trie_ops = { + .map_alloc = trie_alloc, + .map_free = trie_free, + .map_lookup_elem = trie_lookup_elem, + .map_update_elem = trie_update_elem, + .map_delete_elem = trie_delete_elem, +}; + +static struct bpf_map_type_list trie_type __ro_after_init = { + .ops = &trie_ops, + .type = BPF_MAP_TYPE_LPM_TRIE, +}; + +static int __init register_trie_map(void) +{ + bpf_register_map_type(&trie_type); + return 0; +} +late_initcall(register_trie_map); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index be8519148c25..22aa45cd0324 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -273,7 +273,7 @@ static const struct bpf_map_ops stack_map_ops = { .map_delete_elem = stack_map_delete_elem, }; -static struct bpf_map_type_list stack_map_type __read_mostly = { +static struct bpf_map_type_list stack_map_type __ro_after_init = { .ops = &stack_map_ops, .type = BPF_MAP_TYPE_STACK_TRACE, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bbb016adbaeb..461eb1e66a0f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -10,6 +10,7 @@ * General Public License for more details. */ #include <linux/bpf.h> +#include <linux/bpf_trace.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -241,6 +242,7 @@ static int map_create(union bpf_attr *attr) /* failed to allocate fd */ goto free_map; + trace_bpf_map_create(map, err); return err; free_map: @@ -365,6 +367,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; + trace_bpf_map_lookup_elem(map, ufd, key, value); err = 0; free_value: @@ -447,6 +450,8 @@ static int map_update_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); + if (!err) + trace_bpf_map_update_elem(map, ufd, key, value); free_value: kfree(value); free_key: @@ -492,6 +497,8 @@ static int map_delete_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); + if (!err) + trace_bpf_map_delete_elem(map, ufd, key); free_key: kfree(key); err_put: @@ -544,6 +551,7 @@ static int map_get_next_key(union bpf_attr *attr) if (copy_to_user(unext_key, next_key, map->key_size) != 0) goto free_next_key; + trace_bpf_map_next_key(map, ufd, key, next_key); err = 0; free_next_key: @@ -697,8 +705,11 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) void bpf_prog_put(struct bpf_prog *prog) { - if (atomic_dec_and_test(&prog->aux->refcnt)) + if (atomic_dec_and_test(&prog->aux->refcnt)) { + trace_bpf_prog_put_rcu(prog); + bpf_prog_kallsyms_del(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); + } } EXPORT_SYMBOL_GPL(bpf_prog_put); @@ -807,7 +818,11 @@ struct bpf_prog *bpf_prog_get(u32 ufd) struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) { - return __bpf_prog_get(ufd, &type); + struct bpf_prog *prog = __bpf_prog_get(ufd, &type); + + if (!IS_ERR(prog)) + trace_bpf_prog_get_type(prog); + return prog; } EXPORT_SYMBOL_GPL(bpf_prog_get_type); @@ -889,6 +904,8 @@ static int bpf_prog_load(union bpf_attr *attr) /* failed to allocate fd */ goto free_used_maps; + bpf_prog_kallsyms_add(prog); + trace_bpf_prog_load(prog, err); return err; free_used_maps: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cdc43b899f28..3fc6e39b223e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -481,6 +481,13 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) regs[regno].max_value = BPF_REGISTER_MAX_RANGE; } +static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs, + u32 regno) +{ + mark_reg_unknown_value(regs, regno); + reset_reg_range_values(regs, regno); +} + enum reg_arg_type { SRC_OP, /* register is used as source operand */ DST_OP, /* register is used as destination operand */ @@ -532,6 +539,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) switch (type) { case PTR_TO_MAP_VALUE: case PTR_TO_MAP_VALUE_OR_NULL: + case PTR_TO_MAP_VALUE_ADJ: case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: @@ -616,7 +624,8 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, } if (value_regno >= 0) /* have read misc data from the stack */ - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); return 0; } } @@ -627,7 +636,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, { struct bpf_map *map = env->cur_state.regs[regno].map_ptr; - if (off < 0 || off + size > map->value_size) { + if (off < 0 || size <= 0 || off + size > map->value_size) { verbose("invalid access to map value, value_size=%d off=%d size=%d\n", map->value_size, off, size); return -EACCES; @@ -635,6 +644,51 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, return 0; } +/* check read/write into an adjusted map element */ +static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno, + int off, int size) +{ + struct bpf_verifier_state *state = &env->cur_state; + struct bpf_reg_state *reg = &state->regs[regno]; + int err; + + /* We adjusted the register to this map value, so we + * need to change off and size to min_value and max_value + * respectively to make sure our theoretical access will be + * safe. + */ + if (log_level) + print_verifier_state(state); + env->varlen_map_value_access = true; + /* The minimum value is only important with signed + * comparisons where we can't assume the floor of a + * value is 0. If we are using signed variables for our + * index'es we need to make sure that whatever we use + * will have a set floor within our range. + */ + if (reg->min_value < 0) { + verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + regno); + return -EACCES; + } + err = check_map_access(env, regno, reg->min_value + off, size); + if (err) { + verbose("R%d min value is outside of the array range\n", + regno); + return err; + } + + /* If we haven't set a max value then we need to bail + * since we can't be sure we won't do bad things. + */ + if (reg->max_value == BPF_REGISTER_MAX_RANGE) { + verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", + regno); + return -EACCES; + } + return check_map_access(env, regno, reg->max_value + off, size); +} + #define MAX_PACKET_OFF 0xffff static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, @@ -647,6 +701,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; + /* fallthrough */ case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: @@ -775,47 +830,13 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, return -EACCES; } - /* If we adjusted the register to this map value at all then we - * need to change off and size to min_value and max_value - * respectively to make sure our theoretical access will be - * safe. - */ - if (reg->type == PTR_TO_MAP_VALUE_ADJ) { - if (log_level) - print_verifier_state(state); - env->varlen_map_value_access = true; - /* The minimum value is only important with signed - * comparisons where we can't assume the floor of a - * value is 0. If we are using signed variables for our - * index'es we need to make sure that whatever we use - * will have a set floor within our range. - */ - if (reg->min_value < 0) { - verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", - regno); - return -EACCES; - } - err = check_map_access(env, regno, reg->min_value + off, - size); - if (err) { - verbose("R%d min value is outside of the array range\n", - regno); - return err; - } - - /* If we haven't set a max value then we need to bail - * since we can't be sure we won't do bad things. - */ - if (reg->max_value == BPF_REGISTER_MAX_RANGE) { - verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", - regno); - return -EACCES; - } - off += reg->max_value; - } - err = check_map_access(env, regno, off, size); + if (reg->type == PTR_TO_MAP_VALUE_ADJ) + err = check_map_access_adj(env, regno, off, size); + else + err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = UNKNOWN_VALUE; @@ -827,7 +848,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, } err = check_ctx_access(env, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); /* note that reg.[id|off|range] == 0 */ state->regs[value_regno].type = reg_type; } @@ -860,7 +882,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown_value(state->regs, value_regno); + mark_reg_unknown_value_and_range(state->regs, + value_regno); } else { verbose("R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -958,6 +981,25 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return 0; } +static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, + int access_size, bool zero_size_allowed, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = env->cur_state.regs; + + switch (regs[regno].type) { + case PTR_TO_PACKET: + return check_packet_access(env, regno, 0, access_size); + case PTR_TO_MAP_VALUE: + return check_map_access(env, regno, 0, access_size); + case PTR_TO_MAP_VALUE_ADJ: + return check_map_access_adj(env, regno, 0, access_size); + default: /* const_imm|ptr_to_stack or invalid ptr */ + return check_stack_boundary(env, regno, access_size, + zero_size_allowed, meta); + } +} + static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) @@ -993,10 +1035,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_STACK; if (type != PTR_TO_PACKET && type != expected_type) goto err_type; - } else if (arg_type == ARG_CONST_STACK_SIZE || - arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { + } else if (arg_type == ARG_CONST_SIZE || + arg_type == ARG_CONST_SIZE_OR_ZERO) { expected_type = CONST_IMM; - if (type != expected_type) + /* One exception. Allow UNKNOWN_VALUE registers when the + * boundaries are known and don't cause unsafe memory accesses + */ + if (type != UNKNOWN_VALUE && type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; @@ -1006,8 +1051,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_CTX; if (type != expected_type) goto err_type; - } else if (arg_type == ARG_PTR_TO_STACK || - arg_type == ARG_PTR_TO_RAW_STACK) { + } else if (arg_type == ARG_PTR_TO_MEM || + arg_type == ARG_PTR_TO_UNINIT_MEM) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be * passed in as argument, it's a CONST_IMM type. Final test @@ -1015,9 +1060,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (type == CONST_IMM && reg->imm == 0) /* final test in check_stack_boundary() */; - else if (type != PTR_TO_PACKET && type != expected_type) + else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE && + type != PTR_TO_MAP_VALUE_ADJ && type != expected_type) goto err_type; - meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; + meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -1063,9 +1109,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_stack_boundary(env, regno, meta->map_ptr->value_size, false, NULL); - } else if (arg_type == ARG_CONST_STACK_SIZE || - arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { - bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); + } else if (arg_type == ARG_CONST_SIZE || + arg_type == ARG_CONST_SIZE_OR_ZERO) { + bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); /* bpf_xxx(..., buf, len) call will access 'len' bytes * from stack pointer 'buf'. Check it @@ -1073,14 +1119,50 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (regno == 0) { /* kernel subsystem misconfigured verifier */ - verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); + verbose("ARG_CONST_SIZE cannot be first argument\n"); return -EACCES; } - if (regs[regno - 1].type == PTR_TO_PACKET) - err = check_packet_access(env, regno - 1, 0, reg->imm); - else - err = check_stack_boundary(env, regno - 1, reg->imm, - zero_size_allowed, meta); + + /* If the register is UNKNOWN_VALUE, the access check happens + * using its boundaries. Otherwise, just use its imm + */ + if (type == UNKNOWN_VALUE) { + /* For unprivileged variable accesses, disable raw + * mode so that the program is required to + * initialize all the memory that the helper could + * just partially fill up. + */ + meta = NULL; + + if (reg->min_value < 0) { + verbose("R%d min value is negative, either use unsigned or 'var &= const'\n", + regno); + return -EACCES; + } + + if (reg->min_value == 0) { + err = check_helper_mem_access(env, regno - 1, 0, + zero_size_allowed, + meta); + if (err) + return err; + } + + if (reg->max_value == BPF_REGISTER_MAX_RANGE) { + verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + err = check_helper_mem_access(env, regno - 1, + reg->max_value, + zero_size_allowed, meta); + if (err) + return err; + } else { + /* register is CONST_IMM */ + err = check_helper_mem_access(env, regno - 1, reg->imm, + zero_size_allowed, meta); + } } return err; @@ -1154,15 +1236,15 @@ static int check_raw_mode(const struct bpf_func_proto *fn) { int count = 0; - if (fn->arg1_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg2_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg3_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg4_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM) count++; - if (fn->arg5_type == ARG_PTR_TO_RAW_STACK) + if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM) count++; return count > 1 ? -EINVAL : 0; @@ -1316,7 +1398,7 @@ static int check_packet_ptr_add(struct bpf_verifier_env *env, imm = insn->imm; add_imm: - if (imm <= 0) { + if (imm < 0) { verbose("addition of negative constant to packet pointer is not allowed\n"); return -EACCES; } @@ -1485,22 +1567,54 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; struct bpf_reg_state *src_reg = ®s[insn->src_reg]; u8 opcode = BPF_OP(insn->code); + u64 dst_imm = dst_reg->imm; - /* dst_reg->type == CONST_IMM here, simulate execution of 'add'/'or' - * insn. Don't care about overflow or negative values, just add them + /* dst_reg->type == CONST_IMM here. Simulate execution of insns + * containing ALU ops. Don't care about overflow or negative + * values, just add/sub/... them; registers are in u64. */ - if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) - dst_reg->imm += insn->imm; - else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && - src_reg->type == CONST_IMM) - dst_reg->imm += src_reg->imm; - else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) - dst_reg->imm |= insn->imm; - else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X && - src_reg->type == CONST_IMM) - dst_reg->imm |= src_reg->imm; - else + if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) { + dst_imm += insn->imm; + } else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm += src_reg->imm; + } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_K) { + dst_imm -= insn->imm; + } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm -= src_reg->imm; + } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_K) { + dst_imm *= insn->imm; + } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm *= src_reg->imm; + } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) { + dst_imm |= insn->imm; + } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm |= src_reg->imm; + } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_K) { + dst_imm &= insn->imm; + } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm &= src_reg->imm; + } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_K) { + dst_imm >>= insn->imm; + } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm >>= src_reg->imm; + } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_K) { + dst_imm <<= insn->imm; + } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) { + dst_imm <<= src_reg->imm; + } else { mark_reg_unknown_value(regs, insn->dst_reg); + goto out; + } + + dst_reg->imm = dst_imm; +out: return 0; } @@ -1894,6 +2008,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, case BPF_JGT: /* Unsigned comparison, the minimum value is 0. */ false_reg->min_value = 0; + /* fallthrough */ case BPF_JSGT: /* If this is false then we know the maximum val is val, * otherwise we know the min val is val+1. @@ -1904,6 +2019,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, case BPF_JGE: /* Unsigned comparison, the minimum value is 0. */ false_reg->min_value = 0; + /* fallthrough */ case BPF_JSGE: /* If this is false then we know the maximum value is val - 1, * otherwise we know the mimimum value is val. @@ -1942,6 +2058,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, case BPF_JGT: /* Unsigned comparison, the minimum value is 0. */ true_reg->min_value = 0; + /* fallthrough */ case BPF_JSGT: /* * If this is false, then the val is <= the register, if it is @@ -1953,6 +2070,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, case BPF_JGE: /* Unsigned comparison, the minimum value is 0. */ true_reg->min_value = 0; + /* fallthrough */ case BPF_JSGE: /* If this is false then constant < register, if it is true then * the register < constant. @@ -2144,14 +2262,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (insn->src_reg == 0) { - /* generic move 64-bit immediate into a register, - * only analyzer needs to collect the ld_imm value. - */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - if (!env->analyzer_ops) - return 0; - regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = imm; return 0; @@ -2664,7 +2776,7 @@ static int do_check(struct bpf_verifier_env *env) class = BPF_CLASS(insn->code); if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { - verbose("BPF program is too large. Proccessed %d insn\n", + verbose("BPF program is too large. Processed %d insn\n", insn_processed); return -E2BIG; } @@ -2729,7 +2841,6 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; - reset_reg_range_values(regs, insn->dst_reg); if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) { insn_idx++; @@ -3085,10 +3196,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn = env->prog->insnsi + delta; for (i = 0; i < insn_cnt; i++, insn++) { - if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || + if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || + insn->code == (BPF_LDX | BPF_MEM | BPF_H) || + insn->code == (BPF_LDX | BPF_MEM | BPF_W) || insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) type = BPF_READ; - else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) || + else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || + insn->code == (BPF_STX | BPF_MEM | BPF_H) || + insn->code == (BPF_STX | BPF_MEM | BPF_W) || insn->code == (BPF_STX | BPF_MEM | BPF_DW)) type = BPF_WRITE; else @@ -3097,8 +3212,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX) continue; - cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg, - insn->off, insn_buf, env->prog); + cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile new file mode 100644 index 000000000000..387348a40c64 --- /dev/null +++ b/kernel/cgroup/Makefile @@ -0,0 +1,6 @@ +obj-y := cgroup.o namespace.o cgroup-v1.o + +obj-$(CONFIG_CGROUP_FREEZER) += freezer.o +obj-$(CONFIG_CGROUP_PIDS) += pids.o +obj-$(CONFIG_CGROUP_RDMA) += rdma.o +obj-$(CONFIG_CPUSETS) += cpuset.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h new file mode 100644 index 000000000000..9203bfb05603 --- /dev/null +++ b/kernel/cgroup/cgroup-internal.h @@ -0,0 +1,214 @@ +#ifndef __CGROUP_INTERNAL_H +#define __CGROUP_INTERNAL_H + +#include <linux/cgroup.h> +#include <linux/kernfs.h> +#include <linux/workqueue.h> +#include <linux/list.h> + +/* + * A cgroup can be associated with multiple css_sets as different tasks may + * belong to different cgroups on different hierarchies. In the other + * direction, a css_set is naturally associated with multiple cgroups. + * This M:N relationship is represented by the following link structure + * which exists for each association and allows traversing the associations + * from both sides. + */ +struct cgrp_cset_link { + /* the cgroup and css_set this link associates */ + struct cgroup *cgrp; + struct css_set *cset; + + /* list of cgrp_cset_links anchored at cgrp->cset_links */ + struct list_head cset_link; + + /* list of cgrp_cset_links anchored at css_set->cgrp_links */ + struct list_head cgrp_link; +}; + +/* used to track tasks and csets during migration */ +struct cgroup_taskset { + /* the src and dst cset list running through cset->mg_node */ + struct list_head src_csets; + struct list_head dst_csets; + + /* the subsys currently being processed */ + int ssid; + + /* + * Fields for cgroup_taskset_*() iteration. + * + * Before migration is committed, the target migration tasks are on + * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of + * the csets on ->dst_csets. ->csets point to either ->src_csets + * or ->dst_csets depending on whether migration is committed. + * + * ->cur_csets and ->cur_task point to the current task position + * during iteration. + */ + struct list_head *csets; + struct css_set *cur_cset; + struct task_struct *cur_task; +}; + +/* migration context also tracks preloading */ +struct cgroup_mgctx { + /* + * Preloaded source and destination csets. Used to guarantee + * atomic success or failure on actual migration. + */ + struct list_head preloaded_src_csets; + struct list_head preloaded_dst_csets; + + /* tasks and csets to migrate */ + struct cgroup_taskset tset; + + /* subsystems affected by migration */ + u16 ss_mask; +}; + +#define CGROUP_TASKSET_INIT(tset) \ +{ \ + .src_csets = LIST_HEAD_INIT(tset.src_csets), \ + .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ + .csets = &tset.src_csets, \ +} + +#define CGROUP_MGCTX_INIT(name) \ +{ \ + LIST_HEAD_INIT(name.preloaded_src_csets), \ + LIST_HEAD_INIT(name.preloaded_dst_csets), \ + CGROUP_TASKSET_INIT(name.tset), \ +} + +#define DEFINE_CGROUP_MGCTX(name) \ + struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) + +struct cgroup_sb_opts { + u16 subsys_mask; + unsigned int flags; + char *release_agent; + bool cpuset_clone_children; + char *name; + /* User explicitly requested empty subsystem */ + bool none; +}; + +extern struct mutex cgroup_mutex; +extern spinlock_t css_set_lock; +extern struct cgroup_subsys *cgroup_subsys[]; +extern struct list_head cgroup_roots; +extern struct file_system_type cgroup_fs_type; + +/* iterate across the hierarchies */ +#define for_each_root(root) \ + list_for_each_entry((root), &cgroup_roots, root_list) + +/** + * for_each_subsys - iterate all enabled cgroup subsystems + * @ss: the iteration cursor + * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end + */ +#define for_each_subsys(ss, ssid) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ + (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) + +static inline bool cgroup_is_dead(const struct cgroup *cgrp) +{ + return !(cgrp->self.flags & CSS_ONLINE); +} + +static inline bool notify_on_release(const struct cgroup *cgrp) +{ + return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); +} + +void put_css_set_locked(struct css_set *cset); + +static inline void put_css_set(struct css_set *cset) +{ + unsigned long flags; + + /* + * Ensure that the refcount doesn't hit zero while any readers + * can see it. Similar to atomic_dec_and_lock(), but for an + * rwlock + */ + if (atomic_add_unless(&cset->refcount, -1, 1)) + return; + + spin_lock_irqsave(&css_set_lock, flags); + put_css_set_locked(cset); + spin_unlock_irqrestore(&css_set_lock, flags); +} + +/* + * refcounted get/put for css_set objects + */ +static inline void get_css_set(struct css_set *cset) +{ + atomic_inc(&cset->refcount); +} + +bool cgroup_ssid_enabled(int ssid); +bool cgroup_on_dfl(const struct cgroup *cgrp); + +struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); +struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroup_root *root); +struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline); +void cgroup_kn_unlock(struct kernfs_node *kn); +int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns); + +void cgroup_free_root(struct cgroup_root *root); +void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts); +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); +int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); +struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, + struct cgroup_root *root, unsigned long magic, + struct cgroup_namespace *ns); + +bool cgroup_may_migrate_to(struct cgroup *dst_cgrp); +void cgroup_migrate_finish(struct cgroup_mgctx *mgctx); +void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, + struct cgroup_mgctx *mgctx); +int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx); +int cgroup_migrate(struct task_struct *leader, bool threadgroup, + struct cgroup_mgctx *mgctx); + +int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, + bool threadgroup); +ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, bool threadgroup); +ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off); + +void cgroup_lock_and_drain_offline(struct cgroup *cgrp); + +int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode); +int cgroup_rmdir(struct kernfs_node *kn); +int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, + struct kernfs_root *kf_root); + +/* + * namespace.c + */ +extern const struct proc_ns_operations cgroupns_operations; + +/* + * cgroup-v1.c + */ +extern struct cftype cgroup1_base_files[]; +extern const struct file_operations proc_cgroupstats_operations; +extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; + +bool cgroup1_ssid_disabled(int ssid); +void cgroup1_pidlist_destroy_all(struct cgroup *cgrp); +void cgroup1_release_agent(struct work_struct *work); +void cgroup1_check_for_release(struct cgroup *cgrp); +struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, + void *data, unsigned long magic, + struct cgroup_namespace *ns); + +#endif /* __CGROUP_INTERNAL_H */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c new file mode 100644 index 000000000000..fc34bcf2329f --- /dev/null +++ b/kernel/cgroup/cgroup-v1.c @@ -0,0 +1,1395 @@ +#include "cgroup-internal.h" + +#include <linux/ctype.h> +#include <linux/kmod.h> +#include <linux/sort.h> +#include <linux/delay.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/delayacct.h> +#include <linux/pid_namespace.h> +#include <linux/cgroupstats.h> + +#include <trace/events/cgroup.h> + +/* + * pidlists linger the following amount before being destroyed. The goal + * is avoiding frequent destruction in the middle of consecutive read calls + * Expiring in the middle is a performance problem not a correctness one. + * 1 sec should be enough. + */ +#define CGROUP_PIDLIST_DESTROY_DELAY HZ + +/* Controllers blocked by the commandline in v1 */ +static u16 cgroup_no_v1_mask; + +/* + * pidlist destructions need to be flushed on cgroup destruction. Use a + * separate workqueue as flush domain. + */ +static struct workqueue_struct *cgroup_pidlist_destroy_wq; + +/* + * Protects cgroup_subsys->release_agent_path. Modifying it also requires + * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. + */ +static DEFINE_SPINLOCK(release_agent_path_lock); + +bool cgroup1_ssid_disabled(int ssid) +{ + return cgroup_no_v1_mask & (1 << ssid); +} + +/** + * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' + * @from: attach to all cgroups of a given task + * @tsk: the task to be attached + */ +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) +{ + struct cgroup_root *root; + int retval = 0; + + mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + for_each_root(root) { + struct cgroup *from_cgrp; + + if (root == &cgrp_dfl_root) + continue; + + spin_lock_irq(&css_set_lock); + from_cgrp = task_cgroup_from_root(from, root); + spin_unlock_irq(&css_set_lock); + + retval = cgroup_attach_task(from_cgrp, tsk, false); + if (retval) + break; + } + percpu_up_write(&cgroup_threadgroup_rwsem); + mutex_unlock(&cgroup_mutex); + + return retval; +} +EXPORT_SYMBOL_GPL(cgroup_attach_task_all); + +/** + * cgroup_trasnsfer_tasks - move tasks from one cgroup to another + * @to: cgroup to which the tasks will be moved + * @from: cgroup in which the tasks currently reside + * + * Locking rules between cgroup_post_fork() and the migration path + * guarantee that, if a task is forking while being migrated, the new child + * is guaranteed to be either visible in the source cgroup after the + * parent's migration is complete or put into the target cgroup. No task + * can slip out of migration through forking. + */ +int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) +{ + DEFINE_CGROUP_MGCTX(mgctx); + struct cgrp_cset_link *link; + struct css_task_iter it; + struct task_struct *task; + int ret; + + if (cgroup_on_dfl(to)) + return -EINVAL; + + if (!cgroup_may_migrate_to(to)) + return -EBUSY; + + mutex_lock(&cgroup_mutex); + + percpu_down_write(&cgroup_threadgroup_rwsem); + + /* all tasks in @from are being moved, all csets are source */ + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &from->cset_links, cset_link) + cgroup_migrate_add_src(link->cset, to, &mgctx); + spin_unlock_irq(&css_set_lock); + + ret = cgroup_migrate_prepare_dst(&mgctx); + if (ret) + goto out_err; + + /* + * Migrate tasks one-by-one until @from is empty. This fails iff + * ->can_attach() fails. + */ + do { + css_task_iter_start(&from->self, &it); + task = css_task_iter_next(&it); + if (task) + get_task_struct(task); + css_task_iter_end(&it); + + if (task) { + ret = cgroup_migrate(task, false, &mgctx); + if (!ret) + trace_cgroup_transfer_tasks(to, task, false); + put_task_struct(task); + } + } while (task && !ret); +out_err: + cgroup_migrate_finish(&mgctx); + percpu_up_write(&cgroup_threadgroup_rwsem); + mutex_unlock(&cgroup_mutex); + return ret; +} + +/* + * Stuff for reading the 'tasks'/'procs' files. + * + * Reading this file can return large amounts of data if a cgroup has + * *lots* of attached tasks. So it may need several calls to read(), + * but we cannot guarantee that the information we produce is correct + * unless we produce it entirely atomically. + * + */ + +/* which pidlist file are we talking about? */ +enum cgroup_filetype { + CGROUP_FILE_PROCS, + CGROUP_FILE_TASKS, +}; + +/* + * A pidlist is a list of pids that virtually represents the contents of one + * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, + * a pair (one each for procs, tasks) for each pid namespace that's relevant + * to the cgroup. + */ +struct cgroup_pidlist { + /* + * used to find which pidlist is wanted. doesn't change as long as + * this particular list stays in the list. + */ + struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; + /* array of xids */ + pid_t *list; + /* how many elements the above list has */ + int length; + /* each of these stored in a list by its cgroup */ + struct list_head links; + /* pointer to the cgroup we belong to, for list removal purposes */ + struct cgroup *owner; + /* for delayed destruction */ + struct delayed_work destroy_dwork; +}; + +/* + * The following two functions "fix" the issue where there are more pids + * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. + * TODO: replace with a kernel-wide solution to this problem + */ +#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) +static void *pidlist_allocate(int count) +{ + if (PIDLIST_TOO_LARGE(count)) + return vmalloc(count * sizeof(pid_t)); + else + return kmalloc(count * sizeof(pid_t), GFP_KERNEL); +} + +static void pidlist_free(void *p) +{ + kvfree(p); +} + +/* + * Used to destroy all pidlists lingering waiting for destroy timer. None + * should be left afterwards. + */ +void cgroup1_pidlist_destroy_all(struct cgroup *cgrp) +{ + struct cgroup_pidlist *l, *tmp_l; + + mutex_lock(&cgrp->pidlist_mutex); + list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); + mutex_unlock(&cgrp->pidlist_mutex); + + flush_workqueue(cgroup_pidlist_destroy_wq); + BUG_ON(!list_empty(&cgrp->pidlists)); +} + +static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, + destroy_dwork); + struct cgroup_pidlist *tofree = NULL; + + mutex_lock(&l->owner->pidlist_mutex); + + /* + * Destroy iff we didn't get queued again. The state won't change + * as destroy_dwork can only be queued while locked. + */ + if (!delayed_work_pending(dwork)) { + list_del(&l->links); + pidlist_free(l->list); + put_pid_ns(l->key.ns); + tofree = l; + } + + mutex_unlock(&l->owner->pidlist_mutex); + kfree(tofree); +} + +/* + * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries + * Returns the number of unique elements. + */ +static int pidlist_uniq(pid_t *list, int length) +{ + int src, dest = 1; + + /* + * we presume the 0th element is unique, so i starts at 1. trivial + * edge cases first; no work needs to be done for either + */ + if (length == 0 || length == 1) + return length; + /* src and dest walk down the list; dest counts unique elements */ + for (src = 1; src < length; src++) { + /* find next unique element */ + while (list[src] == list[src-1]) { + src++; + if (src == length) + goto after; + } + /* dest always points to where the next unique element goes */ + list[dest] = list[src]; + dest++; + } +after: + return dest; +} + +/* + * The two pid files - task and cgroup.procs - guaranteed that the result + * is sorted, which forced this whole pidlist fiasco. As pid order is + * different per namespace, each namespace needs differently sorted list, + * making it impossible to use, for example, single rbtree of member tasks + * sorted by task pointer. As pidlists can be fairly large, allocating one + * per open file is dangerous, so cgroup had to implement shared pool of + * pidlists keyed by cgroup and namespace. + */ +static int cmppid(const void *a, const void *b) +{ + return *(pid_t *)a - *(pid_t *)b; +} + +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, + enum cgroup_filetype type) +{ + struct cgroup_pidlist *l; + /* don't need task_nsproxy() if we're looking at ourself */ + struct pid_namespace *ns = task_active_pid_ns(current); + + lockdep_assert_held(&cgrp->pidlist_mutex); + + list_for_each_entry(l, &cgrp->pidlists, links) + if (l->key.type == type && l->key.ns == ns) + return l; + return NULL; +} + +/* + * find the appropriate pidlist for our purpose (given procs vs tasks) + * returns with the lock on that pidlist already held, and takes care + * of the use count, or returns NULL with no locks held if we're out of + * memory. + */ +static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, + enum cgroup_filetype type) +{ + struct cgroup_pidlist *l; + + lockdep_assert_held(&cgrp->pidlist_mutex); + + l = cgroup_pidlist_find(cgrp, type); + if (l) + return l; + + /* entry not found; create a new one */ + l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); + if (!l) + return l; + + INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); + l->key.type = type; + /* don't need task_nsproxy() if we're looking at ourself */ + l->key.ns = get_pid_ns(task_active_pid_ns(current)); + l->owner = cgrp; + list_add(&l->links, &cgrp->pidlists); + return l; +} + +/** + * cgroup_task_count - count the number of tasks in a cgroup. + * @cgrp: the cgroup in question + * + * Return the number of tasks in the cgroup. The returned number can be + * higher than the actual number of tasks due to css_set references from + * namespace roots and temporary usages. + */ +static int cgroup_task_count(const struct cgroup *cgrp) +{ + int count = 0; + struct cgrp_cset_link *link; + + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &cgrp->cset_links, cset_link) + count += atomic_read(&link->cset->refcount); + spin_unlock_irq(&css_set_lock); + return count; +} + +/* + * Load a cgroup's pidarray with either procs' tgids or tasks' pids + */ +static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, + struct cgroup_pidlist **lp) +{ + pid_t *array; + int length; + int pid, n = 0; /* used for populating the array */ + struct css_task_iter it; + struct task_struct *tsk; + struct cgroup_pidlist *l; + + lockdep_assert_held(&cgrp->pidlist_mutex); + + /* + * If cgroup gets more users after we read count, we won't have + * enough space - tough. This race is indistinguishable to the + * caller from the case that the additional cgroup users didn't + * show up until sometime later on. + */ + length = cgroup_task_count(cgrp); + array = pidlist_allocate(length); + if (!array) + return -ENOMEM; + /* now, populate the array */ + css_task_iter_start(&cgrp->self, &it); + while ((tsk = css_task_iter_next(&it))) { + if (unlikely(n == length)) + break; + /* get tgid or pid for procs or tasks file respectively */ + if (type == CGROUP_FILE_PROCS) + pid = task_tgid_vnr(tsk); + else + pid = task_pid_vnr(tsk); + if (pid > 0) /* make sure to only use valid results */ + array[n++] = pid; + } + css_task_iter_end(&it); + length = n; + /* now sort & (if procs) strip out duplicates */ + sort(array, length, sizeof(pid_t), cmppid, NULL); + if (type == CGROUP_FILE_PROCS) + length = pidlist_uniq(array, length); + + l = cgroup_pidlist_find_create(cgrp, type); + if (!l) { + pidlist_free(array); + return -ENOMEM; + } + + /* store array, freeing old if necessary */ + pidlist_free(l->list); + l->list = array; + l->length = length; + *lp = l; + return 0; +} + +/* + * seq_file methods for the tasks/procs files. The seq_file position is the + * next pid to display; the seq_file iterator is a pointer to the pid + * in the cgroup->l->list array. + */ + +static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) +{ + /* + * Initially we receive a position value that corresponds to + * one more than the last pid shown (or 0 on the first call or + * after a seek to the start). Use a binary-search to find the + * next pid to display, if any + */ + struct kernfs_open_file *of = s->private; + struct cgroup *cgrp = seq_css(s)->cgroup; + struct cgroup_pidlist *l; + enum cgroup_filetype type = seq_cft(s)->private; + int index = 0, pid = *pos; + int *iter, ret; + + mutex_lock(&cgrp->pidlist_mutex); + + /* + * !NULL @of->priv indicates that this isn't the first start() + * after open. If the matching pidlist is around, we can use that. + * Look for it. Note that @of->priv can't be used directly. It + * could already have been destroyed. + */ + if (of->priv) + of->priv = cgroup_pidlist_find(cgrp, type); + + /* + * Either this is the first start() after open or the matching + * pidlist has been destroyed inbetween. Create a new one. + */ + if (!of->priv) { + ret = pidlist_array_load(cgrp, type, + (struct cgroup_pidlist **)&of->priv); + if (ret) + return ERR_PTR(ret); + } + l = of->priv; + + if (pid) { + int end = l->length; + + while (index < end) { + int mid = (index + end) / 2; + if (l->list[mid] == pid) { + index = mid; + break; + } else if (l->list[mid] <= pid) + index = mid + 1; + else + end = mid; + } + } + /* If we're off the end of the array, we're done */ + if (index >= l->length) + return NULL; + /* Update the abstract position to be the actual pid that we found */ + iter = l->list + index; + *pos = *iter; + return iter; +} + +static void cgroup_pidlist_stop(struct seq_file *s, void *v) +{ + struct kernfs_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; + + if (l) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, + CGROUP_PIDLIST_DESTROY_DELAY); + mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); +} + +static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct kernfs_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; + pid_t *p = v; + pid_t *end = l->list + l->length; + /* + * Advance to the next pid in the array. If this goes off the + * end, we're done + */ + p++; + if (p >= end) { + return NULL; + } else { + *pos = *p; + return p; + } +} + +static int cgroup_pidlist_show(struct seq_file *s, void *v) +{ + seq_printf(s, "%d\n", *(int *)v); + + return 0; +} + +static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return __cgroup_procs_write(of, buf, nbytes, off, false); +} + +static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + + BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + spin_lock(&release_agent_path_lock); + strlcpy(cgrp->root->release_agent_path, strstrip(buf), + sizeof(cgrp->root->release_agent_path)); + spin_unlock(&release_agent_path_lock); + cgroup_kn_unlock(of->kn); + return nbytes; +} + +static int cgroup_release_agent_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + spin_lock(&release_agent_path_lock); + seq_puts(seq, cgrp->root->release_agent_path); + spin_unlock(&release_agent_path_lock); + seq_putc(seq, '\n'); + return 0; +} + +static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) +{ + seq_puts(seq, "0\n"); + return 0; +} + +static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return notify_on_release(css->cgroup); +} + +static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + if (val) + set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); + else + clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); + return 0; +} + +static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); +} + +static int cgroup_clone_children_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + if (val) + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); + else + clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); + return 0; +} + +/* cgroup core interface files for the legacy hierarchies */ +struct cftype cgroup1_base_files[] = { + { + .name = "cgroup.procs", + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_PROCS, + .write = cgroup_procs_write, + }, + { + .name = "cgroup.clone_children", + .read_u64 = cgroup_clone_children_read, + .write_u64 = cgroup_clone_children_write, + }, + { + .name = "cgroup.sane_behavior", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_sane_behavior_show, + }, + { + .name = "tasks", + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_TASKS, + .write = cgroup_tasks_write, + }, + { + .name = "notify_on_release", + .read_u64 = cgroup_read_notify_on_release, + .write_u64 = cgroup_write_notify_on_release, + }, + { + .name = "release_agent", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_release_agent_show, + .write = cgroup_release_agent_write, + .max_write_len = PATH_MAX - 1, + }, + { } /* terminate */ +}; + +/* Display information about each subsystem and each hierarchy */ +static int proc_cgroupstats_show(struct seq_file *m, void *v) +{ + struct cgroup_subsys *ss; + int i; + + seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); + /* + * ideally we don't want subsystems moving around while we do this. + * cgroup_mutex is also necessary to guarantee an atomic snapshot of + * subsys/hierarchy state. + */ + mutex_lock(&cgroup_mutex); + + for_each_subsys(ss, i) + seq_printf(m, "%s\t%d\t%d\t%d\n", + ss->legacy_name, ss->root->hierarchy_id, + atomic_read(&ss->root->nr_cgrps), + cgroup_ssid_enabled(i)); + + mutex_unlock(&cgroup_mutex); + return 0; +} + +static int cgroupstats_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_cgroupstats_show, NULL); +} + +const struct file_operations proc_cgroupstats_operations = { + .open = cgroupstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/** + * cgroupstats_build - build and fill cgroupstats + * @stats: cgroupstats to fill information into + * @dentry: A dentry entry belonging to the cgroup for which stats have + * been requested. + * + * Build and fill cgroupstats so that taskstats can export it to user + * space. + */ +int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) +{ + struct kernfs_node *kn = kernfs_node_from_dentry(dentry); + struct cgroup *cgrp; + struct css_task_iter it; + struct task_struct *tsk; + + /* it should be kernfs_node belonging to cgroupfs and is a directory */ + if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || + kernfs_type(kn) != KERNFS_DIR) + return -EINVAL; + + mutex_lock(&cgroup_mutex); + + /* + * We aren't being called from kernfs and there's no guarantee on + * @kn->priv's validity. For this and css_tryget_online_from_dir(), + * @kn->priv is RCU safe. Let's do the RCU dancing. + */ + rcu_read_lock(); + cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); + if (!cgrp || cgroup_is_dead(cgrp)) { + rcu_read_unlock(); + mutex_unlock(&cgroup_mutex); + return -ENOENT; + } + rcu_read_unlock(); + + css_task_iter_start(&cgrp->self, &it); + while ((tsk = css_task_iter_next(&it))) { + switch (tsk->state) { + case TASK_RUNNING: + stats->nr_running++; + break; + case TASK_INTERRUPTIBLE: + stats->nr_sleeping++; + break; + case TASK_UNINTERRUPTIBLE: + stats->nr_uninterruptible++; + break; + case TASK_STOPPED: + stats->nr_stopped++; + break; + default: + if (delayacct_is_task_waiting_on_io(tsk)) + stats->nr_io_wait++; + break; + } + } + css_task_iter_end(&it); + + mutex_unlock(&cgroup_mutex); + return 0; +} + +void cgroup1_check_for_release(struct cgroup *cgrp) +{ + if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && + !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) + schedule_work(&cgrp->release_agent_work); +} + +/* + * Notify userspace when a cgroup is released, by running the + * configured release agent with the name of the cgroup (path + * relative to the root of cgroup file system) as the argument. + * + * Most likely, this user command will try to rmdir this cgroup. + * + * This races with the possibility that some other task will be + * attached to this cgroup before it is removed, or that some other + * user task will 'mkdir' a child cgroup of this cgroup. That's ok. + * The presumed 'rmdir' will fail quietly if this cgroup is no longer + * unused, and this cgroup will be reprieved from its death sentence, + * to continue to serve a useful existence. Next time it's released, + * we will get notified again, if it still has 'notify_on_release' set. + * + * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which + * means only wait until the task is successfully execve()'d. The + * separate release agent task is forked by call_usermodehelper(), + * then control in this thread returns here, without waiting for the + * release agent task. We don't bother to wait because the caller of + * this routine has no use for the exit status of the release agent + * task, so no sense holding our caller up for that. + */ +void cgroup1_release_agent(struct work_struct *work) +{ + struct cgroup *cgrp = + container_of(work, struct cgroup, release_agent_work); + char *pathbuf = NULL, *agentbuf = NULL; + char *argv[3], *envp[3]; + int ret; + + mutex_lock(&cgroup_mutex); + + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); + agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); + if (!pathbuf || !agentbuf) + goto out; + + spin_lock_irq(&css_set_lock); + ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); + spin_unlock_irq(&css_set_lock); + if (ret < 0 || ret >= PATH_MAX) + goto out; + + argv[0] = agentbuf; + argv[1] = pathbuf; + argv[2] = NULL; + + /* minimal command environment */ + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + + mutex_unlock(&cgroup_mutex); + call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + goto out_free; +out: + mutex_unlock(&cgroup_mutex); +out_free: + kfree(agentbuf); + kfree(pathbuf); +} + +/* + * cgroup_rename - Only allow simple rename of directories in place. + */ +static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name_str) +{ + struct cgroup *cgrp = kn->priv; + int ret; + + if (kernfs_type(kn) != KERNFS_DIR) + return -ENOTDIR; + if (kn->parent != new_parent) + return -EIO; + + /* + * We're gonna grab cgroup_mutex which nests outside kernfs + * active_ref. kernfs_rename() doesn't require active_ref + * protection. Break them before grabbing cgroup_mutex. + */ + kernfs_break_active_protection(new_parent); + kernfs_break_active_protection(kn); + + mutex_lock(&cgroup_mutex); + + ret = kernfs_rename(kn, new_parent, new_name_str); + if (!ret) + trace_cgroup_rename(cgrp); + + mutex_unlock(&cgroup_mutex); + + kernfs_unbreak_active_protection(kn); + kernfs_unbreak_active_protection(new_parent); + return ret; +} + +static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root) +{ + struct cgroup_root *root = cgroup_root_from_kf(kf_root); + struct cgroup_subsys *ss; + int ssid; + + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_show_option(seq, ss->legacy_name, NULL); + if (root->flags & CGRP_ROOT_NOPREFIX) + seq_puts(seq, ",noprefix"); + if (root->flags & CGRP_ROOT_XATTR) + seq_puts(seq, ",xattr"); + + spin_lock(&release_agent_path_lock); + if (strlen(root->release_agent_path)) + seq_show_option(seq, "release_agent", + root->release_agent_path); + spin_unlock(&release_agent_path_lock); + + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) + seq_puts(seq, ",clone_children"); + if (strlen(root->name)) + seq_show_option(seq, "name", root->name); + return 0; +} + +static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) +{ + char *token, *o = data; + bool all_ss = false, one_ss = false; + u16 mask = U16_MAX; + struct cgroup_subsys *ss; + int nr_opts = 0; + int i; + +#ifdef CONFIG_CPUSETS + mask = ~((u16)1 << cpuset_cgrp_id); +#endif + + memset(opts, 0, sizeof(*opts)); + + while ((token = strsep(&o, ",")) != NULL) { + nr_opts++; + + if (!*token) + return -EINVAL; + if (!strcmp(token, "none")) { + /* Explicitly have no subsystems */ + opts->none = true; + continue; + } + if (!strcmp(token, "all")) { + /* Mutually exclusive option 'all' + subsystem name */ + if (one_ss) + return -EINVAL; + all_ss = true; + continue; + } + if (!strcmp(token, "noprefix")) { + opts->flags |= CGRP_ROOT_NOPREFIX; + continue; + } + if (!strcmp(token, "clone_children")) { + opts->cpuset_clone_children = true; + continue; + } + if (!strcmp(token, "xattr")) { + opts->flags |= CGRP_ROOT_XATTR; + continue; + } + if (!strncmp(token, "release_agent=", 14)) { + /* Specifying two release agents is forbidden */ + if (opts->release_agent) + return -EINVAL; + opts->release_agent = + kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); + if (!opts->release_agent) + return -ENOMEM; + continue; + } + if (!strncmp(token, "name=", 5)) { + const char *name = token + 5; + /* Can't specify an empty name */ + if (!strlen(name)) + return -EINVAL; + /* Must match [\w.-]+ */ + for (i = 0; i < strlen(name); i++) { + char c = name[i]; + if (isalnum(c)) + continue; + if ((c == '.') || (c == '-') || (c == '_')) + continue; + return -EINVAL; + } + /* Specifying two names is forbidden */ + if (opts->name) + return -EINVAL; + opts->name = kstrndup(name, + MAX_CGROUP_ROOT_NAMELEN - 1, + GFP_KERNEL); + if (!opts->name) + return -ENOMEM; + + continue; + } + + for_each_subsys(ss, i) { + if (strcmp(token, ss->legacy_name)) + continue; + if (!cgroup_ssid_enabled(i)) + continue; + if (cgroup1_ssid_disabled(i)) + continue; + + /* Mutually exclusive option 'all' + subsystem name */ + if (all_ss) + return -EINVAL; + opts->subsys_mask |= (1 << i); + one_ss = true; + + break; + } + if (i == CGROUP_SUBSYS_COUNT) + return -ENOENT; + } + + /* + * If the 'all' option was specified select all the subsystems, + * otherwise if 'none', 'name=' and a subsystem name options were + * not specified, let's default to 'all' + */ + if (all_ss || (!one_ss && !opts->none && !opts->name)) + for_each_subsys(ss, i) + if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) + opts->subsys_mask |= (1 << i); + + /* + * We either have to specify by name or by subsystems. (So all + * empty hierarchies must have a name). + */ + if (!opts->subsys_mask && !opts->name) + return -EINVAL; + + /* + * Option noprefix was introduced just for backward compatibility + * with the old cpuset, so we allow noprefix only if mounting just + * the cpuset subsystem. + */ + if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) + return -EINVAL; + + /* Can't specify "none" and some subsystems */ + if (opts->subsys_mask && opts->none) + return -EINVAL; + + return 0; +} + +static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) +{ + int ret = 0; + struct cgroup_root *root = cgroup_root_from_kf(kf_root); + struct cgroup_sb_opts opts; + u16 added_mask, removed_mask; + + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); + + /* See what subsystems are wanted */ + ret = parse_cgroupfs_options(data, &opts); + if (ret) + goto out_unlock; + + if (opts.subsys_mask != root->subsys_mask || opts.release_agent) + pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", + task_tgid_nr(current), current->comm); + + added_mask = opts.subsys_mask & ~root->subsys_mask; + removed_mask = root->subsys_mask & ~opts.subsys_mask; + + /* Don't allow flags or name to change at remount */ + if ((opts.flags ^ root->flags) || + (opts.name && strcmp(opts.name, root->name))) { + pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", + opts.flags, opts.name ?: "", root->flags, root->name); + ret = -EINVAL; + goto out_unlock; + } + + /* remounting is not allowed for populated hierarchies */ + if (!list_empty(&root->cgrp.self.children)) { + ret = -EBUSY; + goto out_unlock; + } + + ret = rebind_subsystems(root, added_mask); + if (ret) + goto out_unlock; + + WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); + + if (opts.release_agent) { + spin_lock(&release_agent_path_lock); + strcpy(root->release_agent_path, opts.release_agent); + spin_unlock(&release_agent_path_lock); + } + + trace_cgroup_remount(root); + + out_unlock: + kfree(opts.release_agent); + kfree(opts.name); + mutex_unlock(&cgroup_mutex); + return ret; +} + +struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { + .rename = cgroup1_rename, + .show_options = cgroup1_show_options, + .remount_fs = cgroup1_remount, + .mkdir = cgroup_mkdir, + .rmdir = cgroup_rmdir, + .show_path = cgroup_show_path, +}; + +struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, + void *data, unsigned long magic, + struct cgroup_namespace *ns) +{ + struct super_block *pinned_sb = NULL; + struct cgroup_sb_opts opts; + struct cgroup_root *root; + struct cgroup_subsys *ss; + struct dentry *dentry; + int i, ret; + + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); + + /* First find the desired set of subsystems */ + ret = parse_cgroupfs_options(data, &opts); + if (ret) + goto out_unlock; + + /* + * Destruction of cgroup root is asynchronous, so subsystems may + * still be dying after the previous unmount. Let's drain the + * dying subsystems. We just need to ensure that the ones + * unmounted previously finish dying and don't care about new ones + * starting. Testing ref liveliness is good enough. + */ + for_each_subsys(ss, i) { + if (!(opts.subsys_mask & (1 << i)) || + ss->root == &cgrp_dfl_root) + continue; + + if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { + mutex_unlock(&cgroup_mutex); + msleep(10); + ret = restart_syscall(); + goto out_free; + } + cgroup_put(&ss->root->cgrp); + } + + for_each_root(root) { + bool name_match = false; + + if (root == &cgrp_dfl_root) + continue; + + /* + * If we asked for a name then it must match. Also, if + * name matches but sybsys_mask doesn't, we should fail. + * Remember whether name matched. + */ + if (opts.name) { + if (strcmp(opts.name, root->name)) + continue; + name_match = true; + } + + /* + * If we asked for subsystems (or explicitly for no + * subsystems) then they must match. + */ + if ((opts.subsys_mask || opts.none) && + (opts.subsys_mask != root->subsys_mask)) { + if (!name_match) + continue; + ret = -EBUSY; + goto out_unlock; + } + + if (root->flags ^ opts.flags) + pr_warn("new mount options do not match the existing superblock, will be ignored\n"); + + /* + * We want to reuse @root whose lifetime is governed by its + * ->cgrp. Let's check whether @root is alive and keep it + * that way. As cgroup_kill_sb() can happen anytime, we + * want to block it by pinning the sb so that @root doesn't + * get killed before mount is complete. + * + * With the sb pinned, tryget_live can reliably indicate + * whether @root can be reused. If it's being killed, + * drain it. We can use wait_queue for the wait but this + * path is super cold. Let's just sleep a bit and retry. + */ + pinned_sb = kernfs_pin_sb(root->kf_root, NULL); + if (IS_ERR(pinned_sb) || + !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { + mutex_unlock(&cgroup_mutex); + if (!IS_ERR_OR_NULL(pinned_sb)) + deactivate_super(pinned_sb); + msleep(10); + ret = restart_syscall(); + goto out_free; + } + + ret = 0; + goto out_unlock; + } + + /* + * No such thing, create a new one. name= matching without subsys + * specification is allowed for already existing hierarchies but we + * can't create new one without subsys specification. + */ + if (!opts.subsys_mask && !opts.none) { + ret = -EINVAL; + goto out_unlock; + } + + /* Hierarchies may only be created in the initial cgroup namespace. */ + if (ns != &init_cgroup_ns) { + ret = -EPERM; + goto out_unlock; + } + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) { + ret = -ENOMEM; + goto out_unlock; + } + + init_cgroup_root(root, &opts); + + ret = cgroup_setup_root(root, opts.subsys_mask); + if (ret) + cgroup_free_root(root); + +out_unlock: + mutex_unlock(&cgroup_mutex); +out_free: + kfree(opts.release_agent); + kfree(opts.name); + + if (ret) + return ERR_PTR(ret); + + dentry = cgroup_do_mount(&cgroup_fs_type, flags, root, + CGROUP_SUPER_MAGIC, ns); + + /* + * If @pinned_sb, we're reusing an existing root and holding an + * extra ref on its sb. Mount is complete. Put the extra ref. + */ + if (pinned_sb) + deactivate_super(pinned_sb); + + return dentry; +} + +static int __init cgroup1_wq_init(void) +{ + /* + * Used to destroy pidlists and separate to serve as flush domain. + * Cap @max_active to 1 too. + */ + cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", + 0, 1); + BUG_ON(!cgroup_pidlist_destroy_wq); + return 0; +} +core_initcall(cgroup1_wq_init); + +static int __init cgroup_no_v1(char *str) +{ + struct cgroup_subsys *ss; + char *token; + int i; + + while ((token = strsep(&str, ",")) != NULL) { + if (!*token) + continue; + + if (!strcmp(token, "all")) { + cgroup_no_v1_mask = U16_MAX; + break; + } + + for_each_subsys(ss, i) { + if (strcmp(token, ss->name) && + strcmp(token, ss->legacy_name)) + continue; + + cgroup_no_v1_mask |= 1 << i; + } + } + return 1; +} +__setup("cgroup_no_v1=", cgroup_no_v1); + + +#ifdef CONFIG_CGROUP_DEBUG +static struct cgroup_subsys_state * +debug_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + + if (!css) + return ERR_PTR(-ENOMEM); + + return css; +} + +static void debug_css_free(struct cgroup_subsys_state *css) +{ + kfree(css); +} + +static u64 debug_taskcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return cgroup_task_count(css->cgroup); +} + +static u64 current_css_set_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return (u64)(unsigned long)current->cgroups; +} + +static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 count; + + rcu_read_lock(); + count = atomic_read(&task_css_set(current)->refcount); + rcu_read_unlock(); + return count; +} + +static int current_css_set_cg_links_read(struct seq_file *seq, void *v) +{ + struct cgrp_cset_link *link; + struct css_set *cset; + char *name_buf; + + name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name_buf) + return -ENOMEM; + + spin_lock_irq(&css_set_lock); + rcu_read_lock(); + cset = rcu_dereference(current->cgroups); + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + struct cgroup *c = link->cgrp; + + cgroup_name(c, name_buf, NAME_MAX + 1); + seq_printf(seq, "Root %d group %s\n", + c->root->hierarchy_id, name_buf); + } + rcu_read_unlock(); + spin_unlock_irq(&css_set_lock); + kfree(name_buf); + return 0; +} + +#define MAX_TASKS_SHOWN_PER_CSS 25 +static int cgroup_css_links_read(struct seq_file *seq, void *v) +{ + struct cgroup_subsys_state *css = seq_css(seq); + struct cgrp_cset_link *link; + + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { + struct css_set *cset = link->cset; + struct task_struct *task; + int count = 0; + + seq_printf(seq, "css_set %p\n", cset); + + list_for_each_entry(task, &cset->tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + + list_for_each_entry(task, &cset->mg_tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + continue; + overflow: + seq_puts(seq, " ...\n"); + } + spin_unlock_irq(&css_set_lock); + return 0; +} + +static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return (!cgroup_is_populated(css->cgroup) && + !css_has_online_children(&css->cgroup->self)); +} + +static struct cftype debug_files[] = { + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .read_u64 = current_css_set_read, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + }, + + { + .name = "current_css_set_cg_links", + .seq_show = current_css_set_cg_links_read, + }, + + { + .name = "cgroup_css_links", + .seq_show = cgroup_css_links_read, + }, + + { + .name = "releasable", + .read_u64 = releasable_read, + }, + + { } /* terminate */ +}; + +struct cgroup_subsys debug_cgrp_subsys = { + .css_alloc = debug_css_alloc, + .css_free = debug_css_free, + .legacy_cftypes = debug_files, +}; +#endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup.c b/kernel/cgroup/cgroup.c index 53bbca7c4859..e8f87bf9840c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -28,15 +28,13 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/cgroup.h> +#include "cgroup-internal.h" + #include <linux/cred.h> -#include <linux/ctype.h> #include <linux/errno.h> #include <linux/init_task.h> #include <linux/kernel.h> -#include <linux/list.h> #include <linux/magic.h> -#include <linux/mm.h> #include <linux/mutex.h> #include <linux/mount.h> #include <linux/pagemap.h> @@ -47,16 +45,9 @@ #include <linux/spinlock.h> #include <linux/percpu-rwsem.h> #include <linux/string.h> -#include <linux/sort.h> -#include <linux/kmod.h> -#include <linux/delayacct.h> -#include <linux/cgroupstats.h> #include <linux/hashtable.h> -#include <linux/pid_namespace.h> #include <linux/idr.h> -#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <linux/kthread.h> -#include <linux/delay.h> #include <linux/atomic.h> #include <linux/cpuset.h> #include <linux/proc_ns.h> @@ -67,14 +58,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/cgroup.h> -/* - * pidlists linger the following amount before being destroyed. The goal - * is avoiding frequent destruction in the middle of consecutive read calls - * Expiring in the middle is a performance problem not a correctness one. - * 1 sec should be enough. - */ -#define CGROUP_PIDLIST_DESTROY_DELAY HZ - #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) @@ -88,14 +71,12 @@ * These locks are exported if CONFIG_PROVE_RCU so that accessors in * cgroup.h can use them for lockdep annotations. */ -#ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); + +#ifdef CONFIG_PROVE_RCU EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); -#else -static DEFINE_MUTEX(cgroup_mutex); -static DEFINE_SPINLOCK(css_set_lock); #endif /* @@ -110,12 +91,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(cgroup_file_kn_lock); -/* - * Protects cgroup_subsys->release_agent_path. Modifying it also requires - * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. - */ -static DEFINE_SPINLOCK(release_agent_path_lock); - struct percpu_rw_semaphore cgroup_threadgroup_rwsem; #define cgroup_assert_mutex_or_rcu_locked() \ @@ -131,15 +106,9 @@ struct percpu_rw_semaphore cgroup_threadgroup_rwsem; */ static struct workqueue_struct *cgroup_destroy_wq; -/* - * pidlist destructions need to be flushed on cgroup destruction. Use a - * separate workqueue as flush domain. - */ -static struct workqueue_struct *cgroup_pidlist_destroy_wq; - /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, -static struct cgroup_subsys *cgroup_subsys[] = { +struct cgroup_subsys *cgroup_subsys[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS @@ -186,18 +155,14 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); */ static bool cgrp_dfl_visible; -/* Controllers blocked by the commandline in v1 */ -static u16 cgroup_no_v1_mask; - /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; /* some controllers are implicitly enabled on the default hierarchy */ -static unsigned long cgrp_dfl_implicit_ss_mask; +static u16 cgrp_dfl_implicit_ss_mask; /* The list of hierarchy roots */ - -static LIST_HEAD(cgroup_roots); +LIST_HEAD(cgroup_roots); static int cgroup_root_count; /* hierarchy ID allocation and mapping, protected by cgroup_mutex */ @@ -213,13 +178,13 @@ static DEFINE_IDR(cgroup_hierarchy_idr); static u64 css_serial_nr_next = 1; /* - * These bitmask flags indicate whether tasks in the fork and exit paths have - * fork/exit handlers to call. This avoids us having to do extra work in the - * fork/exit path to check which subsystems have fork/exit callbacks. + * These bitmasks identify subsystems with specific features to avoid + * having to do iterative checks repeatedly. */ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; static u16 have_free_callback __read_mostly; +static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { @@ -230,15 +195,9 @@ struct cgroup_namespace init_cgroup_ns = { .root_cset = &init_css_set, }; -/* Ditto for the can_fork callback. */ -static u16 have_canfork_callback __read_mostly; - static struct file_system_type cgroup2_fs_type; -static struct cftype cgroup_dfl_base_files[]; -static struct cftype cgroup_legacy_base_files[]; +static struct cftype cgroup_base_files[]; -static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); -static void cgroup_lock_and_drain_offline(struct cgroup *cgrp); static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_advance(struct css_task_iter *it); @@ -259,7 +218,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, * is fine for individual subsystems but unsuitable for cgroup core. This * is slower static_key_enabled() based test indexed by @ssid. */ -static bool cgroup_ssid_enabled(int ssid) +bool cgroup_ssid_enabled(int ssid) { if (CGROUP_SUBSYS_COUNT == 0) return false; @@ -267,11 +226,6 @@ static bool cgroup_ssid_enabled(int ssid) return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } -static bool cgroup_ssid_no_v1(int ssid) -{ - return cgroup_no_v1_mask & (1 << ssid); -} - /** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy * @cgrp: the cgroup of interest @@ -325,7 +279,7 @@ static bool cgroup_ssid_no_v1(int ssid) * * - debug: disallowed on the default hierarchy. */ -static bool cgroup_on_dfl(const struct cgroup *cgrp) +bool cgroup_on_dfl(const struct cgroup *cgrp) { return cgrp->root == &cgrp_dfl_root; } @@ -481,12 +435,6 @@ out_unlock: return css; } -/* convenient tests for these bits */ -static inline bool cgroup_is_dead(const struct cgroup *cgrp) -{ - return !(cgrp->self.flags & CSS_ONLINE); -} - static void cgroup_get(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); @@ -518,11 +466,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) } EXPORT_SYMBOL_GPL(of_css); -static int notify_on_release(const struct cgroup *cgrp) -{ - return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); -} - /** * for_each_css - iterate all css's of a cgroup * @css: the iteration cursor @@ -553,15 +496,6 @@ static int notify_on_release(const struct cgroup *cgrp) else /** - * for_each_subsys - iterate all enabled cgroup subsystems - * @ss: the iteration cursor - * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end - */ -#define for_each_subsys(ss, ssid) \ - for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ - (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) - -/** * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end @@ -585,10 +519,6 @@ static int notify_on_release(const struct cgroup *cgrp) } \ } while (false) -/* iterate across the hierarchies */ -#define for_each_root(root) \ - list_for_each_entry((root), &cgroup_roots, root_list) - /* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \ list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ @@ -615,29 +545,6 @@ static int notify_on_release(const struct cgroup *cgrp) ; \ else -static void cgroup_release_agent(struct work_struct *work); -static void check_for_release(struct cgroup *cgrp); - -/* - * A cgroup can be associated with multiple css_sets as different tasks may - * belong to different cgroups on different hierarchies. In the other - * direction, a css_set is naturally associated with multiple cgroups. - * This M:N relationship is represented by the following link structure - * which exists for each association and allows traversing the associations - * from both sides. - */ -struct cgrp_cset_link { - /* the cgroup and css_set this link associates */ - struct cgroup *cgrp; - struct css_set *cset; - - /* list of cgrp_cset_links anchored at cgrp->cset_links */ - struct list_head cset_link; - - /* list of cgrp_cset_links anchored at css_set->cgrp_links */ - struct list_head cgrp_link; -}; - /* * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state @@ -647,12 +554,12 @@ struct cgrp_cset_link { */ struct css_set init_css_set = { .refcount = ATOMIC_INIT(1), - .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), + .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), + .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), - .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), }; static int css_set_count = 1; /* 1 for init_css_set */ @@ -699,7 +606,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) if (!trigger) break; - check_for_release(cgrp); + cgroup1_check_for_release(cgrp); cgroup_file_notify(&cgrp->events_file); cgrp = cgroup_parent(cgrp); @@ -808,7 +715,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) return key; } -static void put_css_set_locked(struct css_set *cset) +void put_css_set_locked(struct css_set *cset) { struct cgrp_cset_link *link, *tmp_link; struct cgroup_subsys *ss; @@ -838,31 +745,6 @@ static void put_css_set_locked(struct css_set *cset) kfree_rcu(cset, rcu_head); } -static void put_css_set(struct css_set *cset) -{ - unsigned long flags; - - /* - * Ensure that the refcount doesn't hit zero while any readers - * can see it. Similar to atomic_dec_and_lock(), but for an - * rwlock - */ - if (atomic_add_unless(&cset->refcount, -1, 1)) - return; - - spin_lock_irqsave(&css_set_lock, flags); - put_css_set_locked(cset); - spin_unlock_irqrestore(&css_set_lock, flags); -} - -/* - * refcounted get/put for css_set objects - */ -static inline void get_css_set(struct css_set *cset) -{ - atomic_inc(&cset->refcount); -} - /** * compare_css_sets - helper function for find_existing_css_set(). * @cset: candidate css_set being tested @@ -1095,13 +977,13 @@ static struct css_set *find_css_set(struct css_set *old_cset, } atomic_set(&cset->refcount, 1); - INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); - INIT_LIST_HEAD(&cset->mg_preload_node); - INIT_LIST_HEAD(&cset->mg_node); INIT_LIST_HEAD(&cset->task_iters); INIT_HLIST_NODE(&cset->hlist); + INIT_LIST_HEAD(&cset->cgrp_links); + INIT_LIST_HEAD(&cset->mg_preload_node); + INIT_LIST_HEAD(&cset->mg_node); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ @@ -1138,7 +1020,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, return cset; } -static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) +struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { struct cgroup *root_cgrp = kf_root->kn->priv; @@ -1166,7 +1048,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); } -static void cgroup_free_root(struct cgroup_root *root) +void cgroup_free_root(struct cgroup_root *root) { if (root) { idr_destroy(&root->cgroup_idr); @@ -1283,8 +1165,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, * Return the cgroup for "task" from the given hierarchy. Must be * called with cgroup_mutex and css_set_lock held. */ -static struct cgroup *task_cgroup_from_root(struct task_struct *task, - struct cgroup_root *root) +struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroup_root *root) { /* * No need to lock the task - since we hold cgroup_mutex the @@ -1321,7 +1203,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, */ static struct kernfs_syscall_ops cgroup_kf_syscall_ops; -static const struct file_operations proc_cgroupstats_operations; static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, char *buf) @@ -1415,7 +1296,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) * inaccessible any time. If the caller intends to continue to access the * cgroup, it should pin it before invoking this function. */ -static void cgroup_kn_unlock(struct kernfs_node *kn) +void cgroup_kn_unlock(struct kernfs_node *kn) { struct cgroup *cgrp; @@ -1447,8 +1328,7 @@ static void cgroup_kn_unlock(struct kernfs_node *kn) * locking under kernfs active protection and allows all kernfs operations * including self-removal. */ -static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, - bool drain_offline) +struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) { struct cgroup *cgrp; @@ -1532,9 +1412,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css) if (!css->ss) { if (cgroup_on_dfl(cgrp)) - cfts = cgroup_dfl_base_files; + cfts = cgroup_base_files; else - cfts = cgroup_legacy_base_files; + cfts = cgroup1_base_files; return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); } @@ -1559,7 +1439,7 @@ err: return ret; } -static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) +int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; @@ -1629,8 +1509,8 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) return 0; } -static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, - struct kernfs_root *kf_root) +int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, + struct kernfs_root *kf_root) { int len = 0; char *buf = NULL; @@ -1656,237 +1536,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, return len; } -static int cgroup_show_options(struct seq_file *seq, - struct kernfs_root *kf_root) -{ - struct cgroup_root *root = cgroup_root_from_kf(kf_root); - struct cgroup_subsys *ss; - int ssid; - - if (root != &cgrp_dfl_root) - for_each_subsys(ss, ssid) - if (root->subsys_mask & (1 << ssid)) - seq_show_option(seq, ss->legacy_name, NULL); - if (root->flags & CGRP_ROOT_NOPREFIX) - seq_puts(seq, ",noprefix"); - if (root->flags & CGRP_ROOT_XATTR) - seq_puts(seq, ",xattr"); - - spin_lock(&release_agent_path_lock); - if (strlen(root->release_agent_path)) - seq_show_option(seq, "release_agent", - root->release_agent_path); - spin_unlock(&release_agent_path_lock); - - if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) - seq_puts(seq, ",clone_children"); - if (strlen(root->name)) - seq_show_option(seq, "name", root->name); - return 0; -} - -struct cgroup_sb_opts { - u16 subsys_mask; - unsigned int flags; - char *release_agent; - bool cpuset_clone_children; - char *name; - /* User explicitly requested empty subsystem */ - bool none; -}; - -static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) -{ - char *token, *o = data; - bool all_ss = false, one_ss = false; - u16 mask = U16_MAX; - struct cgroup_subsys *ss; - int nr_opts = 0; - int i; - -#ifdef CONFIG_CPUSETS - mask = ~((u16)1 << cpuset_cgrp_id); -#endif - - memset(opts, 0, sizeof(*opts)); - - while ((token = strsep(&o, ",")) != NULL) { - nr_opts++; - - if (!*token) - return -EINVAL; - if (!strcmp(token, "none")) { - /* Explicitly have no subsystems */ - opts->none = true; - continue; - } - if (!strcmp(token, "all")) { - /* Mutually exclusive option 'all' + subsystem name */ - if (one_ss) - return -EINVAL; - all_ss = true; - continue; - } - if (!strcmp(token, "noprefix")) { - opts->flags |= CGRP_ROOT_NOPREFIX; - continue; - } - if (!strcmp(token, "clone_children")) { - opts->cpuset_clone_children = true; - continue; - } - if (!strcmp(token, "xattr")) { - opts->flags |= CGRP_ROOT_XATTR; - continue; - } - if (!strncmp(token, "release_agent=", 14)) { - /* Specifying two release agents is forbidden */ - if (opts->release_agent) - return -EINVAL; - opts->release_agent = - kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); - if (!opts->release_agent) - return -ENOMEM; - continue; - } - if (!strncmp(token, "name=", 5)) { - const char *name = token + 5; - /* Can't specify an empty name */ - if (!strlen(name)) - return -EINVAL; - /* Must match [\w.-]+ */ - for (i = 0; i < strlen(name); i++) { - char c = name[i]; - if (isalnum(c)) - continue; - if ((c == '.') || (c == '-') || (c == '_')) - continue; - return -EINVAL; - } - /* Specifying two names is forbidden */ - if (opts->name) - return -EINVAL; - opts->name = kstrndup(name, - MAX_CGROUP_ROOT_NAMELEN - 1, - GFP_KERNEL); - if (!opts->name) - return -ENOMEM; - - continue; - } - - for_each_subsys(ss, i) { - if (strcmp(token, ss->legacy_name)) - continue; - if (!cgroup_ssid_enabled(i)) - continue; - if (cgroup_ssid_no_v1(i)) - continue; - - /* Mutually exclusive option 'all' + subsystem name */ - if (all_ss) - return -EINVAL; - opts->subsys_mask |= (1 << i); - one_ss = true; - - break; - } - if (i == CGROUP_SUBSYS_COUNT) - return -ENOENT; - } - - /* - * If the 'all' option was specified select all the subsystems, - * otherwise if 'none', 'name=' and a subsystem name options were - * not specified, let's default to 'all' - */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i)) - opts->subsys_mask |= (1 << i); - - /* - * We either have to specify by name or by subsystems. (So all - * empty hierarchies must have a name). - */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; - - /* - * Option noprefix was introduced just for backward compatibility - * with the old cpuset, so we allow noprefix only if mounting just - * the cpuset subsystem. - */ - if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) - return -EINVAL; - - /* Can't specify "none" and some subsystems */ - if (opts->subsys_mask && opts->none) - return -EINVAL; - - return 0; -} - static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { - int ret = 0; - struct cgroup_root *root = cgroup_root_from_kf(kf_root); - struct cgroup_sb_opts opts; - u16 added_mask, removed_mask; - - if (root == &cgrp_dfl_root) { - pr_err("remount is not allowed\n"); - return -EINVAL; - } - - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); - - /* See what subsystems are wanted */ - ret = parse_cgroupfs_options(data, &opts); - if (ret) - goto out_unlock; - - if (opts.subsys_mask != root->subsys_mask || opts.release_agent) - pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", - task_tgid_nr(current), current->comm); - - added_mask = opts.subsys_mask & ~root->subsys_mask; - removed_mask = root->subsys_mask & ~opts.subsys_mask; - - /* Don't allow flags or name to change at remount */ - if ((opts.flags ^ root->flags) || - (opts.name && strcmp(opts.name, root->name))) { - pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", - opts.flags, opts.name ?: "", root->flags, root->name); - ret = -EINVAL; - goto out_unlock; - } - - /* remounting is not allowed for populated hierarchies */ - if (!list_empty(&root->cgrp.self.children)) { - ret = -EBUSY; - goto out_unlock; - } - - ret = rebind_subsystems(root, added_mask); - if (ret) - goto out_unlock; - - WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); - - if (opts.release_agent) { - spin_lock(&release_agent_path_lock); - strcpy(root->release_agent_path, opts.release_agent); - spin_unlock(&release_agent_path_lock); - } - - trace_cgroup_remount(root); - - out_unlock: - kfree(opts.release_agent); - kfree(opts.name); - mutex_unlock(&cgroup_mutex); - return ret; + pr_err("remount is not allowed\n"); + return -EINVAL; } /* @@ -1964,11 +1617,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); init_waitqueue_head(&cgrp->offline_waitq); - INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent); + INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } -static void init_cgroup_root(struct cgroup_root *root, - struct cgroup_sb_opts *opts) +void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) { struct cgroup *cgrp = &root->cgrp; @@ -1987,10 +1639,11 @@ static void init_cgroup_root(struct cgroup_root *root, set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; + struct kernfs_syscall_ops *kf_sops; struct css_set *cset; int i, ret; @@ -2022,7 +1675,10 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto cancel_ref; - root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, + kf_sops = root == &cgrp_dfl_root ? + &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; + + root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED, root_cgrp); if (IS_ERR(root->kf_root)) { @@ -2080,182 +1736,18 @@ out: return ret; } -static struct dentry *cgroup_mount(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data) +struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, + struct cgroup_root *root, unsigned long magic, + struct cgroup_namespace *ns) { - bool is_v2 = fs_type == &cgroup2_fs_type; - struct super_block *pinned_sb = NULL; - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; - struct cgroup_subsys *ss; - struct cgroup_root *root; - struct cgroup_sb_opts opts; struct dentry *dentry; - int ret; - int i; bool new_sb; - get_cgroup_ns(ns); - - /* Check if the caller has permission to mount. */ - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { - put_cgroup_ns(ns); - return ERR_PTR(-EPERM); - } - - /* - * The first time anyone tries to mount a cgroup, enable the list - * linking each css_set to its tasks and fix up all existing tasks. - */ - if (!use_task_css_set_links) - cgroup_enable_task_cg_lists(); - - if (is_v2) { - if (data) { - pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); - put_cgroup_ns(ns); - return ERR_PTR(-EINVAL); - } - cgrp_dfl_visible = true; - root = &cgrp_dfl_root; - cgroup_get(&root->cgrp); - goto out_mount; - } - - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); - - /* First find the desired set of subsystems */ - ret = parse_cgroupfs_options(data, &opts); - if (ret) - goto out_unlock; - - /* - * Destruction of cgroup root is asynchronous, so subsystems may - * still be dying after the previous unmount. Let's drain the - * dying subsystems. We just need to ensure that the ones - * unmounted previously finish dying and don't care about new ones - * starting. Testing ref liveliness is good enough. - */ - for_each_subsys(ss, i) { - if (!(opts.subsys_mask & (1 << i)) || - ss->root == &cgrp_dfl_root) - continue; - - if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - msleep(10); - ret = restart_syscall(); - goto out_free; - } - cgroup_put(&ss->root->cgrp); - } - - for_each_root(root) { - bool name_match = false; - - if (root == &cgrp_dfl_root) - continue; - - /* - * If we asked for a name then it must match. Also, if - * name matches but sybsys_mask doesn't, we should fail. - * Remember whether name matched. - */ - if (opts.name) { - if (strcmp(opts.name, root->name)) - continue; - name_match = true; - } - - /* - * If we asked for subsystems (or explicitly for no - * subsystems) then they must match. - */ - if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->subsys_mask)) { - if (!name_match) - continue; - ret = -EBUSY; - goto out_unlock; - } - - if (root->flags ^ opts.flags) - pr_warn("new mount options do not match the existing superblock, will be ignored\n"); - - /* - * We want to reuse @root whose lifetime is governed by its - * ->cgrp. Let's check whether @root is alive and keep it - * that way. As cgroup_kill_sb() can happen anytime, we - * want to block it by pinning the sb so that @root doesn't - * get killed before mount is complete. - * - * With the sb pinned, tryget_live can reliably indicate - * whether @root can be reused. If it's being killed, - * drain it. We can use wait_queue for the wait but this - * path is super cold. Let's just sleep a bit and retry. - */ - pinned_sb = kernfs_pin_sb(root->kf_root, NULL); - if (IS_ERR(pinned_sb) || - !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - if (!IS_ERR_OR_NULL(pinned_sb)) - deactivate_super(pinned_sb); - msleep(10); - ret = restart_syscall(); - goto out_free; - } - - ret = 0; - goto out_unlock; - } + dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb); /* - * No such thing, create a new one. name= matching without subsys - * specification is allowed for already existing hierarchies but we - * can't create new one without subsys specification. - */ - if (!opts.subsys_mask && !opts.none) { - ret = -EINVAL; - goto out_unlock; - } - - /* Hierarchies may only be created in the initial cgroup namespace. */ - if (ns != &init_cgroup_ns) { - ret = -EPERM; - goto out_unlock; - } - - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) { - ret = -ENOMEM; - goto out_unlock; - } - - init_cgroup_root(root, &opts); - - ret = cgroup_setup_root(root, opts.subsys_mask); - if (ret) - cgroup_free_root(root); - -out_unlock: - mutex_unlock(&cgroup_mutex); -out_free: - kfree(opts.release_agent); - kfree(opts.name); - - if (ret) { - put_cgroup_ns(ns); - return ERR_PTR(ret); - } -out_mount: - dentry = kernfs_mount(fs_type, flags, root->kf_root, - is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, - &new_sb); - - /* - * In non-init cgroup namespace, instead of root cgroup's - * dentry, we return the dentry corresponding to the - * cgroupns->root_cgrp. + * In non-init cgroup namespace, instead of root cgroup's dentry, + * we return the dentry corresponding to the cgroupns->root_cgrp. */ if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { struct dentry *nsdentry; @@ -2277,13 +1769,45 @@ out_mount: if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); + return dentry; +} + +static struct dentry *cgroup_mount(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, + void *data) +{ + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct dentry *dentry; + + get_cgroup_ns(ns); + + /* Check if the caller has permission to mount. */ + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { + put_cgroup_ns(ns); + return ERR_PTR(-EPERM); + } + /* - * If @pinned_sb, we're reusing an existing root and holding an - * extra ref on its sb. Mount is complete. Put the extra ref. + * The first time anyone tries to mount a cgroup, enable the list + * linking each css_set to its tasks and fix up all existing tasks. */ - if (pinned_sb) { - WARN_ON(new_sb); - deactivate_super(pinned_sb); + if (!use_task_css_set_links) + cgroup_enable_task_cg_lists(); + + if (fs_type == &cgroup2_fs_type) { + if (data) { + pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + put_cgroup_ns(ns); + return ERR_PTR(-EINVAL); + } + cgrp_dfl_visible = true; + cgroup_get(&cgrp_dfl_root.cgrp); + + dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, + CGROUP2_SUPER_MAGIC, ns); + } else { + dentry = cgroup1_mount(&cgroup_fs_type, flags, data, + CGROUP_SUPER_MAGIC, ns); } put_cgroup_ns(ns); @@ -2311,7 +1835,7 @@ static void cgroup_kill_sb(struct super_block *sb) kernfs_kill_sb(sb); } -static struct file_system_type cgroup_fs_type = { +struct file_system_type cgroup_fs_type = { .name = "cgroup", .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, @@ -2325,8 +1849,8 @@ static struct file_system_type cgroup2_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns) +int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) { struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); @@ -2389,49 +1913,18 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) } EXPORT_SYMBOL_GPL(task_cgroup_path); -/* used to track tasks and other necessary states during migration */ -struct cgroup_taskset { - /* the src and dst cset list running through cset->mg_node */ - struct list_head src_csets; - struct list_head dst_csets; - - /* the subsys currently being processed */ - int ssid; - - /* - * Fields for cgroup_taskset_*() iteration. - * - * Before migration is committed, the target migration tasks are on - * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of - * the csets on ->dst_csets. ->csets point to either ->src_csets - * or ->dst_csets depending on whether migration is committed. - * - * ->cur_csets and ->cur_task point to the current task position - * during iteration. - */ - struct list_head *csets; - struct css_set *cur_cset; - struct task_struct *cur_task; -}; - -#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \ - .src_csets = LIST_HEAD_INIT(tset.src_csets), \ - .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ - .csets = &tset.src_csets, \ -} - /** - * cgroup_taskset_add - try to add a migration target task to a taskset + * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task - * @tset: target taskset + * @mgctx: target migration context * - * Add @task, which is a migration target, to @tset. This function becomes - * noop if @task doesn't need to be migrated. @task's css_set should have - * been added as a migration source and @task->cg_list will be moved from - * the css_set's tasks list to mg_tasks one. + * Add @task, which is a migration target, to @mgctx->tset. This function + * becomes noop if @task doesn't need to be migrated. @task's css_set + * should have been added as a migration source and @task->cg_list will be + * moved from the css_set's tasks list to mg_tasks one. */ -static void cgroup_taskset_add(struct task_struct *task, - struct cgroup_taskset *tset) +static void cgroup_migrate_add_task(struct task_struct *task, + struct cgroup_mgctx *mgctx) { struct css_set *cset; @@ -2451,10 +1944,11 @@ static void cgroup_taskset_add(struct task_struct *task, list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) - list_add_tail(&cset->mg_node, &tset->src_csets); + list_add_tail(&cset->mg_node, + &mgctx->tset.src_csets); if (list_empty(&cset->mg_dst_cset->mg_node)) - list_move_tail(&cset->mg_dst_cset->mg_node, - &tset->dst_csets); + list_add_tail(&cset->mg_dst_cset->mg_node, + &mgctx->tset.dst_csets); } /** @@ -2521,17 +2015,16 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, /** * cgroup_taskset_migrate - migrate a taskset - * @tset: taget taskset - * @root: cgroup root the migration is taking place on + * @mgctx: migration context * - * Migrate tasks in @tset as setup by migration preparation functions. + * Migrate tasks in @mgctx as setup by migration preparation functions. * This function fails iff one of the ->can_attach callbacks fails and - * guarantees that either all or none of the tasks in @tset are migrated. - * @tset is consumed regardless of success. + * guarantees that either all or none of the tasks in @mgctx are migrated. + * @mgctx is consumed regardless of success. */ -static int cgroup_taskset_migrate(struct cgroup_taskset *tset, - struct cgroup_root *root) +static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) { + struct cgroup_taskset *tset = &mgctx->tset; struct cgroup_subsys *ss; struct task_struct *task, *tmp_task; struct css_set *cset, *tmp_cset; @@ -2542,7 +2035,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, return 0; /* check that we can legitimately attach to the cgroup */ - do_each_subsys_mask(ss, ssid, root->subsys_mask) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->can_attach) { tset->ssid = ssid; ret = ss->can_attach(tset); @@ -2578,7 +2071,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, */ tset->csets = &tset->dst_csets; - do_each_subsys_mask(ss, ssid, root->subsys_mask) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->attach) { tset->ssid = ssid; ss->attach(tset); @@ -2589,7 +2082,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, goto out_release_tset; out_cancel_attach: - do_each_subsys_mask(ss, ssid, root->subsys_mask) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ssid == failed_ssid) break; if (ss->cancel_attach) { @@ -2616,7 +2109,7 @@ out_release_tset: * zero for migration destination cgroups with tasks so that child cgroups * don't compete against tasks. */ -static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) +bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) { return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || !dst_cgrp->subtree_control; @@ -2624,25 +2117,31 @@ static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) /** * cgroup_migrate_finish - cleanup after attach - * @preloaded_csets: list of preloaded css_sets + * @mgctx: migration context * * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See * those functions for details. */ -static void cgroup_migrate_finish(struct list_head *preloaded_csets) +void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) { + LIST_HEAD(preloaded); struct css_set *cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { + + list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded); + list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded); + + list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_preload_node); put_css_set_locked(cset); } + spin_unlock_irq(&css_set_lock); } @@ -2650,10 +2149,10 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) * cgroup_migrate_add_src - add a migration source css_set * @src_cset: the source css_set to add * @dst_cgrp: the destination cgroup - * @preloaded_csets: list of preloaded css_sets + * @mgctx: migration context * * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin - * @src_cset and add it to @preloaded_csets, which should later be cleaned + * @src_cset and add it to @mgctx->src_csets, which should later be cleaned * up by cgroup_migrate_finish(). * * This function may be called without holding cgroup_threadgroup_rwsem @@ -2662,9 +2161,9 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) * into play and the preloaded css_sets are guaranteed to cover all * migrations. */ -static void cgroup_migrate_add_src(struct css_set *src_cset, - struct cgroup *dst_cgrp, - struct list_head *preloaded_csets) +void cgroup_migrate_add_src(struct css_set *src_cset, + struct cgroup *dst_cgrp, + struct cgroup_mgctx *mgctx) { struct cgroup *src_cgrp; @@ -2692,33 +2191,35 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); - list_add(&src_cset->mg_preload_node, preloaded_csets); + list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets); } /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration - * @preloaded_csets: list of preloaded source css_sets + * @mgctx: migration context * * Tasks are about to be moved and all the source css_sets have been - * preloaded to @preloaded_csets. This function looks up and pins all - * destination css_sets, links each to its source, and append them to - * @preloaded_csets. + * preloaded to @mgctx->preloaded_src_csets. This function looks up and + * pins all destination css_sets, links each to its source, and append them + * to @mgctx->preloaded_dst_csets. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed * using cgroup_migrate(), cgroup_migrate_finish() must be called on - * @preloaded_csets. + * @mgctx. */ -static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) +int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) { - LIST_HEAD(csets); struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); /* look up the dst cset for each src cset and link it to src */ - list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { + list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, + mg_preload_node) { struct css_set *dst_cset; + struct cgroup_subsys *ss; + int ssid; dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) @@ -2743,15 +2244,19 @@ static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) src_cset->mg_dst_cset = dst_cset; if (list_empty(&dst_cset->mg_preload_node)) - list_add(&dst_cset->mg_preload_node, &csets); + list_add_tail(&dst_cset->mg_preload_node, + &mgctx->preloaded_dst_csets); else put_css_set(dst_cset); + + for_each_subsys(ss, ssid) + if (src_cset->subsys[ssid] != dst_cset->subsys[ssid]) + mgctx->ss_mask |= 1 << ssid; } - list_splice_tail(&csets, preloaded_csets); return 0; err: - cgroup_migrate_finish(&csets); + cgroup_migrate_finish(mgctx); return -ENOMEM; } @@ -2759,7 +2264,7 @@ err: * cgroup_migrate - migrate a process or task to a cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task - * @root: cgroup root migration is taking place on + * @mgctx: migration context * * Migrate a process or task denoted by @leader. If migrating a process, * the caller must be holding cgroup_threadgroup_rwsem. The caller is also @@ -2773,10 +2278,9 @@ err: * decided for all targets by invoking group_migrate_prepare_dst() before * actually starting migrating. */ -static int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup_root *root) +int cgroup_migrate(struct task_struct *leader, bool threadgroup, + struct cgroup_mgctx *mgctx) { - struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct task_struct *task; /* @@ -2788,14 +2292,14 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, rcu_read_lock(); task = leader; do { - cgroup_taskset_add(task, &tset); + cgroup_migrate_add_task(task, mgctx); if (!threadgroup) break; } while_each_thread(leader, task); rcu_read_unlock(); spin_unlock_irq(&css_set_lock); - return cgroup_taskset_migrate(&tset, root); + return cgroup_migrate_execute(mgctx); } /** @@ -2806,10 +2310,10 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, * * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. */ -static int cgroup_attach_task(struct cgroup *dst_cgrp, - struct task_struct *leader, bool threadgroup) +int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, + bool threadgroup) { - LIST_HEAD(preloaded_csets); + DEFINE_CGROUP_MGCTX(mgctx); struct task_struct *task; int ret; @@ -2821,8 +2325,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, rcu_read_lock(); task = leader; do { - cgroup_migrate_add_src(task_css_set(task), dst_cgrp, - &preloaded_csets); + cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); if (!threadgroup) break; } while_each_thread(leader, task); @@ -2830,11 +2333,11 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ - ret = cgroup_migrate_prepare_dst(&preloaded_csets); + ret = cgroup_migrate_prepare_dst(&mgctx); if (!ret) - ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); + ret = cgroup_migrate(leader, threadgroup, &mgctx); - cgroup_migrate_finish(&preloaded_csets); + cgroup_migrate_finish(&mgctx); if (!ret) trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); @@ -2846,20 +2349,9 @@ static int cgroup_procs_write_permission(struct task_struct *task, struct cgroup *dst_cgrp, struct kernfs_open_file *of) { - const struct cred *cred = current_cred(); - const struct cred *tcred = get_task_cred(task); int ret = 0; - /* - * even if we're attaching all tasks in the thread group, we only - * need to check permissions on one of them. - */ - if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && - !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) - ret = -EACCES; - - if (!ret && cgroup_on_dfl(dst_cgrp)) { + if (cgroup_on_dfl(dst_cgrp)) { struct super_block *sb = of->file->f_path.dentry->d_sb; struct cgroup *cgrp; struct inode *inode; @@ -2877,9 +2369,21 @@ static int cgroup_procs_write_permission(struct task_struct *task, ret = inode_permission(inode, MAY_WRITE); iput(inode); } + } else { + const struct cred *cred = current_cred(); + const struct cred *tcred = get_task_cred(task); + + /* + * even if we're attaching all tasks in the thread group, + * we only need to check permissions on one of them. + */ + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) + ret = -EACCES; + put_cred(tcred); } - put_cred(tcred); return ret; } @@ -2888,8 +2392,8 @@ static int cgroup_procs_write_permission(struct task_struct *task, * function to attach either it or all tasks in its threadgroup. Will lock * cgroup_mutex and threadgroup. */ -static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off, bool threadgroup) +ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, bool threadgroup) { struct task_struct *tsk; struct cgroup_subsys *ss; @@ -2950,86 +2454,12 @@ out_unlock_threadgroup: return ret ?: nbytes; } -/** - * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' - * @from: attach to all cgroups of a given task - * @tsk: the task to be attached - */ -int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) -{ - struct cgroup_root *root; - int retval = 0; - - mutex_lock(&cgroup_mutex); - percpu_down_write(&cgroup_threadgroup_rwsem); - for_each_root(root) { - struct cgroup *from_cgrp; - - if (root == &cgrp_dfl_root) - continue; - - spin_lock_irq(&css_set_lock); - from_cgrp = task_cgroup_from_root(from, root); - spin_unlock_irq(&css_set_lock); - - retval = cgroup_attach_task(from_cgrp, tsk, false); - if (retval) - break; - } - percpu_up_write(&cgroup_threadgroup_rwsem); - mutex_unlock(&cgroup_mutex); - - return retval; -} -EXPORT_SYMBOL_GPL(cgroup_attach_task_all); - -static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - return __cgroup_procs_write(of, buf, nbytes, off, false); -} - -static ssize_t cgroup_procs_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) +ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off) { return __cgroup_procs_write(of, buf, nbytes, off, true); } -static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct cgroup *cgrp; - - BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); - - cgrp = cgroup_kn_lock_live(of->kn, false); - if (!cgrp) - return -ENODEV; - spin_lock(&release_agent_path_lock); - strlcpy(cgrp->root->release_agent_path, strstrip(buf), - sizeof(cgrp->root->release_agent_path)); - spin_unlock(&release_agent_path_lock); - cgroup_kn_unlock(of->kn); - return nbytes; -} - -static int cgroup_release_agent_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - - spin_lock(&release_agent_path_lock); - seq_puts(seq, cgrp->root->release_agent_path); - spin_unlock(&release_agent_path_lock); - seq_putc(seq, '\n'); - return 0; -} - -static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) -{ - seq_puts(seq, "0\n"); - return 0; -} - static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; @@ -3075,8 +2505,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) */ static int cgroup_update_dfl_csses(struct cgroup *cgrp) { - LIST_HEAD(preloaded_csets); - struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); + DEFINE_CGROUP_MGCTX(mgctx); struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; @@ -3092,33 +2521,28 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgrp_cset_link *link; list_for_each_entry(link, &dsct->cset_links, cset_link) - cgroup_migrate_add_src(link->cset, dsct, - &preloaded_csets); + cgroup_migrate_add_src(link->cset, dsct, &mgctx); } spin_unlock_irq(&css_set_lock); /* NULL dst indicates self on default hierarchy */ - ret = cgroup_migrate_prepare_dst(&preloaded_csets); + ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_finish; spin_lock_irq(&css_set_lock); - list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { + list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) { struct task_struct *task, *ntask; - /* src_csets precede dst_csets, break on the first dst_cset */ - if (!src_cset->mg_src_cgrp) - break; - /* all tasks in src_csets need to be migrated */ list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) - cgroup_taskset_add(task, &tset); + cgroup_migrate_add_task(task, &mgctx); } spin_unlock_irq(&css_set_lock); - ret = cgroup_taskset_migrate(&tset, cgrp->root); + ret = cgroup_migrate_execute(&mgctx); out_finish: - cgroup_migrate_finish(&preloaded_csets); + cgroup_migrate_finish(&mgctx); percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } @@ -3131,7 +2555,7 @@ out_finish: * controller while the previous css is still around. This function grabs * cgroup_mutex and drains the previous css instances of @cgrp's subtree. */ -static void cgroup_lock_and_drain_offline(struct cgroup *cgrp) +void cgroup_lock_and_drain_offline(struct cgroup *cgrp) __acquires(&cgroup_mutex) { struct cgroup *dsct; @@ -3503,6 +2927,23 @@ static int cgroup_events_show(struct seq_file *seq, void *v) return 0; } +static int cgroup_file_open(struct kernfs_open_file *of) +{ + struct cftype *cft = of->kn->priv; + + if (cft->open) + return cft->open(of); + return 0; +} + +static void cgroup_file_release(struct kernfs_open_file *of) +{ + struct cftype *cft = of->kn->priv; + + if (cft->release) + cft->release(of); +} + static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -3553,7 +2994,8 @@ static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) static void cgroup_seqfile_stop(struct seq_file *seq, void *v) { - seq_cft(seq)->seq_stop(seq, v); + if (seq_cft(seq)->seq_stop) + seq_cft(seq)->seq_stop(seq, v); } static int cgroup_seqfile_show(struct seq_file *m, void *arg) @@ -3575,12 +3017,16 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) static struct kernfs_ops cgroup_kf_single_ops = { .atomic_write_len = PAGE_SIZE, + .open = cgroup_file_open, + .release = cgroup_file_release, .write = cgroup_file_write, .seq_show = cgroup_seqfile_show, }; static struct kernfs_ops cgroup_kf_ops = { .atomic_write_len = PAGE_SIZE, + .open = cgroup_file_open, + .release = cgroup_file_release, .write = cgroup_file_write, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, @@ -3588,48 +3034,6 @@ static struct kernfs_ops cgroup_kf_ops = { .seq_show = cgroup_seqfile_show, }; -/* - * cgroup_rename - Only allow simple rename of directories in place. - */ -static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, - const char *new_name_str) -{ - struct cgroup *cgrp = kn->priv; - int ret; - - if (kernfs_type(kn) != KERNFS_DIR) - return -ENOTDIR; - if (kn->parent != new_parent) - return -EIO; - - /* - * This isn't a proper migration and its usefulness is very - * limited. Disallow on the default hierarchy. - */ - if (cgroup_on_dfl(cgrp)) - return -EPERM; - - /* - * We're gonna grab cgroup_mutex which nests outside kernfs - * active_ref. kernfs_rename() doesn't require active_ref - * protection. Break them before grabbing cgroup_mutex. - */ - kernfs_break_active_protection(new_parent); - kernfs_break_active_protection(kn); - - mutex_lock(&cgroup_mutex); - - ret = kernfs_rename(kn, new_parent, new_name_str); - if (!ret) - trace_cgroup_rename(cgrp); - - mutex_unlock(&cgroup_mutex); - - kernfs_unbreak_active_protection(kn); - kernfs_unbreak_active_protection(new_parent); - return ret; -} - /* set uid and gid of cgroup dirs and files to that of the creator */ static int cgroup_kn_set_ugid(struct kernfs_node *kn) { @@ -3926,26 +3330,6 @@ void cgroup_file_notify(struct cgroup_file *cfile) } /** - * cgroup_task_count - count the number of tasks in a cgroup. - * @cgrp: the cgroup in question - * - * Return the number of tasks in the cgroup. The returned number can be - * higher than the actual number of tasks due to css_set references from - * namespace roots and temporary usages. - */ -static int cgroup_task_count(const struct cgroup *cgrp) -{ - int count = 0; - struct cgrp_cset_link *link; - - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &cgrp->cset_links, cset_link) - count += atomic_read(&link->cset->refcount); - spin_unlock_irq(&css_set_lock); - return count; -} - -/** * css_next_child - find the next child of a given css * @pos: the current position (%NULL to initiate traversal) * @parent: css whose children to walk @@ -4343,560 +3727,69 @@ void css_task_iter_end(struct css_task_iter *it) put_task_struct(it->cur_task); } -/** - * cgroup_trasnsfer_tasks - move tasks from one cgroup to another - * @to: cgroup to which the tasks will be moved - * @from: cgroup in which the tasks currently reside - * - * Locking rules between cgroup_post_fork() and the migration path - * guarantee that, if a task is forking while being migrated, the new child - * is guaranteed to be either visible in the source cgroup after the - * parent's migration is complete or put into the target cgroup. No task - * can slip out of migration through forking. - */ -int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) -{ - LIST_HEAD(preloaded_csets); - struct cgrp_cset_link *link; - struct css_task_iter it; - struct task_struct *task; - int ret; - - if (!cgroup_may_migrate_to(to)) - return -EBUSY; - - mutex_lock(&cgroup_mutex); - - percpu_down_write(&cgroup_threadgroup_rwsem); - - /* all tasks in @from are being moved, all csets are source */ - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &from->cset_links, cset_link) - cgroup_migrate_add_src(link->cset, to, &preloaded_csets); - spin_unlock_irq(&css_set_lock); - - ret = cgroup_migrate_prepare_dst(&preloaded_csets); - if (ret) - goto out_err; - - /* - * Migrate tasks one-by-one until @from is empty. This fails iff - * ->can_attach() fails. - */ - do { - css_task_iter_start(&from->self, &it); - task = css_task_iter_next(&it); - if (task) - get_task_struct(task); - css_task_iter_end(&it); - - if (task) { - ret = cgroup_migrate(task, false, to->root); - if (!ret) - trace_cgroup_transfer_tasks(to, task, false); - put_task_struct(task); - } - } while (task && !ret); -out_err: - cgroup_migrate_finish(&preloaded_csets); - percpu_up_write(&cgroup_threadgroup_rwsem); - mutex_unlock(&cgroup_mutex); - return ret; -} - -/* - * Stuff for reading the 'tasks'/'procs' files. - * - * Reading this file can return large amounts of data if a cgroup has - * *lots* of attached tasks. So it may need several calls to read(), - * but we cannot guarantee that the information we produce is correct - * unless we produce it entirely atomically. - * - */ - -/* which pidlist file are we talking about? */ -enum cgroup_filetype { - CGROUP_FILE_PROCS, - CGROUP_FILE_TASKS, -}; - -/* - * A pidlist is a list of pids that virtually represents the contents of one - * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, - * a pair (one each for procs, tasks) for each pid namespace that's relevant - * to the cgroup. - */ -struct cgroup_pidlist { - /* - * used to find which pidlist is wanted. doesn't change as long as - * this particular list stays in the list. - */ - struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; - /* array of xids */ - pid_t *list; - /* how many elements the above list has */ - int length; - /* each of these stored in a list by its cgroup */ - struct list_head links; - /* pointer to the cgroup we belong to, for list removal purposes */ - struct cgroup *owner; - /* for delayed destruction */ - struct delayed_work destroy_dwork; -}; - -/* - * The following two functions "fix" the issue where there are more pids - * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. - * TODO: replace with a kernel-wide solution to this problem - */ -#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) -static void *pidlist_allocate(int count) -{ - if (PIDLIST_TOO_LARGE(count)) - return vmalloc(count * sizeof(pid_t)); - else - return kmalloc(count * sizeof(pid_t), GFP_KERNEL); -} - -static void pidlist_free(void *p) -{ - kvfree(p); -} - -/* - * Used to destroy all pidlists lingering waiting for destroy timer. None - * should be left afterwards. - */ -static void cgroup_pidlist_destroy_all(struct cgroup *cgrp) -{ - struct cgroup_pidlist *l, *tmp_l; - - mutex_lock(&cgrp->pidlist_mutex); - list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) - mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); - mutex_unlock(&cgrp->pidlist_mutex); - - flush_workqueue(cgroup_pidlist_destroy_wq); - BUG_ON(!list_empty(&cgrp->pidlists)); -} - -static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) -{ - struct delayed_work *dwork = to_delayed_work(work); - struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, - destroy_dwork); - struct cgroup_pidlist *tofree = NULL; - - mutex_lock(&l->owner->pidlist_mutex); - - /* - * Destroy iff we didn't get queued again. The state won't change - * as destroy_dwork can only be queued while locked. - */ - if (!delayed_work_pending(dwork)) { - list_del(&l->links); - pidlist_free(l->list); - put_pid_ns(l->key.ns); - tofree = l; - } - - mutex_unlock(&l->owner->pidlist_mutex); - kfree(tofree); -} - -/* - * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries - * Returns the number of unique elements. - */ -static int pidlist_uniq(pid_t *list, int length) -{ - int src, dest = 1; - - /* - * we presume the 0th element is unique, so i starts at 1. trivial - * edge cases first; no work needs to be done for either - */ - if (length == 0 || length == 1) - return length; - /* src and dest walk down the list; dest counts unique elements */ - for (src = 1; src < length; src++) { - /* find next unique element */ - while (list[src] == list[src-1]) { - src++; - if (src == length) - goto after; - } - /* dest always points to where the next unique element goes */ - list[dest] = list[src]; - dest++; - } -after: - return dest; -} - -/* - * The two pid files - task and cgroup.procs - guaranteed that the result - * is sorted, which forced this whole pidlist fiasco. As pid order is - * different per namespace, each namespace needs differently sorted list, - * making it impossible to use, for example, single rbtree of member tasks - * sorted by task pointer. As pidlists can be fairly large, allocating one - * per open file is dangerous, so cgroup had to implement shared pool of - * pidlists keyed by cgroup and namespace. - * - * All this extra complexity was caused by the original implementation - * committing to an entirely unnecessary property. In the long term, we - * want to do away with it. Explicitly scramble sort order if on the - * default hierarchy so that no such expectation exists in the new - * interface. - * - * Scrambling is done by swapping every two consecutive bits, which is - * non-identity one-to-one mapping which disturbs sort order sufficiently. - */ -static pid_t pid_fry(pid_t pid) +static void cgroup_procs_release(struct kernfs_open_file *of) { - unsigned a = pid & 0x55555555; - unsigned b = pid & 0xAAAAAAAA; - - return (a << 1) | (b >> 1); -} - -static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) -{ - if (cgroup_on_dfl(cgrp)) - return pid_fry(pid); - else - return pid; -} - -static int cmppid(const void *a, const void *b) -{ - return *(pid_t *)a - *(pid_t *)b; -} - -static int fried_cmppid(const void *a, const void *b) -{ - return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b); -} - -static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, - enum cgroup_filetype type) -{ - struct cgroup_pidlist *l; - /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = task_active_pid_ns(current); - - lockdep_assert_held(&cgrp->pidlist_mutex); - - list_for_each_entry(l, &cgrp->pidlists, links) - if (l->key.type == type && l->key.ns == ns) - return l; - return NULL; -} - -/* - * find the appropriate pidlist for our purpose (given procs vs tasks) - * returns with the lock on that pidlist already held, and takes care - * of the use count, or returns NULL with no locks held if we're out of - * memory. - */ -static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, - enum cgroup_filetype type) -{ - struct cgroup_pidlist *l; - - lockdep_assert_held(&cgrp->pidlist_mutex); - - l = cgroup_pidlist_find(cgrp, type); - if (l) - return l; - - /* entry not found; create a new one */ - l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); - if (!l) - return l; - - INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); - l->key.type = type; - /* don't need task_nsproxy() if we're looking at ourself */ - l->key.ns = get_pid_ns(task_active_pid_ns(current)); - l->owner = cgrp; - list_add(&l->links, &cgrp->pidlists); - return l; -} - -/* - * Load a cgroup's pidarray with either procs' tgids or tasks' pids - */ -static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, - struct cgroup_pidlist **lp) -{ - pid_t *array; - int length; - int pid, n = 0; /* used for populating the array */ - struct css_task_iter it; - struct task_struct *tsk; - struct cgroup_pidlist *l; - - lockdep_assert_held(&cgrp->pidlist_mutex); - - /* - * If cgroup gets more users after we read count, we won't have - * enough space - tough. This race is indistinguishable to the - * caller from the case that the additional cgroup users didn't - * show up until sometime later on. - */ - length = cgroup_task_count(cgrp); - array = pidlist_allocate(length); - if (!array) - return -ENOMEM; - /* now, populate the array */ - css_task_iter_start(&cgrp->self, &it); - while ((tsk = css_task_iter_next(&it))) { - if (unlikely(n == length)) - break; - /* get tgid or pid for procs or tasks file respectively */ - if (type == CGROUP_FILE_PROCS) - pid = task_tgid_vnr(tsk); - else - pid = task_pid_vnr(tsk); - if (pid > 0) /* make sure to only use valid results */ - array[n++] = pid; - } - css_task_iter_end(&it); - length = n; - /* now sort & (if procs) strip out duplicates */ - if (cgroup_on_dfl(cgrp)) - sort(array, length, sizeof(pid_t), fried_cmppid, NULL); - else - sort(array, length, sizeof(pid_t), cmppid, NULL); - if (type == CGROUP_FILE_PROCS) - length = pidlist_uniq(array, length); - - l = cgroup_pidlist_find_create(cgrp, type); - if (!l) { - pidlist_free(array); - return -ENOMEM; + if (of->priv) { + css_task_iter_end(of->priv); + kfree(of->priv); } - - /* store array, freeing old if necessary */ - pidlist_free(l->list); - l->list = array; - l->length = length; - *lp = l; - return 0; } -/** - * cgroupstats_build - build and fill cgroupstats - * @stats: cgroupstats to fill information into - * @dentry: A dentry entry belonging to the cgroup for which stats have - * been requested. - * - * Build and fill cgroupstats so that taskstats can export it to user - * space. - */ -int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) +static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) { - struct kernfs_node *kn = kernfs_node_from_dentry(dentry); - struct cgroup *cgrp; - struct css_task_iter it; - struct task_struct *tsk; - - /* it should be kernfs_node belonging to cgroupfs and is a directory */ - if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || - kernfs_type(kn) != KERNFS_DIR) - return -EINVAL; - - mutex_lock(&cgroup_mutex); - - /* - * We aren't being called from kernfs and there's no guarantee on - * @kn->priv's validity. For this and css_tryget_online_from_dir(), - * @kn->priv is RCU safe. Let's do the RCU dancing. - */ - rcu_read_lock(); - cgrp = rcu_dereference(kn->priv); - if (!cgrp || cgroup_is_dead(cgrp)) { - rcu_read_unlock(); - mutex_unlock(&cgroup_mutex); - return -ENOENT; - } - rcu_read_unlock(); + struct kernfs_open_file *of = s->private; + struct css_task_iter *it = of->priv; + struct task_struct *task; - css_task_iter_start(&cgrp->self, &it); - while ((tsk = css_task_iter_next(&it))) { - switch (tsk->state) { - case TASK_RUNNING: - stats->nr_running++; - break; - case TASK_INTERRUPTIBLE: - stats->nr_sleeping++; - break; - case TASK_UNINTERRUPTIBLE: - stats->nr_uninterruptible++; - break; - case TASK_STOPPED: - stats->nr_stopped++; - break; - default: - if (delayacct_is_task_waiting_on_io(tsk)) - stats->nr_io_wait++; - break; - } - } - css_task_iter_end(&it); + do { + task = css_task_iter_next(it); + } while (task && !thread_group_leader(task)); - mutex_unlock(&cgroup_mutex); - return 0; + return task; } - -/* - * seq_file methods for the tasks/procs files. The seq_file position is the - * next pid to display; the seq_file iterator is a pointer to the pid - * in the cgroup->l->list array. - */ - -static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) +static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) { - /* - * Initially we receive a position value that corresponds to - * one more than the last pid shown (or 0 on the first call or - * after a seek to the start). Use a binary-search to find the - * next pid to display, if any - */ struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; - struct cgroup_pidlist *l; - enum cgroup_filetype type = seq_cft(s)->private; - int index = 0, pid = *pos; - int *iter, ret; - - mutex_lock(&cgrp->pidlist_mutex); + struct css_task_iter *it = of->priv; /* - * !NULL @of->priv indicates that this isn't the first start() - * after open. If the matching pidlist is around, we can use that. - * Look for it. Note that @of->priv can't be used directly. It - * could already have been destroyed. + * When a seq_file is seeked, it's always traversed sequentially + * from position 0, so we can simply keep iterating on !0 *pos. */ - if (of->priv) - of->priv = cgroup_pidlist_find(cgrp, type); - - /* - * Either this is the first start() after open or the matching - * pidlist has been destroyed inbetween. Create a new one. - */ - if (!of->priv) { - ret = pidlist_array_load(cgrp, type, - (struct cgroup_pidlist **)&of->priv); - if (ret) - return ERR_PTR(ret); - } - l = of->priv; - - if (pid) { - int end = l->length; - - while (index < end) { - int mid = (index + end) / 2; - if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) { - index = mid; - break; - } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid) - index = mid + 1; - else - end = mid; - } - } - /* If we're off the end of the array, we're done */ - if (index >= l->length) - return NULL; - /* Update the abstract position to be the actual pid that we found */ - iter = l->list + index; - *pos = cgroup_pid_fry(cgrp, *iter); - return iter; -} - -static void cgroup_pidlist_stop(struct seq_file *s, void *v) -{ - struct kernfs_open_file *of = s->private; - struct cgroup_pidlist *l = of->priv; - - if (l) - mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, - CGROUP_PIDLIST_DESTROY_DELAY); - mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); -} + if (!it) { + if (WARN_ON_ONCE((*pos)++)) + return ERR_PTR(-EINVAL); -static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) -{ - struct kernfs_open_file *of = s->private; - struct cgroup_pidlist *l = of->priv; - pid_t *p = v; - pid_t *end = l->list + l->length; - /* - * Advance to the next pid in the array. If this goes off the - * end, we're done - */ - p++; - if (p >= end) { - return NULL; - } else { - *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p); - return p; + it = kzalloc(sizeof(*it), GFP_KERNEL); + if (!it) + return ERR_PTR(-ENOMEM); + of->priv = it; + css_task_iter_start(&cgrp->self, it); + } else if (!(*pos)++) { + css_task_iter_end(it); + css_task_iter_start(&cgrp->self, it); } -} - -static int cgroup_pidlist_show(struct seq_file *s, void *v) -{ - seq_printf(s, "%d\n", *(int *)v); - return 0; + return cgroup_procs_next(s, NULL, NULL); } -static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, - struct cftype *cft) +static int cgroup_procs_show(struct seq_file *s, void *v) { - return notify_on_release(css->cgroup); -} - -static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, - struct cftype *cft, u64 val) -{ - if (val) - set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); - else - clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); - return 0; -} - -static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); -} - -static int cgroup_clone_children_write(struct cgroup_subsys_state *css, - struct cftype *cft, u64 val) -{ - if (val) - set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); - else - clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); + seq_printf(s, "%d\n", task_tgid_vnr(v)); return 0; } /* cgroup core interface files for the default hierarchy */ -static struct cftype cgroup_dfl_base_files[] = { +static struct cftype cgroup_base_files[] = { { .name = "cgroup.procs", .file_offset = offsetof(struct cgroup, procs_file), - .seq_start = cgroup_pidlist_start, - .seq_next = cgroup_pidlist_next, - .seq_stop = cgroup_pidlist_stop, - .seq_show = cgroup_pidlist_show, - .private = CGROUP_FILE_PROCS, + .release = cgroup_procs_release, + .seq_start = cgroup_procs_start, + .seq_next = cgroup_procs_next, + .seq_show = cgroup_procs_show, .write = cgroup_procs_write, }, { @@ -4917,51 +3810,6 @@ static struct cftype cgroup_dfl_base_files[] = { { } /* terminate */ }; -/* cgroup core interface files for the legacy hierarchies */ -static struct cftype cgroup_legacy_base_files[] = { - { - .name = "cgroup.procs", - .seq_start = cgroup_pidlist_start, - .seq_next = cgroup_pidlist_next, - .seq_stop = cgroup_pidlist_stop, - .seq_show = cgroup_pidlist_show, - .private = CGROUP_FILE_PROCS, - .write = cgroup_procs_write, - }, - { - .name = "cgroup.clone_children", - .read_u64 = cgroup_clone_children_read, - .write_u64 = cgroup_clone_children_write, - }, - { - .name = "cgroup.sane_behavior", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_sane_behavior_show, - }, - { - .name = "tasks", - .seq_start = cgroup_pidlist_start, - .seq_next = cgroup_pidlist_next, - .seq_stop = cgroup_pidlist_stop, - .seq_show = cgroup_pidlist_show, - .private = CGROUP_FILE_TASKS, - .write = cgroup_tasks_write, - }, - { - .name = "notify_on_release", - .read_u64 = cgroup_read_notify_on_release, - .write_u64 = cgroup_write_notify_on_release, - }, - { - .name = "release_agent", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_release_agent_show, - .write = cgroup_release_agent_write, - .max_write_len = PATH_MAX - 1, - }, - { } /* terminate */ -}; - /* * css destruction is four-stage process. * @@ -5007,7 +3855,7 @@ static void css_free_work_fn(struct work_struct *work) } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); - cgroup_pidlist_destroy_all(cgrp); + cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); if (cgroup_parent(cgrp)) { @@ -5302,8 +4150,7 @@ out_free_cgrp: return ERR_PTR(ret); } -static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) +int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; struct kernfs_node *kn; @@ -5507,7 +4354,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ kernfs_remove(cgrp->kn); - check_for_release(cgroup_parent(cgrp)); + cgroup1_check_for_release(cgroup_parent(cgrp)); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -5515,7 +4362,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return 0; }; -static int cgroup_rmdir(struct kernfs_node *kn) +int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; int ret = 0; @@ -5535,10 +4382,8 @@ static int cgroup_rmdir(struct kernfs_node *kn) static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .remount_fs = cgroup_remount, - .show_options = cgroup_show_options, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, - .rename = cgroup_rename, .show_path = cgroup_show_path, }; @@ -5646,8 +4491,8 @@ int __init cgroup_init(void) BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); - BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); - BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); /* * The latency of the synchronize_sched() is too high for cgroups, @@ -5697,7 +4542,7 @@ int __init cgroup_init(void) continue; } - if (cgroup_ssid_no_v1(ssid)) + if (cgroup1_ssid_disabled(ssid)) printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", ss->name); @@ -5744,15 +4589,6 @@ static int __init cgroup_wq_init(void) */ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); BUG_ON(!cgroup_destroy_wq); - - /* - * Used to destroy pidlists and separate to serve as flush domain. - * Cap @max_active to 1 too. - */ - cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", - 0, 1); - BUG_ON(!cgroup_pidlist_destroy_wq); - return 0; } core_initcall(cgroup_wq_init); @@ -5835,42 +4671,6 @@ out: return retval; } -/* Display information about each subsystem and each hierarchy */ -static int proc_cgroupstats_show(struct seq_file *m, void *v) -{ - struct cgroup_subsys *ss; - int i; - - seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); - /* - * ideally we don't want subsystems moving around while we do this. - * cgroup_mutex is also necessary to guarantee an atomic snapshot of - * subsys/hierarchy state. - */ - mutex_lock(&cgroup_mutex); - - for_each_subsys(ss, i) - seq_printf(m, "%s\t%d\t%d\t%d\n", - ss->legacy_name, ss->root->hierarchy_id, - atomic_read(&ss->root->nr_cgrps), - cgroup_ssid_enabled(i)); - - mutex_unlock(&cgroup_mutex); - return 0; -} - -static int cgroupstats_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_cgroupstats_show, NULL); -} - -static const struct file_operations proc_cgroupstats_operations = { - .open = cgroupstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - /** * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. @@ -6050,76 +4850,6 @@ void cgroup_free(struct task_struct *task) put_css_set(cset); } -static void check_for_release(struct cgroup *cgrp) -{ - if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && - !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) - schedule_work(&cgrp->release_agent_work); -} - -/* - * Notify userspace when a cgroup is released, by running the - * configured release agent with the name of the cgroup (path - * relative to the root of cgroup file system) as the argument. - * - * Most likely, this user command will try to rmdir this cgroup. - * - * This races with the possibility that some other task will be - * attached to this cgroup before it is removed, or that some other - * user task will 'mkdir' a child cgroup of this cgroup. That's ok. - * The presumed 'rmdir' will fail quietly if this cgroup is no longer - * unused, and this cgroup will be reprieved from its death sentence, - * to continue to serve a useful existence. Next time it's released, - * we will get notified again, if it still has 'notify_on_release' set. - * - * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which - * means only wait until the task is successfully execve()'d. The - * separate release agent task is forked by call_usermodehelper(), - * then control in this thread returns here, without waiting for the - * release agent task. We don't bother to wait because the caller of - * this routine has no use for the exit status of the release agent - * task, so no sense holding our caller up for that. - */ -static void cgroup_release_agent(struct work_struct *work) -{ - struct cgroup *cgrp = - container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL; - char *argv[3], *envp[3]; - int ret; - - mutex_lock(&cgroup_mutex); - - pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); - agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!pathbuf || !agentbuf) - goto out; - - spin_lock_irq(&css_set_lock); - ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); - spin_unlock_irq(&css_set_lock); - if (ret < 0 || ret >= PATH_MAX) - goto out; - - argv[0] = agentbuf; - argv[1] = pathbuf; - argv[2] = NULL; - - /* minimal command environment */ - envp[0] = "HOME=/"; - envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; - envp[2] = NULL; - - mutex_unlock(&cgroup_mutex); - call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - goto out_free; -out: - mutex_unlock(&cgroup_mutex); -out_free: - kfree(agentbuf); - kfree(pathbuf); -} - static int __init cgroup_disable(char *str) { struct cgroup_subsys *ss; @@ -6141,33 +4871,6 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); -static int __init cgroup_no_v1(char *str) -{ - struct cgroup_subsys *ss; - char *token; - int i; - - while ((token = strsep(&str, ",")) != NULL) { - if (!*token) - continue; - - if (!strcmp(token, "all")) { - cgroup_no_v1_mask = U16_MAX; - break; - } - - for_each_subsys(ss, i) { - if (strcmp(token, ss->name) && - strcmp(token, ss->legacy_name)) - continue; - - cgroup_no_v1_mask |= 1 << i; - } - } - return 1; -} -__setup("cgroup_no_v1=", cgroup_no_v1); - /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest @@ -6197,7 +4900,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, * have been or be removed at any point. @kn->priv is RCU * protected for this access. See css_release_work_fn() for details. */ - cgrp = rcu_dereference(kn->priv); + cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (cgrp) css = cgroup_css(cgrp, ss); @@ -6349,154 +5052,6 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ -/* cgroup namespaces */ - -static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) -{ - return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); -} - -static void dec_cgroup_namespaces(struct ucounts *ucounts) -{ - dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); -} - -static struct cgroup_namespace *alloc_cgroup_ns(void) -{ - struct cgroup_namespace *new_ns; - int ret; - - new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); - if (!new_ns) - return ERR_PTR(-ENOMEM); - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); - return ERR_PTR(ret); - } - atomic_set(&new_ns->count, 1); - new_ns->ns.ops = &cgroupns_operations; - return new_ns; -} - -void free_cgroup_ns(struct cgroup_namespace *ns) -{ - put_css_set(ns->root_cset); - dec_cgroup_namespaces(ns->ucounts); - put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); - kfree(ns); -} -EXPORT_SYMBOL(free_cgroup_ns); - -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, - struct user_namespace *user_ns, - struct cgroup_namespace *old_ns) -{ - struct cgroup_namespace *new_ns; - struct ucounts *ucounts; - struct css_set *cset; - - BUG_ON(!old_ns); - - if (!(flags & CLONE_NEWCGROUP)) { - get_cgroup_ns(old_ns); - return old_ns; - } - - /* Allow only sysadmin to create cgroup namespace. */ - if (!ns_capable(user_ns, CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - - ucounts = inc_cgroup_namespaces(user_ns); - if (!ucounts) - return ERR_PTR(-ENOSPC); - - /* It is not safe to take cgroup_mutex here */ - spin_lock_irq(&css_set_lock); - cset = task_css_set(current); - get_css_set(cset); - spin_unlock_irq(&css_set_lock); - - new_ns = alloc_cgroup_ns(); - if (IS_ERR(new_ns)) { - put_css_set(cset); - dec_cgroup_namespaces(ucounts); - return new_ns; - } - - new_ns->user_ns = get_user_ns(user_ns); - new_ns->ucounts = ucounts; - new_ns->root_cset = cset; - - return new_ns; -} - -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - -static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) -{ - struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); - - if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || - !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) - return -EPERM; - - /* Don't need to do anything if we are attaching to our own cgroupns. */ - if (cgroup_ns == nsproxy->cgroup_ns) - return 0; - - get_cgroup_ns(cgroup_ns); - put_cgroup_ns(nsproxy->cgroup_ns); - nsproxy->cgroup_ns = cgroup_ns; - - return 0; -} - -static struct ns_common *cgroupns_get(struct task_struct *task) -{ - struct cgroup_namespace *ns = NULL; - struct nsproxy *nsproxy; - - task_lock(task); - nsproxy = task->nsproxy; - if (nsproxy) { - ns = nsproxy->cgroup_ns; - get_cgroup_ns(ns); - } - task_unlock(task); - - return ns ? &ns->ns : NULL; -} - -static void cgroupns_put(struct ns_common *ns) -{ - put_cgroup_ns(to_cg_ns(ns)); -} - -static struct user_namespace *cgroupns_owner(struct ns_common *ns) -{ - return to_cg_ns(ns)->user_ns; -} - -const struct proc_ns_operations cgroupns_operations = { - .name = "cgroup", - .type = CLONE_NEWCGROUP, - .get = cgroupns_get, - .put = cgroupns_put, - .install = cgroupns_install, - .owner = cgroupns_owner, -}; - -static __init int cgroup_namespaces_init(void) -{ - return 0; -} -subsys_initcall(cgroup_namespaces_init); - #ifdef CONFIG_CGROUP_BPF int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, bool overridable) @@ -6510,149 +5065,3 @@ int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, return ret; } #endif /* CONFIG_CGROUP_BPF */ - -#ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state * -debug_css_alloc(struct cgroup_subsys_state *parent_css) -{ - struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - - if (!css) - return ERR_PTR(-ENOMEM); - - return css; -} - -static void debug_css_free(struct cgroup_subsys_state *css) -{ - kfree(css); -} - -static u64 debug_taskcount_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return cgroup_task_count(css->cgroup); -} - -static u64 current_css_set_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return (u64)(unsigned long)current->cgroups; -} - -static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - u64 count; - - rcu_read_lock(); - count = atomic_read(&task_css_set(current)->refcount); - rcu_read_unlock(); - return count; -} - -static int current_css_set_cg_links_read(struct seq_file *seq, void *v) -{ - struct cgrp_cset_link *link; - struct css_set *cset; - char *name_buf; - - name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!name_buf) - return -ENOMEM; - - spin_lock_irq(&css_set_lock); - rcu_read_lock(); - cset = rcu_dereference(current->cgroups); - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { - struct cgroup *c = link->cgrp; - - cgroup_name(c, name_buf, NAME_MAX + 1); - seq_printf(seq, "Root %d group %s\n", - c->root->hierarchy_id, name_buf); - } - rcu_read_unlock(); - spin_unlock_irq(&css_set_lock); - kfree(name_buf); - return 0; -} - -#define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct seq_file *seq, void *v) -{ - struct cgroup_subsys_state *css = seq_css(seq); - struct cgrp_cset_link *link; - - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { - struct css_set *cset = link->cset; - struct task_struct *task; - int count = 0; - - seq_printf(seq, "css_set %p\n", cset); - - list_for_each_entry(task, &cset->tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); - } - - list_for_each_entry(task, &cset->mg_tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); - } - continue; - overflow: - seq_puts(seq, " ...\n"); - } - spin_unlock_irq(&css_set_lock); - return 0; -} - -static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) -{ - return (!cgroup_is_populated(css->cgroup) && - !css_has_online_children(&css->cgroup->self)); -} - -static struct cftype debug_files[] = { - { - .name = "taskcount", - .read_u64 = debug_taskcount_read, - }, - - { - .name = "current_css_set", - .read_u64 = current_css_set_read, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - }, - - { - .name = "current_css_set_cg_links", - .seq_show = current_css_set_cg_links_read, - }, - - { - .name = "cgroup_css_links", - .seq_show = cgroup_css_links_read, - }, - - { - .name = "releasable", - .read_u64 = releasable_read, - }, - - { } /* terminate */ -}; - -struct cgroup_subsys debug_cgrp_subsys = { - .css_alloc = debug_css_alloc, - .css_free = debug_css_free, - .legacy_cftypes = debug_files, -}; -#endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cpuset.c b/kernel/cgroup/cpuset.c index b3088886cd37..b3088886cd37 100644 --- a/kernel/cpuset.c +++ b/kernel/cgroup/cpuset.c diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup/freezer.c index 1b72d56edce5..1b72d56edce5 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup/freezer.c diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c new file mode 100644 index 000000000000..cff7ea62c38f --- /dev/null +++ b/kernel/cgroup/namespace.c @@ -0,0 +1,155 @@ +#include "cgroup-internal.h" + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/nsproxy.h> +#include <linux/proc_ns.h> + + +/* cgroup namespaces */ + +static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); +} + +static void dec_cgroup_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); +} + +static struct cgroup_namespace *alloc_cgroup_ns(void) +{ + struct cgroup_namespace *new_ns; + int ret; + + new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); + if (!new_ns) + return ERR_PTR(-ENOMEM); + ret = ns_alloc_inum(&new_ns->ns); + if (ret) { + kfree(new_ns); + return ERR_PTR(ret); + } + atomic_set(&new_ns->count, 1); + new_ns->ns.ops = &cgroupns_operations; + return new_ns; +} + +void free_cgroup_ns(struct cgroup_namespace *ns) +{ + put_css_set(ns->root_cset); + dec_cgroup_namespaces(ns->ucounts); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + kfree(ns); +} +EXPORT_SYMBOL(free_cgroup_ns); + +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + struct cgroup_namespace *new_ns; + struct ucounts *ucounts; + struct css_set *cset; + + BUG_ON(!old_ns); + + if (!(flags & CLONE_NEWCGROUP)) { + get_cgroup_ns(old_ns); + return old_ns; + } + + /* Allow only sysadmin to create cgroup namespace. */ + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + ucounts = inc_cgroup_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + + /* It is not safe to take cgroup_mutex here */ + spin_lock_irq(&css_set_lock); + cset = task_css_set(current); + get_css_set(cset); + spin_unlock_irq(&css_set_lock); + + new_ns = alloc_cgroup_ns(); + if (IS_ERR(new_ns)) { + put_css_set(cset); + dec_cgroup_namespaces(ucounts); + return new_ns; + } + + new_ns->user_ns = get_user_ns(user_ns); + new_ns->ucounts = ucounts; + new_ns->root_cset = cset; + + return new_ns; +} + +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + +static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) +{ + struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); + + if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || + !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + /* Don't need to do anything if we are attaching to our own cgroupns. */ + if (cgroup_ns == nsproxy->cgroup_ns) + return 0; + + get_cgroup_ns(cgroup_ns); + put_cgroup_ns(nsproxy->cgroup_ns); + nsproxy->cgroup_ns = cgroup_ns; + + return 0; +} + +static struct ns_common *cgroupns_get(struct task_struct *task) +{ + struct cgroup_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->cgroup_ns; + get_cgroup_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void cgroupns_put(struct ns_common *ns) +{ + put_cgroup_ns(to_cg_ns(ns)); +} + +static struct user_namespace *cgroupns_owner(struct ns_common *ns) +{ + return to_cg_ns(ns)->user_ns; +} + +const struct proc_ns_operations cgroupns_operations = { + .name = "cgroup", + .type = CLONE_NEWCGROUP, + .get = cgroupns_get, + .put = cgroupns_put, + .install = cgroupns_install, + .owner = cgroupns_owner, +}; + +static __init int cgroup_namespaces_init(void) +{ + return 0; +} +subsys_initcall(cgroup_namespaces_init); diff --git a/kernel/cgroup_pids.c b/kernel/cgroup/pids.c index 2bd673783f1a..2bd673783f1a 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup/pids.c diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c new file mode 100644 index 000000000000..defad3c5e7dc --- /dev/null +++ b/kernel/cgroup/rdma.c @@ -0,0 +1,619 @@ +/* + * RDMA resource limiting controller for cgroups. + * + * Used to allow a cgroup hierarchy to stop processes from consuming + * additional RDMA resources after a certain limit is reached. + * + * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include <linux/bitops.h> +#include <linux/slab.h> +#include <linux/seq_file.h> +#include <linux/cgroup.h> +#include <linux/parser.h> +#include <linux/cgroup_rdma.h> + +#define RDMACG_MAX_STR "max" + +/* + * Protects list of resource pools maintained on per cgroup basis + * and rdma device list. + */ +static DEFINE_MUTEX(rdmacg_mutex); +static LIST_HEAD(rdmacg_devices); + +enum rdmacg_file_type { + RDMACG_RESOURCE_TYPE_MAX, + RDMACG_RESOURCE_TYPE_STAT, +}; + +/* + * resource table definition as to be seen by the user. + * Need to add entries to it when more resources are + * added/defined at IB verb/core layer. + */ +static char const *rdmacg_resource_names[] = { + [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle", + [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object", +}; + +/* resource tracker for each resource of rdma cgroup */ +struct rdmacg_resource { + int max; + int usage; +}; + +/* + * resource pool object which represents per cgroup, per device + * resources. There are multiple instances of this object per cgroup, + * therefore it cannot be embedded within rdma_cgroup structure. It + * is maintained as list. + */ +struct rdmacg_resource_pool { + struct rdmacg_device *device; + struct rdmacg_resource resources[RDMACG_RESOURCE_MAX]; + + struct list_head cg_node; + struct list_head dev_node; + + /* count active user tasks of this pool */ + u64 usage_sum; + /* total number counts which are set to max */ + int num_max_cnt; +}; + +static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css) +{ + return container_of(css, struct rdma_cgroup, css); +} + +static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg) +{ + return css_rdmacg(cg->css.parent); +} + +static inline struct rdma_cgroup *get_current_rdmacg(void) +{ + return css_rdmacg(task_get_css(current, rdma_cgrp_id)); +} + +static void set_resource_limit(struct rdmacg_resource_pool *rpool, + int index, int new_max) +{ + if (new_max == S32_MAX) { + if (rpool->resources[index].max != S32_MAX) + rpool->num_max_cnt++; + } else { + if (rpool->resources[index].max == S32_MAX) + rpool->num_max_cnt--; + } + rpool->resources[index].max = new_max; +} + +static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool) +{ + int i; + + for (i = 0; i < RDMACG_RESOURCE_MAX; i++) + set_resource_limit(rpool, i, S32_MAX); +} + +static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool) +{ + lockdep_assert_held(&rdmacg_mutex); + + list_del(&rpool->cg_node); + list_del(&rpool->dev_node); + kfree(rpool); +} + +static struct rdmacg_resource_pool * +find_cg_rpool_locked(struct rdma_cgroup *cg, + struct rdmacg_device *device) + +{ + struct rdmacg_resource_pool *pool; + + lockdep_assert_held(&rdmacg_mutex); + + list_for_each_entry(pool, &cg->rpools, cg_node) + if (pool->device == device) + return pool; + + return NULL; +} + +static struct rdmacg_resource_pool * +get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device) +{ + struct rdmacg_resource_pool *rpool; + + rpool = find_cg_rpool_locked(cg, device); + if (rpool) + return rpool; + + rpool = kzalloc(sizeof(*rpool), GFP_KERNEL); + if (!rpool) + return ERR_PTR(-ENOMEM); + + rpool->device = device; + set_all_resource_max_limit(rpool); + + INIT_LIST_HEAD(&rpool->cg_node); + INIT_LIST_HEAD(&rpool->dev_node); + list_add_tail(&rpool->cg_node, &cg->rpools); + list_add_tail(&rpool->dev_node, &device->rpools); + return rpool; +} + +/** + * uncharge_cg_locked - uncharge resource for rdma cgroup + * @cg: pointer to cg to uncharge and all parents in hierarchy + * @device: pointer to rdmacg device + * @index: index of the resource to uncharge in cg (resource pool) + * + * It also frees the resource pool which was created as part of + * charging operation when there are no resources attached to + * resource pool. + */ +static void +uncharge_cg_locked(struct rdma_cgroup *cg, + struct rdmacg_device *device, + enum rdmacg_resource_type index) +{ + struct rdmacg_resource_pool *rpool; + + rpool = find_cg_rpool_locked(cg, device); + + /* + * rpool cannot be null at this stage. Let kernel operate in case + * if there a bug in IB stack or rdma controller, instead of crashing + * the system. + */ + if (unlikely(!rpool)) { + pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device); + return; + } + + rpool->resources[index].usage--; + + /* + * A negative count (or overflow) is invalid, + * it indicates a bug in the rdma controller. + */ + WARN_ON_ONCE(rpool->resources[index].usage < 0); + rpool->usage_sum--; + if (rpool->usage_sum == 0 && + rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { + /* + * No user of the rpool and all entries are set to max, so + * safe to delete this rpool. + */ + free_cg_rpool_locked(rpool); + } +} + +/** + * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count + * @device: pointer to rdmacg device + * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup + * stop uncharging + * @index: index of the resource to uncharge in cg in given resource pool + */ +static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg, + struct rdmacg_device *device, + struct rdma_cgroup *stop_cg, + enum rdmacg_resource_type index) +{ + struct rdma_cgroup *p; + + mutex_lock(&rdmacg_mutex); + + for (p = cg; p != stop_cg; p = parent_rdmacg(p)) + uncharge_cg_locked(p, device, index); + + mutex_unlock(&rdmacg_mutex); + + css_put(&cg->css); +} + +/** + * rdmacg_uncharge - hierarchically uncharge rdma resource count + * @device: pointer to rdmacg device + * @index: index of the resource to uncharge in cgroup in given resource pool + */ +void rdmacg_uncharge(struct rdma_cgroup *cg, + struct rdmacg_device *device, + enum rdmacg_resource_type index) +{ + if (index >= RDMACG_RESOURCE_MAX) + return; + + rdmacg_uncharge_hierarchy(cg, device, NULL, index); +} +EXPORT_SYMBOL(rdmacg_uncharge); + +/** + * rdmacg_try_charge - hierarchically try to charge the rdma resource + * @rdmacg: pointer to rdma cgroup which will own this resource + * @device: pointer to rdmacg device + * @index: index of the resource to charge in cgroup (resource pool) + * + * This function follows charging resource in hierarchical way. + * It will fail if the charge would cause the new value to exceed the + * hierarchical limit. + * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL. + * Returns pointer to rdmacg for this resource when charging is successful. + * + * Charger needs to account resources on two criteria. + * (a) per cgroup & (b) per device resource usage. + * Per cgroup resource usage ensures that tasks of cgroup doesn't cross + * the configured limits. Per device provides granular configuration + * in multi device usage. It allocates resource pool in the hierarchy + * for each parent it come across for first resource. Later on resource + * pool will be available. Therefore it will be much faster thereon + * to charge/uncharge. + */ +int rdmacg_try_charge(struct rdma_cgroup **rdmacg, + struct rdmacg_device *device, + enum rdmacg_resource_type index) +{ + struct rdma_cgroup *cg, *p; + struct rdmacg_resource_pool *rpool; + s64 new; + int ret = 0; + + if (index >= RDMACG_RESOURCE_MAX) + return -EINVAL; + + /* + * hold on to css, as cgroup can be removed but resource + * accounting happens on css. + */ + cg = get_current_rdmacg(); + + mutex_lock(&rdmacg_mutex); + for (p = cg; p; p = parent_rdmacg(p)) { + rpool = get_cg_rpool_locked(p, device); + if (IS_ERR(rpool)) { + ret = PTR_ERR(rpool); + goto err; + } else { + new = rpool->resources[index].usage + 1; + if (new > rpool->resources[index].max) { + ret = -EAGAIN; + goto err; + } else { + rpool->resources[index].usage = new; + rpool->usage_sum++; + } + } + } + mutex_unlock(&rdmacg_mutex); + + *rdmacg = cg; + return 0; + +err: + mutex_unlock(&rdmacg_mutex); + rdmacg_uncharge_hierarchy(cg, device, p, index); + return ret; +} +EXPORT_SYMBOL(rdmacg_try_charge); + +/** + * rdmacg_register_device - register rdmacg device to rdma controller. + * @device: pointer to rdmacg device whose resources need to be accounted. + * + * If IB stack wish a device to participate in rdma cgroup resource + * tracking, it must invoke this API to register with rdma cgroup before + * any user space application can start using the RDMA resources. + * Returns 0 on success or EINVAL when table length given is beyond + * supported size. + */ +int rdmacg_register_device(struct rdmacg_device *device) +{ + INIT_LIST_HEAD(&device->dev_node); + INIT_LIST_HEAD(&device->rpools); + + mutex_lock(&rdmacg_mutex); + list_add_tail(&device->dev_node, &rdmacg_devices); + mutex_unlock(&rdmacg_mutex); + return 0; +} +EXPORT_SYMBOL(rdmacg_register_device); + +/** + * rdmacg_unregister_device - unregister rdmacg device from rdma controller. + * @device: pointer to rdmacg device which was previously registered with rdma + * controller using rdmacg_register_device(). + * + * IB stack must invoke this after all the resources of the IB device + * are destroyed and after ensuring that no more resources will be created + * when this API is invoked. + */ +void rdmacg_unregister_device(struct rdmacg_device *device) +{ + struct rdmacg_resource_pool *rpool, *tmp; + + /* + * Synchronize with any active resource settings, + * usage query happening via configfs. + */ + mutex_lock(&rdmacg_mutex); + list_del_init(&device->dev_node); + + /* + * Now that this device is off the cgroup list, its safe to free + * all the rpool resources. + */ + list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node) + free_cg_rpool_locked(rpool); + + mutex_unlock(&rdmacg_mutex); +} +EXPORT_SYMBOL(rdmacg_unregister_device); + +static int parse_resource(char *c, int *intval) +{ + substring_t argstr; + const char **table = &rdmacg_resource_names[0]; + char *name, *value = c; + size_t len; + int ret, i = 0; + + name = strsep(&value, "="); + if (!name || !value) + return -EINVAL; + + len = strlen(value); + + for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { + if (strcmp(table[i], name)) + continue; + + argstr.from = value; + argstr.to = value + len; + + ret = match_int(&argstr, intval); + if (ret >= 0) { + if (*intval < 0) + break; + return i; + } + if (strncmp(value, RDMACG_MAX_STR, len) == 0) { + *intval = S32_MAX; + return i; + } + break; + } + return -EINVAL; +} + +static int rdmacg_parse_limits(char *options, + int *new_limits, unsigned long *enables) +{ + char *c; + int err = -EINVAL; + + /* parse resource options */ + while ((c = strsep(&options, " ")) != NULL) { + int index, intval; + + index = parse_resource(c, &intval); + if (index < 0) + goto err; + + new_limits[index] = intval; + *enables |= BIT(index); + } + return 0; + +err: + return err; +} + +static struct rdmacg_device *rdmacg_get_device_locked(const char *name) +{ + struct rdmacg_device *device; + + lockdep_assert_held(&rdmacg_mutex); + + list_for_each_entry(device, &rdmacg_devices, dev_node) + if (!strcmp(name, device->name)) + return device; + + return NULL; +} + +static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdma_cgroup *cg = css_rdmacg(of_css(of)); + const char *dev_name; + struct rdmacg_resource_pool *rpool; + struct rdmacg_device *device; + char *options = strstrip(buf); + int *new_limits; + unsigned long enables = 0; + int i = 0, ret = 0; + + /* extract the device name first */ + dev_name = strsep(&options, " "); + if (!dev_name) { + ret = -EINVAL; + goto err; + } + + new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL); + if (!new_limits) { + ret = -ENOMEM; + goto err; + } + + ret = rdmacg_parse_limits(options, new_limits, &enables); + if (ret) + goto parse_err; + + /* acquire lock to synchronize with hot plug devices */ + mutex_lock(&rdmacg_mutex); + + device = rdmacg_get_device_locked(dev_name); + if (!device) { + ret = -ENODEV; + goto dev_err; + } + + rpool = get_cg_rpool_locked(cg, device); + if (IS_ERR(rpool)) { + ret = PTR_ERR(rpool); + goto dev_err; + } + + /* now set the new limits of the rpool */ + for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX) + set_resource_limit(rpool, i, new_limits[i]); + + if (rpool->usage_sum == 0 && + rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { + /* + * No user of the rpool and all entries are set to max, so + * safe to delete this rpool. + */ + free_cg_rpool_locked(rpool); + } + +dev_err: + mutex_unlock(&rdmacg_mutex); + +parse_err: + kfree(new_limits); + +err: + return ret ?: nbytes; +} + +static void print_rpool_values(struct seq_file *sf, + struct rdmacg_resource_pool *rpool) +{ + enum rdmacg_file_type sf_type; + int i; + u32 value; + + sf_type = seq_cft(sf)->private; + + for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { + seq_puts(sf, rdmacg_resource_names[i]); + seq_putc(sf, '='); + if (sf_type == RDMACG_RESOURCE_TYPE_MAX) { + if (rpool) + value = rpool->resources[i].max; + else + value = S32_MAX; + } else { + if (rpool) + value = rpool->resources[i].usage; + else + value = 0; + } + + if (value == S32_MAX) + seq_puts(sf, RDMACG_MAX_STR); + else + seq_printf(sf, "%d", value); + seq_putc(sf, ' '); + } +} + +static int rdmacg_resource_read(struct seq_file *sf, void *v) +{ + struct rdmacg_device *device; + struct rdmacg_resource_pool *rpool; + struct rdma_cgroup *cg = css_rdmacg(seq_css(sf)); + + mutex_lock(&rdmacg_mutex); + + list_for_each_entry(device, &rdmacg_devices, dev_node) { + seq_printf(sf, "%s ", device->name); + + rpool = find_cg_rpool_locked(cg, device); + print_rpool_values(sf, rpool); + + seq_putc(sf, '\n'); + } + + mutex_unlock(&rdmacg_mutex); + return 0; +} + +static struct cftype rdmacg_files[] = { + { + .name = "max", + .write = rdmacg_resource_set_max, + .seq_show = rdmacg_resource_read, + .private = RDMACG_RESOURCE_TYPE_MAX, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "current", + .seq_show = rdmacg_resource_read, + .private = RDMACG_RESOURCE_TYPE_STAT, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { } /* terminate */ +}; + +static struct cgroup_subsys_state * +rdmacg_css_alloc(struct cgroup_subsys_state *parent) +{ + struct rdma_cgroup *cg; + + cg = kzalloc(sizeof(*cg), GFP_KERNEL); + if (!cg) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&cg->rpools); + return &cg->css; +} + +static void rdmacg_css_free(struct cgroup_subsys_state *css) +{ + struct rdma_cgroup *cg = css_rdmacg(css); + + kfree(cg); +} + +/** + * rdmacg_css_offline - cgroup css_offline callback + * @css: css of interest + * + * This function is called when @css is about to go away and responsible + * for shooting down all rdmacg associated with @css. As part of that it + * marks all the resource pool entries to max value, so that when resources are + * uncharged, associated resource pool can be freed as well. + */ +static void rdmacg_css_offline(struct cgroup_subsys_state *css) +{ + struct rdma_cgroup *cg = css_rdmacg(css); + struct rdmacg_resource_pool *rpool; + + mutex_lock(&rdmacg_mutex); + + list_for_each_entry(rpool, &cg->rpools, cg_node) + set_all_resource_max_limit(rpool); + + mutex_unlock(&rdmacg_mutex); +} + +struct cgroup_subsys rdma_cgrp_subsys = { + .css_alloc = rdmacg_css_alloc, + .css_free = rdmacg_css_free, + .css_offline = rdmacg_css_offline, + .legacy_cftypes = rdmacg_files, + .dfl_cftypes = rdmacg_files, +}; diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index 1a8f34f63601..26a06e09a5bd 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -21,6 +21,7 @@ CONFIG_CP15_BARRIER_EMULATION=y CONFIG_DEFAULT_SECURITY_SELINUX=y CONFIG_EMBEDDED=y CONFIG_FB=y +CONFIG_HARDENED_USERCOPY=y CONFIG_HIGH_RES_TIMERS=y CONFIG_INET6_AH=y CONFIG_INET6_ESP=y @@ -129,6 +130,7 @@ CONFIG_PPP_DEFLATE=y CONFIG_PPP_MPPE=y CONFIG_PREEMPT=y CONFIG_QUOTA=y +CONFIG_RANDOMIZE_BASE=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y CONFIG_SECCOMP=y diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index 297756be369c..28ee064b6744 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -1,4 +1,5 @@ # KEEP ALPHABETICALLY SORTED +# CONFIG_AIO is not set # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_LEGACY_PTYS is not set @@ -11,7 +12,7 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_COMPACTION=y -CONFIG_DEBUG_RODATA=y +CONFIG_STRICT_KERNEL_RWX=y CONFIG_DM_CRYPT=y CONFIG_DM_UEVENT=y CONFIG_DM_VERITY=y diff --git a/kernel/events/core.c b/kernel/events/core.c index 77a932b54a64..1031bdf9f012 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -455,7 +455,7 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec(table, write, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; @@ -3522,6 +3522,8 @@ static void perf_event_enable_on_exec(int ctxn) if (enabled) { clone_ctx = unclone_ctx(ctx); ctx_resched(cpuctx, ctx, event_type); + } else { + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); } perf_ctx_unlock(cpuctx, ctx); @@ -4925,9 +4927,9 @@ unlock: rcu_read_unlock(); } -static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int perf_mmap_fault(struct vm_fault *vmf) { - struct perf_event *event = vma->vm_file->private_data; + struct perf_event *event = vmf->vma->vm_file->private_data; struct ring_buffer *rb; int ret = VM_FAULT_SIGBUS; @@ -4950,7 +4952,7 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) goto unlock; get_page(vmf->page); - vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->mapping = vmf->vma->vm_file->f_mapping; vmf->page->index = vmf->pgoff; ret = 0; @@ -9955,6 +9957,7 @@ SYSCALL_DEFINE5(perf_event_open, * of swizzling perf_event::ctx. */ perf_remove_from_context(group_leader, 0); + put_ctx(gctx); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { @@ -9993,13 +9996,6 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__state_init(group_leader); perf_install_in_context(ctx, group_leader, group_leader->cpu); get_ctx(ctx); - - /* - * Now that all events are installed in @ctx, nothing - * references @gctx anymore, so drop the last reference we have - * on it. - */ - put_ctx(gctx); } /* @@ -10959,5 +10955,11 @@ struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, .attach = perf_cgroup_attach, + /* + * Implicitly enable on dfl hierarchy so that perf events can + * always be filtered by cgroup2 path as long as perf_event + * controller is not mounted on a legacy hierarchy. + */ + .implicit_on_dfl = true, }; #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d416f3baf392..d630f8ac4d2f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -153,14 +153,19 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { struct mm_struct *mm = vma->vm_mm; - spinlock_t *ptl; - pte_t *ptep; + struct page_vma_mapped_walk pvmw = { + .page = old_page, + .vma = vma, + .address = addr, + }; int err; /* For mmu_notifiers */ const unsigned long mmun_start = addr; const unsigned long mmun_end = addr + PAGE_SIZE; struct mem_cgroup *memcg; + VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); + err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, false); if (err) @@ -171,11 +176,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); err = -EAGAIN; - ptep = page_check_address(old_page, mm, addr, &ptl, 0); - if (!ptep) { + if (!page_vma_mapped_walk(&pvmw)) { mem_cgroup_cancel_charge(new_page, memcg, false); goto unlock; } + VM_BUG_ON_PAGE(addr != pvmw.address, old_page); get_page(new_page); page_add_new_anon_rmap(new_page, vma, addr, false); @@ -187,14 +192,15 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, inc_mm_counter(mm, MM_ANONPAGES); } - flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush_notify(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, mk_pte(new_page, vma->vm_page_prot)); + flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); + ptep_clear_flush_notify(vma, addr, pvmw.pte); + set_pte_at_notify(mm, addr, pvmw.pte, + mk_pte(new_page, vma->vm_page_prot)); page_remove_rmap(old_page, false); if (!page_mapped(old_page)) try_to_free_swap(old_page); - pte_unmap_unlock(ptep, ptl); + page_vma_mapped_walk_done(&pvmw); if (vma->vm_flags & VM_LOCKED) munlock_vma_page(old_page); @@ -300,8 +306,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, - &vma, NULL); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, + FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL); if (ret <= 0) return ret; @@ -741,7 +747,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) continue; } - if (!atomic_inc_not_zero(&vma->vm_mm->mm_users)) + if (!mmget_not_zero(vma->vm_mm)) continue; info = prev; diff --git a/kernel/exit.c b/kernel/exit.c index b67c57faa705..8a768a3672a5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -14,7 +14,6 @@ #include <linux/tty.h> #include <linux/iocontext.h> #include <linux/key.h> -#include <linux/security.h> #include <linux/cpu.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> @@ -46,6 +45,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/tracehook.h> #include <linux/fs_struct.h> +#include <linux/userfaultfd_k.h> #include <linux/init_task.h> #include <linux/perf_event.h> #include <trace/events/sched.h> @@ -539,7 +539,7 @@ static void exit_mm(void) __set_current_state(TASK_RUNNING); down_read(&mm->mmap_sem); } - atomic_inc(&mm->mm_count); + mmgrab(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); @@ -548,6 +548,7 @@ static void exit_mm(void) enter_lazy_tlb(mm, current); task_unlock(current); mm_update_next_owner(mm); + userfaultfd_exit(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) exit_oom_victim(); @@ -608,15 +609,18 @@ static struct task_struct *find_new_reaper(struct task_struct *father, return thread; if (father->signal->has_child_subreaper) { + unsigned int ns_level = task_pid(father)->level; /* * Find the first ->is_child_subreaper ancestor in our pid_ns. - * We start from father to ensure we can not look into another - * namespace, this is safe because all its threads are dead. + * We can't check reaper != child_reaper to ensure we do not + * cross the namespaces, the exiting parent could be injected + * by setns() + fork(). + * We check pid->level, this is slightly more efficient than + * task_active_pid_ns(reaper) != task_active_pid_ns(father). */ - for (reaper = father; - !same_thread_group(reaper, child_reaper); + for (reaper = father->real_parent; + task_pid(reaper)->level == ns_level; reaper = reaper->real_parent) { - /* call_usermodehelper() descendants need this check */ if (reaper == &init_task) break; if (!reaper->signal->is_child_subreaper) @@ -1390,7 +1394,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; * then ->notask_error is 0 if @p is an eligible child, - * or another error from security_task_wait(), or still -ECHILD. + * or still -ECHILD. */ static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) @@ -1410,20 +1414,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, if (!ret) return ret; - ret = security_task_wait(p); - if (unlikely(ret < 0)) { - /* - * If we have not yet seen any eligible child, - * then let this error code replace -ECHILD. - * A permission error will give the user a clue - * to look for security policy problems, rather - * than for mysterious wait bugs. - */ - if (wo->notask_error) - wo->notask_error = ret; - return 0; - } - if (unlikely(exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case @@ -1516,7 +1506,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; then * ->notask_error is 0 if there were any eligible children, - * or another error from security_task_wait(), or still -ECHILD. + * or still -ECHILD. */ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) { diff --git a/kernel/extable.c b/kernel/extable.c index e1359474baa5..2676d7f8baf6 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -17,10 +17,12 @@ */ #include <linux/ftrace.h> #include <linux/memory.h> +#include <linux/extable.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/init.h> #include <linux/kprobes.h> +#include <linux/filter.h> #include <asm/sections.h> #include <linux/uaccess.h> @@ -107,6 +109,8 @@ int __kernel_text_address(unsigned long addr) return 1; if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) return 1; + if (is_bpf_text_address(addr)) + return 1; /* * There might be init symbols in saved stacktraces. * Give those symbols a chance to be printed in @@ -130,6 +134,8 @@ int kernel_text_address(unsigned long addr) return 1; if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) return 1; + if (is_bpf_text_address(addr)) + return 1; return 0; } diff --git a/kernel/fork.c b/kernel/fork.c index ff82e24573b6..246bf9aaf9df 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -55,6 +55,7 @@ #include <linux/rmap.h> #include <linux/ksm.h> #include <linux/acct.h> +#include <linux/userfaultfd_k.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/freezer.h> @@ -561,6 +562,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; + LIST_HEAD(uf); uprobe_start_dup_mmap(); if (down_write_killable(&oldmm->mmap_sem)) { @@ -617,12 +619,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (retval) goto fail_nomem_policy; tmp->vm_mm = mm; + retval = dup_userfaultfd(tmp, &uf); + if (retval) + goto fail_nomem_anon_vma_fork; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= - ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP); + tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); tmp->vm_next = tmp->vm_prev = NULL; - tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; file = tmp->vm_file; if (file) { struct inode *inode = file_inode(file); @@ -678,6 +681,7 @@ out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); + dup_userfaultfd_complete(&uf); fail_uprobe_end: uprobe_end_dup_mmap(); return retval; @@ -996,7 +1000,7 @@ struct mm_struct *get_task_mm(struct task_struct *task) if (task->flags & PF_KTHREAD) mm = NULL; else - atomic_inc(&mm->mm_users); + mmget(mm); } task_unlock(task); return mm; @@ -1184,7 +1188,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) vmacache_flush(tsk); if (clone_flags & CLONE_VM) { - atomic_inc(&oldmm->mm_users); + mmget(oldmm); mm = oldmm; goto good_mm; } @@ -1373,9 +1377,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; - sig->has_child_subreaper = current->signal->has_child_subreaper || - current->signal->is_child_subreaper; - mutex_init(&sig->cred_guard_mutex); return 0; @@ -1810,6 +1811,13 @@ static __latent_entropy struct task_struct *copy_process( p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); + /* + * Inherit has_child_subreaper flag under the same + * tasklist_lock with adding child to the process tree + * for propagate_has_child_subreaper optimization. + */ + p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper || + p->real_parent->signal->is_child_subreaper; list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); attach_pid(p, PIDTYPE_PGID); @@ -2063,6 +2071,38 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, } #endif +void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data) +{ + struct task_struct *leader, *parent, *child; + int res; + + read_lock(&tasklist_lock); + leader = top = top->group_leader; +down: + for_each_thread(leader, parent) { + list_for_each_entry(child, &parent->children, sibling) { + res = visitor(child, data); + if (res) { + if (res < 0) + goto out; + leader = child; + goto down; + } +up: + ; + } + } + + if (leader != top) { + child = leader; + parent = child->real_parent; + leader = parent->group_leader; + goto up; + } +out: + read_unlock(&tasklist_lock); +} + #ifndef ARCH_MIN_MMSTRUCT_ALIGN #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif diff --git a/kernel/futex.c b/kernel/futex.c index cdf365036141..b687cb22301c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -338,7 +338,7 @@ static inline bool should_fail_futex(bool fshared) static inline void futex_get_mm(union futex_key *key) { - atomic_inc(&key->private.mm->mm_count); + mmgrab(key->private.mm); /* * Ensure futex_get_mm() implies a full barrier such that * get_futex_key() implies a full barrier. This is relied upon diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6b669593e7eb..944d068b6c48 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -353,7 +353,7 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) return 0; /* - * Preserve the managed affinity setting and an userspace affinity + * Preserve the managed affinity setting and a userspace affinity * setup, but make sure that one of the targets is online. */ if (irqd_affinity_is_managed(&desc->irq_data) || diff --git a/kernel/jump_label.c b/kernel/jump_label.c index a9b8cf500591..6c9cb208ac48 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -236,12 +236,28 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry static inline struct jump_entry *static_key_entries(struct static_key *key) { - return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK); + WARN_ON_ONCE(key->type & JUMP_TYPE_LINKED); + return (struct jump_entry *)(key->type & ~JUMP_TYPE_MASK); } static inline bool static_key_type(struct static_key *key) { - return (unsigned long)key->entries & JUMP_TYPE_MASK; + return key->type & JUMP_TYPE_TRUE; +} + +static inline bool static_key_linked(struct static_key *key) +{ + return key->type & JUMP_TYPE_LINKED; +} + +static inline void static_key_clear_linked(struct static_key *key) +{ + key->type &= ~JUMP_TYPE_LINKED; +} + +static inline void static_key_set_linked(struct static_key *key) +{ + key->type |= JUMP_TYPE_LINKED; } static inline struct static_key *jump_entry_key(struct jump_entry *entry) @@ -254,6 +270,26 @@ static bool jump_entry_branch(struct jump_entry *entry) return (unsigned long)entry->key & 1UL; } +/*** + * A 'struct static_key' uses a union such that it either points directly + * to a table of 'struct jump_entry' or to a linked list of modules which in + * turn point to 'struct jump_entry' tables. + * + * The two lower bits of the pointer are used to keep track of which pointer + * type is in use and to store the initial branch direction, we use an access + * function which preserves these bits. + */ +static void static_key_set_entries(struct static_key *key, + struct jump_entry *entries) +{ + unsigned long type; + + WARN_ON_ONCE((unsigned long)entries & JUMP_TYPE_MASK); + type = key->type & JUMP_TYPE_MASK; + key->entries = entries; + key->type |= type; +} + static enum jump_label_type jump_label_type(struct jump_entry *entry) { struct static_key *key = jump_entry_key(entry); @@ -313,13 +349,7 @@ void __init jump_label_init(void) continue; key = iterk; - /* - * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. - */ - *((unsigned long *)&key->entries) += (unsigned long)iter; -#ifdef CONFIG_MODULES - key->next = NULL; -#endif + static_key_set_entries(key, iter); } static_key_initialized = true; jump_label_unlock(); @@ -343,6 +373,29 @@ struct static_key_mod { struct module *mod; }; +static inline struct static_key_mod *static_key_mod(struct static_key *key) +{ + WARN_ON_ONCE(!(key->type & JUMP_TYPE_LINKED)); + return (struct static_key_mod *)(key->type & ~JUMP_TYPE_MASK); +} + +/*** + * key->type and key->next are the same via union. + * This sets key->next and preserves the type bits. + * + * See additional comments above static_key_set_entries(). + */ +static void static_key_set_mod(struct static_key *key, + struct static_key_mod *mod) +{ + unsigned long type; + + WARN_ON_ONCE((unsigned long)mod & JUMP_TYPE_MASK); + type = key->type & JUMP_TYPE_MASK; + key->next = mod; + key->type |= type; +} + static int __jump_label_mod_text_reserved(void *start, void *end) { struct module *mod; @@ -365,11 +418,23 @@ static void __jump_label_mod_update(struct static_key *key) { struct static_key_mod *mod; - for (mod = key->next; mod; mod = mod->next) { - struct module *m = mod->mod; + for (mod = static_key_mod(key); mod; mod = mod->next) { + struct jump_entry *stop; + struct module *m; + + /* + * NULL if the static_key is defined in a module + * that does not use it + */ + if (!mod->entries) + continue; - __jump_label_update(key, mod->entries, - m->jump_entries + m->num_jump_entries); + m = mod->mod; + if (!m) + stop = __stop___jump_table; + else + stop = m->jump_entries + m->num_jump_entries; + __jump_label_update(key, mod->entries, stop); } } @@ -404,7 +469,7 @@ static int jump_label_add_module(struct module *mod) struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; struct static_key *key = NULL; - struct static_key_mod *jlm; + struct static_key_mod *jlm, *jlm2; /* if the module doesn't have jump label entries, just return */ if (iter_start == iter_stop) @@ -421,20 +486,32 @@ static int jump_label_add_module(struct module *mod) key = iterk; if (within_module(iter->key, mod)) { - /* - * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. - */ - *((unsigned long *)&key->entries) += (unsigned long)iter; - key->next = NULL; + static_key_set_entries(key, iter); continue; } jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; + if (!static_key_linked(key)) { + jlm2 = kzalloc(sizeof(struct static_key_mod), + GFP_KERNEL); + if (!jlm2) { + kfree(jlm); + return -ENOMEM; + } + preempt_disable(); + jlm2->mod = __module_address((unsigned long)key); + preempt_enable(); + jlm2->entries = static_key_entries(key); + jlm2->next = NULL; + static_key_set_mod(key, jlm2); + static_key_set_linked(key); + } jlm->mod = mod; jlm->entries = iter; - jlm->next = key->next; - key->next = jlm; + jlm->next = static_key_mod(key); + static_key_set_mod(key, jlm); + static_key_set_linked(key); /* Only update if we've changed from our initial state */ if (jump_label_type(iter) != jump_label_init_type(iter)) @@ -461,16 +538,34 @@ static void jump_label_del_module(struct module *mod) if (within_module(iter->key, mod)) continue; + /* No memory during module load */ + if (WARN_ON(!static_key_linked(key))) + continue; + prev = &key->next; - jlm = key->next; + jlm = static_key_mod(key); while (jlm && jlm->mod != mod) { prev = &jlm->next; jlm = jlm->next; } - if (jlm) { + /* No memory during module load */ + if (WARN_ON(!jlm)) + continue; + + if (prev == &key->next) + static_key_set_mod(key, jlm->next); + else *prev = jlm->next; + + kfree(jlm); + + jlm = static_key_mod(key); + /* if only one etry is left, fold it back into the static_key */ + if (jlm->next == NULL) { + static_key_set_entries(key, jlm->entries); + static_key_clear_linked(key); kfree(jlm); } } @@ -499,8 +594,10 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, case MODULE_STATE_COMING: jump_label_lock(); ret = jump_label_add_module(mod); - if (ret) + if (ret) { + WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n"); jump_label_del_module(mod); + } jump_label_unlock(); break; case MODULE_STATE_GOING: @@ -561,11 +658,14 @@ int jump_label_text_reserved(void *start, void *end) static void jump_label_update(struct static_key *key) { struct jump_entry *stop = __stop___jump_table; - struct jump_entry *entry = static_key_entries(key); + struct jump_entry *entry; #ifdef CONFIG_MODULES struct module *mod; - __jump_label_mod_update(key); + if (static_key_linked(key)) { + __jump_label_mod_update(key); + return; + } preempt_disable(); mod = __module_address((unsigned long)key); @@ -573,6 +673,7 @@ static void jump_label_update(struct static_key *key) stop = mod->jump_entries + mod->num_jump_entries; preempt_enable(); #endif + entry = static_key_entries(key); /* if there are no users, entry can be NULL */ if (entry) __jump_label_update(key, entry, stop); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index fafd1a3ef0da..6a3b249a2ae1 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -23,6 +23,7 @@ #include <linux/mm.h> #include <linux/ctype.h> #include <linux/slab.h> +#include <linux/filter.h> #include <linux/compiler.h> #include <asm/sections.h> @@ -300,10 +301,11 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { char namebuf[KSYM_NAME_LEN]; + if (is_ksym_addr(addr)) return !!get_symbol_pos(addr, symbolsize, offset); - - return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf); + return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) || + !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); } /* @@ -318,6 +320,8 @@ const char *kallsyms_lookup(unsigned long addr, unsigned long *offset, char **modname, char *namebuf) { + const char *ret; + namebuf[KSYM_NAME_LEN - 1] = 0; namebuf[0] = 0; @@ -333,9 +337,13 @@ const char *kallsyms_lookup(unsigned long addr, return namebuf; } - /* See if it's in a module. */ - return module_address_lookup(addr, symbolsize, offset, modname, - namebuf); + /* See if it's in a module or a BPF JITed image. */ + ret = module_address_lookup(addr, symbolsize, offset, + modname, namebuf); + if (!ret) + ret = bpf_address_lookup(addr, symbolsize, + offset, modname, namebuf); + return ret; } int lookup_symbol_name(unsigned long addr, char *symname) @@ -471,6 +479,7 @@ EXPORT_SYMBOL(__print_symbol); /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ struct kallsym_iter { loff_t pos; + loff_t pos_mod_end; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols. */ char type; @@ -481,13 +490,27 @@ struct kallsym_iter { static int get_ksymbol_mod(struct kallsym_iter *iter) { - if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value, - &iter->type, iter->name, iter->module_name, - &iter->exported) < 0) + int ret = module_get_kallsym(iter->pos - kallsyms_num_syms, + &iter->value, &iter->type, + iter->name, iter->module_name, + &iter->exported); + if (ret < 0) { + iter->pos_mod_end = iter->pos; return 0; + } + return 1; } +static int get_ksymbol_bpf(struct kallsym_iter *iter) +{ + iter->module_name[0] = '\0'; + iter->exported = 0; + return bpf_get_kallsym(iter->pos - iter->pos_mod_end, + &iter->value, &iter->type, + iter->name) < 0 ? 0 : 1; +} + /* Returns space to next name. */ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) { @@ -508,16 +531,30 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) iter->name[0] = '\0'; iter->nameoff = get_symbol_offset(new_pos); iter->pos = new_pos; + if (new_pos == 0) + iter->pos_mod_end = 0; +} + +static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) +{ + iter->pos = pos; + + if (iter->pos_mod_end > 0 && + iter->pos_mod_end < iter->pos) + return get_ksymbol_bpf(iter); + + if (!get_ksymbol_mod(iter)) + return get_ksymbol_bpf(iter); + + return 1; } /* Returns false if pos at or past end of file. */ static int update_iter(struct kallsym_iter *iter, loff_t pos) { /* Module symbols can be accessed randomly. */ - if (pos >= kallsyms_num_syms) { - iter->pos = pos; - return get_ksymbol_mod(iter); - } + if (pos >= kallsyms_num_syms) + return update_iter_mod(iter, pos); /* If we're not on the desired position, reset to new position. */ if (pos != iter->pos) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 5617cc412444..bfe62d5b3872 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -916,7 +916,7 @@ void crash_kexec(struct pt_regs *regs) old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); if (old_cpu == PANIC_CPU_INVALID) { /* This is the 1st CPU which comes here, so go ahead. */ - printk_nmi_flush_on_panic(); + printk_safe_flush_on_panic(); __crash_kexec(regs); /* @@ -1399,7 +1399,7 @@ void __weak arch_crash_save_vmcoreinfo(void) phys_addr_t __weak paddr_vmcoreinfo_note(void) { - return __pa((unsigned long)(char *)&vmcoreinfo_note); + return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); } static int __init crash_save_vmcoreinfo_init(void) diff --git a/kernel/kmod.c b/kernel/kmod.c index d45c96073afb..0c407f905ca4 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -516,7 +516,7 @@ static void helper_unlock(void) * Function must be runnable in either a process context or the * context in which call_usermodehelper_exec is called. */ -struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, +struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, char **envp, gfp_t gfp_mask, int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *info), @@ -528,7 +528,12 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, goto out; INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); + +#ifdef CONFIG_STATIC_USERMODEHELPER + sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH; +#else sub_info->path = path; +#endif sub_info->argv = argv; sub_info->envp = envp; @@ -566,6 +571,15 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) retval = -EBUSY; goto out; } + + /* + * If there is no binary for us to call, then just return and get out of + * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and + * disable all call_usermodehelper() calls. + */ + if (strlen(sub_info->path) == 0) + goto out; + /* * Set the completion pointer only if there is a waiter. * This makes it possible to use umh_complete to free @@ -613,7 +627,7 @@ EXPORT_SYMBOL(call_usermodehelper_exec); * This function is the equivalent to use call_usermodehelper_setup() and * call_usermodehelper_exec(). */ -int call_usermodehelper(char *path, char **argv, char **envp, int wait) +int call_usermodehelper(const char *path, char **argv, char **envp, int wait) { struct subprocess_info *info; gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ebb4dadca66b..699c5bc51a92 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1740,6 +1740,12 @@ void unregister_kprobes(struct kprobe **kps, int num) } EXPORT_SYMBOL_GPL(unregister_kprobes); +int __weak __kprobes kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return NOTIFY_DONE; +} + static struct notifier_block kprobe_exceptions_nb = { .notifier_call = kprobe_exceptions_notify, .priority = 0x7fffffff /* we need to be notified first */ diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index ee1bc1bb8feb..0999679d6f26 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -195,7 +195,7 @@ static ssize_t notes_read(struct file *filp, struct kobject *kobj, return count; } -static struct bin_attribute notes_attr = { +static struct bin_attribute notes_attr __ro_after_init = { .attr = { .name = "notes", .mode = S_IRUGO, diff --git a/kernel/memremap.c b/kernel/memremap.c index 9ecedc28b928..06123234f118 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -246,9 +246,13 @@ static void devm_memremap_pages_release(struct device *dev, void *data) /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); + + lock_device_hotplug(); mem_hotplug_begin(); arch_remove_memory(align_start, align_size); mem_hotplug_done(); + unlock_device_hotplug(); + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); pgmap_radix_release(res); dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, @@ -360,9 +364,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (error) goto err_pfn_remap; + lock_device_hotplug(); mem_hotplug_begin(); error = arch_add_memory(nid, align_start, align_size, true); mem_hotplug_done(); + unlock_device_hotplug(); if (error) goto err_add_memory; diff --git a/kernel/module.c b/kernel/module.c index 3d8f126208e3..7eba6dea4f41 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -17,6 +17,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/export.h> +#include <linux/extable.h> #include <linux/moduleloader.h> #include <linux/trace_events.h> #include <linux/init.h> @@ -61,6 +62,7 @@ #include <linux/pfn.h> #include <linux/bsearch.h> #include <linux/dynamic_debug.h> +#include <linux/audit.h> #include <uapi/linux/module.h> #include "module-internal.h" @@ -74,9 +76,9 @@ /* * Modules' sections will be aligned on page boundaries * to ensure complete separation of code and data, but - * only when CONFIG_DEBUG_SET_MODULE_RONX=y + * only when CONFIG_STRICT_MODULE_RWX=y */ -#ifdef CONFIG_DEBUG_SET_MODULE_RONX +#ifdef CONFIG_STRICT_MODULE_RWX # define debug_align(X) ALIGN(X, PAGE_SIZE) #else # define debug_align(X) (X) @@ -1844,7 +1846,7 @@ static void mod_sysfs_teardown(struct module *mod) mod_sysfs_fini(mod); } -#ifdef CONFIG_DEBUG_SET_MODULE_RONX +#ifdef CONFIG_STRICT_MODULE_RWX /* * LKM RO/NX protection: protect module's text/ro-data * from modification and any data from execution. @@ -2809,6 +2811,8 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info) if (get_modinfo(info, "livepatch")) { mod->klp = true; add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK); + pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n", + mod->name); } return 0; @@ -3608,6 +3612,8 @@ static int load_module(struct load_info *info, const char __user *uargs, goto free_copy; } + audit_log_kern_module(mod->name); + /* Reserve our place in the list. */ err = add_unformed_module(mod); if (err) @@ -3696,7 +3702,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mod->name, after_dashes); } - /* Link in to syfs. */ + /* Link in to sysfs. */ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); if (err < 0) goto coming_cleanup; @@ -3719,6 +3725,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mod_sysfs_teardown(mod); coming_cleanup: mod->state = MODULE_STATE_GOING; + destroy_params(mod->kp, mod->num_kp); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); klp_module_going(mod); @@ -4165,22 +4172,23 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) struct module *mod; preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if (mod->num_exentries == 0) - continue; + mod = __module_address(addr); + if (!mod) + goto out; - e = search_extable(mod->extable, - mod->extable + mod->num_exentries - 1, - addr); - if (e) - break; - } + if (!mod->num_exentries) + goto out; + + e = search_extable(mod->extable, + mod->extable + mod->num_exentries - 1, + addr); +out: preempt_enable(); - /* Now, if we found one, we are running inside it now, hence - we cannot unload the module, hence no refcnt needed. */ + /* + * Now, if we found one, we are running inside it now, hence + * we cannot unload the module, hence no refcnt needed. + */ return e; } diff --git a/kernel/notifier.c b/kernel/notifier.c index fd2c9acbcc19..6196af8a8223 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -95,7 +95,7 @@ static int notifier_call_chain(struct notifier_block **nl, if (nr_calls) (*nr_calls)++; - if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) + if (ret & NOTIFY_STOP_MASK) break; nb = next_nb; nr_to_call--; diff --git a/kernel/panic.c b/kernel/panic.c index 08aa88dde7de..3ec16e603e88 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -188,7 +188,7 @@ void panic(const char *fmt, ...) * Bypass the panic_cpu check and call __crash_kexec directly. */ if (!_crash_kexec_post_notifiers) { - printk_nmi_flush_on_panic(); + printk_safe_flush_on_panic(); __crash_kexec(NULL); /* @@ -213,7 +213,7 @@ void panic(const char *fmt, ...) atomic_notifier_call_chain(&panic_notifier_list, 0, buf); /* Call flush even twice. It tries harder with a single online CPU */ - printk_nmi_flush_on_panic(); + printk_safe_flush_on_panic(); kmsg_dump(KMSG_DUMP_PANIC); /* @@ -273,7 +273,8 @@ void panic(const char *fmt, ...) extern int stop_a_enabled; /* Make sure the user can actually press Stop-A (L1-A) */ stop_a_enabled = 1; - pr_emerg("Press Stop-A (L1-A) to return to the boot prom\n"); + pr_emerg("Press Stop-A (L1-A) from sun keyboard or send break\n" + "twice on console to return to the boot prom\n"); } #endif #if defined(CONFIG_S390) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8951d0d04810..9e1cba069385 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1151,7 +1151,7 @@ static int __init hibernate_setup(char *str) } else if (!strncmp(str, "no", 2)) { noresume = 1; nohibernate = 1; - } else if (IS_ENABLED(CONFIG_DEBUG_RODATA) + } else if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && !strncmp(str, "protect_image", 13)) { enable_restore_image_protection(); } diff --git a/kernel/power/power.h b/kernel/power/power.h index 1dfa0da827d3..7fdc40d31b7d 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -61,12 +61,12 @@ extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); extern int hibernation_platform_enter(void); -#ifdef CONFIG_DEBUG_RODATA +#ifdef CONFIG_STRICT_KERNEL_RWX /* kernel/power/snapshot.c */ extern void enable_restore_image_protection(void); #else static inline void enable_restore_image_protection(void) {} -#endif /* CONFIG_DEBUG_RODATA */ +#endif /* CONFIG_STRICT_KERNEL_RWX */ #else /* !CONFIG_HIBERNATION */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 2d8e2b227db8..905d5bbd595f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -38,7 +38,7 @@ #include "power.h" -#ifdef CONFIG_DEBUG_RODATA +#ifdef CONFIG_STRICT_KERNEL_RWX static bool hibernate_restore_protection; static bool hibernate_restore_protection_active; @@ -73,7 +73,7 @@ static inline void hibernate_restore_protection_begin(void) {} static inline void hibernate_restore_protection_end(void) {} static inline void hibernate_restore_protect_page(void *page_address) {} static inline void hibernate_restore_unprotect_page(void *page_address) {} -#endif /* CONFIG_DEBUG_RODATA */ +#endif /* CONFIG_STRICT_KERNEL_RWX */ static int swsusp_page_is_free(struct page *); static void swsusp_set_page_forbidden(struct page *); diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index abb0042a427b..4a2ffc39eb95 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,3 +1,3 @@ obj-y = printk.o -obj-$(CONFIG_PRINTK_NMI) += nmi.o +obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 7fd2838fa417..1db044f808b7 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -16,42 +16,55 @@ */ #include <linux/percpu.h> -typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args); +#ifdef CONFIG_PRINTK -int __printf(1, 0) vprintk_default(const char *fmt, va_list args); - -#ifdef CONFIG_PRINTK_NMI +#define PRINTK_SAFE_CONTEXT_MASK 0x7fffffff +#define PRINTK_NMI_CONTEXT_MASK 0x80000000 extern raw_spinlock_t logbuf_lock; +__printf(1, 0) int vprintk_default(const char *fmt, va_list args); +__printf(1, 0) int vprintk_func(const char *fmt, va_list args); +void __printk_safe_enter(void); +void __printk_safe_exit(void); + +#define printk_safe_enter_irqsave(flags) \ + do { \ + local_irq_save(flags); \ + __printk_safe_enter(); \ + } while (0) + +#define printk_safe_exit_irqrestore(flags) \ + do { \ + __printk_safe_exit(); \ + local_irq_restore(flags); \ + } while (0) + +#define printk_safe_enter_irq() \ + do { \ + local_irq_disable(); \ + __printk_safe_enter(); \ + } while (0) + +#define printk_safe_exit_irq() \ + do { \ + __printk_safe_exit(); \ + local_irq_enable(); \ + } while (0) + +#else + +__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; } + /* - * printk() could not take logbuf_lock in NMI context. Instead, - * it temporary stores the strings into a per-CPU buffer. - * The alternative implementation is chosen transparently - * via per-CPU variable. + * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem + * semaphore and some of console functions (console_unlock()/etc.), so + * printk-safe must preserve the existing local IRQ guarantees. */ -DECLARE_PER_CPU(printk_func_t, printk_func); -static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) -{ - return this_cpu_read(printk_func)(fmt, args); -} - -extern atomic_t nmi_message_lost; -static inline int get_nmi_message_lost(void) -{ - return atomic_xchg(&nmi_message_lost, 0); -} - -#else /* CONFIG_PRINTK_NMI */ - -static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) -{ - return vprintk_default(fmt, args); -} - -static inline int get_nmi_message_lost(void) -{ - return 0; -} - -#endif /* CONFIG_PRINTK_NMI */ +#define printk_safe_enter_irqsave(flags) local_irq_save(flags) +#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags) + +#define printk_safe_enter_irq() local_irq_disable() +#define printk_safe_exit_irq() local_irq_enable() + +#endif /* CONFIG_PRINTK */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 4ba3d34938c0..34da86e73d00 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -213,17 +213,36 @@ static int nr_ext_console_drivers; static int __down_trylock_console_sem(unsigned long ip) { - if (down_trylock(&console_sem)) + int lock_failed; + unsigned long flags; + + /* + * Here and in __up_console_sem() we need to be in safe mode, + * because spindump/WARN/etc from under console ->lock will + * deadlock in printk()->down_trylock_console_sem() otherwise. + */ + printk_safe_enter_irqsave(flags); + lock_failed = down_trylock(&console_sem); + printk_safe_exit_irqrestore(flags); + + if (lock_failed) return 1; mutex_acquire(&console_lock_dep_map, 0, 1, ip); return 0; } #define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) -#define up_console_sem() do { \ - mutex_release(&console_lock_dep_map, 1, _RET_IP_);\ - up(&console_sem);\ -} while (0) +static void __up_console_sem(unsigned long ip) +{ + unsigned long flags; + + mutex_release(&console_lock_dep_map, 1, ip); + + printk_safe_enter_irqsave(flags); + up(&console_sem); + printk_safe_exit_irqrestore(flags); +} +#define up_console_sem() __up_console_sem(_RET_IP_) /* * This is used for debugging the mess that is the VT code by @@ -351,6 +370,34 @@ __packed __aligned(4) */ DEFINE_RAW_SPINLOCK(logbuf_lock); +/* + * Helper macros to lock/unlock logbuf_lock and switch between + * printk-safe/unsafe modes. + */ +#define logbuf_lock_irq() \ + do { \ + printk_safe_enter_irq(); \ + raw_spin_lock(&logbuf_lock); \ + } while (0) + +#define logbuf_unlock_irq() \ + do { \ + raw_spin_unlock(&logbuf_lock); \ + printk_safe_exit_irq(); \ + } while (0) + +#define logbuf_lock_irqsave(flags) \ + do { \ + printk_safe_enter_irqsave(flags); \ + raw_spin_lock(&logbuf_lock); \ + } while (0) + +#define logbuf_unlock_irqrestore(flags) \ + do { \ + raw_spin_unlock(&logbuf_lock); \ + printk_safe_exit_irqrestore(flags); \ + } while (0) + #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); /* the next printk record to read by syslog(READ) or /proc/kmsg */ @@ -782,20 +829,21 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, ret = mutex_lock_interruptible(&user->lock); if (ret) return ret; - raw_spin_lock_irq(&logbuf_lock); + + logbuf_lock_irq(); while (user->seq == log_next_seq) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); goto out; } - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); ret = wait_event_interruptible(log_wait, user->seq != log_next_seq); if (ret) goto out; - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); } if (user->seq < log_first_seq) { @@ -803,7 +851,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, user->idx = log_first_idx; user->seq = log_first_seq; ret = -EPIPE; - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); goto out; } @@ -816,7 +864,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, user->idx = log_next(user->idx); user->seq++; - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); if (len > count) { ret = -EINVAL; @@ -843,7 +891,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) if (offset) return -ESPIPE; - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); switch (whence) { case SEEK_SET: /* the first record */ @@ -867,7 +915,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) default: ret = -EINVAL; } - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); return ret; } @@ -881,7 +929,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); if (user->seq < log_next_seq) { /* return error when data has vanished underneath us */ if (user->seq < log_first_seq) @@ -889,7 +937,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) else ret = POLLIN|POLLRDNORM; } - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); return ret; } @@ -919,10 +967,10 @@ static int devkmsg_open(struct inode *inode, struct file *file) mutex_init(&user->lock); - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); user->idx = log_first_idx; user->seq = log_first_seq; - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); file->private_data = user; return 0; @@ -1064,13 +1112,13 @@ void __init setup_log_buf(int early) return; } - raw_spin_lock_irqsave(&logbuf_lock, flags); + logbuf_lock_irqsave(flags); log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; free = __LOG_BUF_LEN - log_next_idx; memcpy(log_buf, __log_buf, __LOG_BUF_LEN); - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); pr_info("log_buf_len: %d bytes\n", log_buf_len); pr_info("early log buf free: %d(%d%%)\n", @@ -1248,7 +1296,7 @@ static int syslog_print(char __user *buf, int size) size_t n; size_t skip; - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); if (syslog_seq < log_first_seq) { /* messages are gone, move to first one */ syslog_seq = log_first_seq; @@ -1256,7 +1304,7 @@ static int syslog_print(char __user *buf, int size) syslog_partial = 0; } if (syslog_seq == log_next_seq) { - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); break; } @@ -1275,7 +1323,7 @@ static int syslog_print(char __user *buf, int size) syslog_partial += n; } else n = 0; - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); if (!n) break; @@ -1304,7 +1352,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) if (!text) return -ENOMEM; - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); if (buf) { u64 next_seq; u64 seq; @@ -1352,12 +1400,12 @@ static int syslog_print_all(char __user *buf, int size, bool clear) idx = log_next(idx); seq++; - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); if (copy_to_user(buf + len, text, textlen)) len = -EFAULT; else len += textlen; - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); if (seq < log_first_seq) { /* messages are gone, move to next one */ @@ -1371,7 +1419,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) clear_seq = log_next_seq; clear_idx = log_next_idx; } - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); kfree(text); return len; @@ -1458,7 +1506,7 @@ int do_syslog(int type, char __user *buf, int len, int source) break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: - raw_spin_lock_irq(&logbuf_lock); + logbuf_lock_irq(); if (syslog_seq < log_first_seq) { /* messages are gone, move to first one */ syslog_seq = log_first_seq; @@ -1486,7 +1534,7 @@ int do_syslog(int type, char __user *buf, int len, int source) } error -= syslog_partial; } - raw_spin_unlock_irq(&logbuf_lock); + logbuf_unlock_irq(); break; /* Size of the log buffer */ case SYSLOG_ACTION_SIZE_BUFFER: @@ -1510,8 +1558,7 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) * log_buf[start] to log_buf[end - 1]. * The console_lock must be held. */ -static void call_console_drivers(int level, - const char *ext_text, size_t ext_len, +static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) { struct console *con; @@ -1538,28 +1585,6 @@ static void call_console_drivers(int level, } } -/* - * Zap console related locks when oopsing. - * To leave time for slow consoles to print a full oops, - * only zap at most once every 30 seconds. - */ -static void zap_locks(void) -{ - static unsigned long oops_timestamp; - - if (time_after_eq(jiffies, oops_timestamp) && - !time_after(jiffies, oops_timestamp + 30 * HZ)) - return; - - oops_timestamp = jiffies; - - debug_locks_off(); - /* If a crash is occurring, make sure we can't deadlock */ - raw_spin_lock_init(&logbuf_lock); - /* And make sure that we print immediately */ - sema_init(&console_sem, 1); -} - int printk_delay_msec __read_mostly; static inline void printk_delay(void) @@ -1669,18 +1694,13 @@ asmlinkage int vprintk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, va_list args) { - static bool recursion_bug; static char textbuf[LOG_LINE_MAX]; char *text = textbuf; size_t text_len = 0; enum log_flags lflags = 0; unsigned long flags; - int this_cpu; int printed_len = 0; - int nmi_message_lost; bool in_sched = false; - /* cpu currently holding logbuf_lock in this function */ - static unsigned int logbuf_cpu = UINT_MAX; if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; @@ -1690,53 +1710,8 @@ asmlinkage int vprintk_emit(int facility, int level, boot_delay_msec(level); printk_delay(); - local_irq_save(flags); - this_cpu = smp_processor_id(); - - /* - * Ouch, printk recursed into itself! - */ - if (unlikely(logbuf_cpu == this_cpu)) { - /* - * If a crash is occurring during printk() on this CPU, - * then try to get the crash message out but make sure - * we can't deadlock. Otherwise just return to avoid the - * recursion and return - but flag the recursion so that - * it can be printed at the next appropriate moment: - */ - if (!oops_in_progress && !lockdep_recursing(current)) { - recursion_bug = true; - local_irq_restore(flags); - return 0; - } - zap_locks(); - } - - lockdep_off(); /* This stops the holder of console_sem just where we want him */ - raw_spin_lock(&logbuf_lock); - logbuf_cpu = this_cpu; - - if (unlikely(recursion_bug)) { - static const char recursion_msg[] = - "BUG: recent printk recursion!"; - - recursion_bug = false; - /* emit KERN_CRIT message */ - printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, - NULL, 0, recursion_msg, - strlen(recursion_msg)); - } - - nmi_message_lost = get_nmi_message_lost(); - if (unlikely(nmi_message_lost)) { - text_len = scnprintf(textbuf, sizeof(textbuf), - "BAD LUCK: lost %d message(s) from NMI context!", - nmi_message_lost); - printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, - NULL, 0, textbuf, text_len); - } - + logbuf_lock_irqsave(flags); /* * The printf needs to come first; we need the syslog * prefix which might be passed-in as a parameter. @@ -1779,14 +1754,10 @@ asmlinkage int vprintk_emit(int facility, int level, printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len); - logbuf_cpu = UINT_MAX; - raw_spin_unlock(&logbuf_lock); - lockdep_on(); - local_irq_restore(flags); + logbuf_unlock_irqrestore(flags); /* If called from the scheduler, we can not call up(). */ if (!in_sched) { - lockdep_off(); /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up @@ -1794,7 +1765,6 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (console_trylock()) console_unlock(); - lockdep_on(); } return printed_len; @@ -1803,7 +1773,7 @@ EXPORT_SYMBOL(vprintk_emit); asmlinkage int vprintk(const char *fmt, va_list args) { - return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + return vprintk_func(fmt, args); } EXPORT_SYMBOL(vprintk); @@ -1895,16 +1865,12 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, static ssize_t msg_print_ext_body(char *buf, size_t size, char *dict, size_t dict_len, char *text, size_t text_len) { return 0; } -static void call_console_drivers(int level, - const char *ext_text, size_t ext_len, +static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size) { return 0; } static bool suppress_message_printing(int level) { return false; } -/* Still needs to be defined for users */ -DEFINE_PER_CPU(printk_func_t, printk_func); - #endif /* CONFIG_PRINTK */ #ifdef CONFIG_EARLY_PRINTK @@ -2220,9 +2186,9 @@ again: struct printk_log *msg; size_t ext_len = 0; size_t len; - int level; - raw_spin_lock_irqsave(&logbuf_lock, flags); + printk_safe_enter_irqsave(flags); + raw_spin_lock(&logbuf_lock); if (seen_seq != log_next_seq) { wake_klogd = true; seen_seq = log_next_seq; @@ -2243,8 +2209,7 @@ skip: break; msg = log_from_idx(console_idx); - level = msg->level; - if (suppress_message_printing(level)) { + if (suppress_message_printing(msg->level)) { /* * Skip record we have buffered and already printed * directly to the console when we received it, and @@ -2270,9 +2235,9 @@ skip: raw_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ - call_console_drivers(level, ext_text, ext_len, text, len); + call_console_drivers(ext_text, ext_len, text, len); start_critical_timings(); - local_irq_restore(flags); + printk_safe_exit_irqrestore(flags); if (do_cond_resched) cond_resched(); @@ -2295,7 +2260,8 @@ skip: */ raw_spin_lock(&logbuf_lock); retry = console_seq != log_next_seq; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock(&logbuf_lock); + printk_safe_exit_irqrestore(flags); if (retry && console_trylock()) goto again; @@ -2558,10 +2524,10 @@ void register_console(struct console *newcon) * console_unlock(); will print out the buffered messages * for us. */ - raw_spin_lock_irqsave(&logbuf_lock, flags); + logbuf_lock_irqsave(flags); console_seq = syslog_seq; console_idx = syslog_idx; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); /* * We're about to replay the log buffer. Only do this to the * just-registered console to avoid excessive message spam to @@ -2860,12 +2826,12 @@ void kmsg_dump(enum kmsg_dump_reason reason) /* initialize iterator with data about the stored records */ dumper->active = true; - raw_spin_lock_irqsave(&logbuf_lock, flags); + logbuf_lock_irqsave(flags); dumper->cur_seq = clear_seq; dumper->cur_idx = clear_idx; dumper->next_seq = log_next_seq; dumper->next_idx = log_next_idx; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); /* invoke dumper which will iterate over records */ dumper->dump(dumper, reason); @@ -2950,9 +2916,9 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, unsigned long flags; bool ret; - raw_spin_lock_irqsave(&logbuf_lock, flags); + logbuf_lock_irqsave(flags); ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); return ret; } @@ -2991,7 +2957,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, if (!dumper->active) goto out; - raw_spin_lock_irqsave(&logbuf_lock, flags); + logbuf_lock_irqsave(flags); if (dumper->cur_seq < log_first_seq) { /* messages are gone, move to first available one */ dumper->cur_seq = log_first_seq; @@ -3000,7 +2966,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, /* last entry */ if (dumper->cur_seq >= dumper->next_seq) { - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); goto out; } @@ -3042,7 +3008,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, dumper->next_seq = next_seq; dumper->next_idx = next_idx; ret = true; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); out: if (len) *len = l; @@ -3080,9 +3046,9 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper) { unsigned long flags; - raw_spin_lock_irqsave(&logbuf_lock, flags); + logbuf_lock_irqsave(flags); kmsg_dump_rewind_nolock(dumper); - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + logbuf_unlock_irqrestore(flags); } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); diff --git a/kernel/printk/nmi.c b/kernel/printk/printk_safe.c index f011aaef583c..033e50a7d706 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/printk_safe.c @@ -1,5 +1,5 @@ /* - * nmi.c - Safe printk in NMI context + * printk_safe.c - Safe printk for printk-deadlock-prone contexts * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -32,36 +32,58 @@ * is later flushed into the main ring buffer via IRQ work. * * The alternative implementation is chosen transparently - * via @printk_func per-CPU variable. + * by examinig current printk() context mask stored in @printk_context + * per-CPU variable. * * The implementation allows to flush the strings also from another CPU. * There are situations when we want to make sure that all buffers * were handled or when IRQs are blocked. */ -DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; -static int printk_nmi_irq_ready; -atomic_t nmi_message_lost; +static int printk_safe_irq_ready; -#define NMI_LOG_BUF_LEN ((1 << CONFIG_NMI_LOG_BUF_SHIFT) - \ - sizeof(atomic_t) - sizeof(struct irq_work)) +#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \ + sizeof(atomic_t) - \ + sizeof(atomic_t) - \ + sizeof(struct irq_work)) -struct nmi_seq_buf { +struct printk_safe_seq_buf { atomic_t len; /* length of written data */ + atomic_t message_lost; struct irq_work work; /* IRQ work that flushes the buffer */ - unsigned char buffer[NMI_LOG_BUF_LEN]; + unsigned char buffer[SAFE_LOG_BUF_LEN]; }; -static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); + +static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq); +static DEFINE_PER_CPU(int, printk_context); + +#ifdef CONFIG_PRINTK_NMI +static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq); +#endif + +/* Get flushed in a more safe context. */ +static void queue_flush_work(struct printk_safe_seq_buf *s) +{ + if (printk_safe_irq_ready) { + /* Make sure that IRQ work is really initialized. */ + smp_rmb(); + irq_work_queue(&s->work); + } +} /* - * Safe printk() for NMI context. It uses a per-CPU buffer to - * store the message. NMIs are not nested, so there is always only - * one writer running. But the buffer might get flushed from another - * CPU, so we need to be careful. + * Add a message to per-CPU context-dependent buffer. NMI and printk-safe + * have dedicated buffers, because otherwise printk-safe preempted by + * NMI-printk would have overwritten the NMI messages. + * + * The messages are fushed from irq work (or from panic()), possibly, + * from other CPU, concurrently with printk_safe_log_store(). Should this + * happen, printk_safe_log_store() will notice the buffer->len mismatch + * and repeat the write. */ -static int vprintk_nmi(const char *fmt, va_list args) +static int printk_safe_log_store(struct printk_safe_seq_buf *s, + const char *fmt, va_list args) { - struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - int add = 0; + int add; size_t len; again: @@ -69,18 +91,21 @@ again: /* The trailing '\0' is not counted into len. */ if (len >= sizeof(s->buffer) - 1) { - atomic_inc(&nmi_message_lost); + atomic_inc(&s->message_lost); + queue_flush_work(s); return 0; } /* - * Make sure that all old data have been read before the buffer was - * reseted. This is not needed when we just append data. + * Make sure that all old data have been read before the buffer + * was reset. This is not needed when we just append data. */ if (!len) smp_rmb(); add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + if (!add) + return 0; /* * Do it once again if the buffer has been flushed in the meantime. @@ -90,32 +115,23 @@ again: if (atomic_cmpxchg(&s->len, len, len + add) != len) goto again; - /* Get flushed in a more safe context. */ - if (add && printk_nmi_irq_ready) { - /* Make sure that IRQ work is really initialized. */ - smp_rmb(); - irq_work_queue(&s->work); - } - + queue_flush_work(s); return add; } -static void printk_nmi_flush_line(const char *text, int len) +static inline void printk_safe_flush_line(const char *text, int len) { /* - * The buffers are flushed in NMI only on panic. The messages must - * go only into the ring buffer at this stage. Consoles will get - * explicitly called later when a crashdump is not generated. + * Avoid any console drivers calls from here, because we may be + * in NMI or printk_safe context (when in panic). The messages + * must go only into the ring buffer at this stage. Consoles will + * get explicitly called later when a crashdump is not generated. */ - if (in_nmi()) - printk_deferred("%.*s", len, text); - else - printk("%.*s", len, text); - + printk_deferred("%.*s", len, text); } /* printk part of the temporary buffer line by line */ -static int printk_nmi_flush_buffer(const char *start, size_t len) +static int printk_safe_flush_buffer(const char *start, size_t len) { const char *c, *end; bool header; @@ -127,7 +143,7 @@ static int printk_nmi_flush_buffer(const char *start, size_t len) /* Print line by line. */ while (c < end) { if (*c == '\n') { - printk_nmi_flush_line(start, c - start + 1); + printk_safe_flush_line(start, c - start + 1); start = ++c; header = true; continue; @@ -140,7 +156,7 @@ static int printk_nmi_flush_buffer(const char *start, size_t len) continue; } - printk_nmi_flush_line(start, c - start); + printk_safe_flush_line(start, c - start); start = c++; header = true; continue; @@ -154,22 +170,31 @@ static int printk_nmi_flush_buffer(const char *start, size_t len) if (start < end && !header) { static const char newline[] = KERN_CONT "\n"; - printk_nmi_flush_line(start, end - start); - printk_nmi_flush_line(newline, strlen(newline)); + printk_safe_flush_line(start, end - start); + printk_safe_flush_line(newline, strlen(newline)); } return len; } +static void report_message_lost(struct printk_safe_seq_buf *s) +{ + int lost = atomic_xchg(&s->message_lost, 0); + + if (lost) + printk_deferred("Lost %d message(s)!\n", lost); +} + /* - * Flush data from the associated per_CPU buffer. The function + * Flush data from the associated per-CPU buffer. The function * can be called either via IRQ work or independently. */ -static void __printk_nmi_flush(struct irq_work *work) +static void __printk_safe_flush(struct irq_work *work) { static raw_spinlock_t read_lock = __RAW_SPIN_LOCK_INITIALIZER(read_lock); - struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work); + struct printk_safe_seq_buf *s = + container_of(work, struct printk_safe_seq_buf, work); unsigned long flags; size_t len; int i; @@ -194,9 +219,9 @@ more: * buffer size. */ if ((i && i >= len) || len > sizeof(s->buffer)) { - const char *msg = "printk_nmi_flush: internal error\n"; + const char *msg = "printk_safe_flush: internal error\n"; - printk_nmi_flush_line(msg, strlen(msg)); + printk_safe_flush_line(msg, strlen(msg)); len = 0; } @@ -205,7 +230,7 @@ more: /* Make sure that data has been written up to the @len */ smp_rmb(); - i += printk_nmi_flush_buffer(s->buffer + i, len - i); + i += printk_safe_flush_buffer(s->buffer + i, len - i); /* * Check that nothing has got added in the meantime and truncate @@ -217,35 +242,40 @@ more: goto more; out: + report_message_lost(s); raw_spin_unlock_irqrestore(&read_lock, flags); } /** - * printk_nmi_flush - flush all per-cpu nmi buffers. + * printk_safe_flush - flush all per-cpu nmi buffers. * * The buffers are flushed automatically via IRQ work. This function * is useful only when someone wants to be sure that all buffers have * been flushed at some point. */ -void printk_nmi_flush(void) +void printk_safe_flush(void) { int cpu; - for_each_possible_cpu(cpu) - __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work); + for_each_possible_cpu(cpu) { +#ifdef CONFIG_PRINTK_NMI + __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work); +#endif + __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work); + } } /** - * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system + * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system * goes down. * - * Similar to printk_nmi_flush() but it can be called even in NMI context when + * Similar to printk_safe_flush() but it can be called even in NMI context when * the system goes down. It does the best effort to get NMI messages into * the main ring buffer. * * Note that it could try harder when there is only one CPU online. */ -void printk_nmi_flush_on_panic(void) +void printk_safe_flush_on_panic(void) { /* * Make sure that we could access the main ring buffer. @@ -259,33 +289,97 @@ void printk_nmi_flush_on_panic(void) raw_spin_lock_init(&logbuf_lock); } - printk_nmi_flush(); + printk_safe_flush(); } -void __init printk_nmi_init(void) +#ifdef CONFIG_PRINTK_NMI +/* + * Safe printk() for NMI context. It uses a per-CPU buffer to + * store the message. NMIs are not nested, so there is always only + * one writer running. But the buffer might get flushed from another + * CPU, so we need to be careful. + */ +static int vprintk_nmi(const char *fmt, va_list args) { - int cpu; + struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - for_each_possible_cpu(cpu) { - struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu); + return printk_safe_log_store(s, fmt, args); +} - init_irq_work(&s->work, __printk_nmi_flush); - } +void printk_nmi_enter(void) +{ + this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); +} - /* Make sure that IRQ works are initialized before enabling. */ - smp_wmb(); - printk_nmi_irq_ready = 1; +void printk_nmi_exit(void) +{ + this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); +} - /* Flush pending messages that did not have scheduled IRQ works. */ - printk_nmi_flush(); +#else + +static int vprintk_nmi(const char *fmt, va_list args) +{ + return 0; } -void printk_nmi_enter(void) +#endif /* CONFIG_PRINTK_NMI */ + +/* + * Lock-less printk(), to avoid deadlocks should the printk() recurse + * into itself. It uses a per-CPU buffer to store the message, just like + * NMI. + */ +static int vprintk_safe(const char *fmt, va_list args) { - this_cpu_write(printk_func, vprintk_nmi); + struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq); + + return printk_safe_log_store(s, fmt, args); } -void printk_nmi_exit(void) +/* Can be preempted by NMI. */ +void __printk_safe_enter(void) +{ + this_cpu_inc(printk_context); +} + +/* Can be preempted by NMI. */ +void __printk_safe_exit(void) { - this_cpu_write(printk_func, vprintk_default); + this_cpu_dec(printk_context); +} + +__printf(1, 0) int vprintk_func(const char *fmt, va_list args) +{ + if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) + return vprintk_nmi(fmt, args); + + if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) + return vprintk_safe(fmt, args); + + return vprintk_default(fmt, args); +} + +void __init printk_safe_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct printk_safe_seq_buf *s; + + s = &per_cpu(safe_print_seq, cpu); + init_irq_work(&s->work, __printk_safe_flush); + +#ifdef CONFIG_PRINTK_NMI + s = &per_cpu(nmi_print_seq, cpu); + init_irq_work(&s->work, __printk_safe_flush); +#endif + } + + /* Make sure that IRQ works are initialized before enabling. */ + smp_wmb(); + printk_safe_irq_ready = 1; + + /* Flush pending messages that did not have scheduled IRQ works. */ + printk_safe_flush(); } diff --git a/kernel/relay.c b/kernel/relay.c index 8f18d314a96a..0e413d9eec8a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -39,10 +39,10 @@ static void relay_file_mmap_close(struct vm_area_struct *vma) /* * fault() vm_op implementation for relay file mapping. */ -static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int relay_buf_fault(struct vm_fault *vmf) { struct page *page; - struct rchan_buf *buf = vma->vm_private_data; + struct rchan_buf *buf = vmf->vma->vm_private_data; pgoff_t pgoff = vmf->pgoff; if (!buf) @@ -847,7 +847,7 @@ void relay_close(struct rchan *chan) if (chan->last_toobig) printk(KERN_WARNING "relay: one or more items not logged " - "[item size (%Zd) > sub-buffer size (%Zd)]\n", + "[item size (%zd) > sub-buffer size (%zd)]\n", chan->last_toobig, chan->subbuf_size); list_del(&chan->list); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 34e2291a9a6c..bbfb917a9b49 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -23,6 +23,9 @@ #include <asm/switch_to.h> #include <asm/tlb.h> +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#endif #include "sched.h" #include "../workqueue_internal.h" @@ -1087,6 +1090,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, int ret = 0; rq = task_rq_lock(p, &rf); + update_rq_clock(rq); if (p->flags & PF_KTHREAD) { /* @@ -2844,7 +2848,7 @@ context_switch(struct rq *rq, struct task_struct *prev, if (!mm) { next->active_mm = oldmm; - atomic_inc(&oldmm->mm_count); + mmgrab(oldmm); enter_lazy_tlb(oldmm, next); } else switch_mm_irqs_off(oldmm, mm, next); @@ -5557,7 +5561,7 @@ static void migrate_tasks(struct rq *dead_rq) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; - struct rq_flags rf, old_rf; + struct rq_flags rf; int dest_cpu; /* @@ -5576,7 +5580,9 @@ static void migrate_tasks(struct rq *dead_rq) * class method both need to have an up-to-date * value of rq->clock[_task] */ + rq_pin_lock(rq, &rf); update_rq_clock(rq); + rq_unpin_lock(rq, &rf); for (;;) { /* @@ -5589,7 +5595,7 @@ static void migrate_tasks(struct rq *dead_rq) /* * pick_next_task() assumes pinned rq->lock: */ - rq_pin_lock(rq, &rf); + rq_repin_lock(rq, &rf); next = pick_next_task(rq, &fake_task, &rf); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); @@ -5618,13 +5624,6 @@ static void migrate_tasks(struct rq *dead_rq) continue; } - /* - * __migrate_task() may return with a different - * rq->lock held and a new cookie in 'rf', but we need - * to preserve rf::clock_update_flags for 'dead_rq'. - */ - old_rf = rf; - /* Find suitable destination for @next, with force if needed. */ dest_cpu = select_fallback_rq(dead_rq->cpu, next); @@ -5633,7 +5632,6 @@ static void migrate_tasks(struct rq *dead_rq) raw_spin_unlock(&rq->lock); rq = dead_rq; raw_spin_lock(&rq->lock); - rf = old_rf; } raw_spin_unlock(&next->pi_lock); } @@ -6095,7 +6093,7 @@ void __init sched_init(void) /* * The boot idle thread does lazy MMU switching as well: */ - atomic_inc(&init_mm.mm_count); + mmgrab(&init_mm); enter_lazy_tlb(&init_mm, current); /* @@ -6816,11 +6814,20 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); - sched_online_group(tg, parent); - return &tg->css; } +/* Expose task group only after completing cgroup initialization */ +static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + struct task_group *parent = css_tg(css->parent); + + if (parent) + sched_online_group(tg, parent); + return 0; +} + static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); @@ -7226,6 +7233,7 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, + .css_online = cpu_cgroup_css_online, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .fork = cpu_cgroup_fork, diff --git a/kernel/seccomp.c b/kernel/seccomp.c index f7ce79a46050..e15185c28de5 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -16,6 +16,7 @@ #include <linux/atomic.h> #include <linux/audit.h> #include <linux/compat.h> +#include <linux/coredump.h> #include <linux/sched.h> #include <linux/seccomp.h> #include <linux/slab.h> @@ -486,6 +487,17 @@ void put_seccomp_filter(struct task_struct *tsk) } } +static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) +{ + memset(info, 0, sizeof(*info)); + info->si_signo = SIGSYS; + info->si_code = SYS_SECCOMP; + info->si_call_addr = (void __user *)KSTK_EIP(current); + info->si_errno = reason; + info->si_arch = syscall_get_arch(); + info->si_syscall = syscall; +} + /** * seccomp_send_sigsys - signals the task to allow in-process syscall emulation * @syscall: syscall number to send to userland @@ -496,13 +508,7 @@ void put_seccomp_filter(struct task_struct *tsk) static void seccomp_send_sigsys(int syscall, int reason) { struct siginfo info; - memset(&info, 0, sizeof(info)); - info.si_signo = SIGSYS; - info.si_code = SYS_SECCOMP; - info.si_call_addr = (void __user *)KSTK_EIP(current); - info.si_errno = reason; - info.si_arch = syscall_get_arch(); - info.si_syscall = syscall; + seccomp_init_siginfo(&info, syscall, reason); force_sig_info(SIGSYS, &info, current); } #endif /* CONFIG_SECCOMP_FILTER */ @@ -634,10 +640,20 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; case SECCOMP_RET_KILL: - default: + default: { + siginfo_t info; audit_seccomp(this_syscall, SIGSYS, action); + /* Dump core only if this is the last remaining thread. */ + if (get_nr_threads(current) == 1) { + /* Show the original registers in the dump. */ + syscall_rollback(current, task_pt_regs(current)); + /* Trigger a manual coredump since do_exit skips it. */ + seccomp_init_siginfo(&info, this_syscall, data); + do_coredump(&info); + } do_exit(SIGSYS); } + } unreachable(); diff --git a/kernel/signal.c b/kernel/signal.c index 13f9def8b24a..214a8feeb771 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3239,10 +3239,17 @@ int compat_restore_altstack(const compat_stack_t __user *uss) int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) { + int err; struct task_struct *t = current; - return __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) | - __put_user(sas_ss_flags(sp), &uss->ss_flags) | + err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), + &uss->ss_sp) | + __put_user(t->sas_ss_flags, &uss->ss_flags) | __put_user(t->sas_ss_size, &uss->ss_size); + if (err) + return err; + if (t->sas_ss_flags & SS_AUTODISARM) + sas_ss_reset(t); + return 0; } #endif diff --git a/kernel/sys.c b/kernel/sys.c index 7d4a9a6df956..b07adca97ea3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2063,6 +2063,24 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) } #endif +static int propagate_has_child_subreaper(struct task_struct *p, void *data) +{ + /* + * If task has has_child_subreaper - all its decendants + * already have these flag too and new decendants will + * inherit it on fork, skip them. + * + * If we've found child_reaper - skip descendants in + * it's subtree as they will never get out pidns. + */ + if (p->signal->has_child_subreaper || + is_child_reaper(task_pid(p))) + return 0; + + p->signal->has_child_subreaper = 1; + return 1; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2214,6 +2232,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, break; case PR_SET_CHILD_SUBREAPER: me->signal->is_child_subreaper = !!arg2; + if (!arg2) + break; + + walk_process_tree(me, propagate_has_child_subreaper, NULL); break; case PR_GET_CHILD_SUBREAPER: error = put_user(me->signal->is_child_subreaper, diff --git a/kernel/torture.c b/kernel/torture.c index 0d887eb62856..01a99976f072 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -311,7 +311,7 @@ EXPORT_SYMBOL_GPL(torture_random); /* * Variables for shuffling. The idea is to ensure that each CPU stays * idle for an extended period to test interactions with dyntick idle, - * as well as interactions with any per-CPU varibles. + * as well as interactions with any per-CPU variables. */ struct shuffle_task { struct list_head st_l; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 95cecbf67f5c..b2058a7f94bd 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -28,6 +28,8 @@ #include <linux/uaccess.h> #include <linux/list.h> +#include "../../block/blk.h" + #include <trace/events/block.h> #include "trace_output.h" @@ -292,9 +294,6 @@ record_it: local_irq_restore(flags); } -static struct dentry *blk_tree_root; -static DEFINE_MUTEX(blk_tree_mutex); - static void blk_trace_free(struct blk_trace *bt) { debugfs_remove(bt->msg_file); @@ -433,9 +432,9 @@ static void blk_trace_setup_lba(struct blk_trace *bt, /* * Setup everything required to start tracing */ -int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - struct blk_user_trace_setup *buts) +static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, + struct blk_user_trace_setup *buts) { struct blk_trace *bt = NULL; struct dentry *dir = NULL; @@ -468,22 +467,15 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = -ENOENT; - mutex_lock(&blk_tree_mutex); - if (!blk_tree_root) { - blk_tree_root = debugfs_create_dir("block", NULL); - if (!blk_tree_root) { - mutex_unlock(&blk_tree_mutex); - goto err; - } - } - mutex_unlock(&blk_tree_mutex); - - dir = debugfs_create_dir(buts->name, blk_tree_root); + if (!blk_debugfs_root) + goto err; + dir = debugfs_lookup(buts->name, blk_debugfs_root); + if (!dir) + bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); if (!dir) goto err; - bt->dir = dir; bt->dev = dev; atomic_set(&bt->dropped, 0); INIT_LIST_HEAD(&bt->running_list); @@ -525,9 +517,12 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (atomic_inc_return(&blk_probes_ref) == 1) blk_register_tracepoints(); - return 0; + ret = 0; err: - blk_trace_free(bt); + if (dir && !bt->dir) + dput(dir); + if (ret) + blk_trace_free(bt); return ret; } @@ -712,15 +707,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, if (likely(!bt)) return; - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { + if (blk_rq_is_passthrough(rq)) what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags, - what, rq->errors, rq->cmd_len, rq->cmd); - } else { + else what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq), - rq->cmd_flags, what, rq->errors, 0, NULL); - } + + __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), + rq->cmd_flags, what, rq->errors, 0, NULL); } static void blk_add_trace_rq_abort(void *ignore, @@ -972,11 +965,7 @@ void blk_add_driver_data(struct request_queue *q, if (likely(!bt)) return; - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) - __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0, - BLK_TA_DRV_DATA, rq->errors, len, data); - else - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0, + __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, BLK_TA_DRV_DATA, rq->errors, len, data); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1752,31 +1741,6 @@ void blk_trace_remove_sysfs(struct device *dev) #ifdef CONFIG_EVENT_TRACING -void blk_dump_cmd(char *buf, struct request *rq) -{ - int i, end; - int len = rq->cmd_len; - unsigned char *cmd = rq->cmd; - - if (rq->cmd_type != REQ_TYPE_BLOCK_PC) { - buf[0] = '\0'; - return; - } - - for (end = len - 1; end >= 0; end--) - if (cmd[end]) - break; - end++; - - for (i = 0; i < len; i++) { - buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]); - if (i == end && end != len - 1) { - sprintf(buf, " .."); - break; - } - } -} - void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes) { int i = 0; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index fa77311dadb2..cee9802cf3e0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -76,8 +76,8 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .func = bpf_probe_read, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_RAW_STACK, - .arg2_type = ARG_CONST_STACK_SIZE, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, }; @@ -109,8 +109,8 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_PTR_TO_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto *bpf_get_probe_write_proto(void) @@ -213,8 +213,8 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .func = bpf_trace_printk, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_STACK, - .arg2_type = ARG_CONST_STACK_SIZE, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, }; const struct bpf_func_proto *bpf_get_trace_printk_proto(void) @@ -329,8 +329,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, }; static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); @@ -395,6 +395,36 @@ static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + int ret; + + /* + * The strncpy_from_unsafe() call will likely not fill the entire + * buffer, but that's okay in this circumstance as we're probing + * arbitrary memory anyway similar to bpf_probe_read() and might + * as well probe the stack. Thus, memory is explicitly cleared + * only in error case, so that improper users ignoring return + * code altogether don't copy garbage; otherwise length of string + * is returned that can be used for bpf_perf_event_output() et al. + */ + ret = strncpy_from_unsafe(dst, unsafe_ptr, size); + if (unlikely(ret < 0)) + memset(dst, 0, size); + + return ret; +} + +static const struct bpf_func_proto bpf_probe_read_str_proto = { + .func = bpf_probe_read_str, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -432,6 +462,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return &bpf_current_task_under_cgroup_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; + case BPF_FUNC_probe_read_str: + return &bpf_probe_read_str_proto; default: return NULL; } @@ -459,6 +491,13 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type return false; if (off % size != 0) return false; + /* + * Assertion for 32 bit to make sure last 8 byte access + * (BPF_DW) to the last 4 byte member is disallowed. + */ + if (off + size > sizeof(struct pt_regs)) + return false; + return true; } @@ -467,7 +506,7 @@ static const struct bpf_verifier_ops kprobe_prog_ops = { .is_valid_access = kprobe_prog_is_valid_access, }; -static struct bpf_prog_type_list kprobe_tl = { +static struct bpf_prog_type_list kprobe_tl __ro_after_init = { .ops = &kprobe_prog_ops, .type = BPF_PROG_TYPE_KPROBE, }; @@ -492,8 +531,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, @@ -540,6 +579,8 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type return false; if (off % size != 0) return false; + + BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); return true; } @@ -548,7 +589,7 @@ static const struct bpf_verifier_ops tracepoint_prog_ops = { .is_valid_access = tp_prog_is_valid_access, }; -static struct bpf_prog_type_list tracepoint_tl = { +static struct bpf_prog_type_list tracepoint_tl __ro_after_init = { .ops = &tracepoint_prog_ops, .type = BPF_PROG_TYPE_TRACEPOINT, }; @@ -572,28 +613,29 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type return true; } -static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, +static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; - switch (ctx_off) { + switch (si->off) { case offsetof(struct bpf_perf_event_data, sample_period): BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, - data), dst_reg, src_reg, + data), si->dst_reg, si->src_reg, offsetof(struct bpf_perf_event_data_kern, data)); - *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, + *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, offsetof(struct perf_sample_data, period)); break; default: *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, - regs), dst_reg, src_reg, + regs), si->dst_reg, si->src_reg, offsetof(struct bpf_perf_event_data_kern, regs)); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg, + si->off); break; } @@ -606,7 +648,7 @@ static const struct bpf_verifier_ops perf_event_prog_ops = { .convert_ctx_access = pe_prog_convert_ctx_access, }; -static struct bpf_prog_type_list perf_event_tl = { +static struct bpf_prog_type_list perf_event_tl __ro_after_init = { .ops = &perf_event_prog_ops, .type = BPF_PROG_TYPE_PERF_EVENT, }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eb230f06ba41..0c0609326391 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1110,13 +1110,6 @@ struct ftrace_func_entry { unsigned long ip; }; -struct ftrace_hash { - unsigned long size_bits; - struct hlist_head *buckets; - unsigned long count; - struct rcu_head rcu; -}; - /* * We make these constant because no one should touch them, * but they are used as the default "empty hash", to avoid allocating @@ -1192,26 +1185,24 @@ struct ftrace_page { static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; -static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash) +static __always_inline unsigned long +ftrace_hash_key(struct ftrace_hash *hash, unsigned long ip) { - return !hash || !hash->count; + if (hash->size_bits > 0) + return hash_long(ip, hash->size_bits); + + return 0; } -static struct ftrace_func_entry * -ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) +/* Only use this function if ftrace_hash_empty() has already been tested */ +static __always_inline struct ftrace_func_entry * +__ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) { unsigned long key; struct ftrace_func_entry *entry; struct hlist_head *hhd; - if (ftrace_hash_empty(hash)) - return NULL; - - if (hash->size_bits > 0) - key = hash_long(ip, hash->size_bits); - else - key = 0; - + key = ftrace_hash_key(hash, ip); hhd = &hash->buckets[key]; hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) { @@ -1221,17 +1212,32 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) return NULL; } +/** + * ftrace_lookup_ip - Test to see if an ip exists in an ftrace_hash + * @hash: The hash to look at + * @ip: The instruction pointer to test + * + * Search a given @hash to see if a given instruction pointer (@ip) + * exists in it. + * + * Returns the entry that holds the @ip if found. NULL otherwise. + */ +struct ftrace_func_entry * +ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) +{ + if (ftrace_hash_empty(hash)) + return NULL; + + return __ftrace_lookup_ip(hash, ip); +} + static void __add_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry) { struct hlist_head *hhd; unsigned long key; - if (hash->size_bits) - key = hash_long(entry->ip, hash->size_bits); - else - key = 0; - + key = ftrace_hash_key(hash, entry->ip); hhd = &hash->buckets[key]; hlist_add_head(&entry->hlist, hhd); hash->count++; @@ -1383,9 +1389,8 @@ ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, struct ftrace_hash *new_hash); -static int -ftrace_hash_move(struct ftrace_ops *ops, int enable, - struct ftrace_hash **dst, struct ftrace_hash *src) +static struct ftrace_hash * +__ftrace_hash_move(struct ftrace_hash *src) { struct ftrace_func_entry *entry; struct hlist_node *tn; @@ -1393,21 +1398,13 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_hash *new_hash; int size = src->count; int bits = 0; - int ret; int i; - /* Reject setting notrace hash on IPMODIFY ftrace_ops */ - if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable) - return -EINVAL; - /* - * If the new source is empty, just free dst and assign it - * the empty_hash. + * If the new source is empty, just return the empty_hash. */ - if (!src->count) { - new_hash = EMPTY_HASH; - goto update; - } + if (!src->count) + return EMPTY_HASH; /* * Make the hash size about 1/2 the # found @@ -1421,7 +1418,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, new_hash = alloc_ftrace_hash(bits); if (!new_hash) - return -ENOMEM; + return NULL; size = 1 << src->size_bits; for (i = 0; i < size; i++) { @@ -1432,7 +1429,24 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, } } -update: + return new_hash; +} + +static int +ftrace_hash_move(struct ftrace_ops *ops, int enable, + struct ftrace_hash **dst, struct ftrace_hash *src) +{ + struct ftrace_hash *new_hash; + int ret; + + /* Reject setting notrace hash on IPMODIFY ftrace_ops */ + if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable) + return -EINVAL; + + new_hash = __ftrace_hash_move(src); + if (!new_hash) + return -ENOMEM; + /* Make sure this can be applied if it is IPMODIFY ftrace_ops */ if (enable) { /* IPMODIFY should be updated only when filter_hash updating */ @@ -1466,9 +1480,9 @@ static bool hash_contains_ip(unsigned long ip, * notrace hash is considered not in the notrace hash. */ return (ftrace_hash_empty(hash->filter_hash) || - ftrace_lookup_ip(hash->filter_hash, ip)) && + __ftrace_lookup_ip(hash->filter_hash, ip)) && (ftrace_hash_empty(hash->notrace_hash) || - !ftrace_lookup_ip(hash->notrace_hash, ip)); + !__ftrace_lookup_ip(hash->notrace_hash, ip)); } /* @@ -2880,7 +2894,7 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) /* The function must be in the filter */ if (!ftrace_hash_empty(ops->func_hash->filter_hash) && - !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) + !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) return 0; /* If in notrace hash, we ignore it too */ @@ -4382,7 +4396,7 @@ __setup("ftrace_filter=", set_ftrace_filter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; -static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); +static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); static unsigned long save_global_trampoline; static unsigned long save_global_flags; @@ -4405,18 +4419,17 @@ static void __init set_ftrace_early_graph(char *buf, int enable) { int ret; char *func; - unsigned long *table = ftrace_graph_funcs; - int *count = &ftrace_graph_count; + struct ftrace_hash *hash; - if (!enable) { - table = ftrace_graph_notrace_funcs; - count = &ftrace_graph_notrace_count; - } + if (enable) + hash = ftrace_graph_hash; + else + hash = ftrace_graph_notrace_hash; while (buf) { func = strsep(&buf, ","); /* we allow only one expression at a time */ - ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func); + ret = ftrace_graph_set_hash(hash, func); if (ret) printk(KERN_DEBUG "ftrace: function %s not " "traceable\n", func); @@ -4540,26 +4553,55 @@ static const struct file_operations ftrace_notrace_fops = { static DEFINE_MUTEX(graph_lock); -int ftrace_graph_count; -int ftrace_graph_notrace_count; -unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; -unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; +struct ftrace_hash *ftrace_graph_hash = EMPTY_HASH; +struct ftrace_hash *ftrace_graph_notrace_hash = EMPTY_HASH; + +enum graph_filter_type { + GRAPH_FILTER_NOTRACE = 0, + GRAPH_FILTER_FUNCTION, +}; + +#define FTRACE_GRAPH_EMPTY ((void *)1) struct ftrace_graph_data { - unsigned long *table; - size_t size; - int *count; - const struct seq_operations *seq_ops; + struct ftrace_hash *hash; + struct ftrace_func_entry *entry; + int idx; /* for hash table iteration */ + enum graph_filter_type type; + struct ftrace_hash *new_hash; + const struct seq_operations *seq_ops; + struct trace_parser parser; }; static void * __g_next(struct seq_file *m, loff_t *pos) { struct ftrace_graph_data *fgd = m->private; + struct ftrace_func_entry *entry = fgd->entry; + struct hlist_head *head; + int i, idx = fgd->idx; - if (*pos >= *fgd->count) + if (*pos >= fgd->hash->count) return NULL; - return &fgd->table[*pos]; + + if (entry) { + hlist_for_each_entry_continue(entry, hlist) { + fgd->entry = entry; + return entry; + } + + idx++; + } + + for (i = idx; i < 1 << fgd->hash->size_bits; i++) { + head = &fgd->hash->buckets[i]; + hlist_for_each_entry(entry, head, hlist) { + fgd->entry = entry; + fgd->idx = i; + return entry; + } + } + return NULL; } static void * @@ -4575,10 +4617,19 @@ static void *g_start(struct seq_file *m, loff_t *pos) mutex_lock(&graph_lock); + if (fgd->type == GRAPH_FILTER_FUNCTION) + fgd->hash = rcu_dereference_protected(ftrace_graph_hash, + lockdep_is_held(&graph_lock)); + else + fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash, + lockdep_is_held(&graph_lock)); + /* Nothing, tell g_show to print all functions are enabled */ - if (!*fgd->count && !*pos) - return (void *)1; + if (ftrace_hash_empty(fgd->hash) && !*pos) + return FTRACE_GRAPH_EMPTY; + fgd->idx = 0; + fgd->entry = NULL; return __g_next(m, pos); } @@ -4589,22 +4640,22 @@ static void g_stop(struct seq_file *m, void *p) static int g_show(struct seq_file *m, void *v) { - unsigned long *ptr = v; + struct ftrace_func_entry *entry = v; - if (!ptr) + if (!entry) return 0; - if (ptr == (unsigned long *)1) { + if (entry == FTRACE_GRAPH_EMPTY) { struct ftrace_graph_data *fgd = m->private; - if (fgd->table == ftrace_graph_funcs) + if (fgd->type == GRAPH_FILTER_FUNCTION) seq_puts(m, "#### all functions enabled ####\n"); else seq_puts(m, "#### no functions disabled ####\n"); return 0; } - seq_printf(m, "%ps\n", (void *)*ptr); + seq_printf(m, "%ps\n", (void *)entry->ip); return 0; } @@ -4621,24 +4672,51 @@ __ftrace_graph_open(struct inode *inode, struct file *file, struct ftrace_graph_data *fgd) { int ret = 0; + struct ftrace_hash *new_hash = NULL; - mutex_lock(&graph_lock); - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) { - *fgd->count = 0; - memset(fgd->table, 0, fgd->size * sizeof(*fgd->table)); + if (file->f_mode & FMODE_WRITE) { + const int size_bits = FTRACE_HASH_DEFAULT_BITS; + + if (trace_parser_get_init(&fgd->parser, FTRACE_BUFF_MAX)) + return -ENOMEM; + + if (file->f_flags & O_TRUNC) + new_hash = alloc_ftrace_hash(size_bits); + else + new_hash = alloc_and_copy_ftrace_hash(size_bits, + fgd->hash); + if (!new_hash) { + ret = -ENOMEM; + goto out; + } } - mutex_unlock(&graph_lock); if (file->f_mode & FMODE_READ) { - ret = seq_open(file, fgd->seq_ops); + ret = seq_open(file, &ftrace_graph_seq_ops); if (!ret) { struct seq_file *m = file->private_data; m->private = fgd; + } else { + /* Failed */ + free_ftrace_hash(new_hash); + new_hash = NULL; } } else file->private_data = fgd; +out: + if (ret < 0 && file->f_mode & FMODE_WRITE) + trace_parser_put(&fgd->parser); + + fgd->new_hash = new_hash; + + /* + * All uses of fgd->hash must be taken with the graph_lock + * held. The graph_lock is going to be released, so force + * fgd->hash to be reinitialized when it is taken again. + */ + fgd->hash = NULL; + return ret; } @@ -4646,6 +4724,7 @@ static int ftrace_graph_open(struct inode *inode, struct file *file) { struct ftrace_graph_data *fgd; + int ret; if (unlikely(ftrace_disabled)) return -ENODEV; @@ -4654,18 +4733,26 @@ ftrace_graph_open(struct inode *inode, struct file *file) if (fgd == NULL) return -ENOMEM; - fgd->table = ftrace_graph_funcs; - fgd->size = FTRACE_GRAPH_MAX_FUNCS; - fgd->count = &ftrace_graph_count; + mutex_lock(&graph_lock); + + fgd->hash = rcu_dereference_protected(ftrace_graph_hash, + lockdep_is_held(&graph_lock)); + fgd->type = GRAPH_FILTER_FUNCTION; fgd->seq_ops = &ftrace_graph_seq_ops; - return __ftrace_graph_open(inode, file, fgd); + ret = __ftrace_graph_open(inode, file, fgd); + if (ret < 0) + kfree(fgd); + + mutex_unlock(&graph_lock); + return ret; } static int ftrace_graph_notrace_open(struct inode *inode, struct file *file) { struct ftrace_graph_data *fgd; + int ret; if (unlikely(ftrace_disabled)) return -ENODEV; @@ -4674,45 +4761,97 @@ ftrace_graph_notrace_open(struct inode *inode, struct file *file) if (fgd == NULL) return -ENOMEM; - fgd->table = ftrace_graph_notrace_funcs; - fgd->size = FTRACE_GRAPH_MAX_FUNCS; - fgd->count = &ftrace_graph_notrace_count; + mutex_lock(&graph_lock); + + fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash, + lockdep_is_held(&graph_lock)); + fgd->type = GRAPH_FILTER_NOTRACE; fgd->seq_ops = &ftrace_graph_seq_ops; - return __ftrace_graph_open(inode, file, fgd); + ret = __ftrace_graph_open(inode, file, fgd); + if (ret < 0) + kfree(fgd); + + mutex_unlock(&graph_lock); + return ret; } static int ftrace_graph_release(struct inode *inode, struct file *file) { + struct ftrace_graph_data *fgd; + struct ftrace_hash *old_hash, *new_hash; + struct trace_parser *parser; + int ret = 0; + if (file->f_mode & FMODE_READ) { struct seq_file *m = file->private_data; - kfree(m->private); + fgd = m->private; seq_release(inode, file); } else { - kfree(file->private_data); + fgd = file->private_data; } - return 0; + + if (file->f_mode & FMODE_WRITE) { + + parser = &fgd->parser; + + if (trace_parser_loaded((parser))) { + parser->buffer[parser->idx] = 0; + ret = ftrace_graph_set_hash(fgd->new_hash, + parser->buffer); + } + + trace_parser_put(parser); + + new_hash = __ftrace_hash_move(fgd->new_hash); + if (!new_hash) { + ret = -ENOMEM; + goto out; + } + + mutex_lock(&graph_lock); + + if (fgd->type == GRAPH_FILTER_FUNCTION) { + old_hash = rcu_dereference_protected(ftrace_graph_hash, + lockdep_is_held(&graph_lock)); + rcu_assign_pointer(ftrace_graph_hash, new_hash); + } else { + old_hash = rcu_dereference_protected(ftrace_graph_notrace_hash, + lockdep_is_held(&graph_lock)); + rcu_assign_pointer(ftrace_graph_notrace_hash, new_hash); + } + + mutex_unlock(&graph_lock); + + /* Wait till all users are no longer using the old hash */ + synchronize_sched(); + + free_ftrace_hash(old_hash); + } + + out: + kfree(fgd->new_hash); + kfree(fgd); + + return ret; } static int -ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) +ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) { struct ftrace_glob func_g; struct dyn_ftrace *rec; struct ftrace_page *pg; + struct ftrace_func_entry *entry; int fail = 1; int not; - bool exists; - int i; /* decode regex */ func_g.type = filter_parse_regex(buffer, strlen(buffer), &func_g.search, ¬); - if (!not && *idx >= size) - return -EBUSY; func_g.len = strlen(func_g.search); @@ -4729,26 +4868,18 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) continue; if (ftrace_match_record(rec, &func_g, NULL, 0)) { - /* if it is in the array */ - exists = false; - for (i = 0; i < *idx; i++) { - if (array[i] == rec->ip) { - exists = true; - break; - } - } + entry = ftrace_lookup_ip(hash, rec->ip); if (!not) { fail = 0; - if (!exists) { - array[(*idx)++] = rec->ip; - if (*idx >= size) - goto out; - } + + if (entry) + continue; + if (add_hash_entry(hash, rec->ip) < 0) + goto out; } else { - if (exists) { - array[i] = array[--(*idx)]; - array[*idx] = 0; + if (entry) { + free_hash_entry(hash, entry); fail = 0; } } @@ -4767,35 +4898,34 @@ static ssize_t ftrace_graph_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_parser parser; ssize_t read, ret = 0; struct ftrace_graph_data *fgd = file->private_data; + struct trace_parser *parser; if (!cnt) return 0; - if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) - return -ENOMEM; - - read = trace_get_user(&parser, ubuf, cnt, ppos); + /* Read mode uses seq functions */ + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + fgd = m->private; + } - if (read >= 0 && trace_parser_loaded((&parser))) { - parser.buffer[parser.idx] = 0; + parser = &fgd->parser; - mutex_lock(&graph_lock); + read = trace_get_user(parser, ubuf, cnt, ppos); - /* we allow only one expression at a time */ - ret = ftrace_set_func(fgd->table, fgd->count, fgd->size, - parser.buffer); + if (read >= 0 && trace_parser_loaded(parser) && + !trace_parser_cont(parser)) { - mutex_unlock(&graph_lock); + ret = ftrace_graph_set_hash(fgd->new_hash, + parser->buffer); + trace_parser_clear(parser); } if (!ret) ret = read; - trace_parser_put(&parser); - return ret; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d7449783987a..707445ceb7ef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -260,16 +260,8 @@ unsigned long long ns2usecs(u64 nsec) TRACE_ITER_EVENT_FORK /* - * The global_trace is the descriptor that holds the tracing - * buffers for the live tracing. For each CPU, it contains - * a link list of pages that will store trace entries. The - * page descriptor of the pages in the memory is used to hold - * the link list by linking the lru item in the page descriptor - * to each of the pages in the buffer per CPU. - * - * For each active CPU there is a data field that holds the - * pages for the buffer for that CPU. Each CPU has the same number - * of pages allocated for its buffer. + * The global_trace is the descriptor that holds the top-level tracing + * buffers for the live tracing. */ static struct trace_array global_trace = { .trace_flags = TRACE_DEFAULT_FLAGS, @@ -1193,6 +1185,7 @@ int trace_parser_get_init(struct trace_parser *parser, int size) void trace_parser_put(struct trace_parser *parser) { kfree(parser->buffer); + parser->buffer = NULL; } /* @@ -7503,7 +7496,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) ftrace_init_tracefs(tr, d_tracer); } -static struct vfsmount *trace_automount(void *ingore) +static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) { struct vfsmount *mnt; struct file_system_type *type; @@ -7516,7 +7509,7 @@ static struct vfsmount *trace_automount(void *ingore) type = get_fs_type("tracefs"); if (!type) return NULL; - mnt = vfs_kern_mount(type, 0, "tracefs", NULL); + mnt = vfs_submount(mntpt, type, "tracefs", NULL); put_filesystem(type); if (IS_ERR(mnt)) return NULL; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1ea51ab53edf..ae1cce91fead 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -753,6 +753,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); extern char trace_find_mark(unsigned long long duration); +struct ftrace_hash { + unsigned long size_bits; + struct hlist_head *buckets; + unsigned long count; + struct rcu_head rcu; +}; + +struct ftrace_func_entry * +ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip); + +static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) +{ + return !hash || !hash->count; +} + /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -787,53 +802,50 @@ extern void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned long flags, int pc); - #ifdef CONFIG_DYNAMIC_FTRACE -/* TODO: make this variable */ -#define FTRACE_GRAPH_MAX_FUNCS 32 -extern int ftrace_graph_count; -extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; -extern int ftrace_graph_notrace_count; -extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS]; +extern struct ftrace_hash *ftrace_graph_hash; +extern struct ftrace_hash *ftrace_graph_notrace_hash; static inline int ftrace_graph_addr(unsigned long addr) { - int i; - - if (!ftrace_graph_count) - return 1; - - for (i = 0; i < ftrace_graph_count; i++) { - if (addr == ftrace_graph_funcs[i]) { - /* - * If no irqs are to be traced, but a set_graph_function - * is set, and called by an interrupt handler, we still - * want to trace it. - */ - if (in_irq()) - trace_recursion_set(TRACE_IRQ_BIT); - else - trace_recursion_clear(TRACE_IRQ_BIT); - return 1; - } + int ret = 0; + + preempt_disable_notrace(); + + if (ftrace_hash_empty(ftrace_graph_hash)) { + ret = 1; + goto out; } - return 0; + if (ftrace_lookup_ip(ftrace_graph_hash, addr)) { + /* + * If no irqs are to be traced, but a set_graph_function + * is set, and called by an interrupt handler, we still + * want to trace it. + */ + if (in_irq()) + trace_recursion_set(TRACE_IRQ_BIT); + else + trace_recursion_clear(TRACE_IRQ_BIT); + ret = 1; + } + +out: + preempt_enable_notrace(); + return ret; } static inline int ftrace_graph_notrace_addr(unsigned long addr) { - int i; + int ret = 0; - if (!ftrace_graph_notrace_count) - return 0; + preempt_disable_notrace(); - for (i = 0; i < ftrace_graph_notrace_count; i++) { - if (addr == ftrace_graph_notrace_funcs[i]) - return 1; - } + if (ftrace_lookup_ip(ftrace_graph_notrace_hash, addr)) + ret = 1; - return 0; + preempt_enable_notrace(); + return ret; } #else static inline int ftrace_graph_addr(unsigned long addr) @@ -1300,7 +1312,8 @@ static inline bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || field->filter_type == FILTER_STATIC_STRING || - field->filter_type == FILTER_PTR_STRING; + field->filter_type == FILTER_PTR_STRING || + field->filter_type == FILTER_COMM; } static inline bool is_function_field(struct ftrace_event_field *field) diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index e3b488825ae3..e49fbe901cfc 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -175,9 +175,9 @@ int trace_benchmark_reg(void) bm_event_thread = kthread_run(benchmark_event_kthread, NULL, "event_benchmark"); - if (!bm_event_thread) { + if (IS_ERR(bm_event_thread)) { pr_warning("trace benchmark failed to create kernel thread\n"); - return -ENOMEM; + return PTR_ERR(bm_event_thread); } return 0; diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 75489de546b6..4d8fdf3184dc 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -27,7 +27,7 @@ static DEFINE_MUTEX(branch_tracing_mutex); static struct trace_array *branch_tracer; static void -probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) +probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) { struct trace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; @@ -68,16 +68,17 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry = ring_buffer_event_data(event); /* Strip off the path, only save the file */ - p = f->file + strlen(f->file); - while (p >= f->file && *p != '/') + p = f->data.file + strlen(f->data.file); + while (p >= f->data.file && *p != '/') p--; p++; - strncpy(entry->func, f->func, TRACE_FUNC_SIZE); + strncpy(entry->func, f->data.func, TRACE_FUNC_SIZE); strncpy(entry->file, p, TRACE_FILE_SIZE); entry->func[TRACE_FUNC_SIZE] = 0; entry->file[TRACE_FILE_SIZE] = 0; - entry->line = f->line; + entry->constant = f->constant; + entry->line = f->data.line; entry->correct = val == expect; if (!call_filter_check_discard(call, entry, buffer, event)) @@ -89,7 +90,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) } static inline -void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) +void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect) { if (!branch_tracing_enabled) return; @@ -195,13 +196,19 @@ core_initcall(init_branch_tracer); #else static inline -void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) +void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect) { } #endif /* CONFIG_BRANCH_TRACER */ -void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect) +void ftrace_likely_update(struct ftrace_likely_data *f, int val, + int expect, int is_constant) { + /* A constant is always correct */ + if (is_constant) { + f->constant++; + val = expect; + } /* * I would love to have a trace point here instead, but the * trace point code is so inundated with unlikely and likely @@ -212,9 +219,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect) /* FIXME: Make this atomic! */ if (val == expect) - f->correct++; + f->data.correct++; else - f->incorrect++; + f->data.incorrect++; } EXPORT_SYMBOL(ftrace_likely_update); @@ -245,29 +252,60 @@ static inline long get_incorrect_percent(struct ftrace_branch_data *p) return percent; } -static int branch_stat_show(struct seq_file *m, void *v) +static const char *branch_stat_process_file(struct ftrace_branch_data *p) { - struct ftrace_branch_data *p = v; const char *f; - long percent; /* Only print the file, not the path */ f = p->file + strlen(p->file); while (f >= p->file && *f != '/') f--; - f++; + return ++f; +} + +static void branch_stat_show(struct seq_file *m, + struct ftrace_branch_data *p, const char *f) +{ + long percent; /* * The miss is overlayed on correct, and hit on incorrect. */ percent = get_incorrect_percent(p); - seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); if (percent < 0) seq_puts(m, " X "); else seq_printf(m, "%3ld ", percent); + seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); +} + +static int branch_stat_show_normal(struct seq_file *m, + struct ftrace_branch_data *p, const char *f) +{ + seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); + branch_stat_show(m, p, f); + return 0; +} + +static int annotate_branch_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_likely_data *p = v; + const char *f; + int l; + + f = branch_stat_process_file(&p->data); + + if (!p->constant) + return branch_stat_show_normal(m, &p->data, f); + + l = snprintf(NULL, 0, "/%lu", p->constant); + l = l > 8 ? 0 : 8 - l; + + seq_printf(m, "%8lu/%lu %*lu ", + p->data.correct, p->constant, l, p->data.incorrect); + branch_stat_show(m, &p->data, f); return 0; } @@ -279,7 +317,7 @@ static void *annotated_branch_stat_start(struct tracer_stat *trace) static void * annotated_branch_stat_next(void *v, int idx) { - struct ftrace_branch_data *p = v; + struct ftrace_likely_data *p = v; ++p; @@ -328,7 +366,7 @@ static struct tracer_stat annotated_branch_stats = { .stat_next = annotated_branch_stat_next, .stat_cmp = annotated_branch_stat_cmp, .stat_headers = annotated_branch_stat_headers, - .stat_show = branch_stat_show + .stat_show = annotate_branch_stat_show }; __init static int init_annotated_branch_stats(void) @@ -379,12 +417,21 @@ all_branch_stat_next(void *v, int idx) return p; } +static int all_branch_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_branch_data *p = v; + const char *f; + + f = branch_stat_process_file(p); + return branch_stat_show_normal(m, p, f); +} + static struct tracer_stat all_branch_stats = { .name = "branch_all", .stat_start = all_branch_stat_start, .stat_next = all_branch_stat_next, .stat_headers = all_branch_stat_headers, - .stat_show = branch_stat_show + .stat_show = all_branch_stat_show }; __init static int all_annotated_branch_stats(void) diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index eb7396b7e7c3..c203ac4df791 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -328,11 +328,13 @@ FTRACE_ENTRY(branch, trace_branch, __array( char, func, TRACE_FUNC_SIZE+1 ) __array( char, file, TRACE_FILE_SIZE+1 ) __field( char, correct ) + __field( char, constant ) ), - F_printk("%u:%s:%s (%u)", + F_printk("%u:%s:%s (%u)%s", __entry->line, - __entry->func, __entry->file, __entry->correct), + __entry->func, __entry->file, __entry->correct, + __entry->constant ? " CONSTANT" : ""), FILTER_OTHER ); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index af344a1bf0d0..edfacd954e1b 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -266,24 +266,13 @@ out: static struct cpumask save_cpumask; static bool disable_migrate; -static void move_to_next_cpu(bool initmask) +static void move_to_next_cpu(void) { - static struct cpumask *current_mask; + struct cpumask *current_mask = &save_cpumask; int next_cpu; if (disable_migrate) return; - - /* Just pick the first CPU on first iteration */ - if (initmask) { - current_mask = &save_cpumask; - get_online_cpus(); - cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); - put_online_cpus(); - next_cpu = cpumask_first(current_mask); - goto set_affinity; - } - /* * If for some reason the user modifies the CPU affinity * of this thread, than stop migrating for the duration @@ -300,7 +289,6 @@ static void move_to_next_cpu(bool initmask) if (next_cpu >= nr_cpu_ids) next_cpu = cpumask_first(current_mask); - set_affinity: if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */ goto disable; @@ -322,20 +310,15 @@ static void move_to_next_cpu(bool initmask) * need to ensure nothing else might be running (and thus preempting). * Obviously this should never be used in production environments. * - * Currently this runs on which ever CPU it was scheduled on, but most - * real-world hardware latency situations occur across several CPUs, - * but we might later generalize this if we find there are any actualy - * systems with alternate SMI delivery or other hardware latencies. + * Executes one loop interaction on each CPU in tracing_cpumask sysfs file. */ static int kthread_fn(void *data) { u64 interval; - bool initmask = true; while (!kthread_should_stop()) { - move_to_next_cpu(initmask); - initmask = false; + move_to_next_cpu(); local_irq_disable(); get_sample(); @@ -366,13 +349,27 @@ static int kthread_fn(void *data) */ static int start_kthread(struct trace_array *tr) { + struct cpumask *current_mask = &save_cpumask; struct task_struct *kthread; + int next_cpu; + + /* Just pick the first CPU on first iteration */ + current_mask = &save_cpumask; + get_online_cpus(); + cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); + put_online_cpus(); + next_cpu = cpumask_first(current_mask); kthread = kthread_create(kthread_fn, NULL, "hwlatd"); if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); return -ENOMEM; } + + cpumask_clear(current_mask); + cpumask_set_cpu(next_cpu, current_mask); + sched_setaffinity(kthread->pid, current_mask); + hwlat_kthread = kthread; wake_up_process(kthread); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7ad9e53ad174..eadd96ef772f 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -16,6 +16,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) "trace_kprobe: " fmt #include <linux/module.h> #include <linux/uaccess.h> diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 5d33a7352919..070866c32eb9 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -124,6 +124,44 @@ EXPORT_SYMBOL(trace_print_symbols_seq); #if BITS_PER_LONG == 32 const char * +trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, + unsigned long long flags, + const struct trace_print_flags_u64 *flag_array) +{ + unsigned long long mask; + const char *str; + const char *ret = trace_seq_buffer_ptr(p); + int i, first = 1; + + for (i = 0; flag_array[i].name && flags; i++) { + + mask = flag_array[i].mask; + if ((flags & mask) != mask) + continue; + + str = flag_array[i].name; + flags &= ~mask; + if (!first && delim) + trace_seq_puts(p, delim); + else + first = 0; + trace_seq_puts(p, str); + } + + /* check for left over flags */ + if (flags) { + if (!first && delim) + trace_seq_puts(p, delim); + trace_seq_printf(p, "0x%llx", flags); + } + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(trace_print_flags_seq_u64); + +const char * trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, const struct trace_print_flags_u64 *symbol_array) { @@ -162,15 +200,27 @@ trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, } EXPORT_SYMBOL_GPL(trace_print_bitmask_seq); +/** + * trace_print_hex_seq - print buffer as hex sequence + * @p: trace seq struct to write to + * @buf: The buffer to print + * @buf_len: Length of @buf in bytes + * @concatenate: Print @buf as single hex string or with spacing + * + * Prints the passed buffer as a hex sequence either as a whole, + * single hex string if @concatenate is true or with spacing after + * each byte in case @concatenate is false. + */ const char * -trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) +trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len, + bool concatenate) { int i; const char *ret = trace_seq_buffer_ptr(p); for (i = 0; i < buf_len; i++) - trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); - + trace_seq_printf(p, "%s%2.2x", concatenate || i == 0 ? "" : " ", + buf[i]); trace_seq_putc(p, 0); return ret; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 8c0553d9afd3..52478f033f88 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -21,6 +21,7 @@ * Copyright (C) IBM Corporation, 2010-2011 * Author: Srikar Dronamraju */ +#define pr_fmt(fmt) "trace_probe: " fmt #include "trace_probe.h" @@ -647,7 +648,7 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, int (*createfn)(int, char **)) { - char *kbuf, *tmp; + char *kbuf, *buf, *tmp; int ret = 0; size_t done = 0; size_t size; @@ -667,27 +668,38 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, goto out; } kbuf[size] = '\0'; - tmp = strchr(kbuf, '\n'); + buf = kbuf; + do { + tmp = strchr(buf, '\n'); + if (tmp) { + *tmp = '\0'; + size = tmp - buf + 1; + } else { + size = strlen(buf); + if (done + size < count) { + if (buf != kbuf) + break; + /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ + pr_warn("Line length is too long: Should be less than %d\n", + WRITE_BUFSIZE - 2); + ret = -EINVAL; + goto out; + } + } + done += size; - if (tmp) { - *tmp = '\0'; - size = tmp - kbuf + 1; - } else if (done + size < count) { - pr_warn("Line length is too long: Should be less than %d\n", - WRITE_BUFSIZE); - ret = -EINVAL; - goto out; - } - done += size; - /* Remove comments */ - tmp = strchr(kbuf, '#'); + /* Remove comments */ + tmp = strchr(buf, '#'); - if (tmp) - *tmp = '\0'; + if (tmp) + *tmp = '\0'; - ret = traceprobe_command(kbuf, createfn); - if (ret) - goto out; + ret = traceprobe_command(buf, createfn); + if (ret) + goto out; + buf += size; + + } while (done < count); } ret = done; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 0913693caf6e..f4379e772171 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -17,6 +17,7 @@ * Copyright (C) IBM Corporation, 2010-2012 * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com> */ +#define pr_fmt(fmt) "trace_kprobe: " fmt #include <linux/module.h> #include <linux/uaccess.h> @@ -431,7 +432,8 @@ static int create_trace_uprobe(int argc, char **argv) pr_info("Probe point is not specified.\n"); return -EINVAL; } - arg = strchr(argv[1], ':'); + /* Find the last occurrence, in case the path contains ':' too. */ + arg = strrchr(argv[1], ':'); if (!arg) { ret = -EINVAL; goto fail_address_parse; diff --git a/kernel/ucount.c b/kernel/ucount.c index 95c6336fc2b3..8a11fc0cb459 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -57,7 +57,7 @@ static struct ctl_table_root set_root = { static int zero = 0; static int int_max = INT_MAX; -#define UCOUNT_ENTRY(name) \ +#define UCOUNT_ENTRY(name) \ { \ .procname = name, \ .maxlen = sizeof(int), \ @@ -74,6 +74,10 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_net_namespaces"), UCOUNT_ENTRY("max_mnt_namespaces"), UCOUNT_ENTRY("max_cgroup_namespaces"), +#ifdef CONFIG_INOTIFY_USER + UCOUNT_ENTRY("max_inotify_instances"), + UCOUNT_ENTRY("max_inotify_watches"), +#endif { } }; #endif /* CONFIG_SYSCTL */ diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 12b8dd640786..b5de262a9eb9 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -137,12 +137,14 @@ static void watchdog_overflow_callback(struct perf_event *event, * Reduce the watchdog noise by only printing messages * that are different from what cpu0 displayed. */ -static unsigned long cpu0_err; +static unsigned long firstcpu_err; +static atomic_t watchdog_cpus; int watchdog_nmi_enable(unsigned int cpu) { struct perf_event_attr *wd_attr; struct perf_event *event = per_cpu(watchdog_ev, cpu); + int firstcpu = 0; /* nothing to do if the hard lockup detector is disabled */ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) @@ -156,19 +158,22 @@ int watchdog_nmi_enable(unsigned int cpu) if (event != NULL) goto out_enable; + if (atomic_inc_return(&watchdog_cpus) == 1) + firstcpu = 1; + wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); - /* save cpu0 error for future comparision */ - if (cpu == 0 && IS_ERR(event)) - cpu0_err = PTR_ERR(event); + /* save the first cpu's error for future comparision */ + if (firstcpu && IS_ERR(event)) + firstcpu_err = PTR_ERR(event); if (!IS_ERR(event)) { - /* only print for cpu0 or different than cpu0 */ - if (cpu == 0 || cpu0_err) + /* only print for the first cpu initialized */ + if (firstcpu || firstcpu_err) pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); goto out_save; } @@ -186,7 +191,7 @@ int watchdog_nmi_enable(unsigned int cpu) smp_mb__after_atomic(); /* skip displaying the same error again */ - if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) + if (!firstcpu && (PTR_ERR(event) == firstcpu_err)) return PTR_ERR(event); /* vary the KERN level based on the returned errno */ @@ -222,9 +227,9 @@ void watchdog_nmi_disable(unsigned int cpu) /* should be in cleanup, but blocks oprofile */ perf_event_release_kernel(event); - } - if (cpu == 0) { + /* watchdog_nmi_enable() expects this to be zero initially. */ - cpu0_err = 0; + if (atomic_dec_and_test(&watchdog_cpus)) + firstcpu_err = 0; } } |