diff options
Diffstat (limited to 'kernel')
153 files changed, 8358 insertions, 3400 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index f85ae5dfa474..d2001624fe7a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -112,7 +112,9 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o -obj-$(CONFIG_HAS_IOMEM) += memremap.o +obj-$(CONFIG_HAS_IOMEM) += iomem.o +obj-$(CONFIG_ZONE_DEVICE) += memremap.o +obj-$(CONFIG_RSEQ) += rseq.o $(obj)/configs.o: $(obj)/config_data.h diff --git a/kernel/audit.c b/kernel/audit.c index 670665c6e2a6..e7478cb58079 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1099,8 +1099,7 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature if (audit_enabled == AUDIT_OFF) return; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE); if (!ab) return; audit_log_task_info(ab, current); @@ -2317,8 +2316,7 @@ void audit_log_link_denied(const char *operation) return; /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */ - ab = audit_log_start(current->audit_context, GFP_KERNEL, - AUDIT_ANOM_LINK); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_LINK); if (!ab) return; audit_log_format(ab, "op=%s", operation); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 9eb8b3511636..f1ba88994508 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -274,7 +274,7 @@ static void audit_update_watch(struct audit_parent *parent, /* If the update involves invalidating rules, do the inode-based * filtering now, so we don't omit records. */ if (invalidating && !audit_dummy_context()) - audit_filter_inodes(current, current->audit_context); + audit_filter_inodes(current, audit_context()); /* updating ino will likely change which audit_hash_list we * are on so we need a new watch for the new list */ diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index d7a807e81451..eaa320148d97 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -426,7 +426,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) return -EINVAL; break; case AUDIT_EXE: - if (f->op != Audit_equal) + if (f->op != Audit_not_equal && f->op != Audit_equal) return -EINVAL; if (entry->rule.listnr != AUDIT_FILTER_EXIT) return -EINVAL; @@ -1089,8 +1089,6 @@ static void audit_list_rules(int seq, struct sk_buff_head *q) static void audit_log_rule_change(char *action, struct audit_krule *rule, int res) { struct audit_buffer *ab; - uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current)); - unsigned int sessionid = audit_get_sessionid(current); if (!audit_enabled) return; @@ -1098,7 +1096,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (!ab) return; - audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid); + audit_log_session_info(ab); audit_log_task_context(ab); audit_log_format(ab, " op=%s", action); audit_log_key(ab, rule->filterkey); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4e0a4ac803db..ceb1c4596c51 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -374,7 +374,7 @@ static int audit_field_compare(struct task_struct *tsk, case AUDIT_COMPARE_EGID_TO_OBJ_GID: return audit_compare_gid(cred->egid, name, f, ctx); case AUDIT_COMPARE_AUID_TO_OBJ_UID: - return audit_compare_uid(tsk->loginuid, name, f, ctx); + return audit_compare_uid(audit_get_loginuid(tsk), name, f, ctx); case AUDIT_COMPARE_SUID_TO_OBJ_UID: return audit_compare_uid(cred->suid, name, f, ctx); case AUDIT_COMPARE_SGID_TO_OBJ_GID: @@ -385,7 +385,8 @@ static int audit_field_compare(struct task_struct *tsk, return audit_compare_gid(cred->fsgid, name, f, ctx); /* uid comparisons */ case AUDIT_COMPARE_UID_TO_AUID: - return audit_uid_comparator(cred->uid, f->op, tsk->loginuid); + return audit_uid_comparator(cred->uid, f->op, + audit_get_loginuid(tsk)); case AUDIT_COMPARE_UID_TO_EUID: return audit_uid_comparator(cred->uid, f->op, cred->euid); case AUDIT_COMPARE_UID_TO_SUID: @@ -394,11 +395,14 @@ static int audit_field_compare(struct task_struct *tsk, return audit_uid_comparator(cred->uid, f->op, cred->fsuid); /* auid comparisons */ case AUDIT_COMPARE_AUID_TO_EUID: - return audit_uid_comparator(tsk->loginuid, f->op, cred->euid); + return audit_uid_comparator(audit_get_loginuid(tsk), f->op, + cred->euid); case AUDIT_COMPARE_AUID_TO_SUID: - return audit_uid_comparator(tsk->loginuid, f->op, cred->suid); + return audit_uid_comparator(audit_get_loginuid(tsk), f->op, + cred->suid); case AUDIT_COMPARE_AUID_TO_FSUID: - return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid); + return audit_uid_comparator(audit_get_loginuid(tsk), f->op, + cred->fsuid); /* euid comparisons */ case AUDIT_COMPARE_EUID_TO_SUID: return audit_uid_comparator(cred->euid, f->op, cred->suid); @@ -471,6 +475,8 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_EXE: result = audit_exe_compare(tsk, rule->exe); + if (f->op == Audit_not_equal) + result = !result; break; case AUDIT_UID: result = audit_uid_comparator(cred->uid, f->op, f->uid); @@ -511,7 +517,7 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_gid_comparator(cred->fsgid, f->op, f->gid); break; case AUDIT_SESSIONID: - sessionid = audit_get_sessionid(current); + sessionid = audit_get_sessionid(tsk); result = audit_comparator(sessionid, f->op, f->val); break; case AUDIT_PERS: @@ -609,7 +615,8 @@ static int audit_filter_rules(struct task_struct *tsk, result = match_tree_refs(ctx, rule->tree); break; case AUDIT_LOGINUID: - result = audit_uid_comparator(tsk->loginuid, f->op, f->uid); + result = audit_uid_comparator(audit_get_loginuid(tsk), + f->op, f->uid); break; case AUDIT_LOGINUID_SET: result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); @@ -863,7 +870,7 @@ static inline struct audit_context *audit_take_context(struct task_struct *tsk, audit_filter_inodes(tsk, context); } - tsk->audit_context = NULL; + audit_set_context(tsk, NULL); return context; } @@ -950,7 +957,7 @@ int audit_alloc(struct task_struct *tsk) } context->filterkey = key; - tsk->audit_context = context; + audit_set_context(tsk, context); set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); return 0; } @@ -1507,8 +1514,7 @@ void __audit_free(struct task_struct *tsk) void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4) { - struct task_struct *tsk = current; - struct audit_context *context = tsk->audit_context; + struct audit_context *context = audit_context(); enum audit_state state; if (!audit_enabled || !context) @@ -1523,7 +1529,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, context->dummy = !audit_n_rules; if (!context->dummy && state == AUDIT_BUILD_CONTEXT) { context->prio = 0; - if (auditd_test_task(tsk)) + if (auditd_test_task(current)) return; } @@ -1553,7 +1559,6 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, */ void __audit_syscall_exit(int success, long return_code) { - struct task_struct *tsk = current; struct audit_context *context; if (success) @@ -1561,12 +1566,12 @@ void __audit_syscall_exit(int success, long return_code) else success = AUDITSC_FAILURE; - context = audit_take_context(tsk, success, return_code); + context = audit_take_context(current, success, return_code); if (!context) return; if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) - audit_log_exit(context, tsk); + audit_log_exit(context, current); context->in_syscall = 0; context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; @@ -1588,7 +1593,7 @@ void __audit_syscall_exit(int success, long return_code) kfree(context->filterkey); context->filterkey = NULL; } - tsk->audit_context = context; + audit_set_context(current, context); } static inline void handle_one(const struct inode *inode) @@ -1600,7 +1605,7 @@ static inline void handle_one(const struct inode *inode) int count; if (likely(!inode->i_fsnotify_marks)) return; - context = current->audit_context; + context = audit_context(); p = context->trees; count = context->tree_count; rcu_read_lock(); @@ -1631,7 +1636,7 @@ static void handle_path(const struct dentry *dentry) unsigned long seq; int count; - context = current->audit_context; + context = audit_context(); p = context->trees; count = context->tree_count; retry: @@ -1713,7 +1718,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, struct filename * __audit_reusename(const __user char *uptr) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); struct audit_names *n; list_for_each_entry(n, &context->names_list, list) { @@ -1736,7 +1741,7 @@ __audit_reusename(const __user char *uptr) */ void __audit_getname(struct filename *name) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); struct audit_names *n; if (!context->in_syscall) @@ -1764,7 +1769,7 @@ void __audit_getname(struct filename *name) void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); struct inode *inode = d_backing_inode(dentry); struct audit_names *n; bool parent = flags & AUDIT_INODE_PARENT; @@ -1863,7 +1868,7 @@ void __audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); struct inode *inode = d_backing_inode(dentry); const char *dname = dentry->d_name.name; struct audit_names *n, *found_parent = NULL, *found_child = NULL; @@ -2048,7 +2053,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, int audit_set_loginuid(kuid_t loginuid) { struct task_struct *task = current; - unsigned int oldsessionid, sessionid = (unsigned int)-1; + unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET; kuid_t oldloginuid; int rc; @@ -2062,7 +2067,7 @@ int audit_set_loginuid(kuid_t loginuid) /* are we setting or clearing? */ if (uid_valid(loginuid)) { sessionid = (unsigned int)atomic_inc_return(&session_id); - if (unlikely(sessionid == (unsigned int)-1)) + if (unlikely(sessionid == AUDIT_SID_UNSET)) sessionid = (unsigned int)atomic_inc_return(&session_id); } @@ -2082,7 +2087,7 @@ out: */ void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); if (attr) memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr)); @@ -2106,7 +2111,7 @@ void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); struct timespec64 *p = &context->mq_sendrecv.abs_timeout; if (abs_timeout) @@ -2130,7 +2135,7 @@ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); if (notification) context->mq_notify.sigev_signo = notification->sigev_signo; @@ -2149,7 +2154,7 @@ void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) */ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); context->mq_getsetattr.mqdes = mqdes; context->mq_getsetattr.mqstat = *mqstat; context->type = AUDIT_MQ_GETSETATTR; @@ -2162,7 +2167,7 @@ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) */ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); context->ipc.uid = ipcp->uid; context->ipc.gid = ipcp->gid; context->ipc.mode = ipcp->mode; @@ -2182,7 +2187,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) */ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); context->ipc.qbytes = qbytes; context->ipc.perm_uid = uid; @@ -2193,7 +2198,7 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo void __audit_bprm(struct linux_binprm *bprm) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); context->type = AUDIT_EXECVE; context->execve.argc = bprm->argc; @@ -2208,7 +2213,7 @@ void __audit_bprm(struct linux_binprm *bprm) */ int __audit_socketcall(int nargs, unsigned long *args) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); if (nargs <= 0 || nargs > AUDITSC_ARGS || !args) return -EINVAL; @@ -2226,7 +2231,7 @@ int __audit_socketcall(int nargs, unsigned long *args) */ void __audit_fd_pair(int fd1, int fd2) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); context->fds[0] = fd1; context->fds[1] = fd2; } @@ -2240,7 +2245,7 @@ void __audit_fd_pair(int fd1, int fd2) */ int __audit_sockaddr(int len, void *a) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); if (!context->sockaddr) { void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); @@ -2256,7 +2261,7 @@ int __audit_sockaddr(int len, void *a) void __audit_ptrace(struct task_struct *t) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); context->target_pid = task_tgid_nr(t); context->target_auid = audit_get_loginuid(t); @@ -2277,19 +2282,19 @@ void __audit_ptrace(struct task_struct *t) int audit_signal_info(int sig, struct task_struct *t) { struct audit_aux_data_pids *axp; - struct task_struct *tsk = current; - struct audit_context *ctx = tsk->audit_context; - kuid_t uid = current_uid(), t_uid = task_uid(t); + struct audit_context *ctx = audit_context(); + kuid_t uid = current_uid(), auid, t_uid = task_uid(t); if (auditd_test_task(t) && (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2)) { - audit_sig_pid = task_tgid_nr(tsk); - if (uid_valid(tsk->loginuid)) - audit_sig_uid = tsk->loginuid; + audit_sig_pid = task_tgid_nr(current); + auid = audit_get_loginuid(current); + if (uid_valid(auid)) + audit_sig_uid = auid; else audit_sig_uid = uid; - security_task_getsecid(tsk, &audit_sig_sid); + security_task_getsecid(current, &audit_sig_sid); } if (!audit_signals || audit_dummy_context()) @@ -2345,7 +2350,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) { struct audit_aux_data_bprm_fcaps *ax; - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); struct cpu_vfs_cap_data vcaps; ax = kmalloc(sizeof(*ax), GFP_KERNEL); @@ -2385,7 +2390,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, */ void __audit_log_capset(const struct cred *new, const struct cred *old) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); context->capset.pid = task_tgid_nr(current); context->capset.cap.effective = new->cap_effective; context->capset.cap.inheritable = new->cap_effective; @@ -2396,7 +2401,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old) void __audit_mmap_fd(int fd, int flags) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); context->mmap.fd = fd; context->mmap.flags = flags; context->type = AUDIT_MMAP; @@ -2404,7 +2409,7 @@ void __audit_mmap_fd(int fd, int flags) void __audit_log_kern_module(char *name) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); context->module.name = kmalloc(strlen(name) + 1, GFP_KERNEL); strcpy(context->module.name, name); @@ -2413,7 +2418,7 @@ void __audit_log_kern_module(char *name) void __audit_fanotify(unsigned int response) { - audit_log(current->audit_context, GFP_KERNEL, + audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY, "resp=%u", response); } @@ -2464,7 +2469,19 @@ void audit_core_dumps(long signr) audit_log_end(ab); } -void __audit_seccomp(unsigned long syscall, long signr, int code) +/** + * audit_seccomp - record information about a seccomp action + * @syscall: syscall number + * @signr: signal value + * @code: the seccomp action + * + * Record the information associated with a seccomp action. Event filtering for + * seccomp actions that are not to be logged is done in seccomp_log(). + * Therefore, this function forces auditing independent of the audit_enabled + * and dummy context state because seccomp actions should be logged even when + * audit is not in use. + */ +void audit_seccomp(unsigned long syscall, long signr, int code) { struct audit_buffer *ab; @@ -2478,9 +2495,29 @@ void __audit_seccomp(unsigned long syscall, long signr, int code) audit_log_end(ab); } +void audit_seccomp_actions_logged(const char *names, const char *old_names, + int res) +{ + struct audit_buffer *ab; + + if (!audit_enabled) + return; + + ab = audit_log_start(audit_context(), GFP_KERNEL, + AUDIT_CONFIG_CHANGE); + if (unlikely(!ab)) + return; + + audit_log_format(ab, "op=seccomp-logging"); + audit_log_format(ab, " actions=%s", names); + audit_log_format(ab, " old-actions=%s", old_names); + audit_log_format(ab, " res=%d", res); + audit_log_end(ab); +} + struct list_head *audit_killed_trees(void) { - struct audit_context *ctx = current->audit_context; + struct audit_context *ctx = audit_context(); if (likely(!ctx || !ctx->in_syscall)) return NULL; return &ctx->killed_trees; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index a713fd23ec88..f27f5496d6fe 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -4,9 +4,13 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o +ifeq ($(CONFIG_XDP_SOCKETS),y) +obj-$(CONFIG_BPF_SYSCALL) += xskmap.o +endif obj-$(CONFIG_BPF_SYSCALL) += offload.o ifeq ($(CONFIG_STREAM_PARSER),y) ifeq ($(CONFIG_INET),y) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 14750e7c5ee4..544e58f5f642 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -11,11 +11,13 @@ * General Public License for more details. */ #include <linux/bpf.h> +#include <linux/btf.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/filter.h> #include <linux/perf_event.h> +#include <uapi/linux/btf.h> #include "map_in_map.h" @@ -336,6 +338,52 @@ static void array_map_free(struct bpf_map *map) bpf_map_area_free(array); } +static void array_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void *value; + + rcu_read_lock(); + + value = array_map_lookup_elem(map, key); + if (!value) { + rcu_read_unlock(); + return; + } + + seq_printf(m, "%u: ", *(u32 *)key); + btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); + seq_puts(m, "\n"); + + rcu_read_unlock(); +} + +static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, + u32 btf_key_id, u32 btf_value_id) +{ + const struct btf_type *key_type, *value_type; + u32 key_size, value_size; + u32 int_data; + + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); + if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + return -EINVAL; + + int_data = *(u32 *)(key_type + 1); + /* bpf array can only take a u32 key. This check makes + * sure that the btf matches the attr used during map_create. + */ + if (BTF_INT_BITS(int_data) != 32 || key_size != 4 || + BTF_INT_OFFSET(int_data)) + return -EINVAL; + + value_type = btf_type_id_size(btf, &btf_value_id, &value_size); + if (!value_type || value_size > map->value_size) + return -EINVAL; + + return 0; +} + const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, @@ -345,6 +393,8 @@ const struct bpf_map_ops array_map_ops = { .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_gen_lookup = array_map_gen_lookup, + .map_seq_show_elem = array_map_seq_show_elem, + .map_check_btf = array_map_check_btf, }; const struct bpf_map_ops percpu_array_map_ops = { @@ -476,7 +526,7 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr) } /* decrement refcnt of all bpf_progs that are stored in this map */ -void bpf_fd_array_map_clear(struct bpf_map *map) +static void bpf_fd_array_map_clear(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; @@ -495,6 +545,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, + .map_release_uref = bpf_fd_array_map_clear, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c new file mode 100644 index 000000000000..2d49d18b793a --- /dev/null +++ b/kernel/bpf/btf.c @@ -0,0 +1,2348 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2018 Facebook */ + +#include <uapi/linux/btf.h> +#include <uapi/linux/types.h> +#include <linux/seq_file.h> +#include <linux/compiler.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/idr.h> +#include <linux/sort.h> +#include <linux/bpf_verifier.h> +#include <linux/btf.h> + +/* BTF (BPF Type Format) is the meta data format which describes + * the data types of BPF program/map. Hence, it basically focus + * on the C programming language which the modern BPF is primary + * using. + * + * ELF Section: + * ~~~~~~~~~~~ + * The BTF data is stored under the ".BTF" ELF section + * + * struct btf_type: + * ~~~~~~~~~~~~~~~ + * Each 'struct btf_type' object describes a C data type. + * Depending on the type it is describing, a 'struct btf_type' + * object may be followed by more data. F.e. + * To describe an array, 'struct btf_type' is followed by + * 'struct btf_array'. + * + * 'struct btf_type' and any extra data following it are + * 4 bytes aligned. + * + * Type section: + * ~~~~~~~~~~~~~ + * The BTF type section contains a list of 'struct btf_type' objects. + * Each one describes a C type. Recall from the above section + * that a 'struct btf_type' object could be immediately followed by extra + * data in order to desribe some particular C types. + * + * type_id: + * ~~~~~~~ + * Each btf_type object is identified by a type_id. The type_id + * is implicitly implied by the location of the btf_type object in + * the BTF type section. The first one has type_id 1. The second + * one has type_id 2...etc. Hence, an earlier btf_type has + * a smaller type_id. + * + * A btf_type object may refer to another btf_type object by using + * type_id (i.e. the "type" in the "struct btf_type"). + * + * NOTE that we cannot assume any reference-order. + * A btf_type object can refer to an earlier btf_type object + * but it can also refer to a later btf_type object. + * + * For example, to describe "const void *". A btf_type + * object describing "const" may refer to another btf_type + * object describing "void *". This type-reference is done + * by specifying type_id: + * + * [1] CONST (anon) type_id=2 + * [2] PTR (anon) type_id=0 + * + * The above is the btf_verifier debug log: + * - Each line started with "[?]" is a btf_type object + * - [?] is the type_id of the btf_type object. + * - CONST/PTR is the BTF_KIND_XXX + * - "(anon)" is the name of the type. It just + * happens that CONST and PTR has no name. + * - type_id=XXX is the 'u32 type' in btf_type + * + * NOTE: "void" has type_id 0 + * + * String section: + * ~~~~~~~~~~~~~~ + * The BTF string section contains the names used by the type section. + * Each string is referred by an "offset" from the beginning of the + * string section. + * + * Each string is '\0' terminated. + * + * The first character in the string section must be '\0' + * which is used to mean 'anonymous'. Some btf_type may not + * have a name. + */ + +/* BTF verification: + * + * To verify BTF data, two passes are needed. + * + * Pass #1 + * ~~~~~~~ + * The first pass is to collect all btf_type objects to + * an array: "btf->types". + * + * Depending on the C type that a btf_type is describing, + * a btf_type may be followed by extra data. We don't know + * how many btf_type is there, and more importantly we don't + * know where each btf_type is located in the type section. + * + * Without knowing the location of each type_id, most verifications + * cannot be done. e.g. an earlier btf_type may refer to a later + * btf_type (recall the "const void *" above), so we cannot + * check this type-reference in the first pass. + * + * In the first pass, it still does some verifications (e.g. + * checking the name is a valid offset to the string section). + * + * Pass #2 + * ~~~~~~~ + * The main focus is to resolve a btf_type that is referring + * to another type. + * + * We have to ensure the referring type: + * 1) does exist in the BTF (i.e. in btf->types[]) + * 2) does not cause a loop: + * struct A { + * struct B b; + * }; + * + * struct B { + * struct A a; + * }; + * + * btf_type_needs_resolve() decides if a btf_type needs + * to be resolved. + * + * The needs_resolve type implements the "resolve()" ops which + * essentially does a DFS and detects backedge. + * + * During resolve (or DFS), different C types have different + * "RESOLVED" conditions. + * + * When resolving a BTF_KIND_STRUCT, we need to resolve all its + * members because a member is always referring to another + * type. A struct's member can be treated as "RESOLVED" if + * it is referring to a BTF_KIND_PTR. Otherwise, the + * following valid C struct would be rejected: + * + * struct A { + * int m; + * struct A *a; + * }; + * + * When resolving a BTF_KIND_PTR, it needs to keep resolving if + * it is referring to another BTF_KIND_PTR. Otherwise, we cannot + * detect a pointer loop, e.g.: + * BTF_KIND_CONST -> BTF_KIND_PTR -> BTF_KIND_CONST -> BTF_KIND_PTR + + * ^ | + * +-----------------------------------------+ + * + */ + +#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE) +#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1) +#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK) +#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3) +#define BITS_ROUNDUP_BYTES(bits) \ + (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) + +#define BTF_INFO_MASK 0x0f00ffff +#define BTF_INT_MASK 0x0fffffff +#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) +#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) + +/* 16MB for 64k structs and each has 16 members and + * a few MB spaces for the string section. + * The hard limit is S32_MAX. + */ +#define BTF_MAX_SIZE (16 * 1024 * 1024) + +#define for_each_member(i, struct_type, member) \ + for (i = 0, member = btf_type_member(struct_type); \ + i < btf_type_vlen(struct_type); \ + i++, member++) + +#define for_each_member_from(i, from, struct_type, member) \ + for (i = from, member = btf_type_member(struct_type) + from; \ + i < btf_type_vlen(struct_type); \ + i++, member++) + +static DEFINE_IDR(btf_idr); +static DEFINE_SPINLOCK(btf_idr_lock); + +struct btf { + void *data; + struct btf_type **types; + u32 *resolved_ids; + u32 *resolved_sizes; + const char *strings; + void *nohdr_data; + struct btf_header hdr; + u32 nr_types; + u32 types_size; + u32 data_size; + refcount_t refcnt; + u32 id; + struct rcu_head rcu; +}; + +enum verifier_phase { + CHECK_META, + CHECK_TYPE, +}; + +struct resolve_vertex { + const struct btf_type *t; + u32 type_id; + u16 next_member; +}; + +enum visit_state { + NOT_VISITED, + VISITED, + RESOLVED, +}; + +enum resolve_mode { + RESOLVE_TBD, /* To Be Determined */ + RESOLVE_PTR, /* Resolving for Pointer */ + RESOLVE_STRUCT_OR_ARRAY, /* Resolving for struct/union + * or array + */ +}; + +#define MAX_RESOLVE_DEPTH 32 + +struct btf_sec_info { + u32 off; + u32 len; +}; + +struct btf_verifier_env { + struct btf *btf; + u8 *visit_states; + struct resolve_vertex stack[MAX_RESOLVE_DEPTH]; + struct bpf_verifier_log log; + u32 log_type_id; + u32 top_stack; + enum verifier_phase phase; + enum resolve_mode resolve_mode; +}; + +static const char * const btf_kind_str[NR_BTF_KINDS] = { + [BTF_KIND_UNKN] = "UNKNOWN", + [BTF_KIND_INT] = "INT", + [BTF_KIND_PTR] = "PTR", + [BTF_KIND_ARRAY] = "ARRAY", + [BTF_KIND_STRUCT] = "STRUCT", + [BTF_KIND_UNION] = "UNION", + [BTF_KIND_ENUM] = "ENUM", + [BTF_KIND_FWD] = "FWD", + [BTF_KIND_TYPEDEF] = "TYPEDEF", + [BTF_KIND_VOLATILE] = "VOLATILE", + [BTF_KIND_CONST] = "CONST", + [BTF_KIND_RESTRICT] = "RESTRICT", +}; + +struct btf_kind_operations { + s32 (*check_meta)(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left); + int (*resolve)(struct btf_verifier_env *env, + const struct resolve_vertex *v); + int (*check_member)(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type); + void (*log_details)(struct btf_verifier_env *env, + const struct btf_type *t); + void (*seq_show)(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offsets, + struct seq_file *m); +}; + +static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS]; +static struct btf_type btf_void; + +static bool btf_type_is_modifier(const struct btf_type *t) +{ + /* Some of them is not strictly a C modifier + * but they are grouped into the same bucket + * for BTF concern: + * A type (t) that refers to another + * type through t->type AND its size cannot + * be determined without following the t->type. + * + * ptr does not fall into this bucket + * because its size is always sizeof(void *). + */ + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + return true; + } + + return false; +} + +static bool btf_type_is_void(const struct btf_type *t) +{ + /* void => no type and size info. + * Hence, FWD is also treated as void. + */ + return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD; +} + +static bool btf_type_is_void_or_null(const struct btf_type *t) +{ + return !t || btf_type_is_void(t); +} + +/* union is only a special case of struct: + * all its offsetof(member) == 0 + */ +static bool btf_type_is_struct(const struct btf_type *t) +{ + u8 kind = BTF_INFO_KIND(t->info); + + return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; +} + +static bool btf_type_is_array(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; +} + +static bool btf_type_is_ptr(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_PTR; +} + +static bool btf_type_is_int(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_INT; +} + +/* What types need to be resolved? + * + * btf_type_is_modifier() is an obvious one. + * + * btf_type_is_struct() because its member refers to + * another type (through member->type). + + * btf_type_is_array() because its element (array->type) + * refers to another type. Array can be thought of a + * special case of struct while array just has the same + * member-type repeated by array->nelems of times. + */ +static bool btf_type_needs_resolve(const struct btf_type *t) +{ + return btf_type_is_modifier(t) || + btf_type_is_ptr(t) || + btf_type_is_struct(t) || + btf_type_is_array(t); +} + +/* t->size can be used */ +static bool btf_type_has_size(const struct btf_type *t) +{ + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_INT: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + return true; + } + + return false; +} + +static const char *btf_int_encoding_str(u8 encoding) +{ + if (encoding == 0) + return "(none)"; + else if (encoding == BTF_INT_SIGNED) + return "SIGNED"; + else if (encoding == BTF_INT_CHAR) + return "CHAR"; + else if (encoding == BTF_INT_BOOL) + return "BOOL"; + else + return "UNKN"; +} + +static u16 btf_type_vlen(const struct btf_type *t) +{ + return BTF_INFO_VLEN(t->info); +} + +static u32 btf_type_int(const struct btf_type *t) +{ + return *(u32 *)(t + 1); +} + +static const struct btf_array *btf_type_array(const struct btf_type *t) +{ + return (const struct btf_array *)(t + 1); +} + +static const struct btf_member *btf_type_member(const struct btf_type *t) +{ + return (const struct btf_member *)(t + 1); +} + +static const struct btf_enum *btf_type_enum(const struct btf_type *t) +{ + return (const struct btf_enum *)(t + 1); +} + +static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) +{ + return kind_ops[BTF_INFO_KIND(t->info)]; +} + +static bool btf_name_offset_valid(const struct btf *btf, u32 offset) +{ + return BTF_STR_OFFSET_VALID(offset) && + offset < btf->hdr.str_len; +} + +static const char *btf_name_by_offset(const struct btf *btf, u32 offset) +{ + if (!offset) + return "(anon)"; + else if (offset < btf->hdr.str_len) + return &btf->strings[offset]; + else + return "(invalid-name-offset)"; +} + +static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) +{ + if (type_id > btf->nr_types) + return NULL; + + return btf->types[type_id]; +} + +/* + * Regular int is not a bit field and it must be either + * u8/u16/u32/u64. + */ +static bool btf_type_int_is_regular(const struct btf_type *t) +{ + u16 nr_bits, nr_bytes; + u32 int_data; + + int_data = btf_type_int(t); + nr_bits = BTF_INT_BITS(int_data); + nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); + if (BITS_PER_BYTE_MASKED(nr_bits) || + BTF_INT_OFFSET(int_data) || + (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && + nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) { + return false; + } + + return true; +} + +__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); +} + +__printf(2, 3) static void btf_verifier_log(struct btf_verifier_env *env, + const char *fmt, ...) +{ + struct bpf_verifier_log *log = &env->log; + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); +} + +__printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, + const struct btf_type *t, + bool log_details, + const char *fmt, ...) +{ + struct bpf_verifier_log *log = &env->log; + u8 kind = BTF_INFO_KIND(t->info); + struct btf *btf = env->btf; + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + + __btf_verifier_log(log, "[%u] %s %s%s", + env->log_type_id, + btf_kind_str[kind], + btf_name_by_offset(btf, t->name_off), + log_details ? " " : ""); + + if (log_details) + btf_type_ops(t)->log_details(env, t); + + if (fmt && *fmt) { + __btf_verifier_log(log, " "); + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); + } + + __btf_verifier_log(log, "\n"); +} + +#define btf_verifier_log_type(env, t, ...) \ + __btf_verifier_log_type((env), (t), true, __VA_ARGS__) +#define btf_verifier_log_basic(env, t, ...) \ + __btf_verifier_log_type((env), (t), false, __VA_ARGS__) + +__printf(4, 5) +static void btf_verifier_log_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const char *fmt, ...) +{ + struct bpf_verifier_log *log = &env->log; + struct btf *btf = env->btf; + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + + /* The CHECK_META phase already did a btf dump. + * + * If member is logged again, it must hit an error in + * parsing this member. It is useful to print out which + * struct this member belongs to. + */ + if (env->phase != CHECK_META) + btf_verifier_log_type(env, struct_type, NULL); + + __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", + btf_name_by_offset(btf, member->name_off), + member->type, member->offset); + + if (fmt && *fmt) { + __btf_verifier_log(log, " "); + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); + } + + __btf_verifier_log(log, "\n"); +} + +static void btf_verifier_log_hdr(struct btf_verifier_env *env, + u32 btf_data_size) +{ + struct bpf_verifier_log *log = &env->log; + const struct btf *btf = env->btf; + const struct btf_header *hdr; + + if (!bpf_verifier_log_needed(log)) + return; + + hdr = &btf->hdr; + __btf_verifier_log(log, "magic: 0x%x\n", hdr->magic); + __btf_verifier_log(log, "version: %u\n", hdr->version); + __btf_verifier_log(log, "flags: 0x%x\n", hdr->flags); + __btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len); + __btf_verifier_log(log, "type_off: %u\n", hdr->type_off); + __btf_verifier_log(log, "type_len: %u\n", hdr->type_len); + __btf_verifier_log(log, "str_off: %u\n", hdr->str_off); + __btf_verifier_log(log, "str_len: %u\n", hdr->str_len); + __btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size); +} + +static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) +{ + struct btf *btf = env->btf; + + /* < 2 because +1 for btf_void which is always in btf->types[0]. + * btf_void is not accounted in btf->nr_types because btf_void + * does not come from the BTF file. + */ + if (btf->types_size - btf->nr_types < 2) { + /* Expand 'types' array */ + + struct btf_type **new_types; + u32 expand_by, new_size; + + if (btf->types_size == BTF_MAX_TYPE) { + btf_verifier_log(env, "Exceeded max num of types"); + return -E2BIG; + } + + expand_by = max_t(u32, btf->types_size >> 2, 16); + new_size = min_t(u32, BTF_MAX_TYPE, + btf->types_size + expand_by); + + new_types = kvcalloc(new_size, sizeof(*new_types), + GFP_KERNEL | __GFP_NOWARN); + if (!new_types) + return -ENOMEM; + + if (btf->nr_types == 0) + new_types[0] = &btf_void; + else + memcpy(new_types, btf->types, + sizeof(*btf->types) * (btf->nr_types + 1)); + + kvfree(btf->types); + btf->types = new_types; + btf->types_size = new_size; + } + + btf->types[++(btf->nr_types)] = t; + + return 0; +} + +static int btf_alloc_id(struct btf *btf) +{ + int id; + + idr_preload(GFP_KERNEL); + spin_lock_bh(&btf_idr_lock); + id = idr_alloc_cyclic(&btf_idr, btf, 1, INT_MAX, GFP_ATOMIC); + if (id > 0) + btf->id = id; + spin_unlock_bh(&btf_idr_lock); + idr_preload_end(); + + if (WARN_ON_ONCE(!id)) + return -ENOSPC; + + return id > 0 ? 0 : id; +} + +static void btf_free_id(struct btf *btf) +{ + unsigned long flags; + + /* + * In map-in-map, calling map_delete_elem() on outer + * map will call bpf_map_put on the inner map. + * It will then eventually call btf_free_id() + * on the inner map. Some of the map_delete_elem() + * implementation may have irq disabled, so + * we need to use the _irqsave() version instead + * of the _bh() version. + */ + spin_lock_irqsave(&btf_idr_lock, flags); + idr_remove(&btf_idr, btf->id); + spin_unlock_irqrestore(&btf_idr_lock, flags); +} + +static void btf_free(struct btf *btf) +{ + kvfree(btf->types); + kvfree(btf->resolved_sizes); + kvfree(btf->resolved_ids); + kvfree(btf->data); + kfree(btf); +} + +static void btf_free_rcu(struct rcu_head *rcu) +{ + struct btf *btf = container_of(rcu, struct btf, rcu); + + btf_free(btf); +} + +void btf_put(struct btf *btf) +{ + if (btf && refcount_dec_and_test(&btf->refcnt)) { + btf_free_id(btf); + call_rcu(&btf->rcu, btf_free_rcu); + } +} + +static int env_resolve_init(struct btf_verifier_env *env) +{ + struct btf *btf = env->btf; + u32 nr_types = btf->nr_types; + u32 *resolved_sizes = NULL; + u32 *resolved_ids = NULL; + u8 *visit_states = NULL; + + /* +1 for btf_void */ + resolved_sizes = kvcalloc(nr_types + 1, sizeof(*resolved_sizes), + GFP_KERNEL | __GFP_NOWARN); + if (!resolved_sizes) + goto nomem; + + resolved_ids = kvcalloc(nr_types + 1, sizeof(*resolved_ids), + GFP_KERNEL | __GFP_NOWARN); + if (!resolved_ids) + goto nomem; + + visit_states = kvcalloc(nr_types + 1, sizeof(*visit_states), + GFP_KERNEL | __GFP_NOWARN); + if (!visit_states) + goto nomem; + + btf->resolved_sizes = resolved_sizes; + btf->resolved_ids = resolved_ids; + env->visit_states = visit_states; + + return 0; + +nomem: + kvfree(resolved_sizes); + kvfree(resolved_ids); + kvfree(visit_states); + return -ENOMEM; +} + +static void btf_verifier_env_free(struct btf_verifier_env *env) +{ + kvfree(env->visit_states); + kfree(env); +} + +static bool env_type_is_resolve_sink(const struct btf_verifier_env *env, + const struct btf_type *next_type) +{ + switch (env->resolve_mode) { + case RESOLVE_TBD: + /* int, enum or void is a sink */ + return !btf_type_needs_resolve(next_type); + case RESOLVE_PTR: + /* int, enum, void, struct or array is a sink for ptr */ + return !btf_type_is_modifier(next_type) && + !btf_type_is_ptr(next_type); + case RESOLVE_STRUCT_OR_ARRAY: + /* int, enum, void or ptr is a sink for struct and array */ + return !btf_type_is_modifier(next_type) && + !btf_type_is_array(next_type) && + !btf_type_is_struct(next_type); + default: + BUG(); + } +} + +static bool env_type_is_resolved(const struct btf_verifier_env *env, + u32 type_id) +{ + return env->visit_states[type_id] == RESOLVED; +} + +static int env_stack_push(struct btf_verifier_env *env, + const struct btf_type *t, u32 type_id) +{ + struct resolve_vertex *v; + + if (env->top_stack == MAX_RESOLVE_DEPTH) + return -E2BIG; + + if (env->visit_states[type_id] != NOT_VISITED) + return -EEXIST; + + env->visit_states[type_id] = VISITED; + + v = &env->stack[env->top_stack++]; + v->t = t; + v->type_id = type_id; + v->next_member = 0; + + if (env->resolve_mode == RESOLVE_TBD) { + if (btf_type_is_ptr(t)) + env->resolve_mode = RESOLVE_PTR; + else if (btf_type_is_struct(t) || btf_type_is_array(t)) + env->resolve_mode = RESOLVE_STRUCT_OR_ARRAY; + } + + return 0; +} + +static void env_stack_set_next_member(struct btf_verifier_env *env, + u16 next_member) +{ + env->stack[env->top_stack - 1].next_member = next_member; +} + +static void env_stack_pop_resolved(struct btf_verifier_env *env, + u32 resolved_type_id, + u32 resolved_size) +{ + u32 type_id = env->stack[--(env->top_stack)].type_id; + struct btf *btf = env->btf; + + btf->resolved_sizes[type_id] = resolved_size; + btf->resolved_ids[type_id] = resolved_type_id; + env->visit_states[type_id] = RESOLVED; +} + +static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) +{ + return env->top_stack ? &env->stack[env->top_stack - 1] : NULL; +} + +/* The input param "type_id" must point to a needs_resolve type */ +static const struct btf_type *btf_type_id_resolve(const struct btf *btf, + u32 *type_id) +{ + *type_id = btf->resolved_ids[*type_id]; + return btf_type_by_id(btf, *type_id); +} + +const struct btf_type *btf_type_id_size(const struct btf *btf, + u32 *type_id, u32 *ret_size) +{ + const struct btf_type *size_type; + u32 size_type_id = *type_id; + u32 size = 0; + + size_type = btf_type_by_id(btf, size_type_id); + if (btf_type_is_void_or_null(size_type)) + return NULL; + + if (btf_type_has_size(size_type)) { + size = size_type->size; + } else if (btf_type_is_array(size_type)) { + size = btf->resolved_sizes[size_type_id]; + } else if (btf_type_is_ptr(size_type)) { + size = sizeof(void *); + } else { + if (WARN_ON_ONCE(!btf_type_is_modifier(size_type))) + return NULL; + + size = btf->resolved_sizes[size_type_id]; + size_type_id = btf->resolved_ids[size_type_id]; + size_type = btf_type_by_id(btf, size_type_id); + if (btf_type_is_void(size_type)) + return NULL; + } + + *type_id = size_type_id; + if (ret_size) + *ret_size = size; + + return size_type; +} + +static int btf_df_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + btf_verifier_log_basic(env, struct_type, + "Unsupported check_member"); + return -EINVAL; +} + +static int btf_df_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + btf_verifier_log_basic(env, v->t, "Unsupported resolve"); + return -EINVAL; +} + +static void btf_df_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offsets, + struct seq_file *m) +{ + seq_printf(m, "<unsupported kind:%u>", BTF_INFO_KIND(t->info)); +} + +static int btf_int_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 int_data = btf_type_int(member_type); + u32 struct_bits_off = member->offset; + u32 struct_size = struct_type->size; + u32 nr_copy_bits; + u32 bytes_offset; + + if (U32_MAX - struct_bits_off < BTF_INT_OFFSET(int_data)) { + btf_verifier_log_member(env, struct_type, member, + "bits_offset exceeds U32_MAX"); + return -EINVAL; + } + + struct_bits_off += BTF_INT_OFFSET(int_data); + bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); + nr_copy_bits = BTF_INT_BITS(int_data) + + BITS_PER_BYTE_MASKED(struct_bits_off); + + if (nr_copy_bits > BITS_PER_U64) { + btf_verifier_log_member(env, struct_type, member, + "nr_copy_bits exceeds 64"); + return -EINVAL; + } + + if (struct_size < bytes_offset || + struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + +static s32 btf_int_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + u32 int_data, nr_bits, meta_needed = sizeof(int_data); + u16 encoding; + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + int_data = btf_type_int(t); + if (int_data & ~BTF_INT_MASK) { + btf_verifier_log_basic(env, t, "Invalid int_data:%x", + int_data); + return -EINVAL; + } + + nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data); + + if (nr_bits > BITS_PER_U64) { + btf_verifier_log_type(env, t, "nr_bits exceeds %zu", + BITS_PER_U64); + return -EINVAL; + } + + if (BITS_ROUNDUP_BYTES(nr_bits) > t->size) { + btf_verifier_log_type(env, t, "nr_bits exceeds type_size"); + return -EINVAL; + } + + /* + * Only one of the encoding bits is allowed and it + * should be sufficient for the pretty print purpose (i.e. decoding). + * Multiple bits can be allowed later if it is found + * to be insufficient. + */ + encoding = BTF_INT_ENCODING(int_data); + if (encoding && + encoding != BTF_INT_SIGNED && + encoding != BTF_INT_CHAR && + encoding != BTF_INT_BOOL) { + btf_verifier_log_type(env, t, "Unsupported encoding"); + return -ENOTSUPP; + } + + btf_verifier_log_type(env, t, NULL); + + return meta_needed; +} + +static void btf_int_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + int int_data = btf_type_int(t); + + btf_verifier_log(env, + "size=%u bits_offset=%u nr_bits=%u encoding=%s", + t->size, BTF_INT_OFFSET(int_data), + BTF_INT_BITS(int_data), + btf_int_encoding_str(BTF_INT_ENCODING(int_data))); +} + +static void btf_int_bits_seq_show(const struct btf *btf, + const struct btf_type *t, + void *data, u8 bits_offset, + struct seq_file *m) +{ + u32 int_data = btf_type_int(t); + u16 nr_bits = BTF_INT_BITS(int_data); + u16 total_bits_offset; + u16 nr_copy_bytes; + u16 nr_copy_bits; + u8 nr_upper_bits; + union { + u64 u64_num; + u8 u8_nums[8]; + } print_num; + + total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); + data += BITS_ROUNDDOWN_BYTES(total_bits_offset); + bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); + nr_copy_bits = nr_bits + bits_offset; + nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); + + print_num.u64_num = 0; + memcpy(&print_num.u64_num, data, nr_copy_bytes); + + /* Ditch the higher order bits */ + nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits); + if (nr_upper_bits) { + /* We need to mask out some bits of the upper byte. */ + u8 mask = (1 << nr_upper_bits) - 1; + + print_num.u8_nums[nr_copy_bytes - 1] &= mask; + } + + print_num.u64_num >>= bits_offset; + + seq_printf(m, "0x%llx", print_num.u64_num); +} + +static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + u32 int_data = btf_type_int(t); + u8 encoding = BTF_INT_ENCODING(int_data); + bool sign = encoding & BTF_INT_SIGNED; + u32 nr_bits = BTF_INT_BITS(int_data); + + if (bits_offset || BTF_INT_OFFSET(int_data) || + BITS_PER_BYTE_MASKED(nr_bits)) { + btf_int_bits_seq_show(btf, t, data, bits_offset, m); + return; + } + + switch (nr_bits) { + case 64: + if (sign) + seq_printf(m, "%lld", *(s64 *)data); + else + seq_printf(m, "%llu", *(u64 *)data); + break; + case 32: + if (sign) + seq_printf(m, "%d", *(s32 *)data); + else + seq_printf(m, "%u", *(u32 *)data); + break; + case 16: + if (sign) + seq_printf(m, "%d", *(s16 *)data); + else + seq_printf(m, "%u", *(u16 *)data); + break; + case 8: + if (sign) + seq_printf(m, "%d", *(s8 *)data); + else + seq_printf(m, "%u", *(u8 *)data); + break; + default: + btf_int_bits_seq_show(btf, t, data, bits_offset, m); + } +} + +static const struct btf_kind_operations int_ops = { + .check_meta = btf_int_check_meta, + .resolve = btf_df_resolve, + .check_member = btf_int_check_member, + .log_details = btf_int_log, + .seq_show = btf_int_seq_show, +}; + +static int btf_modifier_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + const struct btf_type *resolved_type; + u32 resolved_type_id = member->type; + struct btf_member resolved_member; + struct btf *btf = env->btf; + + resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL); + if (!resolved_type) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member"); + return -EINVAL; + } + + resolved_member = *member; + resolved_member.type = resolved_type_id; + + return btf_type_ops(resolved_type)->check_member(env, struct_type, + &resolved_member, + resolved_type); +} + +static int btf_ptr_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_size, struct_bits_off, bytes_offset; + + struct_size = struct_type->size; + struct_bits_off = member->offset; + bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); + + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Member is not byte aligned"); + return -EINVAL; + } + + if (struct_size - bytes_offset < sizeof(void *)) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + +static int btf_ref_type_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + if (!BTF_TYPE_ID_VALID(t->type)) { + btf_verifier_log_type(env, t, "Invalid type_id"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return 0; +} + +static int btf_modifier_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_type *t = v->t; + const struct btf_type *next_type; + u32 next_type_id = t->type; + struct btf *btf = env->btf; + u32 next_type_size = 0; + + next_type = btf_type_by_id(btf, next_type_id); + if (!next_type) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + + /* "typedef void new_void", "const void"...etc */ + if (btf_type_is_void(next_type)) + goto resolved; + + if (!env_type_is_resolve_sink(env, next_type) && + !env_type_is_resolved(env, next_type_id)) + return env_stack_push(env, next_type, next_type_id); + + /* Figure out the resolved next_type_id with size. + * They will be stored in the current modifier's + * resolved_ids and resolved_sizes such that it can + * save us a few type-following when we use it later (e.g. in + * pretty print). + */ + if (!btf_type_id_size(btf, &next_type_id, &next_type_size) && + !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + +resolved: + env_stack_pop_resolved(env, next_type_id, next_type_size); + + return 0; +} + +static int btf_ptr_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_type *next_type; + const struct btf_type *t = v->t; + u32 next_type_id = t->type; + struct btf *btf = env->btf; + u32 next_type_size = 0; + + next_type = btf_type_by_id(btf, next_type_id); + if (!next_type) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + + /* "void *" */ + if (btf_type_is_void(next_type)) + goto resolved; + + if (!env_type_is_resolve_sink(env, next_type) && + !env_type_is_resolved(env, next_type_id)) + return env_stack_push(env, next_type, next_type_id); + + /* If the modifier was RESOLVED during RESOLVE_STRUCT_OR_ARRAY, + * the modifier may have stopped resolving when it was resolved + * to a ptr (last-resolved-ptr). + * + * We now need to continue from the last-resolved-ptr to + * ensure the last-resolved-ptr will not referring back to + * the currenct ptr (t). + */ + if (btf_type_is_modifier(next_type)) { + const struct btf_type *resolved_type; + u32 resolved_type_id; + + resolved_type_id = next_type_id; + resolved_type = btf_type_id_resolve(btf, &resolved_type_id); + + if (btf_type_is_ptr(resolved_type) && + !env_type_is_resolve_sink(env, resolved_type) && + !env_type_is_resolved(env, resolved_type_id)) + return env_stack_push(env, resolved_type, + resolved_type_id); + } + + if (!btf_type_id_size(btf, &next_type_id, &next_type_size) && + !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + +resolved: + env_stack_pop_resolved(env, next_type_id, 0); + + return 0; +} + +static void btf_modifier_seq_show(const struct btf *btf, + const struct btf_type *t, + u32 type_id, void *data, + u8 bits_offset, struct seq_file *m) +{ + t = btf_type_id_resolve(btf, &type_id); + + btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); +} + +static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + /* It is a hashed value */ + seq_printf(m, "%p", *(void **)data); +} + +static void btf_ref_type_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "type_id=%u", t->type); +} + +static struct btf_kind_operations modifier_ops = { + .check_meta = btf_ref_type_check_meta, + .resolve = btf_modifier_resolve, + .check_member = btf_modifier_check_member, + .log_details = btf_ref_type_log, + .seq_show = btf_modifier_seq_show, +}; + +static struct btf_kind_operations ptr_ops = { + .check_meta = btf_ref_type_check_meta, + .resolve = btf_ptr_resolve, + .check_member = btf_ptr_check_member, + .log_details = btf_ref_type_log, + .seq_show = btf_ptr_seq_show, +}; + +static s32 btf_fwd_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + if (t->type) { + btf_verifier_log_type(env, t, "type != 0"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return 0; +} + +static struct btf_kind_operations fwd_ops = { + .check_meta = btf_fwd_check_meta, + .resolve = btf_df_resolve, + .check_member = btf_df_check_member, + .log_details = btf_ref_type_log, + .seq_show = btf_df_seq_show, +}; + +static int btf_array_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_bits_off = member->offset; + u32 struct_size, bytes_offset; + u32 array_type_id, array_size; + struct btf *btf = env->btf; + + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Member is not byte aligned"); + return -EINVAL; + } + + array_type_id = member->type; + btf_type_id_size(btf, &array_type_id, &array_size); + struct_size = struct_type->size; + bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); + if (struct_size - bytes_offset < array_size) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + +static s32 btf_array_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_array *array = btf_type_array(t); + u32 meta_needed = sizeof(*array); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + if (t->size) { + btf_verifier_log_type(env, t, "size != 0"); + return -EINVAL; + } + + /* Array elem type and index type cannot be in type void, + * so !array->type and !array->index_type are not allowed. + */ + if (!array->type || !BTF_TYPE_ID_VALID(array->type)) { + btf_verifier_log_type(env, t, "Invalid elem"); + return -EINVAL; + } + + if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) { + btf_verifier_log_type(env, t, "Invalid index"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return meta_needed; +} + +static int btf_array_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_array *array = btf_type_array(v->t); + const struct btf_type *elem_type, *index_type; + u32 elem_type_id, index_type_id; + struct btf *btf = env->btf; + u32 elem_size; + + /* Check array->index_type */ + index_type_id = array->index_type; + index_type = btf_type_by_id(btf, index_type_id); + if (btf_type_is_void_or_null(index_type)) { + btf_verifier_log_type(env, v->t, "Invalid index"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, index_type) && + !env_type_is_resolved(env, index_type_id)) + return env_stack_push(env, index_type, index_type_id); + + index_type = btf_type_id_size(btf, &index_type_id, NULL); + if (!index_type || !btf_type_is_int(index_type) || + !btf_type_int_is_regular(index_type)) { + btf_verifier_log_type(env, v->t, "Invalid index"); + return -EINVAL; + } + + /* Check array->type */ + elem_type_id = array->type; + elem_type = btf_type_by_id(btf, elem_type_id); + if (btf_type_is_void_or_null(elem_type)) { + btf_verifier_log_type(env, v->t, + "Invalid elem"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, elem_type) && + !env_type_is_resolved(env, elem_type_id)) + return env_stack_push(env, elem_type, elem_type_id); + + elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); + if (!elem_type) { + btf_verifier_log_type(env, v->t, "Invalid elem"); + return -EINVAL; + } + + if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) { + btf_verifier_log_type(env, v->t, "Invalid array of int"); + return -EINVAL; + } + + if (array->nelems && elem_size > U32_MAX / array->nelems) { + btf_verifier_log_type(env, v->t, + "Array size overflows U32_MAX"); + return -EINVAL; + } + + env_stack_pop_resolved(env, elem_type_id, elem_size * array->nelems); + + return 0; +} + +static void btf_array_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + const struct btf_array *array = btf_type_array(t); + + btf_verifier_log(env, "type_id=%u index_type_id=%u nr_elems=%u", + array->type, array->index_type, array->nelems); +} + +static void btf_array_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + const struct btf_array *array = btf_type_array(t); + const struct btf_kind_operations *elem_ops; + const struct btf_type *elem_type; + u32 i, elem_size, elem_type_id; + + elem_type_id = array->type; + elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); + elem_ops = btf_type_ops(elem_type); + seq_puts(m, "["); + for (i = 0; i < array->nelems; i++) { + if (i) + seq_puts(m, ","); + + elem_ops->seq_show(btf, elem_type, elem_type_id, data, + bits_offset, m); + data += elem_size; + } + seq_puts(m, "]"); +} + +static struct btf_kind_operations array_ops = { + .check_meta = btf_array_check_meta, + .resolve = btf_array_resolve, + .check_member = btf_array_check_member, + .log_details = btf_array_log, + .seq_show = btf_array_seq_show, +}; + +static int btf_struct_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_bits_off = member->offset; + u32 struct_size, bytes_offset; + + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Member is not byte aligned"); + return -EINVAL; + } + + struct_size = struct_type->size; + bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); + if (struct_size - bytes_offset < member_type->size) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + +static s32 btf_struct_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION; + const struct btf_member *member; + struct btf *btf = env->btf; + u32 struct_size = t->size; + u32 meta_needed; + u16 i; + + meta_needed = btf_type_vlen(t) * sizeof(*member); + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + for_each_member(i, t, member) { + if (!btf_name_offset_valid(btf, member->name_off)) { + btf_verifier_log_member(env, t, member, + "Invalid member name_offset:%u", + member->name_off); + return -EINVAL; + } + + /* A member cannot be in type void */ + if (!member->type || !BTF_TYPE_ID_VALID(member->type)) { + btf_verifier_log_member(env, t, member, + "Invalid type_id"); + return -EINVAL; + } + + if (is_union && member->offset) { + btf_verifier_log_member(env, t, member, + "Invalid member bits_offset"); + return -EINVAL; + } + + if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) { + btf_verifier_log_member(env, t, member, + "Memmber bits_offset exceeds its struct size"); + return -EINVAL; + } + + btf_verifier_log_member(env, t, member, NULL); + } + + return meta_needed; +} + +static int btf_struct_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_member *member; + int err; + u16 i; + + /* Before continue resolving the next_member, + * ensure the last member is indeed resolved to a + * type with size info. + */ + if (v->next_member) { + const struct btf_type *last_member_type; + const struct btf_member *last_member; + u16 last_member_type_id; + + last_member = btf_type_member(v->t) + v->next_member - 1; + last_member_type_id = last_member->type; + if (WARN_ON_ONCE(!env_type_is_resolved(env, + last_member_type_id))) + return -EINVAL; + + last_member_type = btf_type_by_id(env->btf, + last_member_type_id); + err = btf_type_ops(last_member_type)->check_member(env, v->t, + last_member, + last_member_type); + if (err) + return err; + } + + for_each_member_from(i, v->next_member, v->t, member) { + u32 member_type_id = member->type; + const struct btf_type *member_type = btf_type_by_id(env->btf, + member_type_id); + + if (btf_type_is_void_or_null(member_type)) { + btf_verifier_log_member(env, v->t, member, + "Invalid member"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, member_type) && + !env_type_is_resolved(env, member_type_id)) { + env_stack_set_next_member(env, i + 1); + return env_stack_push(env, member_type, member_type_id); + } + + err = btf_type_ops(member_type)->check_member(env, v->t, + member, + member_type); + if (err) + return err; + } + + env_stack_pop_resolved(env, 0, 0); + + return 0; +} + +static void btf_struct_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); +} + +static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + const char *seq = BTF_INFO_KIND(t->info) == BTF_KIND_UNION ? "|" : ","; + const struct btf_member *member; + u32 i; + + seq_puts(m, "{"); + for_each_member(i, t, member) { + const struct btf_type *member_type = btf_type_by_id(btf, + member->type); + u32 member_offset = member->offset; + u32 bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); + u8 bits8_offset = BITS_PER_BYTE_MASKED(member_offset); + const struct btf_kind_operations *ops; + + if (i) + seq_puts(m, seq); + + ops = btf_type_ops(member_type); + ops->seq_show(btf, member_type, member->type, + data + bytes_offset, bits8_offset, m); + } + seq_puts(m, "}"); +} + +static struct btf_kind_operations struct_ops = { + .check_meta = btf_struct_check_meta, + .resolve = btf_struct_resolve, + .check_member = btf_struct_check_member, + .log_details = btf_struct_log, + .seq_show = btf_struct_seq_show, +}; + +static int btf_enum_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_bits_off = member->offset; + u32 struct_size, bytes_offset; + + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Member is not byte aligned"); + return -EINVAL; + } + + struct_size = struct_type->size; + bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); + if (struct_size - bytes_offset < sizeof(int)) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + +static s32 btf_enum_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_enum *enums = btf_type_enum(t); + struct btf *btf = env->btf; + u16 i, nr_enums; + u32 meta_needed; + + nr_enums = btf_type_vlen(t); + meta_needed = nr_enums * sizeof(*enums); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (t->size != sizeof(int)) { + btf_verifier_log_type(env, t, "Expected size:%zu", + sizeof(int)); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + for (i = 0; i < nr_enums; i++) { + if (!btf_name_offset_valid(btf, enums[i].name_off)) { + btf_verifier_log(env, "\tInvalid name_offset:%u", + enums[i].name_off); + return -EINVAL; + } + + btf_verifier_log(env, "\t%s val=%d\n", + btf_name_by_offset(btf, enums[i].name_off), + enums[i].val); + } + + return meta_needed; +} + +static void btf_enum_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); +} + +static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + const struct btf_enum *enums = btf_type_enum(t); + u32 i, nr_enums = btf_type_vlen(t); + int v = *(int *)data; + + for (i = 0; i < nr_enums; i++) { + if (v == enums[i].val) { + seq_printf(m, "%s", + btf_name_by_offset(btf, enums[i].name_off)); + return; + } + } + + seq_printf(m, "%d", v); +} + +static struct btf_kind_operations enum_ops = { + .check_meta = btf_enum_check_meta, + .resolve = btf_df_resolve, + .check_member = btf_enum_check_member, + .log_details = btf_enum_log, + .seq_show = btf_enum_seq_show, +}; + +static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { + [BTF_KIND_INT] = &int_ops, + [BTF_KIND_PTR] = &ptr_ops, + [BTF_KIND_ARRAY] = &array_ops, + [BTF_KIND_STRUCT] = &struct_ops, + [BTF_KIND_UNION] = &struct_ops, + [BTF_KIND_ENUM] = &enum_ops, + [BTF_KIND_FWD] = &fwd_ops, + [BTF_KIND_TYPEDEF] = &modifier_ops, + [BTF_KIND_VOLATILE] = &modifier_ops, + [BTF_KIND_CONST] = &modifier_ops, + [BTF_KIND_RESTRICT] = &modifier_ops, +}; + +static s32 btf_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + u32 saved_meta_left = meta_left; + s32 var_meta_size; + + if (meta_left < sizeof(*t)) { + btf_verifier_log(env, "[%u] meta_left:%u meta_needed:%zu", + env->log_type_id, meta_left, sizeof(*t)); + return -EINVAL; + } + meta_left -= sizeof(*t); + + if (t->info & ~BTF_INFO_MASK) { + btf_verifier_log(env, "[%u] Invalid btf_info:%x", + env->log_type_id, t->info); + return -EINVAL; + } + + if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX || + BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) { + btf_verifier_log(env, "[%u] Invalid kind:%u", + env->log_type_id, BTF_INFO_KIND(t->info)); + return -EINVAL; + } + + if (!btf_name_offset_valid(env->btf, t->name_off)) { + btf_verifier_log(env, "[%u] Invalid name_offset:%u", + env->log_type_id, t->name_off); + return -EINVAL; + } + + var_meta_size = btf_type_ops(t)->check_meta(env, t, meta_left); + if (var_meta_size < 0) + return var_meta_size; + + meta_left -= var_meta_size; + + return saved_meta_left - meta_left; +} + +static int btf_check_all_metas(struct btf_verifier_env *env) +{ + struct btf *btf = env->btf; + struct btf_header *hdr; + void *cur, *end; + + hdr = &btf->hdr; + cur = btf->nohdr_data + hdr->type_off; + end = btf->nohdr_data + hdr->type_len; + + env->log_type_id = 1; + while (cur < end) { + struct btf_type *t = cur; + s32 meta_size; + + meta_size = btf_check_meta(env, t, end - cur); + if (meta_size < 0) + return meta_size; + + btf_add_type(env, t); + cur += meta_size; + env->log_type_id++; + } + + return 0; +} + +static int btf_resolve(struct btf_verifier_env *env, + const struct btf_type *t, u32 type_id) +{ + const struct resolve_vertex *v; + int err = 0; + + env->resolve_mode = RESOLVE_TBD; + env_stack_push(env, t, type_id); + while (!err && (v = env_stack_peak(env))) { + env->log_type_id = v->type_id; + err = btf_type_ops(v->t)->resolve(env, v); + } + + env->log_type_id = type_id; + if (err == -E2BIG) + btf_verifier_log_type(env, t, + "Exceeded max resolving depth:%u", + MAX_RESOLVE_DEPTH); + else if (err == -EEXIST) + btf_verifier_log_type(env, t, "Loop detected"); + + return err; +} + +static bool btf_resolve_valid(struct btf_verifier_env *env, + const struct btf_type *t, + u32 type_id) +{ + struct btf *btf = env->btf; + + if (!env_type_is_resolved(env, type_id)) + return false; + + if (btf_type_is_struct(t)) + return !btf->resolved_ids[type_id] && + !btf->resolved_sizes[type_id]; + + if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) { + t = btf_type_id_resolve(btf, &type_id); + return t && !btf_type_is_modifier(t); + } + + if (btf_type_is_array(t)) { + const struct btf_array *array = btf_type_array(t); + const struct btf_type *elem_type; + u32 elem_type_id = array->type; + u32 elem_size; + + elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); + return elem_type && !btf_type_is_modifier(elem_type) && + (array->nelems * elem_size == + btf->resolved_sizes[type_id]); + } + + return false; +} + +static int btf_check_all_types(struct btf_verifier_env *env) +{ + struct btf *btf = env->btf; + u32 type_id; + int err; + + err = env_resolve_init(env); + if (err) + return err; + + env->phase++; + for (type_id = 1; type_id <= btf->nr_types; type_id++) { + const struct btf_type *t = btf_type_by_id(btf, type_id); + + env->log_type_id = type_id; + if (btf_type_needs_resolve(t) && + !env_type_is_resolved(env, type_id)) { + err = btf_resolve(env, t, type_id); + if (err) + return err; + } + + if (btf_type_needs_resolve(t) && + !btf_resolve_valid(env, t, type_id)) { + btf_verifier_log_type(env, t, "Invalid resolve state"); + return -EINVAL; + } + } + + return 0; +} + +static int btf_parse_type_sec(struct btf_verifier_env *env) +{ + const struct btf_header *hdr = &env->btf->hdr; + int err; + + /* Type section must align to 4 bytes */ + if (hdr->type_off & (sizeof(u32) - 1)) { + btf_verifier_log(env, "Unaligned type_off"); + return -EINVAL; + } + + if (!hdr->type_len) { + btf_verifier_log(env, "No type found"); + return -EINVAL; + } + + err = btf_check_all_metas(env); + if (err) + return err; + + return btf_check_all_types(env); +} + +static int btf_parse_str_sec(struct btf_verifier_env *env) +{ + const struct btf_header *hdr; + struct btf *btf = env->btf; + const char *start, *end; + + hdr = &btf->hdr; + start = btf->nohdr_data + hdr->str_off; + end = start + hdr->str_len; + + if (end != btf->data + btf->data_size) { + btf_verifier_log(env, "String section is not at the end"); + return -EINVAL; + } + + if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || + start[0] || end[-1]) { + btf_verifier_log(env, "Invalid string section"); + return -EINVAL; + } + + btf->strings = start; + + return 0; +} + +static const size_t btf_sec_info_offset[] = { + offsetof(struct btf_header, type_off), + offsetof(struct btf_header, str_off), +}; + +static int btf_sec_info_cmp(const void *a, const void *b) +{ + const struct btf_sec_info *x = a; + const struct btf_sec_info *y = b; + + return (int)(x->off - y->off) ? : (int)(x->len - y->len); +} + +static int btf_check_sec_info(struct btf_verifier_env *env, + u32 btf_data_size) +{ + struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)]; + u32 total, expected_total, i; + const struct btf_header *hdr; + const struct btf *btf; + + btf = env->btf; + hdr = &btf->hdr; + + /* Populate the secs from hdr */ + for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) + secs[i] = *(struct btf_sec_info *)((void *)hdr + + btf_sec_info_offset[i]); + + sort(secs, ARRAY_SIZE(btf_sec_info_offset), + sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL); + + /* Check for gaps and overlap among sections */ + total = 0; + expected_total = btf_data_size - hdr->hdr_len; + for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) { + if (expected_total < secs[i].off) { + btf_verifier_log(env, "Invalid section offset"); + return -EINVAL; + } + if (total < secs[i].off) { + /* gap */ + btf_verifier_log(env, "Unsupported section found"); + return -EINVAL; + } + if (total > secs[i].off) { + btf_verifier_log(env, "Section overlap found"); + return -EINVAL; + } + if (expected_total - total < secs[i].len) { + btf_verifier_log(env, + "Total section length too long"); + return -EINVAL; + } + total += secs[i].len; + } + + /* There is data other than hdr and known sections */ + if (expected_total != total) { + btf_verifier_log(env, "Unsupported section found"); + return -EINVAL; + } + + return 0; +} + +static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data, + u32 btf_data_size) +{ + const struct btf_header *hdr; + u32 hdr_len, hdr_copy; + /* + * Minimal part of the "struct btf_header" that + * contains the hdr_len. + */ + struct btf_min_header { + u16 magic; + u8 version; + u8 flags; + u32 hdr_len; + } __user *min_hdr; + struct btf *btf; + int err; + + btf = env->btf; + min_hdr = btf_data; + + if (btf_data_size < sizeof(*min_hdr)) { + btf_verifier_log(env, "hdr_len not found"); + return -EINVAL; + } + + if (get_user(hdr_len, &min_hdr->hdr_len)) + return -EFAULT; + + if (btf_data_size < hdr_len) { + btf_verifier_log(env, "btf_header not found"); + return -EINVAL; + } + + err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len); + if (err) { + if (err == -E2BIG) + btf_verifier_log(env, "Unsupported btf_header"); + return err; + } + + hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr)); + if (copy_from_user(&btf->hdr, btf_data, hdr_copy)) + return -EFAULT; + + hdr = &btf->hdr; + + btf_verifier_log_hdr(env, btf_data_size); + + if (hdr->magic != BTF_MAGIC) { + btf_verifier_log(env, "Invalid magic"); + return -EINVAL; + } + + if (hdr->version != BTF_VERSION) { + btf_verifier_log(env, "Unsupported version"); + return -ENOTSUPP; + } + + if (hdr->flags) { + btf_verifier_log(env, "Unsupported flags"); + return -ENOTSUPP; + } + + if (btf_data_size == hdr->hdr_len) { + btf_verifier_log(env, "No data"); + return -EINVAL; + } + + err = btf_check_sec_info(env, btf_data_size); + if (err) + return err; + + return 0; +} + +static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, + u32 log_level, char __user *log_ubuf, u32 log_size) +{ + struct btf_verifier_env *env = NULL; + struct bpf_verifier_log *log; + struct btf *btf = NULL; + u8 *data; + int err; + + if (btf_data_size > BTF_MAX_SIZE) + return ERR_PTR(-E2BIG); + + env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); + if (!env) + return ERR_PTR(-ENOMEM); + + log = &env->log; + if (log_level || log_ubuf || log_size) { + /* user requested verbose verifier output + * and supplied buffer to store the verification trace + */ + log->level = log_level; + log->ubuf = log_ubuf; + log->len_total = log_size; + + /* log attributes have to be sane */ + if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || + !log->level || !log->ubuf) { + err = -EINVAL; + goto errout; + } + } + + btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); + if (!btf) { + err = -ENOMEM; + goto errout; + } + env->btf = btf; + + err = btf_parse_hdr(env, btf_data, btf_data_size); + if (err) + goto errout; + + data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); + if (!data) { + err = -ENOMEM; + goto errout; + } + + btf->data = data; + btf->data_size = btf_data_size; + btf->nohdr_data = btf->data + btf->hdr.hdr_len; + + if (copy_from_user(data, btf_data, btf_data_size)) { + err = -EFAULT; + goto errout; + } + + err = btf_parse_str_sec(env); + if (err) + goto errout; + + err = btf_parse_type_sec(env); + if (err) + goto errout; + + if (log->level && bpf_verifier_log_full(log)) { + err = -ENOSPC; + goto errout; + } + + btf_verifier_env_free(env); + refcount_set(&btf->refcnt, 1); + return btf; + +errout: + btf_verifier_env_free(env); + if (btf) + btf_free(btf); + return ERR_PTR(err); +} + +void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, + struct seq_file *m) +{ + const struct btf_type *t = btf_type_by_id(btf, type_id); + + btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m); +} + +static int btf_release(struct inode *inode, struct file *filp) +{ + btf_put(filp->private_data); + return 0; +} + +const struct file_operations btf_fops = { + .release = btf_release, +}; + +static int __btf_new_fd(struct btf *btf) +{ + return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC); +} + +int btf_new_fd(const union bpf_attr *attr) +{ + struct btf *btf; + int ret; + + btf = btf_parse(u64_to_user_ptr(attr->btf), + attr->btf_size, attr->btf_log_level, + u64_to_user_ptr(attr->btf_log_buf), + attr->btf_log_size); + if (IS_ERR(btf)) + return PTR_ERR(btf); + + ret = btf_alloc_id(btf); + if (ret) { + btf_free(btf); + return ret; + } + + /* + * The BTF ID is published to the userspace. + * All BTF free must go through call_rcu() from + * now on (i.e. free by calling btf_put()). + */ + + ret = __btf_new_fd(btf); + if (ret < 0) + btf_put(btf); + + return ret; +} + +struct btf *btf_get_by_fd(int fd) +{ + struct btf *btf; + struct fd f; + + f = fdget(fd); + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &btf_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + btf = f.file->private_data; + refcount_inc(&btf->refcnt); + fdput(f); + + return btf; +} + +int btf_get_info_by_fd(const struct btf *btf, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_btf_info __user *uinfo; + struct bpf_btf_info info = {}; + u32 info_copy, btf_copy; + void __user *ubtf; + u32 uinfo_len; + + uinfo = u64_to_user_ptr(attr->info.info); + uinfo_len = attr->info.info_len; + + info_copy = min_t(u32, uinfo_len, sizeof(info)); + if (copy_from_user(&info, uinfo, info_copy)) + return -EFAULT; + + info.id = btf->id; + ubtf = u64_to_user_ptr(info.btf); + btf_copy = min_t(u32, btf->data_size, info.btf_size); + if (copy_to_user(ubtf, btf->data, btf_copy)) + return -EFAULT; + info.btf_size = btf->data_size; + + if (copy_to_user(uinfo, &info, info_copy) || + put_user(info_copy, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + +int btf_get_fd_by_id(u32 id) +{ + struct btf *btf; + int fd; + + rcu_read_lock(); + btf = idr_find(&btf_idr, id); + if (!btf || !refcount_inc_not_zero(&btf->refcnt)) + btf = ERR_PTR(-ENOENT); + rcu_read_unlock(); + + if (IS_ERR(btf)) + return PTR_ERR(btf); + + fd = __btf_new_fd(btf); + if (fd < 0) + btf_put(btf); + + return fd; +} + +u32 btf_id(const struct btf *btf) +{ + return btf->id; +} diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 43171a0bb02b..f7c00bd6f8e4 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -500,6 +500,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * @sk: sock struct that will use sockaddr * @uaddr: sockaddr struct provided by user * @type: The type of program to be exectuted + * @t_ctx: Pointer to attach type specific context * * socket is expected to be of type INET or INET6. * @@ -508,12 +509,15 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); */ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, - enum bpf_attach_type type) + enum bpf_attach_type type, + void *t_ctx) { struct bpf_sock_addr_kern ctx = { .sk = sk, .uaddr = uaddr, + .t_ctx = t_ctx, }; + struct sockaddr_storage unspec; struct cgroup *cgrp; int ret; @@ -523,6 +527,11 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) return 0; + if (!ctx.uaddr) { + memset(&unspec, 0, sizeof(unspec)); + ctx.uaddr = (struct sockaddr *)&unspec; + } + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d315b393abdd..9f1493705f40 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -31,6 +31,7 @@ #include <linux/rbtree_latch.h> #include <linux/kallsyms.h> #include <linux/rcupdate.h> +#include <linux/perf_event.h> #include <asm/unaligned.h> @@ -218,47 +219,84 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) return 0; } -static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta) +static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta, + u32 curr, const bool probe_pass) { + const s64 imm_min = S32_MIN, imm_max = S32_MAX; + s64 imm = insn->imm; + + if (curr < pos && curr + imm + 1 > pos) + imm += delta; + else if (curr > pos + delta && curr + imm + 1 <= pos + delta) + imm -= delta; + if (imm < imm_min || imm > imm_max) + return -ERANGE; + if (!probe_pass) + insn->imm = imm; + return 0; +} + +static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta, + u32 curr, const bool probe_pass) +{ + const s32 off_min = S16_MIN, off_max = S16_MAX; + s32 off = insn->off; + + if (curr < pos && curr + off + 1 > pos) + off += delta; + else if (curr > pos + delta && curr + off + 1 <= pos + delta) + off -= delta; + if (off < off_min || off > off_max) + return -ERANGE; + if (!probe_pass) + insn->off = off; + return 0; +} + +static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, + const bool probe_pass) +{ + u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0); struct bpf_insn *insn = prog->insnsi; - u32 i, insn_cnt = prog->len; - bool pseudo_call; - u8 code; - int off; + int ret = 0; for (i = 0; i < insn_cnt; i++, insn++) { + u8 code; + + /* In the probing pass we still operate on the original, + * unpatched image in order to check overflows before we + * do any other adjustments. Therefore skip the patchlet. + */ + if (probe_pass && i == pos) { + i += delta + 1; + insn++; + } code = insn->code; - if (BPF_CLASS(code) != BPF_JMP) - continue; - if (BPF_OP(code) == BPF_EXIT) + if (BPF_CLASS(code) != BPF_JMP || + BPF_OP(code) == BPF_EXIT) continue; + /* Adjust offset of jmps if we cross patch boundaries. */ if (BPF_OP(code) == BPF_CALL) { - if (insn->src_reg == BPF_PSEUDO_CALL) - pseudo_call = true; - else + if (insn->src_reg != BPF_PSEUDO_CALL) continue; + ret = bpf_adj_delta_to_imm(insn, pos, delta, i, + probe_pass); } else { - pseudo_call = false; + ret = bpf_adj_delta_to_off(insn, pos, delta, i, + probe_pass); } - off = pseudo_call ? insn->imm : insn->off; - - /* Adjust offset of jmps if we cross boundaries. */ - if (i < pos && i + off + 1 > pos) - off += delta; - else if (i > pos + delta && i + off + 1 <= pos + delta) - off -= delta; - - if (pseudo_call) - insn->imm = off; - else - insn->off = off; + if (ret) + break; } + + return ret; } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len) { u32 insn_adj_cnt, insn_rest, insn_delta = len - 1; + const u32 cnt_max = S16_MAX; struct bpf_prog *prog_adj; /* Since our patchlet doesn't expand the image, we're done. */ @@ -269,6 +307,15 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, insn_adj_cnt = prog->len + insn_delta; + /* Reject anything that would potentially let the insn->off + * target overflow when we have excessive program expansions. + * We need to probe here before we do any reallocation where + * we afterwards may not fail anymore. + */ + if (insn_adj_cnt > cnt_max && + bpf_adj_branches(prog, off, insn_delta, true)) + return NULL; + /* Several new instructions need to be inserted. Make room * for them. Likely, there's no need for a new allocation as * last page could have large enough tailroom. @@ -294,7 +341,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, sizeof(*patch) * insn_rest); memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len); - bpf_adj_branches(prog_adj, off, insn_delta); + /* We are guaranteed to not fail at this point, otherwise + * the ship has sailed to reverse to the original state. An + * overflow cannot happen at this point. + */ + BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false)); return prog_adj; } @@ -633,23 +684,6 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; - case BPF_LD | BPF_ABS | BPF_W: - case BPF_LD | BPF_ABS | BPF_H: - case BPF_LD | BPF_ABS | BPF_B: - *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); - *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); - *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); - break; - - case BPF_LD | BPF_IND | BPF_W: - case BPF_LD | BPF_IND | BPF_H: - case BPF_LD | BPF_IND | BPF_B: - *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); - *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); - *to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg); - *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); - break; - case BPF_LD | BPF_IMM | BPF_DW: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); @@ -890,14 +924,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(LDX, MEM, W), \ INSN_3(LDX, MEM, DW), \ /* Immediate based. */ \ - INSN_3(LD, IMM, DW), \ - /* Misc (old cBPF carry-over). */ \ - INSN_3(LD, ABS, B), \ - INSN_3(LD, ABS, H), \ - INSN_3(LD, ABS, W), \ - INSN_3(LD, IND, B), \ - INSN_3(LD, IND, H), \ - INSN_3(LD, IND, W) + INSN_3(LD, IMM, DW) bool bpf_opcode_in_insntable(u8 code) { @@ -907,6 +934,13 @@ bool bpf_opcode_in_insntable(u8 code) [0 ... 255] = false, /* Now overwrite non-defaults ... */ BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), + /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */ + [BPF_LD | BPF_ABS | BPF_B] = true, + [BPF_LD | BPF_ABS | BPF_H] = true, + [BPF_LD | BPF_ABS | BPF_W] = true, + [BPF_LD | BPF_IND | BPF_B] = true, + [BPF_LD | BPF_IND | BPF_H] = true, + [BPF_LD | BPF_IND | BPF_W] = true, }; #undef BPF_INSN_3_TBL #undef BPF_INSN_2_TBL @@ -937,8 +971,6 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) #undef BPF_INSN_3_LBL #undef BPF_INSN_2_LBL u32 tail_call_cnt = 0; - void *ptr; - int off; #define CONT ({ insn++; goto select_insn; }) #define CONT_JMP ({ insn++; goto select_insn; }) @@ -1265,67 +1297,6 @@ out: atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) (DST + insn->off)); CONT; - LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ - off = IMM; -load_word: - /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only - * appearing in the programs where ctx == skb - * (see may_access_skb() in the verifier). All programs - * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6, - * bpf_convert_filter() saves it in BPF_R6, internal BPF - * verifier will check that BPF_R6 == ctx. - * - * BPF_ABS and BPF_IND are wrappers of function calls, - * so they scratch BPF_R1-BPF_R5 registers, preserve - * BPF_R6-BPF_R9, and store return value into BPF_R0. - * - * Implicit input: - * ctx == skb == BPF_R6 == CTX - * - * Explicit input: - * SRC == any register - * IMM == 32-bit immediate - * - * Output: - * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness - */ - - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = get_unaligned_be32(ptr); - CONT; - } - - return 0; - LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ - off = IMM; -load_half: - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = get_unaligned_be16(ptr); - CONT; - } - - return 0; - LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ - off = IMM; -load_byte: - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = *(u8 *)ptr; - CONT; - } - - return 0; - LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ - off = IMM + SRC; - goto load_word; - LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ - off = IMM + SRC; - goto load_half; - LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ - off = IMM + SRC; - goto load_byte; default_label: /* If we ever reach this, we have a bug somewhere. Die hard here @@ -1572,13 +1543,32 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) return cnt; } +static bool bpf_prog_array_copy_core(struct bpf_prog **prog, + u32 *prog_ids, + u32 request_cnt) +{ + int i = 0; + + for (; *prog; prog++) { + if (*prog == &dummy_bpf_prog.prog) + continue; + prog_ids[i] = (*prog)->aux->id; + if (++i == request_cnt) { + prog++; + break; + } + } + + return !!(*prog); +} + int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, __u32 __user *prog_ids, u32 cnt) { struct bpf_prog **prog; unsigned long err = 0; - u32 i = 0, *ids; bool nospc; + u32 *ids; /* users of this function are doing: * cnt = bpf_prog_array_length(); @@ -1595,16 +1585,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, return -ENOMEM; rcu_read_lock(); prog = rcu_dereference(progs)->progs; - for (; *prog; prog++) { - if (*prog == &dummy_bpf_prog.prog) - continue; - ids[i] = (*prog)->aux->id; - if (++i == cnt) { - prog++; - break; - } - } - nospc = !!(*prog); + nospc = bpf_prog_array_copy_core(prog, ids, cnt); rcu_read_unlock(); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); @@ -1635,6 +1616,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, int new_prog_cnt, carry_prog_cnt = 0; struct bpf_prog **existing_prog; struct bpf_prog_array *array; + bool found_exclude = false; int new_prog_idx = 0; /* Figure out how many existing progs we need to carry over to @@ -1643,14 +1625,20 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, if (old_array) { existing_prog = old_array->progs; for (; *existing_prog; existing_prog++) { - if (*existing_prog != exclude_prog && - *existing_prog != &dummy_bpf_prog.prog) + if (*existing_prog == exclude_prog) { + found_exclude = true; + continue; + } + if (*existing_prog != &dummy_bpf_prog.prog) carry_prog_cnt++; if (*existing_prog == include_prog) return -EEXIST; } } + if (exclude_prog && !found_exclude) + return -ENOENT; + /* How many progs (not NULL) will be in the new array? */ new_prog_cnt = carry_prog_cnt; if (include_prog) @@ -1683,22 +1671,25 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, } int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, - __u32 __user *prog_ids, u32 request_cnt, - __u32 __user *prog_cnt) + u32 *prog_ids, u32 request_cnt, + u32 *prog_cnt) { + struct bpf_prog **prog; u32 cnt = 0; if (array) cnt = bpf_prog_array_length(array); - if (copy_to_user(prog_cnt, &cnt, sizeof(cnt))) - return -EFAULT; + *prog_cnt = cnt; /* return early if user requested only program count or nothing to copy */ if (!request_cnt || !cnt) return 0; - return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt); + /* this function is called under trace/bpf_trace.c: bpf_event_mutex */ + prog = rcu_dereference_check(array, 1)->progs; + return bpf_prog_array_copy_core(prog, prog_ids, request_cnt) ? -ENOSPC + : 0; } static void bpf_prog_free_deferred(struct work_struct *work) @@ -1709,6 +1700,10 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); +#ifdef CONFIG_PERF_EVENTS + if (aux->prog->has_callchain_buf) + put_callchain_buffers(); +#endif for (i = 0; i < aux->func_cnt; i++) bpf_jit_free(aux->func[i]); if (aux->func_cnt) { @@ -1769,6 +1764,8 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_sock_map_update_proto __weak; +const struct bpf_func_proto bpf_sock_hash_update_proto __weak; +const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { @@ -1781,6 +1778,7 @@ bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, { return -ENOTSUPP; } +EXPORT_SYMBOL_GPL(bpf_event_output); /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { @@ -1827,9 +1825,3 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, #include <linux/bpf_trace.h> EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); - -/* These are only used within the BPF_SYSCALL code */ -#ifdef CONFIG_BPF_SYSCALL -EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type); -EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu); -#endif diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a4bb0b34375a..e0918d180f08 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -19,6 +19,7 @@ #include <linux/bpf.h> #include <linux/filter.h> #include <linux/ptr_ring.h> +#include <net/xdp.h> #include <linux/sched.h> #include <linux/workqueue.h> @@ -137,27 +138,6 @@ free_cmap: return ERR_PTR(err); } -static void __cpu_map_queue_destructor(void *ptr) -{ - /* The tear-down procedure should have made sure that queue is - * empty. See __cpu_map_entry_replace() and work-queue - * invoked cpu_map_kthread_stop(). Catch any broken behaviour - * gracefully and warn once. - */ - if (WARN_ON_ONCE(ptr)) - page_frag_free(ptr); -} - -static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) -{ - if (atomic_dec_and_test(&rcpu->refcnt)) { - /* The queue should be empty at this point */ - ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor); - kfree(rcpu->queue); - kfree(rcpu); - } -} - static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) { atomic_inc(&rcpu->refcnt); @@ -179,45 +159,8 @@ static void cpu_map_kthread_stop(struct work_struct *work) kthread_stop(rcpu->kthread); } -/* For now, xdp_pkt is a cpumap internal data structure, with info - * carried between enqueue to dequeue. It is mapped into the top - * headroom of the packet, to avoid allocating separate mem. - */ -struct xdp_pkt { - void *data; - u16 len; - u16 headroom; - u16 metasize; - struct net_device *dev_rx; -}; - -/* Convert xdp_buff to xdp_pkt */ -static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) -{ - struct xdp_pkt *xdp_pkt; - int metasize; - int headroom; - - /* Assure headroom is available for storing info */ - headroom = xdp->data - xdp->data_hard_start; - metasize = xdp->data - xdp->data_meta; - metasize = metasize > 0 ? metasize : 0; - if (unlikely((headroom - metasize) < sizeof(*xdp_pkt))) - return NULL; - - /* Store info in top of packet */ - xdp_pkt = xdp->data_hard_start; - - xdp_pkt->data = xdp->data; - xdp_pkt->len = xdp->data_end - xdp->data; - xdp_pkt->headroom = headroom - sizeof(*xdp_pkt); - xdp_pkt->metasize = metasize; - - return xdp_pkt; -} - static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_pkt *xdp_pkt) + struct xdp_frame *xdpf) { unsigned int frame_size; void *pkt_data_start; @@ -232,7 +175,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * would be preferred to set frame_size to 2048 or 4096 * depending on the driver. * frame_size = 2048; - * frame_len = frame_size - sizeof(*xdp_pkt); + * frame_len = frame_size - sizeof(*xdp_frame); * * Instead, with info avail, skb_shared_info in placed after * packet len. This, unfortunately fakes the truesize. @@ -240,21 +183,21 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * is not at a fixed memory location, with mixed length * packets, which is bad for cache-line hotness. */ - frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom + + frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - pkt_data_start = xdp_pkt->data - xdp_pkt->headroom; + pkt_data_start = xdpf->data - xdpf->headroom; skb = build_skb(pkt_data_start, frame_size); if (!skb) return NULL; - skb_reserve(skb, xdp_pkt->headroom); - __skb_put(skb, xdp_pkt->len); - if (xdp_pkt->metasize) - skb_metadata_set(skb, xdp_pkt->metasize); + skb_reserve(skb, xdpf->headroom); + __skb_put(skb, xdpf->len); + if (xdpf->metasize) + skb_metadata_set(skb, xdpf->metasize); /* Essential SKB info: protocol and skb->dev */ - skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx); + skb->protocol = eth_type_trans(skb, xdpf->dev_rx); /* Optional SKB info, currently missing: * - HW checksum info (skb->ip_summed) @@ -265,6 +208,31 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, return skb; } +static void __cpu_map_ring_cleanup(struct ptr_ring *ring) +{ + /* The tear-down procedure should have made sure that queue is + * empty. See __cpu_map_entry_replace() and work-queue + * invoked cpu_map_kthread_stop(). Catch any broken behaviour + * gracefully and warn once. + */ + struct xdp_frame *xdpf; + + while ((xdpf = ptr_ring_consume(ring))) + if (WARN_ON_ONCE(xdpf)) + xdp_return_frame(xdpf); +} + +static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) +{ + if (atomic_dec_and_test(&rcpu->refcnt)) { + /* The queue should be empty at this point */ + __cpu_map_ring_cleanup(rcpu->queue); + ptr_ring_cleanup(rcpu->queue, NULL); + kfree(rcpu->queue); + kfree(rcpu); + } +} + static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; @@ -278,7 +246,7 @@ static int cpu_map_kthread_run(void *data) */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { unsigned int processed = 0, drops = 0, sched = 0; - struct xdp_pkt *xdp_pkt; + struct xdp_frame *xdpf; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -301,13 +269,13 @@ static int cpu_map_kthread_run(void *data) * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) { + while ((xdpf = __ptr_ring_consume(rcpu->queue))) { struct sk_buff *skb; int ret; - skb = cpu_map_build_skb(rcpu, xdp_pkt); + skb = cpu_map_build_skb(rcpu, xdpf); if (!skb) { - page_frag_free(xdp_pkt); + xdp_return_frame(xdpf); continue; } @@ -604,13 +572,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, spin_lock(&q->producer_lock); for (i = 0; i < bq->count; i++) { - void *xdp_pkt = bq->q[i]; + struct xdp_frame *xdpf = bq->q[i]; int err; - err = __ptr_ring_produce(q, xdp_pkt); + err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - page_frag_free(xdp_pkt); /* Free xdp_pkt */ + xdp_return_frame_rx_napi(xdpf); } processed++; } @@ -625,7 +593,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) +static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); @@ -636,28 +604,28 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) * driver to code invoking us to finished, due to driver * (e.g. ixgbe) recycle tricks based on page-refcnt. * - * Thus, incoming xdp_pkt is always queued here (else we race + * Thus, incoming xdp_frame is always queued here (else we race * with another CPU on page-refcnt and remaining driver code). * Queue time is very short, as driver will invoke flush * operation, when completing napi->poll call. */ - bq->q[bq->count++] = xdp_pkt; + bq->q[bq->count++] = xdpf; return 0; } int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx) { - struct xdp_pkt *xdp_pkt; + struct xdp_frame *xdpf; - xdp_pkt = convert_to_xdp_pkt(xdp); - if (unlikely(!xdp_pkt)) + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) return -EOVERFLOW; /* Info needed when constructing SKB on remote CPU */ - xdp_pkt->dev_rx = dev_rx; + xdpf->dev_rx = dev_rx; - bq_enqueue(rcpu, xdp_pkt); + bq_enqueue(rcpu, xdpf); return 0; } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 565f9ece9115..a7cc7b3494a9 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -48,15 +48,25 @@ * calls will fail at this point. */ #include <linux/bpf.h> +#include <net/xdp.h> #include <linux/filter.h> +#include <trace/events/xdp.h> #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +#define DEV_MAP_BULK_SIZE 16 +struct xdp_bulk_queue { + struct xdp_frame *q[DEV_MAP_BULK_SIZE]; + struct net_device *dev_rx; + unsigned int count; +}; + struct bpf_dtab_netdev { - struct net_device *dev; + struct net_device *dev; /* must be first member, due to tracepoint */ struct bpf_dtab *dtab; unsigned int bit; + struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; }; @@ -206,6 +216,50 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) __set_bit(bit, bitmap); } +static int bq_xmit_all(struct bpf_dtab_netdev *obj, + struct xdp_bulk_queue *bq, u32 flags) +{ + struct net_device *dev = obj->dev; + int sent = 0, drops = 0, err = 0; + int i; + + if (unlikely(!bq->count)) + return 0; + + for (i = 0; i < bq->count; i++) { + struct xdp_frame *xdpf = bq->q[i]; + + prefetch(xdpf); + } + + sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); + if (sent < 0) { + err = sent; + sent = 0; + goto error; + } + drops = bq->count - sent; +out: + bq->count = 0; + + trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, + sent, drops, bq->dev_rx, dev, err); + bq->dev_rx = NULL; + return 0; +error: + /* If ndo_xdp_xmit fails with an errno, no frames have been + * xmit'ed and it's our responsibility to them free all. + */ + for (i = 0; i < bq->count; i++) { + struct xdp_frame *xdpf = bq->q[i]; + + /* RX path under NAPI protection, can return frames faster */ + xdp_return_frame_rx_napi(xdpf); + drops++; + } + goto out; +} + /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled * from the driver before returning from its napi->poll() routine. The poll() * routine is called either from busy_poll context or net_rx_action signaled @@ -221,7 +275,7 @@ void __dev_map_flush(struct bpf_map *map) for_each_set_bit(bit, bitmap, map->max_entries) { struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); - struct net_device *netdev; + struct xdp_bulk_queue *bq; /* This is possible if the dev entry is removed by user space * between xdp redirect and flush op. @@ -230,9 +284,9 @@ void __dev_map_flush(struct bpf_map *map) continue; __clear_bit(bit, bitmap); - netdev = dev->dev; - if (likely(netdev->netdev_ops->ndo_xdp_flush)) - netdev->netdev_ops->ndo_xdp_flush(netdev); + + bq = this_cpu_ptr(dev->bulkq); + bq_xmit_all(dev, bq, XDP_XMIT_FLUSH); } } @@ -240,37 +294,79 @@ void __dev_map_flush(struct bpf_map *map) * update happens in parallel here a dev_put wont happen until after reading the * ifindex. */ -struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) +struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_dtab_netdev *dev; + struct bpf_dtab_netdev *obj; if (key >= map->max_entries) return NULL; - dev = READ_ONCE(dtab->netdev_map[key]); - return dev ? dev->dev : NULL; + obj = READ_ONCE(dtab->netdev_map[key]); + return obj; +} + +/* Runs under RCU-read-side, plus in softirq under NAPI protection. + * Thus, safe percpu variable access. + */ +static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, + struct net_device *dev_rx) + +{ + struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); + + if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) + bq_xmit_all(obj, bq, 0); + + /* Ingress dev_rx will be the same for all xdp_frame's in + * bulk_queue, because bq stored per-CPU and must be flushed + * from net_device drivers NAPI func end. + */ + if (!bq->dev_rx) + bq->dev_rx = dev_rx; + + bq->q[bq->count++] = xdpf; + return 0; +} + +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + struct net_device *dev = dst->dev; + struct xdp_frame *xdpf; + + if (!dev->netdev_ops->ndo_xdp_xmit) + return -EOPNOTSUPP; + + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + return bq_enqueue(dst, xdpf, dev_rx); } static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { - struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key); + struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); + struct net_device *dev = obj ? obj->dev : NULL; return dev ? &dev->ifindex : NULL; } static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { - if (dev->dev->netdev_ops->ndo_xdp_flush) { - struct net_device *fl = dev->dev; + if (dev->dev->netdev_ops->ndo_xdp_xmit) { + struct xdp_bulk_queue *bq; unsigned long *bitmap; + int cpu; for_each_online_cpu(cpu) { bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); __clear_bit(dev->bit, bitmap); - fl->netdev_ops->ndo_xdp_flush(dev->dev); + bq = per_cpu_ptr(dev->bulkq, cpu); + bq_xmit_all(dev, bq, XDP_XMIT_FLUSH); } } } @@ -281,6 +377,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu) dev = container_of(rcu, struct bpf_dtab_netdev, rcu); dev_map_flush_old(dev); + free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -313,6 +410,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct net *net = current->nsproxy->net_ns; + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; struct bpf_dtab_netdev *dev, *old_dev; u32 i = *(u32 *)key; u32 ifindex = *(u32 *)value; @@ -327,13 +425,20 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, if (!ifindex) { dev = NULL; } else { - dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, - map->numa_node); + dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node); if (!dev) return -ENOMEM; + dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), + sizeof(void *), gfp); + if (!dev->bulkq) { + kfree(dev); + return -ENOMEM; + } + dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) { + free_percpu(dev->bulkq); kfree(dev); return -EINVAL; } @@ -405,6 +510,9 @@ static struct notifier_block dev_map_notifier = { static int __init dev_map_init(void) { + /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ + BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != + offsetof(struct _bpf_dtab_netdev, dev)); register_netdevice_notifier(&dev_map_notifier); return 0; } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b76828f23b49..3ca2198a6d22 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -503,7 +503,9 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; - *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); + BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, + (void *(*)(struct bpf_map *map, void *key))NULL)); + *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, offsetof(struct htab_elem, key) + @@ -530,7 +532,9 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map, const int ret = BPF_REG_0; const int ref_reg = BPF_REG_1; - *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); + BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, + (void *(*)(struct bpf_map *map, void *key))NULL)); + *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4); *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret, offsetof(struct htab_elem, lru_node) + @@ -1369,7 +1373,9 @@ static u32 htab_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; - *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); + BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, + (void *(*)(struct bpf_map *map, void *key))NULL)); + *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, offsetof(struct htab_elem, key) + diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 3d24e238221e..73065e2d23c2 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -179,3 +179,18 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE, }; + +#ifdef CONFIG_CGROUPS +BPF_CALL_0(bpf_get_current_cgroup_id) +{ + struct cgroup *cgrp = task_dfl_cgroup(current); + + return cgrp->kn->id.id; +} + +const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { + .func = bpf_get_current_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; +#endif diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index bf6da59ae0d0..ed13645bd80c 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -150,8 +150,154 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return 0; } +struct map_iter { + void *key; + bool done; +}; + +static struct map_iter *map_iter(struct seq_file *m) +{ + return m->private; +} + +static struct bpf_map *seq_file_to_map(struct seq_file *m) +{ + return file_inode(m->file)->i_private; +} + +static void map_iter_free(struct map_iter *iter) +{ + if (iter) { + kfree(iter->key); + kfree(iter); + } +} + +static struct map_iter *map_iter_alloc(struct bpf_map *map) +{ + struct map_iter *iter; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN); + if (!iter) + goto error; + + iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN); + if (!iter->key) + goto error; + + return iter; + +error: + map_iter_free(iter); + return NULL; +} + +static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct bpf_map *map = seq_file_to_map(m); + void *key = map_iter(m)->key; + + if (map_iter(m)->done) + return NULL; + + if (unlikely(v == SEQ_START_TOKEN)) + goto done; + + if (map->ops->map_get_next_key(map, key, key)) { + map_iter(m)->done = true; + return NULL; + } + +done: + ++(*pos); + return key; +} + +static void *map_seq_start(struct seq_file *m, loff_t *pos) +{ + if (map_iter(m)->done) + return NULL; + + return *pos ? map_iter(m)->key : SEQ_START_TOKEN; +} + +static void map_seq_stop(struct seq_file *m, void *v) +{ +} + +static int map_seq_show(struct seq_file *m, void *v) +{ + struct bpf_map *map = seq_file_to_map(m); + void *key = map_iter(m)->key; + + if (unlikely(v == SEQ_START_TOKEN)) { + seq_puts(m, "# WARNING!! The output is for debug purpose only\n"); + seq_puts(m, "# WARNING!! The output format will change\n"); + } else { + map->ops->map_seq_show_elem(map, key, m); + } + + return 0; +} + +static const struct seq_operations bpffs_map_seq_ops = { + .start = map_seq_start, + .next = map_seq_next, + .show = map_seq_show, + .stop = map_seq_stop, +}; + +static int bpffs_map_open(struct inode *inode, struct file *file) +{ + struct bpf_map *map = inode->i_private; + struct map_iter *iter; + struct seq_file *m; + int err; + + iter = map_iter_alloc(map); + if (!iter) + return -ENOMEM; + + err = seq_open(file, &bpffs_map_seq_ops); + if (err) { + map_iter_free(iter); + return err; + } + + m = file->private_data; + m->private = iter; + + return 0; +} + +static int bpffs_map_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + + map_iter_free(map_iter(m)); + + return seq_release(inode, file); +} + +/* bpffs_map_fops should only implement the basic + * read operation for a BPF map. The purpose is to + * provide a simple user intuitive way to do + * "cat bpffs/pathto/a-pinned-map". + * + * Other operations (e.g. write, lookup...) should be realized by + * the userspace tools (e.g. bpftool) through the + * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update + * interface. + */ +static const struct file_operations bpffs_map_fops = { + .open = bpffs_map_open, + .read = seq_read, + .release = bpffs_map_release, +}; + static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, - const struct inode_operations *iops) + const struct inode_operations *iops, + const struct file_operations *fops) { struct inode *dir = dentry->d_parent->d_inode; struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode); @@ -159,6 +305,7 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, return PTR_ERR(inode); inode->i_op = iops; + inode->i_fop = fops; inode->i_private = raw; bpf_dentry_finalize(dentry, inode, dir); @@ -167,12 +314,15 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg) { - return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops); + return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, NULL); } static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) { - return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops); + struct bpf_map *map = arg; + + return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, + map->btf ? &bpffs_map_fops : NULL); } static struct dentry * @@ -279,13 +429,6 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname) ret = bpf_obj_do_pin(pname, raw, type); if (ret != 0) bpf_any_put(raw, type); - if ((trace_bpf_obj_pin_prog_enabled() || - trace_bpf_obj_pin_map_enabled()) && !ret) { - if (type == BPF_TYPE_PROG) - trace_bpf_obj_pin_prog(raw, ufd, pname); - if (type == BPF_TYPE_MAP) - trace_bpf_obj_pin_map(raw, ufd, pname); - } out: putname(pname); return ret; @@ -352,15 +495,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags) else goto out; - if (ret < 0) { + if (ret < 0) bpf_any_put(raw, type); - } else if (trace_bpf_obj_get_prog_enabled() || - trace_bpf_obj_get_map_enabled()) { - if (type == BPF_TYPE_PROG) - trace_bpf_obj_get_prog(raw, ret, pname); - if (type == BPF_TYPE_MAP) - trace_bpf_obj_get_map(raw, ret, pname); - } out: putname(pname); return ret; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index b4b5b81e7251..1603492c9cc7 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -623,8 +623,9 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) if (!key || key->prefixlen > trie->max_prefixlen) goto find_leftmost; - node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), - GFP_ATOMIC | __GFP_NOWARN); + node_stack = kmalloc_array(trie->max_prefixlen, + sizeof(struct lpm_trie_node *), + GFP_ATOMIC | __GFP_NOWARN); if (!node_stack) return -ENOMEM; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index c9401075b58c..ac747d5cf7c6 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 Netronome Systems, Inc. + * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -474,8 +474,10 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) struct bpf_prog_offload *offload; bool ret; - if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map)) + if (!bpf_prog_is_dev_bound(prog->aux)) return false; + if (!bpf_map_is_dev_bound(map)) + return bpf_map_offload_neutral(map); down_read(&bpf_devs_lock); offload = prog->aux->offload; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 8dd9210d7db7..52a91d816c0e 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -43,18 +43,45 @@ #include <net/tcp.h> #include <linux/ptr_ring.h> #include <net/inet_common.h> +#include <linux/sched/signal.h> #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) -struct bpf_stab { - struct bpf_map map; - struct sock **sock_map; +struct bpf_sock_progs { struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; }; +struct bpf_stab { + struct bpf_map map; + struct sock **sock_map; + struct bpf_sock_progs progs; +}; + +struct bucket { + struct hlist_head head; + raw_spinlock_t lock; +}; + +struct bpf_htab { + struct bpf_map map; + struct bucket *buckets; + atomic_t count; + u32 n_buckets; + u32 elem_size; + struct bpf_sock_progs progs; +}; + +struct htab_elem { + struct rcu_head rcu; + struct hlist_node hash_node; + u32 hash; + struct sock *sk; + char key[0]; +}; + enum smap_psock_state { SMAP_TX_RUNNING, }; @@ -62,6 +89,8 @@ enum smap_psock_state { struct smap_psock_map_entry { struct list_head list; struct sock **entry; + struct htab_elem *hash_link; + struct bpf_htab *htab; }; struct smap_psock { @@ -190,6 +219,12 @@ out: rcu_read_unlock(); } +static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) +{ + atomic_dec(&htab->count); + kfree_rcu(l, rcu); +} + static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); @@ -226,10 +261,16 @@ static void bpf_tcp_close(struct sock *sk, long timeout) } list_for_each_entry_safe(e, tmp, &psock->maps, list) { - osk = cmpxchg(e->entry, sk, NULL); - if (osk == sk) { - list_del(&e->list); - smap_release_sock(psock, sk); + if (e->entry) { + osk = cmpxchg(e->entry, sk, NULL); + if (osk == sk) { + list_del(&e->list); + smap_release_sock(psock, sk); + } + } else { + hlist_del_rcu(&e->hash_link->hash_node); + smap_release_sock(psock, e->hash_link->sk); + free_htab_elem(e->htab, e->hash_link); } } write_unlock_bh(&sk->sk_callback_lock); @@ -325,6 +366,9 @@ retry: if (ret > 0) { if (apply) apply_bytes -= ret; + + sg->offset += ret; + sg->length -= ret; size -= ret; offset += ret; if (uncharge) @@ -332,8 +376,6 @@ retry: goto retry; } - sg->length = size; - sg->offset = offset; return ret; } @@ -391,7 +433,8 @@ static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) } while (i != md->sg_end); } -static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +static void free_bytes_sg(struct sock *sk, int bytes, + struct sk_msg_buff *md, bool charge) { struct scatterlist *sg = md->sg_data; int i = md->sg_start, free; @@ -401,11 +444,13 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) if (bytes < free) { sg[i].length -= bytes; sg[i].offset += bytes; - sk_mem_uncharge(sk, bytes); + if (charge) + sk_mem_uncharge(sk, bytes); break; } - sk_mem_uncharge(sk, sg[i].length); + if (charge) + sk_mem_uncharge(sk, sg[i].length); put_page(sg_page(&sg[i])); bytes -= sg[i].length; sg[i].length = 0; @@ -416,6 +461,7 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) if (i == MAX_SKB_FRAGS) i = 0; } + md->sg_start = i; } static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) @@ -455,7 +501,7 @@ static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) { return ((_rc == SK_PASS) ? - (md->map ? __SK_REDIRECT : __SK_PASS) : + (md->sk_redir ? __SK_REDIRECT : __SK_PASS) : __SK_DROP); } @@ -477,6 +523,7 @@ static unsigned int smap_do_tx_msg(struct sock *sk, } bpf_compute_data_pointers_sg(md); + md->sk = sk; rc = (*prog->bpf_func)(md, prog->insnsi); psock->apply_bytes = md->apply_bytes; @@ -523,8 +570,6 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, i = md->sg_start; do { - r->sg_data[i] = md->sg_data[i]; - size = (apply && apply_bytes < md->sg_data[i].length) ? apply_bytes : md->sg_data[i].length; @@ -535,6 +580,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, } sk_mem_charge(sk, size); + r->sg_data[i] = md->sg_data[i]; r->sg_data[i].length = size; md->sg_data[i].length -= size; md->sg_data[i].offset += size; @@ -575,10 +621,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, struct sk_msg_buff *md, int flags) { + bool ingress = !!(md->flags & BPF_F_INGRESS); struct smap_psock *psock; struct scatterlist *sg; - int i, err, free = 0; - bool ingress = !!(md->flags & BPF_F_INGRESS); + int err = 0; sg = md->sg_data; @@ -606,16 +652,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, out_rcu: rcu_read_unlock(); out: - i = md->sg_start; - while (sg[i].length) { - free += sg[i].length; - put_page(sg_page(&sg[i])); - sg[i].length = 0; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - } - return free; + free_bytes_sg(NULL, send, md, false); + return err; } static inline void bpf_md_init(struct smap_psock *psock) @@ -700,19 +738,26 @@ more_data: err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); lock_sock(sk); + if (unlikely(err < 0)) { + free_start_sg(sk, m); + psock->sg_size = 0; + if (!cork) + *copied -= send; + } else { + psock->sg_size -= send; + } + if (cork) { free_start_sg(sk, m); + psock->sg_size = 0; kfree(m); m = NULL; + err = 0; } - if (unlikely(err)) - *copied -= err; - else - psock->sg_size -= send; break; case __SK_DROP: default: - free_bytes_sg(sk, send, m); + free_bytes_sg(sk, send, m, true); apply_bytes_dec(psock, send); *copied -= send; psock->sg_size -= send; @@ -732,6 +777,26 @@ out_err: return err; } +static int bpf_wait_data(struct sock *sk, + struct smap_psock *psk, int flags, + long timeo, int *err) +{ + int rc; + + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + rc = sk_wait_event(sk, &timeo, + !list_empty(&psk->ingress) || + !skb_queue_empty(&sk->sk_receive_queue), + &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + + return rc; +} + static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -755,6 +820,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); lock_sock(sk); +bytes_ready: while (copied != len) { struct scatterlist *sg; struct sk_msg_buff *md; @@ -809,6 +875,28 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } } + if (!copied) { + long timeo; + int data; + int err = 0; + + timeo = sock_rcvtimeo(sk, nonblock); + data = bpf_wait_data(sk, psock, flags, timeo, &err); + + if (data) { + if (!skb_queue_empty(&sk->sk_receive_queue)) { + release_sock(sk); + smap_release_sock(psock, sk); + copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + return copied; + } + goto bytes_ready; + } + + if (err) + copied = err; + } + release_sock(sk); smap_release_sock(psock, sk); return copied; @@ -1045,7 +1133,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) * when we orphan the skb so that we don't have the possibility * to reference a stale map. */ - TCP_SKB_CB(skb)->bpf.map = NULL; + TCP_SKB_CB(skb)->bpf.sk_redir = NULL; skb->sk = psock->sock; bpf_compute_data_pointers(skb); preempt_disable(); @@ -1055,7 +1143,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) /* Moving return codes from UAPI namespace into internal namespace */ return rc == SK_PASS ? - (TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) : + (TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) : __SK_DROP; } @@ -1325,7 +1413,6 @@ static int smap_init_sock(struct smap_psock *psock, } static void smap_init_progs(struct smap_psock *psock, - struct bpf_stab *stab, struct bpf_prog *verdict, struct bpf_prog *parse) { @@ -1403,14 +1490,13 @@ static void smap_gc_work(struct work_struct *w) kfree(psock); } -static struct smap_psock *smap_init_psock(struct sock *sock, - struct bpf_stab *stab) +static struct smap_psock *smap_init_psock(struct sock *sock, int node) { struct smap_psock *psock; psock = kzalloc_node(sizeof(struct smap_psock), GFP_ATOMIC | __GFP_NOWARN, - stab->map.numa_node); + node); if (!psock) return ERR_PTR(-ENOMEM); @@ -1442,9 +1528,6 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); - if (attr->value_size > KMALLOC_MAX_SIZE) - return ERR_PTR(-E2BIG); - err = bpf_tcp_ulp_register(); if (err && err != -EEXIST) return ERR_PTR(err); @@ -1481,12 +1564,14 @@ free_stab: return ERR_PTR(err); } -static void smap_list_remove(struct smap_psock *psock, struct sock **entry) +static void smap_list_remove(struct smap_psock *psock, + struct sock **entry, + struct htab_elem *hash_link) { struct smap_psock_map_entry *e, *tmp; list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry) { + if (e->entry == entry || e->hash_link == hash_link) { list_del(&e->list); break; } @@ -1524,7 +1609,7 @@ static void sock_map_free(struct bpf_map *map) * to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, &stab->sock_map[i]); + smap_list_remove(psock, &stab->sock_map[i], NULL); smap_release_sock(psock, sock); } write_unlock_bh(&sock->sk_callback_lock); @@ -1583,7 +1668,7 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (psock->bpf_parse) smap_stop_sock(psock, sock); - smap_list_remove(psock, &stab->sock_map[k]); + smap_list_remove(psock, &stab->sock_map[k], NULL); smap_release_sock(psock, sock); out: write_unlock_bh(&sock->sk_callback_lock); @@ -1618,40 +1703,26 @@ out: * - sock_map must use READ_ONCE and (cmp)xchg operations * - BPF verdict/parse programs must use READ_ONCE and xchg operations */ -static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, - struct bpf_map *map, - void *key, u64 flags) + +static int __sock_map_ctx_update_elem(struct bpf_map *map, + struct bpf_sock_progs *progs, + struct sock *sock, + struct sock **map_link, + void *key) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct smap_psock_map_entry *e = NULL; struct bpf_prog *verdict, *parse, *tx_msg; - struct sock *osock, *sock; + struct smap_psock_map_entry *e = NULL; struct smap_psock *psock; - u32 i = *(u32 *)key; bool new = false; - int err; - - if (unlikely(flags > BPF_EXIST)) - return -EINVAL; - - if (unlikely(i >= stab->map.max_entries)) - return -E2BIG; - - sock = READ_ONCE(stab->sock_map[i]); - if (flags == BPF_EXIST && !sock) - return -ENOENT; - else if (flags == BPF_NOEXIST && sock) - return -EEXIST; - - sock = skops->sk; + int err = 0; /* 1. If sock map has BPF programs those will be inherited by the * sock being added. If the sock is already attached to BPF programs * this results in an error. */ - verdict = READ_ONCE(stab->bpf_verdict); - parse = READ_ONCE(stab->bpf_parse); - tx_msg = READ_ONCE(stab->bpf_tx_msg); + verdict = READ_ONCE(progs->bpf_verdict); + parse = READ_ONCE(progs->bpf_parse); + tx_msg = READ_ONCE(progs->bpf_tx_msg); if (parse && verdict) { /* bpf prog refcnt may be zero if a concurrent attach operation @@ -1659,11 +1730,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, * we increment the refcnt. If this is the case abort with an * error. */ - verdict = bpf_prog_inc_not_zero(stab->bpf_verdict); + verdict = bpf_prog_inc_not_zero(verdict); if (IS_ERR(verdict)) return PTR_ERR(verdict); - parse = bpf_prog_inc_not_zero(stab->bpf_parse); + parse = bpf_prog_inc_not_zero(parse); if (IS_ERR(parse)) { bpf_prog_put(verdict); return PTR_ERR(parse); @@ -1671,12 +1742,12 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, } if (tx_msg) { - tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg); + tx_msg = bpf_prog_inc_not_zero(tx_msg); if (IS_ERR(tx_msg)) { - if (verdict) - bpf_prog_put(verdict); - if (parse) + if (parse && verdict) { bpf_prog_put(parse); + bpf_prog_put(verdict); + } return PTR_ERR(tx_msg); } } @@ -1704,7 +1775,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto out_progs; } } else { - psock = smap_init_psock(sock, stab); + psock = smap_init_psock(sock, map->numa_node); if (IS_ERR(psock)) { err = PTR_ERR(psock); goto out_progs; @@ -1714,12 +1785,13 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, new = true; } - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) { - err = -ENOMEM; - goto out_progs; + if (map_link) { + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) { + err = -ENOMEM; + goto out_progs; + } } - e->entry = &stab->sock_map[i]; /* 3. At this point we have a reference to a valid psock that is * running. Attach any BPF programs needed. @@ -1736,7 +1808,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, err = smap_init_sock(psock, sock); if (err) goto out_free; - smap_init_progs(psock, stab, verdict, parse); + smap_init_progs(psock, verdict, parse); smap_start_sock(psock, sock); } @@ -1745,50 +1817,93 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, * it with. Because we can only have a single set of programs if * old_sock has a strp we can stop it. */ - list_add_tail(&e->list, &psock->maps); + if (map_link) { + e->entry = map_link; + list_add_tail(&e->list, &psock->maps); + } + write_unlock_bh(&sock->sk_callback_lock); + return err; +out_free: + smap_release_sock(psock, sock); +out_progs: + if (parse && verdict) { + bpf_prog_put(parse); + bpf_prog_put(verdict); + } + if (tx_msg) + bpf_prog_put(tx_msg); write_unlock_bh(&sock->sk_callback_lock); + kfree(e); + return err; +} + +static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, + struct bpf_map *map, + void *key, u64 flags) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct bpf_sock_progs *progs = &stab->progs; + struct sock *osock, *sock; + u32 i = *(u32 *)key; + int err; + + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + if (unlikely(i >= stab->map.max_entries)) + return -E2BIG; + + sock = READ_ONCE(stab->sock_map[i]); + if (flags == BPF_EXIST && !sock) + return -ENOENT; + else if (flags == BPF_NOEXIST && sock) + return -EEXIST; + + sock = skops->sk; + err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i], + key); + if (err) + goto out; osock = xchg(&stab->sock_map[i], sock); if (osock) { struct smap_psock *opsock = smap_psock_sk(osock); write_lock_bh(&osock->sk_callback_lock); - smap_list_remove(opsock, &stab->sock_map[i]); + smap_list_remove(opsock, &stab->sock_map[i], NULL); smap_release_sock(opsock, osock); write_unlock_bh(&osock->sk_callback_lock); } - return 0; -out_free: - smap_release_sock(psock, sock); -out_progs: - if (verdict) - bpf_prog_put(verdict); - if (parse) - bpf_prog_put(parse); - if (tx_msg) - bpf_prog_put(tx_msg); - write_unlock_bh(&sock->sk_callback_lock); - kfree(e); +out: return err; } int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct bpf_sock_progs *progs; struct bpf_prog *orig; - if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP)) + if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + progs = &stab->progs; + } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + progs = &htab->progs; + } else { return -EINVAL; + } switch (type) { case BPF_SK_MSG_VERDICT: - orig = xchg(&stab->bpf_tx_msg, prog); + orig = xchg(&progs->bpf_tx_msg, prog); break; case BPF_SK_SKB_STREAM_PARSER: - orig = xchg(&stab->bpf_parse, prog); + orig = xchg(&progs->bpf_parse, prog); break; case BPF_SK_SKB_STREAM_VERDICT: - orig = xchg(&stab->bpf_verdict, prog); + orig = xchg(&progs->bpf_verdict, prog); break; default: return -EOPNOTSUPP; @@ -1834,23 +1949,423 @@ static int sock_map_update_elem(struct bpf_map *map, return err; } -static void sock_map_release(struct bpf_map *map, struct file *map_file) +static void sock_map_release(struct bpf_map *map) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct bpf_sock_progs *progs; struct bpf_prog *orig; - orig = xchg(&stab->bpf_parse, NULL); + if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + progs = &stab->progs; + } else { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + progs = &htab->progs; + } + + orig = xchg(&progs->bpf_parse, NULL); if (orig) bpf_prog_put(orig); - orig = xchg(&stab->bpf_verdict, NULL); + orig = xchg(&progs->bpf_verdict, NULL); if (orig) bpf_prog_put(orig); - orig = xchg(&stab->bpf_tx_msg, NULL); + orig = xchg(&progs->bpf_tx_msg, NULL); if (orig) bpf_prog_put(orig); } +static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) +{ + struct bpf_htab *htab; + int i, err; + u64 cost; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->value_size != 4 || + attr->map_flags & ~SOCK_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + if (attr->key_size > MAX_BPF_STACK) + /* eBPF programs initialize keys on stack, so they cannot be + * larger than max stack size + */ + return ERR_PTR(-E2BIG); + + err = bpf_tcp_ulp_register(); + if (err && err != -EEXIST) + return ERR_PTR(err); + + htab = kzalloc(sizeof(*htab), GFP_USER); + if (!htab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&htab->map, attr); + + htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); + htab->elem_size = sizeof(struct htab_elem) + + round_up(htab->map.key_size, 8); + err = -EINVAL; + if (htab->n_buckets == 0 || + htab->n_buckets > U32_MAX / sizeof(struct bucket)) + goto free_htab; + + cost = (u64) htab->n_buckets * sizeof(struct bucket) + + (u64) htab->elem_size * htab->map.max_entries; + + if (cost >= U32_MAX - PAGE_SIZE) + goto free_htab; + + htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + err = bpf_map_precharge_memlock(htab->map.pages); + if (err) + goto free_htab; + + err = -ENOMEM; + htab->buckets = bpf_map_area_alloc( + htab->n_buckets * sizeof(struct bucket), + htab->map.numa_node); + if (!htab->buckets) + goto free_htab; + + for (i = 0; i < htab->n_buckets; i++) { + INIT_HLIST_HEAD(&htab->buckets[i].head); + raw_spin_lock_init(&htab->buckets[i].lock); + } + + return &htab->map; +free_htab: + kfree(htab); + return ERR_PTR(err); +} + +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &htab->buckets[hash & (htab->n_buckets - 1)]; +} + +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + +static void sock_hash_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + int i; + + synchronize_rcu(); + + /* At this point no update, lookup or delete operations can happen. + * However, be aware we can still get a socket state event updates, + * and data ready callabacks that reference the psock from sk_user_data + * Also psock worker threads are still in-flight. So smap_release_sock + * will only free the psock after cancel_sync on the worker threads + * and a grace period expire to ensure psock is really safe to remove. + */ + rcu_read_lock(); + for (i = 0; i < htab->n_buckets; i++) { + struct hlist_head *head = select_bucket(htab, i); + struct hlist_node *n; + struct htab_elem *l; + + hlist_for_each_entry_safe(l, n, head, hash_node) { + struct sock *sock = l->sk; + struct smap_psock *psock; + + hlist_del_rcu(&l->hash_node); + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + /* This check handles a racing sock event that can get + * the sk_callback_lock before this case but after xchg + * causing the refcnt to hit zero and sock user data + * (psock) to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, NULL, l); + smap_release_sock(psock, sock); + } + write_unlock_bh(&sock->sk_callback_lock); + kfree(l); + } + } + rcu_read_unlock(); + bpf_map_area_free(htab->buckets); + kfree(htab); +} + +static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, + void *key, u32 key_size, u32 hash, + struct sock *sk, + struct htab_elem *old_elem) +{ + struct htab_elem *l_new; + + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + if (!old_elem) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + } + l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); + if (!l_new) + return ERR_PTR(-ENOMEM); + + memcpy(l_new->key, key, key_size); + l_new->sk = sk; + l_new->hash = hash; + return l_new; +} + +static struct htab_elem *lookup_elem_raw(struct hlist_head *head, + u32 hash, void *key, u32 key_size) +{ + struct htab_elem *l; + + hlist_for_each_entry_rcu(l, head, hash_node) { + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + } + + return NULL; +} + +static inline u32 htab_map_hash(const void *key, u32 key_len) +{ + return jhash(key, key_len, 0); +} + +static int sock_hash_get_next_key(struct bpf_map *map, + void *key, void *next_key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l, *next_l; + struct hlist_head *h; + u32 hash, key_size; + int i = 0; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + if (!key) + goto find_first_elem; + hash = htab_map_hash(key, key_size); + h = select_bucket(htab, hash); + + l = lookup_elem_raw(h, hash, key, key_size); + if (!l) + goto find_first_elem; + next_l = hlist_entry_safe( + rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), + struct htab_elem, hash_node); + if (next_l) { + memcpy(next_key, next_l->key, key_size); + return 0; + } + + /* no more elements in this hash list, go to the next bucket */ + i = hash & (htab->n_buckets - 1); + i++; + +find_first_elem: + /* iterate over buckets */ + for (; i < htab->n_buckets; i++) { + h = select_bucket(htab, i); + + /* pick first element in the bucket */ + next_l = hlist_entry_safe( + rcu_dereference_raw(hlist_first_rcu(h)), + struct htab_elem, hash_node); + if (next_l) { + /* if it's not empty, just return it */ + memcpy(next_key, next_l->key, key_size); + return 0; + } + } + + /* iterated over all buckets and all elements */ + return -ENOENT; +} + +static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, + struct bpf_map *map, + void *key, u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_sock_progs *progs = &htab->progs; + struct htab_elem *l_new = NULL, *l_old; + struct smap_psock_map_entry *e = NULL; + struct hlist_head *head; + struct smap_psock *psock; + u32 key_size, hash; + struct sock *sock; + struct bucket *b; + int err; + + sock = skops->sk; + + if (sock->sk_type != SOCK_STREAM || + sock->sk_protocol != IPPROTO_TCP) + return -EOPNOTSUPP; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) + return -ENOMEM; + + WARN_ON_ONCE(!rcu_read_lock_held()); + key_size = map->key_size; + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key); + if (err) + goto err; + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_bh(&b->lock); + l_old = lookup_elem_raw(head, hash, key, key_size); + if (l_old && map_flags == BPF_NOEXIST) { + err = -EEXIST; + goto bucket_err; + } + if (!l_old && map_flags == BPF_EXIST) { + err = -ENOENT; + goto bucket_err; + } + + l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old); + if (IS_ERR(l_new)) { + err = PTR_ERR(l_new); + goto bucket_err; + } + + psock = smap_psock_sk(sock); + if (unlikely(!psock)) { + err = -EINVAL; + goto bucket_err; + } + + e->hash_link = l_new; + e->htab = container_of(map, struct bpf_htab, map); + list_add_tail(&e->list, &psock->maps); + + /* add new element to the head of the list, so that + * concurrent search will find it before old elem + */ + hlist_add_head_rcu(&l_new->hash_node, head); + if (l_old) { + psock = smap_psock_sk(l_old->sk); + + hlist_del_rcu(&l_old->hash_node); + smap_list_remove(psock, NULL, l_old); + smap_release_sock(psock, l_old->sk); + free_htab_elem(htab, l_old); + } + raw_spin_unlock_bh(&b->lock); + return 0; +bucket_err: + raw_spin_unlock_bh(&b->lock); +err: + kfree(e); + psock = smap_psock_sk(sock); + if (psock) + smap_release_sock(psock, sock); + return err; +} + +static int sock_hash_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags) +{ + struct bpf_sock_ops_kern skops; + u32 fd = *(u32 *)value; + struct socket *socket; + int err; + + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + skops.sk = socket->sk; + if (!skops.sk) { + fput(socket->file); + return -EINVAL; + } + + err = sock_hash_ctx_update_elem(&skops, map, key, flags); + fput(socket->file); + return err; +} + +static int sock_hash_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct bucket *b; + struct htab_elem *l; + u32 hash, key_size; + int ret = -ENOENT; + + key_size = map->key_size; + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + raw_spin_lock_bh(&b->lock); + l = lookup_elem_raw(head, hash, key, key_size); + if (l) { + struct sock *sock = l->sk; + struct smap_psock *psock; + + hlist_del_rcu(&l->hash_node); + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + /* This check handles a racing sock event that can get the + * sk_callback_lock before this case but after xchg happens + * causing the refcnt to hit zero and sock user data (psock) + * to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, NULL, l); + smap_release_sock(psock, sock); + } + write_unlock_bh(&sock->sk_callback_lock); + free_htab_elem(htab, l); + ret = 0; + } + raw_spin_unlock_bh(&b->lock); + return ret; +} + +struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l; + u32 key_size, hash; + struct bucket *b; + struct sock *sk; + + key_size = map->key_size; + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + raw_spin_lock_bh(&b->lock); + l = lookup_elem_raw(head, hash, key, key_size); + sk = l ? l->sk : NULL; + raw_spin_unlock_bh(&b->lock); + return sk; +} + const struct bpf_map_ops sock_map_ops = { .map_alloc = sock_map_alloc, .map_free = sock_map_free, @@ -1858,7 +2373,16 @@ const struct bpf_map_ops sock_map_ops = { .map_get_next_key = sock_map_get_next_key, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, - .map_release = sock_map_release, + .map_release_uref = sock_map_release, +}; + +const struct bpf_map_ops sock_hash_ops = { + .map_alloc = sock_hash_alloc, + .map_free = sock_hash_free, + .map_lookup_elem = sock_map_lookup, + .map_get_next_key = sock_hash_get_next_key, + .map_update_elem = sock_hash_update_elem, + .map_delete_elem = sock_hash_delete_elem, }; BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, @@ -1878,3 +2402,21 @@ const struct bpf_func_proto bpf_sock_map_update_proto = { .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; + +BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock, + struct bpf_map *, map, void *, key, u64, flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return sock_hash_ctx_update_elem(bpf_sock, map, key, flags); +} + +const struct bpf_func_proto bpf_sock_hash_update_proto = { + .func = bpf_sock_hash_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 57eeb1234b67..b675a3f3d141 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -11,6 +11,7 @@ #include <linux/perf_event.h> #include <linux/elf.h> #include <linux/pagemap.h> +#include <linux/irq_work.h> #include "percpu_freelist.h" #define STACK_CREATE_FLAG_MASK \ @@ -32,6 +33,23 @@ struct bpf_stack_map { struct stack_map_bucket *buckets[]; }; +/* irq_work to run up_read() for build_id lookup in nmi context */ +struct stack_map_irq_work { + struct irq_work irq_work; + struct rw_semaphore *sem; +}; + +static void do_up_read(struct irq_work *entry) +{ + struct stack_map_irq_work *work; + + work = container_of(entry, struct stack_map_irq_work, irq_work); + up_read(work->sem); + work->sem = NULL; +} + +static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work); + static inline bool stack_map_use_build_id(struct bpf_map *map) { return (map->map_flags & BPF_F_STACK_BUILD_ID); @@ -262,27 +280,31 @@ out: return ret; } -static void stack_map_get_build_id_offset(struct bpf_map *map, - struct stack_map_bucket *bucket, +static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u64 *ips, u32 trace_nr, bool user) { int i; struct vm_area_struct *vma; - struct bpf_stack_build_id *id_offs; - - bucket->nr = trace_nr; - id_offs = (struct bpf_stack_build_id *)bucket->data; + bool irq_work_busy = false; + struct stack_map_irq_work *work = NULL; + + if (in_nmi()) { + work = this_cpu_ptr(&up_read_work); + if (work->irq_work.flags & IRQ_WORK_BUSY) + /* cannot queue more up_read, fallback */ + irq_work_busy = true; + } /* - * We cannot do up_read() in nmi context, so build_id lookup is - * only supported for non-nmi events. If at some point, it is - * possible to run find_vma() without taking the semaphore, we - * would like to allow build_id lookup in nmi context. + * We cannot do up_read() in nmi context. To do build_id lookup + * in nmi context, we need to run up_read() in irq_work. We use + * a percpu variable to do the irq_work. If the irq_work is + * already used by another lookup, we fall back to report ips. * * Same fallback is used for kernel stack (!user) on a stackmap * with build_id. */ - if (!user || !current || !current->mm || in_nmi() || + if (!user || !current || !current->mm || irq_work_busy || down_read_trylock(¤t->mm->mmap_sem) == 0) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { @@ -304,7 +326,13 @@ static void stack_map_get_build_id_offset(struct bpf_map *map, - vma->vm_start; id_offs[i].status = BPF_STACK_BUILD_ID_VALID; } - up_read(¤t->mm->mmap_sem); + + if (!work) { + up_read(¤t->mm->mmap_sem); + } else { + work->sem = ¤t->mm->mmap_sem; + irq_work_queue(&work->irq_work); + } } BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, @@ -361,8 +389,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, pcpu_freelist_pop(&smap->freelist); if (unlikely(!new_bucket)) return -ENOMEM; - stack_map_get_build_id_offset(map, new_bucket, ips, - trace_nr, user); + new_bucket->nr = trace_nr; + stack_map_get_build_id_offset( + (struct bpf_stack_build_id *)new_bucket->data, + ips, trace_nr, user); trace_len = trace_nr * sizeof(struct bpf_stack_build_id); if (hash_matches && bucket->nr == trace_nr && memcmp(bucket->data, new_bucket->data, trace_len) == 0) { @@ -405,6 +435,73 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, + u64, flags) +{ + u32 init_nr, trace_nr, copy_len, elem_size, num_elem; + bool user_build_id = flags & BPF_F_USER_BUILD_ID; + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + bool user = flags & BPF_F_USER_STACK; + struct perf_callchain_entry *trace; + bool kernel = !user; + int err = -EINVAL; + u64 *ips; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_USER_BUILD_ID))) + goto clear; + if (kernel && user_build_id) + goto clear; + + elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id) + : sizeof(u64); + if (unlikely(size % elem_size)) + goto clear; + + num_elem = size / elem_size; + if (sysctl_perf_event_max_stack < num_elem) + init_nr = 0; + else + init_nr = sysctl_perf_event_max_stack - num_elem; + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, false, false); + if (unlikely(!trace)) + goto err_fault; + + trace_nr = trace->nr - init_nr; + if (trace_nr < skip) + goto err_fault; + + trace_nr -= skip; + trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; + copy_len = trace_nr * elem_size; + ips = trace->ip + skip + init_nr; + if (user && user_build_id) + stack_map_get_build_id_offset(buf, ips, trace_nr, user); + else + memcpy(buf, ips, copy_len); + + if (size > copy_len) + memset(buf + copy_len, 0, size - copy_len); + return copy_len; + +err_fault: + err = -EFAULT; +clear: + memset(buf, 0, size); + return err; +} + +const struct bpf_func_proto bpf_get_stack_proto = { + .func = bpf_get_stack, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { @@ -511,3 +608,16 @@ const struct bpf_map_ops stack_map_ops = { .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, }; + +static int __init stack_map_init(void) +{ + int cpu; + struct stack_map_irq_work *work; + + for_each_possible_cpu(cpu) { + work = per_cpu_ptr(&up_read_work, cpu); + init_irq_work(&work->irq_work, do_up_read); + } + return 0; +} +subsys_initcall(stack_map_init); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4ca46df19c9a..0fa20624707f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -11,13 +11,17 @@ */ #include <linux/bpf.h> #include <linux/bpf_trace.h> +#include <linux/bpf_lirc.h> +#include <linux/btf.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/vmalloc.h> #include <linux/mmzone.h> #include <linux/anon_inodes.h> +#include <linux/fdtable.h> #include <linux/file.h> +#include <linux/fs.h> #include <linux/license.h> #include <linux/filter.h> #include <linux/version.h> @@ -26,6 +30,8 @@ #include <linux/cred.h> #include <linux/timekeeping.h> #include <linux/ctype.h> +#include <linux/btf.h> +#include <linux/nospec.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -62,9 +68,9 @@ static const struct bpf_map_ops * const bpf_map_types[] = { * copy_from_user() call. However, this is not a concern since this function is * meant to be a future-proofing of bits. */ -static int check_uarg_tail_zero(void __user *uaddr, - size_t expected_size, - size_t actual_size) +int bpf_check_uarg_tail_zero(void __user *uaddr, + size_t expected_size, + size_t actual_size) { unsigned char __user *addr; unsigned char __user *end; @@ -102,12 +108,14 @@ const struct bpf_map_ops bpf_map_offload_ops = { static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { const struct bpf_map_ops *ops; + u32 type = attr->map_type; struct bpf_map *map; int err; - if (attr->map_type >= ARRAY_SIZE(bpf_map_types)) + if (type >= ARRAY_SIZE(bpf_map_types)) return ERR_PTR(-EINVAL); - ops = bpf_map_types[attr->map_type]; + type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types)); + ops = bpf_map_types[type]; if (!ops) return ERR_PTR(-EINVAL); @@ -122,7 +130,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) if (IS_ERR(map)) return map; map->ops = ops; - map->map_type = attr->map_type; + map->map_type = type; return map; } @@ -257,8 +265,8 @@ static void bpf_map_free_deferred(struct work_struct *work) static void bpf_map_put_uref(struct bpf_map *map) { if (atomic_dec_and_test(&map->usercnt)) { - if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) - bpf_fd_array_map_clear(map); + if (map->ops->map_release_uref) + map->ops->map_release_uref(map); } } @@ -270,6 +278,7 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) if (atomic_dec_and_test(&map->refcnt)) { /* bpf_map_free_id() must be called first */ bpf_map_free_id(map, do_idr_lock); + btf_put(map->btf); INIT_WORK(&map->work, bpf_map_free_deferred); schedule_work(&map->work); } @@ -279,6 +288,7 @@ void bpf_map_put(struct bpf_map *map) { __bpf_map_put(map, true); } +EXPORT_SYMBOL_GPL(bpf_map_put); void bpf_map_put_with_uref(struct bpf_map *map) { @@ -317,13 +327,15 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "value_size:\t%u\n" "max_entries:\t%u\n" "map_flags:\t%#x\n" - "memlock:\t%llu\n", + "memlock:\t%llu\n" + "map_id:\t%u\n", map->map_type, map->key_size, map->value_size, map->max_entries, map->map_flags, - map->pages * 1ULL << PAGE_SHIFT); + map->pages * 1ULL << PAGE_SHIFT, + map->id); if (owner_prog_type) { seq_printf(m, "owner_prog_type:\t%u\n", @@ -415,7 +427,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } -#define BPF_MAP_CREATE_LAST_FIELD map_ifindex +#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -449,6 +461,33 @@ static int map_create(union bpf_attr *attr) atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); + if (bpf_map_support_seq_show(map) && + (attr->btf_key_type_id || attr->btf_value_type_id)) { + struct btf *btf; + + if (!attr->btf_key_type_id || !attr->btf_value_type_id) { + err = -EINVAL; + goto free_map_nouncharge; + } + + btf = btf_get_by_fd(attr->btf_fd); + if (IS_ERR(btf)) { + err = PTR_ERR(btf); + goto free_map_nouncharge; + } + + err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id, + attr->btf_value_type_id); + if (err) { + btf_put(btf); + goto free_map_nouncharge; + } + + map->btf = btf; + map->btf_key_type_id = attr->btf_key_type_id; + map->btf_value_type_id = attr->btf_value_type_id; + } + err = security_bpf_map_alloc(map); if (err) goto free_map_nouncharge; @@ -473,7 +512,6 @@ static int map_create(union bpf_attr *attr) return err; } - trace_bpf_map_create(map, err); return err; free_map: @@ -481,6 +519,7 @@ free_map: free_map_sec: security_bpf_map_free(map); free_map_nouncharge: + btf_put(map->btf); map->ops->map_free(map); return err; } @@ -513,6 +552,7 @@ struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref) atomic_inc(&map->usercnt); return map; } +EXPORT_SYMBOL_GPL(bpf_map_inc); struct bpf_map *bpf_map_get_with_uref(u32 ufd) { @@ -632,7 +672,6 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; - trace_bpf_map_lookup_elem(map, ufd, key, value); err = 0; free_value: @@ -729,8 +768,6 @@ static int map_update_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); out: - if (!err) - trace_bpf_map_update_elem(map, ufd, key, value); free_value: kfree(value); free_key: @@ -783,8 +820,6 @@ static int map_delete_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); out: - if (!err) - trace_bpf_map_delete_elem(map, ufd, key); kfree(key); err_put: fdput(f); @@ -848,7 +883,6 @@ out: if (copy_to_user(unext_key, next_key, map->key_size) != 0) goto free_next_key; - trace_bpf_map_next_key(map, ufd, key, next_key); err = 0; free_next_key: @@ -871,11 +905,17 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = { static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) { - if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type]) + const struct bpf_prog_ops *ops; + + if (type >= ARRAY_SIZE(bpf_prog_types)) + return -EINVAL; + type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types)); + ops = bpf_prog_types[type]; + if (!ops) return -EINVAL; if (!bpf_prog_is_dev_bound(prog->aux)) - prog->aux->ops = bpf_prog_types[type]; + prog->aux->ops = ops; else prog->aux->ops = &bpf_offload_prog_ops; prog->type = type; @@ -996,7 +1036,6 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) if (atomic_dec_and_test(&prog->aux->refcnt)) { int i; - trace_bpf_prog_put_rcu(prog); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); @@ -1033,11 +1072,13 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) "prog_type:\t%u\n" "prog_jited:\t%u\n" "prog_tag:\t%s\n" - "memlock:\t%llu\n", + "memlock:\t%llu\n" + "prog_id:\t%u\n", prog->type, prog->jited, prog_tag, - prog->pages * 1ULL << PAGE_SHIFT); + prog->pages * 1ULL << PAGE_SHIFT, + prog->aux->id); } #endif @@ -1163,11 +1204,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv) { - struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv); - - if (!IS_ERR(prog)) - trace_bpf_prog_get_type(prog); - return prog; + return __bpf_prog_get(ufd, &type, attach_drv); } EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); @@ -1217,6 +1254,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: return 0; default: return -EINVAL; @@ -1342,7 +1381,6 @@ static int bpf_prog_load(union bpf_attr *attr) } bpf_prog_kallsyms_add(prog); - trace_bpf_prog_load(prog, err); return err; free_used_maps: @@ -1534,6 +1572,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1547,6 +1587,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); + case BPF_LIRC_MODE2: + return lirc_prog_attach(attr); default: return -EINVAL; } @@ -1604,6 +1646,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1617,6 +1661,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); + case BPF_LIRC_MODE2: + return lirc_prog_detach(attr); default: return -EINVAL; } @@ -1661,9 +1707,13 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: break; + case BPF_LIRC_MODE2: + return lirc_prog_query(attr, uattr); default: return -EINVAL; } @@ -1870,7 +1920,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, u32 ulen; int err; - err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -1883,6 +1933,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.load_time = prog->aux->load_time; info.created_by_uid = from_kuid_munged(current_user_ns(), prog->aux->user->uid); + info.gpl_compatible = prog->gpl_compatible; memcpy(info.tag, prog->tag, sizeof(prog->tag)); memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); @@ -1903,6 +1954,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; info.xlated_prog_len = 0; + info.nr_jited_ksyms = 0; goto done; } @@ -1939,18 +1991,93 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, * for offload. */ ulen = info.jited_prog_len; - info.jited_prog_len = prog->jited_len; + if (prog->aux->func_cnt) { + u32 i; + + info.jited_prog_len = 0; + for (i = 0; i < prog->aux->func_cnt; i++) + info.jited_prog_len += prog->aux->func[i]->jited_len; + } else { + info.jited_prog_len = prog->jited_len; + } + if (info.jited_prog_len && ulen) { if (bpf_dump_raw_ok()) { uinsns = u64_to_user_ptr(info.jited_prog_insns); ulen = min_t(u32, info.jited_prog_len, ulen); - if (copy_to_user(uinsns, prog->bpf_func, ulen)) - return -EFAULT; + + /* for multi-function programs, copy the JITed + * instructions for all the functions + */ + if (prog->aux->func_cnt) { + u32 len, free, i; + u8 *img; + + free = ulen; + for (i = 0; i < prog->aux->func_cnt; i++) { + len = prog->aux->func[i]->jited_len; + len = min_t(u32, len, free); + img = (u8 *) prog->aux->func[i]->bpf_func; + if (copy_to_user(uinsns, img, len)) + return -EFAULT; + uinsns += len; + free -= len; + if (!free) + break; + } + } else { + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } } else { info.jited_prog_insns = 0; } } + ulen = info.nr_jited_ksyms; + info.nr_jited_ksyms = prog->aux->func_cnt; + if (info.nr_jited_ksyms && ulen) { + if (bpf_dump_raw_ok()) { + u64 __user *user_ksyms; + ulong ksym_addr; + u32 i; + + /* copy the address of the kernel symbol + * corresponding to each function + */ + ulen = min_t(u32, info.nr_jited_ksyms, ulen); + user_ksyms = u64_to_user_ptr(info.jited_ksyms); + for (i = 0; i < ulen; i++) { + ksym_addr = (ulong) prog->aux->func[i]->bpf_func; + ksym_addr &= PAGE_MASK; + if (put_user((u64) ksym_addr, &user_ksyms[i])) + return -EFAULT; + } + } else { + info.jited_ksyms = 0; + } + } + + ulen = info.nr_jited_func_lens; + info.nr_jited_func_lens = prog->aux->func_cnt; + if (info.nr_jited_func_lens && ulen) { + if (bpf_dump_raw_ok()) { + u32 __user *user_lens; + u32 func_len, i; + + /* copy the JITed image lengths for each function */ + ulen = min_t(u32, info.nr_jited_func_lens, ulen); + user_lens = u64_to_user_ptr(info.jited_func_lens); + for (i = 0; i < ulen; i++) { + func_len = prog->aux->func[i]->jited_len; + if (put_user(func_len, &user_lens[i])) + return -EFAULT; + } + } else { + info.jited_func_lens = 0; + } + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) @@ -1968,7 +2095,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, u32 info_len = attr->info.info_len; int err; - err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -1981,6 +2108,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.map_flags = map->map_flags; memcpy(info.name, map->name, sizeof(map->name)); + if (map->btf) { + info.btf_id = btf_id(map->btf); + info.btf_key_type_id = map->btf_key_type_id; + info.btf_value_type_id = map->btf_value_type_id; + } + if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_info_fill(&info, map); if (err) @@ -1994,6 +2127,21 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, return 0; } +static int bpf_btf_get_info_by_fd(struct btf *btf, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info); + u32 info_len = attr->info.info_len; + int err; + + err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len); + if (err) + return err; + + return btf_get_info_by_fd(btf, attr, uattr); +} + #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, @@ -2016,6 +2164,8 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, else if (f.file->f_op == &bpf_map_fops) err = bpf_map_get_info_by_fd(f.file->private_data, attr, uattr); + else if (f.file->f_op == &btf_fops) + err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr); else err = -EINVAL; @@ -2023,6 +2173,158 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return err; } +#define BPF_BTF_LOAD_LAST_FIELD btf_log_level + +static int bpf_btf_load(const union bpf_attr *attr) +{ + if (CHECK_ATTR(BPF_BTF_LOAD)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return btf_new_fd(attr); +} + +#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id + +static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) +{ + if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return btf_get_fd_by_id(attr->btf_id); +} + +static int bpf_task_fd_query_copy(const union bpf_attr *attr, + union bpf_attr __user *uattr, + u32 prog_id, u32 fd_type, + const char *buf, u64 probe_offset, + u64 probe_addr) +{ + char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf); + u32 len = buf ? strlen(buf) : 0, input_len; + int err = 0; + + if (put_user(len, &uattr->task_fd_query.buf_len)) + return -EFAULT; + input_len = attr->task_fd_query.buf_len; + if (input_len && ubuf) { + if (!len) { + /* nothing to copy, just make ubuf NULL terminated */ + char zero = '\0'; + + if (put_user(zero, ubuf)) + return -EFAULT; + } else if (input_len >= len + 1) { + /* ubuf can hold the string with NULL terminator */ + if (copy_to_user(ubuf, buf, len + 1)) + return -EFAULT; + } else { + /* ubuf cannot hold the string with NULL terminator, + * do a partial copy with NULL terminator. + */ + char zero = '\0'; + + err = -ENOSPC; + if (copy_to_user(ubuf, buf, input_len - 1)) + return -EFAULT; + if (put_user(zero, ubuf + input_len - 1)) + return -EFAULT; + } + } + + if (put_user(prog_id, &uattr->task_fd_query.prog_id) || + put_user(fd_type, &uattr->task_fd_query.fd_type) || + put_user(probe_offset, &uattr->task_fd_query.probe_offset) || + put_user(probe_addr, &uattr->task_fd_query.probe_addr)) + return -EFAULT; + + return err; +} + +#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr + +static int bpf_task_fd_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + pid_t pid = attr->task_fd_query.pid; + u32 fd = attr->task_fd_query.fd; + const struct perf_event *event; + struct files_struct *files; + struct task_struct *task; + struct file *file; + int err; + + if (CHECK_ATTR(BPF_TASK_FD_QUERY)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (attr->task_fd_query.flags != 0) + return -EINVAL; + + task = get_pid_task(find_vpid(pid), PIDTYPE_PID); + if (!task) + return -ENOENT; + + files = get_files_struct(task); + put_task_struct(task); + if (!files) + return -ENOENT; + + err = 0; + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (!file) + err = -EBADF; + else + get_file(file); + spin_unlock(&files->file_lock); + put_files_struct(files); + + if (err) + goto out; + + if (file->f_op == &bpf_raw_tp_fops) { + struct bpf_raw_tracepoint *raw_tp = file->private_data; + struct bpf_raw_event_map *btp = raw_tp->btp; + + err = bpf_task_fd_query_copy(attr, uattr, + raw_tp->prog->aux->id, + BPF_FD_TYPE_RAW_TRACEPOINT, + btp->tp->name, 0, 0); + goto put_file; + } + + event = perf_get_event(file); + if (!IS_ERR(event)) { + u64 probe_offset, probe_addr; + u32 prog_id, fd_type; + const char *buf; + + err = bpf_get_perf_event_info(event, &prog_id, &fd_type, + &buf, &probe_offset, + &probe_addr); + if (!err) + err = bpf_task_fd_query_copy(attr, uattr, prog_id, + fd_type, buf, + probe_offset, + probe_addr); + goto put_file; + } + + err = -ENOTSUPP; +put_file: + fput(file); +out: + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -2031,7 +2333,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) return -EPERM; - err = check_uarg_tail_zero(uattr, sizeof(attr), size); + err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); if (err) return err; size = min_t(u32, size, sizeof(attr)); @@ -2103,6 +2405,15 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_RAW_TRACEPOINT_OPEN: err = bpf_raw_tracepoint_open(&attr); break; + case BPF_BTF_LOAD: + err = bpf_btf_load(&attr); + break; + case BPF_BTF_GET_FD_BY_ID: + err = bpf_btf_get_fd_by_id(&attr); + break; + case BPF_TASK_FD_QUERY: + err = bpf_task_fd_query(&attr, uattr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 1f4bf68c12db..938d41211be7 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -43,6 +43,16 @@ struct tnum tnum_rshift(struct tnum a, u8 shift) return TNUM(a.value >> shift, a.mask >> shift); } +struct tnum tnum_arshift(struct tnum a, u8 min_shift) +{ + /* if a.value is negative, arithmetic shifting by minimum shift + * will have larger negative offset compared to more shifting. + * If a.value is nonnegative, arithmetic shifting by minimum shift + * will have larger positive offset compare to more shifting. + */ + return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift); +} + struct tnum tnum_add(struct tnum a, struct tnum b) { u64 sm, sv, sigma, chi, mu; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5dd1dcb902bf..9e2bf834f13a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -22,6 +22,7 @@ #include <linux/stringify.h> #include <linux/bsearch.h> #include <linux/sort.h> +#include <linux/perf_event.h> #include "disasm.h" @@ -156,7 +157,29 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_INSNS 131072 #define BPF_COMPLEXITY_LIMIT_STACK 1024 -#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA) +#define BPF_MAP_PTR_UNPRIV 1UL +#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ + POISON_POINTER_DELTA)) +#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) + +static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) +{ + return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON; +} + +static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux) +{ + return aux->map_state & BPF_MAP_PTR_UNPRIV; +} + +static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, + const struct bpf_map *map, bool unpriv) +{ + BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV); + unpriv |= bpf_map_ptr_unpriv(aux); + aux->map_state = (unsigned long)map | + (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); +} struct bpf_call_arg_meta { struct bpf_map *map_ptr; @@ -164,6 +187,8 @@ struct bpf_call_arg_meta { bool pkt_access; int regno; int access_size; + s64 msize_smax_value; + u64 msize_umax_value; }; static DEFINE_MUTEX(bpf_verifier_lock); @@ -738,18 +763,19 @@ enum reg_arg_type { static int cmp_subprogs(const void *a, const void *b) { - return *(int *)a - *(int *)b; + return ((struct bpf_subprog_info *)a)->start - + ((struct bpf_subprog_info *)b)->start; } static int find_subprog(struct bpf_verifier_env *env, int off) { - u32 *p; + struct bpf_subprog_info *p; - p = bsearch(&off, env->subprog_starts, env->subprog_cnt, - sizeof(env->subprog_starts[0]), cmp_subprogs); + p = bsearch(&off, env->subprog_info, env->subprog_cnt, + sizeof(env->subprog_info[0]), cmp_subprogs); if (!p) return -ENOENT; - return p - env->subprog_starts; + return p - env->subprog_info; } @@ -769,18 +795,24 @@ static int add_subprog(struct bpf_verifier_env *env, int off) verbose(env, "too many subprograms\n"); return -E2BIG; } - env->subprog_starts[env->subprog_cnt++] = off; - sort(env->subprog_starts, env->subprog_cnt, - sizeof(env->subprog_starts[0]), cmp_subprogs, NULL); + env->subprog_info[env->subprog_cnt++].start = off; + sort(env->subprog_info, env->subprog_cnt, + sizeof(env->subprog_info[0]), cmp_subprogs, NULL); return 0; } static int check_subprogs(struct bpf_verifier_env *env) { int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; + struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; + /* Add entry function. */ + ret = add_subprog(env, 0); + if (ret < 0) + return ret; + /* determine subprog starts. The end is one before the next starts */ for (i = 0; i < insn_cnt; i++) { if (insn[i].code != (BPF_JMP | BPF_CALL)) @@ -800,16 +832,18 @@ static int check_subprogs(struct bpf_verifier_env *env) return ret; } + /* Add a fake 'exit' subprog which could simplify subprog iteration + * logic. 'subprog_cnt' should not be increased. + */ + subprog[env->subprog_cnt].start = insn_cnt; + if (env->log.level > 1) for (i = 0; i < env->subprog_cnt; i++) - verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]); + verbose(env, "func#%d @%d\n", i, subprog[i].start); /* now check that all jumps are within the same subprog */ - subprog_start = 0; - if (env->subprog_cnt == cur_subprog) - subprog_end = insn_cnt; - else - subprog_end = env->subprog_starts[cur_subprog++]; + subprog_start = subprog[cur_subprog].start; + subprog_end = subprog[cur_subprog + 1].start; for (i = 0; i < insn_cnt; i++) { u8 code = insn[i].code; @@ -834,10 +868,9 @@ next: return -EINVAL; } subprog_start = subprog_end; - if (env->subprog_cnt == cur_subprog) - subprog_end = insn_cnt; - else - subprog_end = env->subprog_starts[cur_subprog++]; + cur_subprog++; + if (cur_subprog < env->subprog_cnt) + subprog_end = subprog[cur_subprog + 1].start; } } return 0; @@ -978,7 +1011,7 @@ static bool register_is_null(struct bpf_reg_state *reg) */ static int check_stack_write(struct bpf_verifier_env *env, struct bpf_func_state *state, /* func where register points to */ - int off, int size, int value_regno) + int off, int size, int value_regno, int insn_idx) { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; @@ -1017,8 +1050,33 @@ static int check_stack_write(struct bpf_verifier_env *env, state->stack[spi].spilled_ptr = cur->regs[value_regno]; state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - for (i = 0; i < BPF_REG_SIZE; i++) + for (i = 0; i < BPF_REG_SIZE; i++) { + if (state->stack[spi].slot_type[i] == STACK_MISC && + !env->allow_ptr_leaks) { + int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; + int soff = (-spi - 1) * BPF_REG_SIZE; + + /* detected reuse of integer stack slot with a pointer + * which means either llvm is reusing stack slot or + * an attacker is trying to exploit CVE-2018-3639 + * (speculative store bypass) + * Have to sanitize that slot with preemptive + * store of zero. + */ + if (*poff && *poff != soff) { + /* disallow programs where single insn stores + * into two different stack slots, since verifier + * cannot sanitize them + */ + verbose(env, + "insn %d cannot access two stack slots fp%d and fp%d", + insn_idx, *poff, soff); + return -EINVAL; + } + *poff = soff; + } state->stack[spi].slot_type[i] = STACK_SPILL; + } } else { u8 type = STACK_MISC; @@ -1251,6 +1309,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, switch (env->prog->type) { case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; @@ -1470,13 +1529,13 @@ static int update_stack_depth(struct bpf_verifier_env *env, const struct bpf_func_state *func, int off) { - u16 stack = env->subprog_stack_depth[func->subprogno]; + u16 stack = env->subprog_info[func->subprogno].stack_depth; if (stack >= -off) return 0; /* update known max for given subprogram */ - env->subprog_stack_depth[func->subprogno] = -off; + env->subprog_info[func->subprogno].stack_depth = -off; return 0; } @@ -1488,9 +1547,9 @@ static int update_stack_depth(struct bpf_verifier_env *env, */ static int check_max_stack_depth(struct bpf_verifier_env *env) { - int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end; + int depth = 0, frame = 0, idx = 0, i = 0, subprog_end; + struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; int ret_insn[MAX_CALL_FRAMES]; int ret_prog[MAX_CALL_FRAMES]; @@ -1498,17 +1557,14 @@ process_func: /* round up to 32-bytes, since this is granularity * of interpreter stack size */ - depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + depth += round_up(max_t(u32, subprog[idx].stack_depth, 1), 32); if (depth > MAX_BPF_STACK) { verbose(env, "combined stack size of %d calls is %d. Too large\n", frame + 1, depth); return -EACCES; } continue_func: - if (env->subprog_cnt == subprog) - subprog_end = insn_cnt; - else - subprog_end = env->subprog_starts[subprog]; + subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { if (insn[i].code != (BPF_JMP | BPF_CALL)) continue; @@ -1516,17 +1572,16 @@ continue_func: continue; /* remember insn and function to return to */ ret_insn[frame] = i + 1; - ret_prog[frame] = subprog; + ret_prog[frame] = idx; /* find the callee */ i = i + insn[i].imm + 1; - subprog = find_subprog(env, i); - if (subprog < 0) { + idx = find_subprog(env, i); + if (idx < 0) { WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", i); return -EFAULT; } - subprog++; frame++; if (frame >= MAX_CALL_FRAMES) { WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); @@ -1539,10 +1594,10 @@ continue_func: */ if (frame == 0) return 0; - depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + depth -= round_up(max_t(u32, subprog[idx].stack_depth, 1), 32); frame--; i = ret_insn[frame]; - subprog = ret_prog[frame]; + idx = ret_prog[frame]; goto continue_func; } @@ -1558,11 +1613,34 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, start); return -EFAULT; } - subprog++; - return env->subprog_stack_depth[subprog]; + return env->subprog_info[subprog].stack_depth; } #endif +static int check_ctx_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) +{ + /* Access to ctx or passing it to a helper is only allowed in + * its original, unmodified form. + */ + + if (reg->off) { + verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n", + regno, reg->off); + return -EACCES; + } + + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf); + return -EACCES; + } + + return 0; +} + /* truncate register to smaller size (in bytes) * must be called with size < BPF_REG_SIZE */ @@ -1632,24 +1710,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose(env, "R%d leaks addr into ctx\n", value_regno); return -EACCES; } - /* ctx accesses must be at a fixed offset, so that we can - * determine what type of data were returned. - */ - if (reg->off) { - verbose(env, - "dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n", - regno, reg->off, off - reg->off); - return -EACCES; - } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, - "variable ctx access var_off=%s off=%d size=%d", - tn_buf, off, size); - return -EACCES; - } + err = check_ctx_reg(env, reg, regno); + if (err < 0) + return err; + err = check_ctx_access(env, insn_idx, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a @@ -1694,7 +1759,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (t == BPF_WRITE) err = check_stack_write(env, state, off, size, - value_regno); + value_regno, insn_idx); else err = check_stack_read(env, state, off, size, value_regno); @@ -1914,7 +1979,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; - if (!type_is_pkt_pointer(type) && + if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || @@ -1930,6 +1995,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_CTX; if (type != expected_type) goto err_type; + err = check_ctx_reg(env, reg, regno); + if (err < 0) + return err; } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be @@ -1966,14 +2034,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "invalid map_ptr to access map->key\n"); return -EACCES; } - if (type_is_pkt_pointer(type)) - err = check_packet_access(env, regno, reg->off, - meta->map_ptr->key_size, - false); - else - err = check_stack_boundary(env, regno, - meta->map_ptr->key_size, - false, NULL); + err = check_helper_mem_access(env, regno, + meta->map_ptr->key_size, false, + NULL); } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity @@ -1983,17 +2046,18 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "invalid map_ptr to access map->value\n"); return -EACCES; } - if (type_is_pkt_pointer(type)) - err = check_packet_access(env, regno, reg->off, - meta->map_ptr->value_size, - false); - else - err = check_stack_boundary(env, regno, - meta->map_ptr->value_size, - false, NULL); + err = check_helper_mem_access(env, regno, + meta->map_ptr->value_size, false, + NULL); } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); + /* remember the mem_size which may be used later + * to refine return values. + */ + meta->msize_smax_value = reg->smax_value; + meta->msize_umax_value = reg->umax_value; + /* The register is SCALAR_VALUE; the access check * happens using its boundaries. */ @@ -2071,8 +2135,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_redirect_map) goto error; break; - /* Restrict bpf side of cpumap, open when use-cases appear */ + /* Restrict bpf side of cpumap and xskmap, open when use-cases + * appear. + */ case BPF_MAP_TYPE_CPUMAP: + case BPF_MAP_TYPE_XSKMAP: if (func_id != BPF_FUNC_redirect_map) goto error; break; @@ -2088,6 +2155,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_msg_redirect_map) goto error; break; + case BPF_MAP_TYPE_SOCKHASH: + if (func_id != BPF_FUNC_sk_redirect_hash && + func_id != BPF_FUNC_sock_hash_update && + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_msg_redirect_hash) + goto error; + break; default: break; } @@ -2097,7 +2171,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_tail_call: if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; - if (env->subprog_cnt) { + if (env->subprog_cnt > 1) { verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); return -EINVAL; } @@ -2119,16 +2193,20 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_FUNC_redirect_map: if (map->map_type != BPF_MAP_TYPE_DEVMAP && - map->map_type != BPF_MAP_TYPE_CPUMAP) + map->map_type != BPF_MAP_TYPE_CPUMAP && + map->map_type != BPF_MAP_TYPE_XSKMAP) goto error; break; case BPF_FUNC_sk_redirect_map: case BPF_FUNC_msg_redirect_map: + case BPF_FUNC_sock_map_update: if (map->map_type != BPF_MAP_TYPE_SOCKMAP) goto error; break; - case BPF_FUNC_sock_map_update: - if (map->map_type != BPF_MAP_TYPE_SOCKMAP) + case BPF_FUNC_sk_redirect_hash: + case BPF_FUNC_msg_redirect_hash: + case BPF_FUNC_sock_hash_update: + if (map->map_type != BPF_MAP_TYPE_SOCKHASH) goto error; break; default: @@ -2269,7 +2347,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* remember the callsite, it will be used by bpf_exit */ *insn_idx /* callsite */, state->curframe + 1 /* frameno within this callchain */, - subprog + 1 /* subprog number within this prog */); + subprog /* subprog number within this prog */); /* copy r1 - r5 args that callee can access */ for (i = BPF_REG_1; i <= BPF_REG_5; i++) @@ -2333,6 +2411,49 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return 0; } +static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, + int func_id, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *ret_reg = ®s[BPF_REG_0]; + + if (ret_type != RET_INTEGER || + (func_id != BPF_FUNC_get_stack && + func_id != BPF_FUNC_probe_read_str)) + return; + + ret_reg->smax_value = meta->msize_smax_value; + ret_reg->umax_value = meta->msize_umax_value; + __reg_deduce_bounds(ret_reg); + __reg_bound_offset(ret_reg); +} + +static int +record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, + int func_id, int insn_idx) +{ + struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + + if (func_id != BPF_FUNC_tail_call && + func_id != BPF_FUNC_map_lookup_elem && + func_id != BPF_FUNC_map_update_elem && + func_id != BPF_FUNC_map_delete_elem) + return 0; + + if (meta->map_ptr == NULL) { + verbose(env, "kernel subsystem misconfigured verifier\n"); + return -EINVAL; + } + + if (!BPF_MAP_PTR(aux->map_state)) + bpf_map_ptr_store(aux, meta->map_ptr, + meta->map_ptr->unpriv_array); + else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr) + bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, + meta->map_ptr->unpriv_array); + return 0; +} + static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { const struct bpf_func_proto *fn = NULL; @@ -2358,7 +2479,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn /* eBPF programs must be GPL compatible to use GPL-ed functions */ if (!env->prog->gpl_compatible && fn->gpl_only) { - verbose(env, "cannot call GPL only function from proprietary program\n"); + verbose(env, "cannot call GPL-restricted function from non-GPL compatible program\n"); return -EINVAL; } @@ -2387,13 +2508,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); if (err) return err; - if (func_id == BPF_FUNC_tail_call) { - if (meta.map_ptr == NULL) { - verbose(env, "verifier bug\n"); - return -EINVAL; - } - env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; - } err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); if (err) return err; @@ -2404,6 +2518,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; + err = record_func_map(env, &meta, func_id, insn_idx); + if (err) + return err; + /* Mark slots with STACK_MISC in case of raw mode, stack offset * is inferred from register state. */ @@ -2428,8 +2546,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { - struct bpf_insn_aux_data *insn_aux; - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); @@ -2445,21 +2561,36 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].id = ++env->id_gen; - insn_aux = &env->insn_aux_data[insn_idx]; - if (!insn_aux->map_ptr) - insn_aux->map_ptr = meta.map_ptr; - else if (insn_aux->map_ptr != meta.map_ptr) - insn_aux->map_ptr = BPF_MAP_PTR_POISON; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); return -EINVAL; } + do_refine_retval_range(regs, fn->ret_type, func_id, &meta); + err = check_map_func_compatibility(env, meta.map_ptr, func_id); if (err) return err; + if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) { + const char *err_str; + +#ifdef CONFIG_PERF_EVENTS + err = get_callchain_buffers(sysctl_perf_event_max_stack); + err_str = "cannot get callchain buffer for func %s#%d\n"; +#else + err = -ENOTSUPP; + err_str = "func %s#%d not supported without CONFIG_PERF_EVENTS\n"; +#endif + if (err) { + verbose(env, err_str, func_id_name(func_id), func_id); + return err; + } + + env->prog->has_callchain_buf = true; + } + if (changes_data) clear_all_pkt_pointers(env); return 0; @@ -2904,10 +3035,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, dst_reg->umin_value <<= umin_val; dst_reg->umax_value <<= umax_val; } - if (src_known) - dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val); - else - dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val); + dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val); /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); break; @@ -2935,16 +3063,35 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, */ dst_reg->smin_value = S64_MIN; dst_reg->smax_value = S64_MAX; - if (src_known) - dst_reg->var_off = tnum_rshift(dst_reg->var_off, - umin_val); - else - dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val); + dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val); dst_reg->umin_value >>= umax_val; dst_reg->umax_value >>= umin_val; /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); break; + case BPF_ARSH: + if (umax_val >= insn_bitness) { + /* Shifts greater than 31 or 63 are undefined. + * This includes shifts by a negative number. + */ + mark_reg_unknown(env, regs, insn->dst_reg); + break; + } + + /* Upon reaching here, src_known is true and + * umax_val is equal to umin_val. + */ + dst_reg->smin_value >>= umin_val; + dst_reg->smax_value >>= umin_val; + dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val); + + /* blow away the dst_reg umin_value/umax_value and rely on + * dst_reg var_off to refine the result. + */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + __update_reg_bounds(dst_reg); + break; default: mark_reg_unknown(env, regs, insn->dst_reg); break; @@ -3828,7 +3975,12 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } - if (env->subprog_cnt) { + if (!env->ops->gen_ld_abs) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } + + if (env->subprog_cnt > 1) { /* when program has LD_ABS insn JITs and interpreter assume * that r1 == ctx == skb which is not the case for callees * that can have arbitrary arguments. It's problematic @@ -4859,15 +5011,15 @@ process_bpf_exit: verbose(env, "processed %d insns (limit %d), stack depth ", insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); - for (i = 0; i < env->subprog_cnt + 1; i++) { - u32 depth = env->subprog_stack_depth[i]; + for (i = 0; i < env->subprog_cnt; i++) { + u32 depth = env->subprog_info[i].stack_depth; verbose(env, "%d", depth); - if (i + 1 < env->subprog_cnt + 1) + if (i + 1 < env->subprog_cnt) verbose(env, "+"); } verbose(env, "\n"); - env->prog->aux->stack_depth = env->subprog_stack_depth[0]; + env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return 0; } @@ -4991,7 +5143,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) /* hold the map. If the program is rejected by verifier, * the map will be released by release_maps() or it * will be used by the valid program until it's unloaded - * and all maps are released in free_bpf_prog_info() + * and all maps are released in free_used_maps() */ map = bpf_map_inc(map, false); if (IS_ERR(map)) { @@ -5054,7 +5206,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, if (cnt == 1) return 0; - new_data = vzalloc(sizeof(struct bpf_insn_aux_data) * prog_len); + new_data = vzalloc(array_size(prog_len, + sizeof(struct bpf_insn_aux_data))); if (!new_data) return -ENOMEM; memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); @@ -5073,10 +5226,11 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len if (len == 1) return; - for (i = 0; i < env->subprog_cnt; i++) { - if (env->subprog_starts[i] < off) + /* NOTE: fake 'exit' subprog should be updated as well. */ + for (i = 0; i <= env->subprog_cnt; i++) { + if (env->subprog_info[i].start < off) continue; - env->subprog_starts[i] += len - 1; + env->subprog_info[i].start += len - 1; } } @@ -5150,7 +5304,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } } - if (!ops->convert_ctx_access) + if (!ops->convert_ctx_access || bpf_prog_is_dev_bound(env->prog->aux)) return 0; insn = env->prog->insnsi + delta; @@ -5169,6 +5323,34 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) else continue; + if (type == BPF_WRITE && + env->insn_aux_data[i + delta].sanitize_stack_off) { + struct bpf_insn patch[] = { + /* Sanitize suspicious stack slot with zero. + * There are no memory dependencies for this store, + * since it's only using frame pointer and immediate + * constant of zero + */ + BPF_ST_MEM(BPF_DW, BPF_REG_FP, + env->insn_aux_data[i + delta].sanitize_stack_off, + 0), + /* the original STX instruction will immediately + * overwrite the same stack slot with appropriate value + */ + *insn, + }; + + cnt = ARRAY_SIZE(patch); + new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) continue; @@ -5182,6 +5364,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) */ is_narrower_load = size < ctx_field_size; if (is_narrower_load) { + u32 size_default = bpf_ctx_off_adjust_machine(ctx_field_size); u32 off = insn->off; u8 size_code; @@ -5196,7 +5379,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) else if (ctx_field_size == 8) size_code = BPF_DW; - insn->off = off & ~(ctx_field_size - 1); + insn->off = off & ~(size_default - 1); insn->code = BPF_LDX | BPF_MEM | size_code; } @@ -5240,7 +5423,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) void *old_bpf_func; int err = -ENOMEM; - if (env->subprog_cnt == 0) + if (env->subprog_cnt <= 1) return 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { @@ -5256,7 +5439,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) /* temporarily remember subprog id inside insn instead of * aux_data, since next loop will split up all insns into funcs */ - insn->off = subprog + 1; + insn->off = subprog; /* remember original imm in case JIT fails and fallback * to interpreter will be needed */ @@ -5265,16 +5448,13 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn->imm = 1; } - func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL); + func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL); if (!func) return -ENOMEM; - for (i = 0; i <= env->subprog_cnt; i++) { + for (i = 0; i < env->subprog_cnt; i++) { subprog_start = subprog_end; - if (env->subprog_cnt == i) - subprog_end = prog->len; - else - subprog_end = env->subprog_starts[i]; + subprog_end = env->subprog_info[i + 1].start; len = subprog_end - subprog_start; func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); @@ -5291,7 +5471,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) * Long term would need debug info to populate names */ func[i]->aux->name[0] = 'F'; - func[i]->aux->stack_depth = env->subprog_stack_depth[i]; + func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { @@ -5304,20 +5484,33 @@ static int jit_subprogs(struct bpf_verifier_env *env) * now populate all bpf_calls with correct addresses and * run last pass of JIT */ - for (i = 0; i <= env->subprog_cnt; i++) { + for (i = 0; i < env->subprog_cnt; i++) { insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) continue; subprog = insn->off; - insn->off = 0; insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) func[subprog]->bpf_func - __bpf_call_base; } + + /* we use the aux data to keep a list of the start addresses + * of the JITed images for each function in the program + * + * for some architectures, such as powerpc64, the imm field + * might not be large enough to hold the offset of the start + * address of the callee's JITed image from __bpf_call_base + * + * in such cases, we can lookup the start address of a callee + * by using its subprog id, available from the off field of + * the call instruction, as an index for this list + */ + func[i]->aux->func = func; + func[i]->aux->func_cnt = env->subprog_cnt; } - for (i = 0; i <= env->subprog_cnt; i++) { + for (i = 0; i < env->subprog_cnt; i++) { old_bpf_func = func[i]->bpf_func; tmp = bpf_int_jit_compile(func[i]); if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { @@ -5331,7 +5524,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) /* finally lock prog and jit images for all functions and * populate kallsysm */ - for (i = 0; i <= env->subprog_cnt; i++) { + for (i = 0; i < env->subprog_cnt; i++) { bpf_prog_lock_ro(func[i]); bpf_prog_kallsyms_add(func[i]); } @@ -5341,26 +5534,21 @@ static int jit_subprogs(struct bpf_verifier_env *env) * later look the same as if they were interpreted only. */ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - unsigned long addr; - if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) continue; insn->off = env->insn_aux_data[i].call_imm; subprog = find_subprog(env, i + insn->off + 1); - addr = (unsigned long)func[subprog + 1]->bpf_func; - addr &= PAGE_MASK; - insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) - addr - __bpf_call_base; + insn->imm = subprog; } prog->jited = 1; prog->bpf_func = func[0]->bpf_func; prog->aux->func = func; - prog->aux->func_cnt = env->subprog_cnt + 1; + prog->aux->func_cnt = env->subprog_cnt; return 0; out_free: - for (i = 0; i <= env->subprog_cnt; i++) + for (i = 0; i < env->subprog_cnt; i++) if (func[i]) bpf_jit_free(func[i]); kfree(func); @@ -5417,6 +5605,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; const int insn_cnt = prog->len; + const struct bpf_map_ops *ops; + struct bpf_insn_aux_data *aux; struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; struct bpf_map *map_ptr; @@ -5463,6 +5653,25 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + if (BPF_CLASS(insn->code) == BPF_LD && + (BPF_MODE(insn->code) == BPF_ABS || + BPF_MODE(insn->code) == BPF_IND)) { + cnt = env->ops->gen_ld_abs(insn, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; if (insn->src_reg == BPF_PSEUDO_CALL) @@ -5491,19 +5700,22 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->imm = 0; insn->code = BPF_JMP | BPF_TAIL_CALL; + aux = &env->insn_aux_data[i + delta]; + if (!bpf_map_ptr_unpriv(aux)) + continue; + /* instead of changing every JIT dealing with tail_call * emit two extra insns: * if (index >= max_entries) goto out; * index &= array->index_mask; * to avoid out-of-bounds cpu speculation */ - map_ptr = env->insn_aux_data[i + delta].map_ptr; - if (map_ptr == BPF_MAP_PTR_POISON) { + if (bpf_map_ptr_poisoned(aux)) { verbose(env, "tail_call abusing map_ptr\n"); return -EINVAL; } - if (!map_ptr->unpriv_array) - continue; + + map_ptr = BPF_MAP_PTR(aux->map_state); insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, map_ptr->max_entries, 2); insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, @@ -5523,32 +5735,61 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) } /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup - * handlers are currently limited to 64 bit only. + * and other inlining handlers are currently limited to 64 bit + * only. */ if (prog->jit_requested && BITS_PER_LONG == 64 && - insn->imm == BPF_FUNC_map_lookup_elem) { - map_ptr = env->insn_aux_data[i + delta].map_ptr; - if (map_ptr == BPF_MAP_PTR_POISON || - !map_ptr->ops->map_gen_lookup) + (insn->imm == BPF_FUNC_map_lookup_elem || + insn->imm == BPF_FUNC_map_update_elem || + insn->imm == BPF_FUNC_map_delete_elem)) { + aux = &env->insn_aux_data[i + delta]; + if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; - cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; - } + map_ptr = BPF_MAP_PTR(aux->map_state); + ops = map_ptr->ops; + if (insn->imm == BPF_FUNC_map_lookup_elem && + ops->map_gen_lookup) { + cnt = ops->map_gen_lookup(map_ptr, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, - cnt); - if (!new_prog) - return -ENOMEM; + new_prog = bpf_patch_insn_data(env, i + delta, + insn_buf, cnt); + if (!new_prog) + return -ENOMEM; - delta += cnt - 1; + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } - /* keep walking new program and skip insns we just inserted */ - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - continue; + BUILD_BUG_ON(!__same_type(ops->map_lookup_elem, + (void *(*)(struct bpf_map *map, void *key))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_delete_elem, + (int (*)(struct bpf_map *map, void *key))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_update_elem, + (int (*)(struct bpf_map *map, void *key, void *value, + u64 flags))NULL)); + switch (insn->imm) { + case BPF_FUNC_map_lookup_elem: + insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) - + __bpf_call_base; + continue; + case BPF_FUNC_map_update_elem: + insn->imm = BPF_CAST_CALL(ops->map_update_elem) - + __bpf_call_base; + continue; + case BPF_FUNC_map_delete_elem: + insn->imm = BPF_CAST_CALL(ops->map_delete_elem) - + __bpf_call_base; + continue; + } + + goto patch_call_imm; } if (insn->imm == BPF_FUNC_redirect_map) { @@ -5630,8 +5871,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) return -ENOMEM; log = &env->log; - env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * - (*prog)->len); + env->insn_aux_data = + vzalloc(array_size(sizeof(struct bpf_insn_aux_data), + (*prog)->len)); ret = -ENOMEM; if (!env->insn_aux_data) goto err_free_env; @@ -5660,16 +5902,16 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; + ret = replace_map_fd_with_map_ptr(env); + if (ret < 0) + goto skip_full_check; + if (bpf_prog_is_dev_bound(env->prog->aux)) { ret = bpf_prog_offload_verifier_prep(env); if (ret) - goto err_unlock; + goto skip_full_check; } - ret = replace_map_fd_with_map_ptr(env); - if (ret < 0) - goto skip_full_check; - env->explored_states = kcalloc(env->prog->len, sizeof(struct bpf_verifier_state_list *), GFP_USER); @@ -5740,7 +5982,7 @@ skip_full_check: err_release_maps: if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release - * them now. Otherwise free_bpf_prog_info() will release them. + * them now. Otherwise free_used_maps() will release them. */ release_maps(env); *prog = env->prog; diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c new file mode 100644 index 000000000000..b3c557476a8d --- /dev/null +++ b/kernel/bpf/xskmap.c @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XSKMAP used for AF_XDP sockets + * Copyright(c) 2018 Intel Corporation. + */ + +#include <linux/bpf.h> +#include <linux/capability.h> +#include <net/xdp_sock.h> +#include <linux/slab.h> +#include <linux/sched.h> + +struct xsk_map { + struct bpf_map map; + struct xdp_sock **xsk_map; + struct list_head __percpu *flush_list; +}; + +static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) +{ + int cpu, err = -EINVAL; + struct xsk_map *m; + u64 cost; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || + attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) + return ERR_PTR(-EINVAL); + + m = kzalloc(sizeof(*m), GFP_USER); + if (!m) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&m->map, attr); + + cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); + cost += sizeof(struct list_head) * num_possible_cpus(); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_m; + + m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* Notice returns -EPERM on if map size is larger than memlock limit */ + err = bpf_map_precharge_memlock(m->map.pages); + if (err) + goto free_m; + + err = -ENOMEM; + + m->flush_list = alloc_percpu(struct list_head); + if (!m->flush_list) + goto free_m; + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); + + m->xsk_map = bpf_map_area_alloc(m->map.max_entries * + sizeof(struct xdp_sock *), + m->map.numa_node); + if (!m->xsk_map) + goto free_percpu; + return &m->map; + +free_percpu: + free_percpu(m->flush_list); +free_m: + kfree(m); + return ERR_PTR(err); +} + +static void xsk_map_free(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + int i; + + synchronize_net(); + + for (i = 0; i < map->max_entries; i++) { + struct xdp_sock *xs; + + xs = m->xsk_map[i]; + if (!xs) + continue; + + sock_put((struct sock *)xs); + } + + free_percpu(m->flush_list); + bpf_map_area_free(m->xsk_map); + kfree(m); +} + +static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = next_key; + + if (index >= m->map.max_entries) { + *next = 0; + return 0; + } + + if (index == m->map.max_entries - 1) + return -ENOENT; + *next = index + 1; + return 0; +} + +struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *xs; + + if (key >= map->max_entries) + return NULL; + + xs = READ_ONCE(m->xsk_map[key]); + return xs; +} + +int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct list_head *flush_list = this_cpu_ptr(m->flush_list); + int err; + + err = xsk_rcv(xs, xdp); + if (err) + return err; + + if (!xs->flush_node.prev) + list_add(&xs->flush_node, flush_list); + + return 0; +} + +void __xsk_map_flush(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct list_head *flush_list = this_cpu_ptr(m->flush_list); + struct xdp_sock *xs, *tmp; + + list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { + xsk_flush(xs); + __list_del(xs->flush_node.prev, xs->flush_node.next); + xs->flush_node.prev = NULL; + } +} + +static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + u32 i = *(u32 *)key, fd = *(u32 *)value; + struct xdp_sock *xs, *old_xs; + struct socket *sock; + int err; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(i >= m->map.max_entries)) + return -E2BIG; + if (unlikely(map_flags == BPF_NOEXIST)) + return -EEXIST; + + sock = sockfd_lookup(fd, &err); + if (!sock) + return err; + + if (sock->sk->sk_family != PF_XDP) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + xs = (struct xdp_sock *)sock->sk; + + if (!xsk_is_setup_for_bpf_map(xs)) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + sock_hold(sock->sk); + + old_xs = xchg(&m->xsk_map[i], xs); + if (old_xs) { + /* Make sure we've flushed everything. */ + synchronize_net(); + sock_put((struct sock *)old_xs); + } + + sockfd_put(sock); + return 0; +} + +static int xsk_map_delete_elem(struct bpf_map *map, void *key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *old_xs; + int k = *(u32 *)key; + + if (k >= map->max_entries) + return -EINVAL; + + old_xs = xchg(&m->xsk_map[k], NULL); + if (old_xs) { + /* Make sure we've flushed everything. */ + synchronize_net(); + sock_put((struct sock *)old_xs); + } + + return 0; +} + +const struct bpf_map_ops xsk_map_ops = { + .map_alloc = xsk_map_alloc, + .map_free = xsk_map_free, + .map_get_next_key = xsk_map_get_next_key, + .map_lookup_elem = xsk_map_lookup_elem, + .map_update_elem = xsk_map_update_elem, + .map_delete_elem = xsk_map_delete_elem, +}; + + diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 2be89a003185..bfcdae896122 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := cgroup.o stat.o namespace.o cgroup-v1.o +obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o obj-$(CONFIG_CGROUP_FREEZER) += freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index b928b27050c6..77ff1cd6a252 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -201,13 +201,12 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, int cgroup_task_count(const struct cgroup *cgrp); /* - * stat.c + * rstat.c */ -void cgroup_stat_flush(struct cgroup *cgrp); -int cgroup_stat_init(struct cgroup *cgrp); -void cgroup_stat_exit(struct cgroup *cgrp); -void cgroup_stat_show_cputime(struct seq_file *seq); -void cgroup_stat_boot(void); +int cgroup_rstat_init(struct cgroup *cgrp); +void cgroup_rstat_exit(struct cgroup *cgrp); +void cgroup_rstat_boot(void); +void cgroup_base_stat_cputime_show(struct seq_file *seq); /* * namespace.c @@ -218,9 +217,9 @@ extern const struct proc_ns_operations cgroupns_operations; * cgroup-v1.c */ extern struct cftype cgroup1_base_files[]; -extern const struct file_operations proc_cgroupstats_operations; extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; +int proc_cgroupstats_show(struct seq_file *m, void *v); bool cgroup1_ssid_disabled(int ssid); void cgroup1_pidlist_destroy_all(struct cgroup *cgrp); void cgroup1_release_agent(struct work_struct *work); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index a2c05d2476ac..8b4f0768efd6 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -195,9 +195,9 @@ struct cgroup_pidlist { static void *pidlist_allocate(int count) { if (PIDLIST_TOO_LARGE(count)) - return vmalloc(count * sizeof(pid_t)); + return vmalloc(array_size(count, sizeof(pid_t))); else - return kmalloc(count * sizeof(pid_t), GFP_KERNEL); + return kmalloc_array(count, sizeof(pid_t), GFP_KERNEL); } static void pidlist_free(void *p) @@ -682,7 +682,7 @@ struct cftype cgroup1_base_files[] = { }; /* Display information about each subsystem and each hierarchy */ -static int proc_cgroupstats_show(struct seq_file *m, void *v) +int proc_cgroupstats_show(struct seq_file *m, void *v) { struct cgroup_subsys *ss; int i; @@ -705,18 +705,6 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) return 0; } -static int cgroupstats_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_cgroupstats_show, NULL); -} - -const struct file_operations proc_cgroupstats_operations = { - .open = cgroupstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - /** * cgroupstats_build - build and fill cgroupstats * @stats: cgroupstats to fill information into diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a662bfcbea0e..077370bf8964 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -54,6 +54,7 @@ #include <linux/proc_ns.h> #include <linux/nsproxy.h> #include <linux/file.h> +#include <linux/sched/cputime.h> #include <net/sock.h> #define CREATE_TRACE_POINTS @@ -61,6 +62,8 @@ #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) +/* let's not notify more than 100 times per second */ +#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) /* * cgroup_mutex is the master lock. Any modification to cgroup or its @@ -142,14 +145,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { }; #undef SUBSYS -static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat); +static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); /* * The default hierarchy, reserved for the subsystems that are otherwise * unattached - it never has more than a single cgroup, and all tasks are * part of that cgroup. */ -struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat }; +struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* @@ -1554,6 +1557,8 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = NULL; spin_unlock_irq(&cgroup_file_kn_lock); + + del_timer_sync(&cfile->notify_timer); } kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); @@ -1573,8 +1578,17 @@ static void css_clear_dir(struct cgroup_subsys_state *css) css->flags &= ~CSS_VISIBLE; - list_for_each_entry(cfts, &css->ss->cfts, node) + if (!css->ss) { + if (cgroup_on_dfl(cgrp)) + cfts = cgroup_base_files; + else + cfts = cgroup1_base_files; + cgroup_addrm_files(css, cgrp, cfts, false); + } else { + list_for_each_entry(cfts, &css->ss->cfts, node) + cgroup_addrm_files(css, cgrp, cfts, false); + } } /** @@ -1598,14 +1612,16 @@ static int css_populate_dir(struct cgroup_subsys_state *css) else cfts = cgroup1_base_files; - return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); - } - - list_for_each_entry(cfts, &css->ss->cfts, node) { - ret = cgroup_addrm_files(css, cgrp, cfts, true); - if (ret < 0) { - failed_cfts = cfts; - goto err; + ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); + if (ret < 0) + return ret; + } else { + list_for_each_entry(cfts, &css->ss->cfts, node) { + ret = cgroup_addrm_files(css, cgrp, cfts, true); + if (ret < 0) { + failed_cfts = cfts; + goto err; + } } } @@ -1782,13 +1798,6 @@ static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; - spin_lock_irq(&css_set_lock); - - if (use_task_css_set_links) - goto out_unlock; - - use_task_css_set_links = true; - /* * We need tasklist_lock because RCU is not safe against * while_each_thread(). Besides, a forking task that has passed @@ -1797,6 +1806,13 @@ static void cgroup_enable_task_cg_lists(void) * tasklist if we walk through it with RCU. */ read_lock(&tasklist_lock); + spin_lock_irq(&css_set_lock); + + if (use_task_css_set_links) + goto out_unlock; + + use_task_css_set_links = true; + do_each_thread(g, p) { WARN_ON_ONCE(!list_empty(&p->cg_list) || task_css_set(p) != &init_css_set); @@ -1824,9 +1840,9 @@ static void cgroup_enable_task_cg_lists(void) } spin_unlock(&p->sighand->siglock); } while_each_thread(g, p); - read_unlock(&tasklist_lock); out_unlock: spin_unlock_irq(&css_set_lock); + read_unlock(&tasklist_lock); } static void init_cgroup_housekeeping(struct cgroup *cgrp) @@ -1844,6 +1860,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) cgrp->dom_cgrp = cgrp; cgrp->max_descendants = INT_MAX; cgrp->max_depth = INT_MAX; + INIT_LIST_HEAD(&cgrp->rstat_css_list); + prev_cputime_init(&cgrp->prev_cputime); for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); @@ -3381,7 +3399,7 @@ static int cpu_stat_show(struct seq_file *seq, void *v) struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; int ret = 0; - cgroup_stat_show_cputime(seq); + cgroup_base_stat_cputime_show(seq); #ifdef CONFIG_CGROUP_SCHED ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); #endif @@ -3521,6 +3539,12 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn) return kernfs_setattr(kn, &iattr); } +static void cgroup_file_notify_timer(struct timer_list *timer) +{ + cgroup_file_notify(container_of(timer, struct cgroup_file, + notify_timer)); +} + static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype *cft) { @@ -3547,6 +3571,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; + timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); + spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = kn; spin_unlock_irq(&cgroup_file_kn_lock); @@ -3796,8 +3822,17 @@ void cgroup_file_notify(struct cgroup_file *cfile) unsigned long flags; spin_lock_irqsave(&cgroup_file_kn_lock, flags); - if (cfile->kn) - kernfs_notify(cfile->kn); + if (cfile->kn) { + unsigned long last = cfile->notified_at; + unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV; + + if (time_in_range(jiffies, last, next)) { + timer_reduce(&cfile->notify_timer, next); + } else { + kernfs_notify(cfile->kn); + cfile->notified_at = jiffies; + } + } spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } @@ -4560,7 +4595,7 @@ static void css_free_rwork_fn(struct work_struct *work) cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); if (cgroup_on_dfl(cgrp)) - cgroup_stat_exit(cgrp); + cgroup_rstat_exit(cgrp); kfree(cgrp); } else { /* @@ -4587,6 +4622,11 @@ static void css_release_work_fn(struct work_struct *work) if (ss) { /* css release path */ + if (!list_empty(&css->rstat_css_node)) { + cgroup_rstat_flush(cgrp); + list_del_rcu(&css->rstat_css_node); + } + cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) ss->css_released(css); @@ -4597,7 +4637,7 @@ static void css_release_work_fn(struct work_struct *work) trace_cgroup_release(cgrp); if (cgroup_on_dfl(cgrp)) - cgroup_stat_flush(cgrp); + cgroup_rstat_flush(cgrp); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) @@ -4648,6 +4688,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); + INIT_LIST_HEAD(&css->rstat_css_node); css->serial_nr = css_serial_nr_next++; atomic_set(&css->online_cnt, 0); @@ -4656,6 +4697,9 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css_get(css->parent); } + if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush) + list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list); + BUG_ON(cgroup_css(cgrp, ss)); } @@ -4757,6 +4801,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, err_list_del: list_del_rcu(&css->sibling); err_free_css: + list_del_rcu(&css->rstat_css_node); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); return ERR_PTR(err); @@ -4775,8 +4820,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(sizeof(*cgrp) + - sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL); + cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)), + GFP_KERNEL); if (!cgrp) return ERR_PTR(-ENOMEM); @@ -4785,7 +4830,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) goto out_free_cgrp; if (cgroup_on_dfl(parent)) { - ret = cgroup_stat_init(cgrp); + ret = cgroup_rstat_init(cgrp); if (ret) goto out_cancel_ref; } @@ -4850,7 +4895,7 @@ out_idr_free: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_stat_exit: if (cgroup_on_dfl(parent)) - cgroup_stat_exit(cgrp); + cgroup_rstat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: @@ -5090,10 +5135,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) for_each_css(css, ssid, cgrp) kill_css(css); - /* - * Remove @cgrp directory along with the base files. @cgrp has an - * extra ref on its kn. - */ + /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */ + css_clear_dir(&cgrp->self); kernfs_remove(cgrp->kn); if (parent && cgroup_is_threaded(cgrp)) @@ -5245,7 +5288,7 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); - cgroup_stat_boot(); + cgroup_rstat_boot(); /* * The latency of the synchronize_sched() is too high for cgroups, @@ -5335,7 +5378,7 @@ int __init cgroup_init(void) WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); - WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); + WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show)); return 0; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index b42037e6e81d..d8b12e0d39cd 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -683,7 +683,7 @@ static int generate_sched_domains(cpumask_var_t **domains, goto done; } - csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL); + csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); if (!csa) goto done; csn = 0; @@ -753,7 +753,8 @@ restart: * The rest of the code, including the scheduler, can deal with * dattr==NULL case. No need to abort if alloc fails. */ - dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); + dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), + GFP_KERNEL); for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index defad3c5e7dc..d3bbb757ee49 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -362,35 +362,32 @@ EXPORT_SYMBOL(rdmacg_unregister_device); static int parse_resource(char *c, int *intval) { substring_t argstr; - const char **table = &rdmacg_resource_names[0]; char *name, *value = c; size_t len; - int ret, i = 0; + int ret, i; name = strsep(&value, "="); if (!name || !value) return -EINVAL; - len = strlen(value); + i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name); + if (i < 0) + return i; - for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { - if (strcmp(table[i], name)) - continue; + len = strlen(value); - argstr.from = value; - argstr.to = value + len; + argstr.from = value; + argstr.to = value + len; - ret = match_int(&argstr, intval); - if (ret >= 0) { - if (*intval < 0) - break; - return i; - } - if (strncmp(value, RDMACG_MAX_STR, len) == 0) { - *intval = S32_MAX; - return i; - } - break; + ret = match_int(&argstr, intval); + if (ret >= 0) { + if (*intval < 0) + return -EINVAL; + return i; + } + if (strncmp(value, RDMACG_MAX_STR, len) == 0) { + *intval = S32_MAX; + return i; } return -EINVAL; } diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c new file mode 100644 index 000000000000..d503d1a9007c --- /dev/null +++ b/kernel/cgroup/rstat.c @@ -0,0 +1,416 @@ +#include "cgroup-internal.h" + +#include <linux/sched/cputime.h> + +static DEFINE_SPINLOCK(cgroup_rstat_lock); +static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); + +static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); + +static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) +{ + return per_cpu_ptr(cgrp->rstat_cpu, cpu); +} + +/** + * cgroup_rstat_updated - keep track of updated rstat_cpu + * @cgrp: target cgroup + * @cpu: cpu on which rstat_cpu was updated + * + * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching + * rstat_cpu->updated_children list. See the comment on top of + * cgroup_rstat_cpu definition for details. + */ +void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) +{ + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); + struct cgroup *parent; + unsigned long flags; + + /* nothing to do for root */ + if (!cgroup_parent(cgrp)) + return; + + /* + * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we + * see NULL updated_next or they see our updated stat. + */ + smp_mb(); + + /* + * Because @parent's updated_children is terminated with @parent + * instead of NULL, we can tell whether @cgrp is on the list by + * testing the next pointer for NULL. + */ + if (cgroup_rstat_cpu(cgrp, cpu)->updated_next) + return; + + raw_spin_lock_irqsave(cpu_lock, flags); + + /* put @cgrp and all ancestors on the corresponding updated lists */ + for (parent = cgroup_parent(cgrp); parent; + cgrp = parent, parent = cgroup_parent(cgrp)) { + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); + + /* + * Both additions and removals are bottom-up. If a cgroup + * is already in the tree, all ancestors are. + */ + if (rstatc->updated_next) + break; + + rstatc->updated_next = prstatc->updated_children; + prstatc->updated_children = cgrp; + } + + raw_spin_unlock_irqrestore(cpu_lock, flags); +} +EXPORT_SYMBOL_GPL(cgroup_rstat_updated); + +/** + * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree + * @pos: current position + * @root: root of the tree to traversal + * @cpu: target cpu + * + * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts + * the traversal and %NULL return indicates the end. During traversal, + * each returned cgroup is unlinked from the tree. Must be called with the + * matching cgroup_rstat_cpu_lock held. + * + * The only ordering guarantee is that, for a parent and a child pair + * covered by a given traversal, if a child is visited, its parent is + * guaranteed to be visited afterwards. + */ +static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, + struct cgroup *root, int cpu) +{ + struct cgroup_rstat_cpu *rstatc; + struct cgroup *parent; + + if (pos == root) + return NULL; + + /* + * We're gonna walk down to the first leaf and visit/remove it. We + * can pick whatever unvisited node as the starting point. + */ + if (!pos) + pos = root; + else + pos = cgroup_parent(pos); + + /* walk down to the first leaf */ + while (true) { + rstatc = cgroup_rstat_cpu(pos, cpu); + if (rstatc->updated_children == pos) + break; + pos = rstatc->updated_children; + } + + /* + * Unlink @pos from the tree. As the updated_children list is + * singly linked, we have to walk it to find the removal point. + * However, due to the way we traverse, @pos will be the first + * child in most cases. The only exception is @root. + */ + parent = cgroup_parent(pos); + if (parent && rstatc->updated_next) { + struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); + struct cgroup_rstat_cpu *nrstatc; + struct cgroup **nextp; + + nextp = &prstatc->updated_children; + while (true) { + nrstatc = cgroup_rstat_cpu(*nextp, cpu); + if (*nextp == pos) + break; + + WARN_ON_ONCE(*nextp == parent); + nextp = &nrstatc->updated_next; + } + + *nextp = rstatc->updated_next; + rstatc->updated_next = NULL; + + /* + * Paired with the one in cgroup_rstat_cpu_updated(). + * Either they see NULL updated_next or we see their + * updated stat. + */ + smp_mb(); + } + + return pos; +} + +/* see cgroup_rstat_flush() */ +static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) + __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) +{ + int cpu; + + lockdep_assert_held(&cgroup_rstat_lock); + + for_each_possible_cpu(cpu) { + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, + cpu); + struct cgroup *pos = NULL; + + raw_spin_lock(cpu_lock); + while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { + struct cgroup_subsys_state *css; + + cgroup_base_stat_flush(pos, cpu); + + rcu_read_lock(); + list_for_each_entry_rcu(css, &pos->rstat_css_list, + rstat_css_node) + css->ss->css_rstat_flush(css, cpu); + rcu_read_unlock(); + } + raw_spin_unlock(cpu_lock); + + /* if @may_sleep, play nice and yield if necessary */ + if (may_sleep && (need_resched() || + spin_needbreak(&cgroup_rstat_lock))) { + spin_unlock_irq(&cgroup_rstat_lock); + if (!cond_resched()) + cpu_relax(); + spin_lock_irq(&cgroup_rstat_lock); + } + } +} + +/** + * cgroup_rstat_flush - flush stats in @cgrp's subtree + * @cgrp: target cgroup + * + * Collect all per-cpu stats in @cgrp's subtree into the global counters + * and propagate them upwards. After this function returns, all cgroups in + * the subtree have up-to-date ->stat. + * + * This also gets all cgroups in the subtree including @cgrp off the + * ->updated_children lists. + * + * This function may block. + */ +void cgroup_rstat_flush(struct cgroup *cgrp) +{ + might_sleep(); + + spin_lock_irq(&cgroup_rstat_lock); + cgroup_rstat_flush_locked(cgrp, true); + spin_unlock_irq(&cgroup_rstat_lock); +} + +/** + * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush() + * @cgrp: target cgroup + * + * This function can be called from any context. + */ +void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp) +{ + unsigned long flags; + + spin_lock_irqsave(&cgroup_rstat_lock, flags); + cgroup_rstat_flush_locked(cgrp, false); + spin_unlock_irqrestore(&cgroup_rstat_lock, flags); +} + +/** + * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold + * @cgrp: target cgroup + * + * Flush stats in @cgrp's subtree and prevent further flushes. Must be + * paired with cgroup_rstat_flush_release(). + * + * This function may block. + */ +void cgroup_rstat_flush_hold(struct cgroup *cgrp) + __acquires(&cgroup_rstat_lock) +{ + might_sleep(); + spin_lock_irq(&cgroup_rstat_lock); + cgroup_rstat_flush_locked(cgrp, true); +} + +/** + * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() + */ +void cgroup_rstat_flush_release(void) + __releases(&cgroup_rstat_lock) +{ + spin_unlock_irq(&cgroup_rstat_lock); +} + +int cgroup_rstat_init(struct cgroup *cgrp) +{ + int cpu; + + /* the root cgrp has rstat_cpu preallocated */ + if (!cgrp->rstat_cpu) { + cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); + if (!cgrp->rstat_cpu) + return -ENOMEM; + } + + /* ->updated_children list is self terminated */ + for_each_possible_cpu(cpu) { + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + + rstatc->updated_children = cgrp; + u64_stats_init(&rstatc->bsync); + } + + return 0; +} + +void cgroup_rstat_exit(struct cgroup *cgrp) +{ + int cpu; + + cgroup_rstat_flush(cgrp); + + /* sanity check */ + for_each_possible_cpu(cpu) { + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + + if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || + WARN_ON_ONCE(rstatc->updated_next)) + return; + } + + free_percpu(cgrp->rstat_cpu); + cgrp->rstat_cpu = NULL; +} + +void __init cgroup_rstat_boot(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); + + BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp)); +} + +/* + * Functions for cgroup basic resource statistics implemented on top of + * rstat. + */ +static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat, + struct cgroup_base_stat *src_bstat) +{ + dst_bstat->cputime.utime += src_bstat->cputime.utime; + dst_bstat->cputime.stime += src_bstat->cputime.stime; + dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; +} + +static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) +{ + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct task_cputime *last_cputime = &rstatc->last_bstat.cputime; + struct task_cputime cputime; + struct cgroup_base_stat delta; + unsigned seq; + + /* fetch the current per-cpu values */ + do { + seq = __u64_stats_fetch_begin(&rstatc->bsync); + cputime = rstatc->bstat.cputime; + } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); + + /* calculate the delta to propgate */ + delta.cputime.utime = cputime.utime - last_cputime->utime; + delta.cputime.stime = cputime.stime - last_cputime->stime; + delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - + last_cputime->sum_exec_runtime; + *last_cputime = cputime; + + /* transfer the pending stat into delta */ + cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat); + memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat)); + + /* propagate delta into the global stat and the parent's pending */ + cgroup_base_stat_accumulate(&cgrp->bstat, &delta); + if (parent) + cgroup_base_stat_accumulate(&parent->pending_bstat, &delta); +} + +static struct cgroup_rstat_cpu * +cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp) +{ + struct cgroup_rstat_cpu *rstatc; + + rstatc = get_cpu_ptr(cgrp->rstat_cpu); + u64_stats_update_begin(&rstatc->bsync); + return rstatc; +} + +static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, + struct cgroup_rstat_cpu *rstatc) +{ + u64_stats_update_end(&rstatc->bsync); + cgroup_rstat_updated(cgrp, smp_processor_id()); + put_cpu_ptr(rstatc); +} + +void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) +{ + struct cgroup_rstat_cpu *rstatc; + + rstatc = cgroup_base_stat_cputime_account_begin(cgrp); + rstatc->bstat.cputime.sum_exec_runtime += delta_exec; + cgroup_base_stat_cputime_account_end(cgrp, rstatc); +} + +void __cgroup_account_cputime_field(struct cgroup *cgrp, + enum cpu_usage_stat index, u64 delta_exec) +{ + struct cgroup_rstat_cpu *rstatc; + + rstatc = cgroup_base_stat_cputime_account_begin(cgrp); + + switch (index) { + case CPUTIME_USER: + case CPUTIME_NICE: + rstatc->bstat.cputime.utime += delta_exec; + break; + case CPUTIME_SYSTEM: + case CPUTIME_IRQ: + case CPUTIME_SOFTIRQ: + rstatc->bstat.cputime.stime += delta_exec; + break; + default: + break; + } + + cgroup_base_stat_cputime_account_end(cgrp, rstatc); +} + +void cgroup_base_stat_cputime_show(struct seq_file *seq) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + u64 usage, utime, stime; + + if (!cgroup_parent(cgrp)) + return; + + cgroup_rstat_flush_hold(cgrp); + usage = cgrp->bstat.cputime.sum_exec_runtime; + cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); + cgroup_rstat_flush_release(); + + do_div(usage, NSEC_PER_USEC); + do_div(utime, NSEC_PER_USEC); + do_div(stime, NSEC_PER_USEC); + + seq_printf(seq, "usage_usec %llu\n" + "user_usec %llu\n" + "system_usec %llu\n", + usage, utime, stime); +} diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c deleted file mode 100644 index 1e111dd455c4..000000000000 --- a/kernel/cgroup/stat.c +++ /dev/null @@ -1,338 +0,0 @@ -#include "cgroup-internal.h" - -#include <linux/sched/cputime.h> - -static DEFINE_MUTEX(cgroup_stat_mutex); -static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock); - -static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu) -{ - return per_cpu_ptr(cgrp->cpu_stat, cpu); -} - -/** - * cgroup_cpu_stat_updated - keep track of updated cpu_stat - * @cgrp: target cgroup - * @cpu: cpu on which cpu_stat was updated - * - * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching - * cpu_stat->updated_children list. See the comment on top of - * cgroup_cpu_stat definition for details. - */ -static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu) -{ - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); - struct cgroup *parent; - unsigned long flags; - - /* - * Speculative already-on-list test. This may race leading to - * temporary inaccuracies, which is fine. - * - * Because @parent's updated_children is terminated with @parent - * instead of NULL, we can tell whether @cgrp is on the list by - * testing the next pointer for NULL. - */ - if (cgroup_cpu_stat(cgrp, cpu)->updated_next) - return; - - raw_spin_lock_irqsave(cpu_lock, flags); - - /* put @cgrp and all ancestors on the corresponding updated lists */ - for (parent = cgroup_parent(cgrp); parent; - cgrp = parent, parent = cgroup_parent(cgrp)) { - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); - struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); - - /* - * Both additions and removals are bottom-up. If a cgroup - * is already in the tree, all ancestors are. - */ - if (cstat->updated_next) - break; - - cstat->updated_next = pcstat->updated_children; - pcstat->updated_children = cgrp; - } - - raw_spin_unlock_irqrestore(cpu_lock, flags); -} - -/** - * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree - * @pos: current position - * @root: root of the tree to traversal - * @cpu: target cpu - * - * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts - * the traversal and %NULL return indicates the end. During traversal, - * each returned cgroup is unlinked from the tree. Must be called with the - * matching cgroup_cpu_stat_lock held. - * - * The only ordering guarantee is that, for a parent and a child pair - * covered by a given traversal, if a child is visited, its parent is - * guaranteed to be visited afterwards. - */ -static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos, - struct cgroup *root, int cpu) -{ - struct cgroup_cpu_stat *cstat; - struct cgroup *parent; - - if (pos == root) - return NULL; - - /* - * We're gonna walk down to the first leaf and visit/remove it. We - * can pick whatever unvisited node as the starting point. - */ - if (!pos) - pos = root; - else - pos = cgroup_parent(pos); - - /* walk down to the first leaf */ - while (true) { - cstat = cgroup_cpu_stat(pos, cpu); - if (cstat->updated_children == pos) - break; - pos = cstat->updated_children; - } - - /* - * Unlink @pos from the tree. As the updated_children list is - * singly linked, we have to walk it to find the removal point. - * However, due to the way we traverse, @pos will be the first - * child in most cases. The only exception is @root. - */ - parent = cgroup_parent(pos); - if (parent && cstat->updated_next) { - struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); - struct cgroup_cpu_stat *ncstat; - struct cgroup **nextp; - - nextp = &pcstat->updated_children; - while (true) { - ncstat = cgroup_cpu_stat(*nextp, cpu); - if (*nextp == pos) - break; - - WARN_ON_ONCE(*nextp == parent); - nextp = &ncstat->updated_next; - } - - *nextp = cstat->updated_next; - cstat->updated_next = NULL; - } - - return pos; -} - -static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat, - struct cgroup_stat *src_stat) -{ - dst_stat->cputime.utime += src_stat->cputime.utime; - dst_stat->cputime.stime += src_stat->cputime.stime; - dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime; -} - -static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu) -{ - struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); - struct task_cputime *last_cputime = &cstat->last_cputime; - struct task_cputime cputime; - struct cgroup_stat delta; - unsigned seq; - - lockdep_assert_held(&cgroup_stat_mutex); - - /* fetch the current per-cpu values */ - do { - seq = __u64_stats_fetch_begin(&cstat->sync); - cputime = cstat->cputime; - } while (__u64_stats_fetch_retry(&cstat->sync, seq)); - - /* accumulate the deltas to propgate */ - delta.cputime.utime = cputime.utime - last_cputime->utime; - delta.cputime.stime = cputime.stime - last_cputime->stime; - delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - - last_cputime->sum_exec_runtime; - *last_cputime = cputime; - - /* transfer the pending stat into delta */ - cgroup_stat_accumulate(&delta, &cgrp->pending_stat); - memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat)); - - /* propagate delta into the global stat and the parent's pending */ - cgroup_stat_accumulate(&cgrp->stat, &delta); - if (parent) - cgroup_stat_accumulate(&parent->pending_stat, &delta); -} - -/* see cgroup_stat_flush() */ -static void cgroup_stat_flush_locked(struct cgroup *cgrp) -{ - int cpu; - - lockdep_assert_held(&cgroup_stat_mutex); - - for_each_possible_cpu(cpu) { - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); - struct cgroup *pos = NULL; - - raw_spin_lock_irq(cpu_lock); - while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu))) - cgroup_cpu_stat_flush_one(pos, cpu); - raw_spin_unlock_irq(cpu_lock); - } -} - -/** - * cgroup_stat_flush - flush stats in @cgrp's subtree - * @cgrp: target cgroup - * - * Collect all per-cpu stats in @cgrp's subtree into the global counters - * and propagate them upwards. After this function returns, all cgroups in - * the subtree have up-to-date ->stat. - * - * This also gets all cgroups in the subtree including @cgrp off the - * ->updated_children lists. - */ -void cgroup_stat_flush(struct cgroup *cgrp) -{ - mutex_lock(&cgroup_stat_mutex); - cgroup_stat_flush_locked(cgrp); - mutex_unlock(&cgroup_stat_mutex); -} - -static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp) -{ - struct cgroup_cpu_stat *cstat; - - cstat = get_cpu_ptr(cgrp->cpu_stat); - u64_stats_update_begin(&cstat->sync); - return cstat; -} - -static void cgroup_cpu_stat_account_end(struct cgroup *cgrp, - struct cgroup_cpu_stat *cstat) -{ - u64_stats_update_end(&cstat->sync); - cgroup_cpu_stat_updated(cgrp, smp_processor_id()); - put_cpu_ptr(cstat); -} - -void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) -{ - struct cgroup_cpu_stat *cstat; - - cstat = cgroup_cpu_stat_account_begin(cgrp); - cstat->cputime.sum_exec_runtime += delta_exec; - cgroup_cpu_stat_account_end(cgrp, cstat); -} - -void __cgroup_account_cputime_field(struct cgroup *cgrp, - enum cpu_usage_stat index, u64 delta_exec) -{ - struct cgroup_cpu_stat *cstat; - - cstat = cgroup_cpu_stat_account_begin(cgrp); - - switch (index) { - case CPUTIME_USER: - case CPUTIME_NICE: - cstat->cputime.utime += delta_exec; - break; - case CPUTIME_SYSTEM: - case CPUTIME_IRQ: - case CPUTIME_SOFTIRQ: - cstat->cputime.stime += delta_exec; - break; - default: - break; - } - - cgroup_cpu_stat_account_end(cgrp, cstat); -} - -void cgroup_stat_show_cputime(struct seq_file *seq) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - u64 usage, utime, stime; - - if (!cgroup_parent(cgrp)) - return; - - mutex_lock(&cgroup_stat_mutex); - - cgroup_stat_flush_locked(cgrp); - - usage = cgrp->stat.cputime.sum_exec_runtime; - cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime, - &utime, &stime); - - mutex_unlock(&cgroup_stat_mutex); - - do_div(usage, NSEC_PER_USEC); - do_div(utime, NSEC_PER_USEC); - do_div(stime, NSEC_PER_USEC); - - seq_printf(seq, "usage_usec %llu\n" - "user_usec %llu\n" - "system_usec %llu\n", - usage, utime, stime); -} - -int cgroup_stat_init(struct cgroup *cgrp) -{ - int cpu; - - /* the root cgrp has cpu_stat preallocated */ - if (!cgrp->cpu_stat) { - cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat); - if (!cgrp->cpu_stat) - return -ENOMEM; - } - - /* ->updated_children list is self terminated */ - for_each_possible_cpu(cpu) { - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); - - cstat->updated_children = cgrp; - u64_stats_init(&cstat->sync); - } - - prev_cputime_init(&cgrp->stat.prev_cputime); - - return 0; -} - -void cgroup_stat_exit(struct cgroup *cgrp) -{ - int cpu; - - cgroup_stat_flush(cgrp); - - /* sanity check */ - for_each_possible_cpu(cpu) { - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); - - if (WARN_ON_ONCE(cstat->updated_children != cgrp) || - WARN_ON_ONCE(cstat->updated_next)) - return; - } - - free_percpu(cgrp->cpu_stat); - cgrp->cpu_stat = NULL; -} - -void __init cgroup_stat_boot(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu)); - - BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp)); -} diff --git a/kernel/compat.c b/kernel/compat.c index 6d21894806b4..702aa846ddac 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -34,6 +34,7 @@ int compat_get_timex(struct timex *txc, const struct compat_timex __user *utp) { struct compat_timex tx32; + memset(txc, 0, sizeof(struct timex)); if (copy_from_user(&tx32, utp, sizeof(struct compat_timex))) return -EFAULT; @@ -120,50 +121,6 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } -static int __compat_get_timespec64(struct timespec64 *ts64, - const struct compat_timespec __user *cts) -{ - struct compat_timespec ts; - int ret; - - ret = copy_from_user(&ts, cts, sizeof(ts)); - if (ret) - return -EFAULT; - - ts64->tv_sec = ts.tv_sec; - ts64->tv_nsec = ts.tv_nsec; - - return 0; -} - -static int __compat_put_timespec64(const struct timespec64 *ts64, - struct compat_timespec __user *cts) -{ - struct compat_timespec ts = { - .tv_sec = ts64->tv_sec, - .tv_nsec = ts64->tv_nsec - }; - return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; -} - -int compat_get_timespec64(struct timespec64 *ts, const void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_get_timespec64(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_get_timespec64); - -int compat_put_timespec64(const struct timespec64 *ts, void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_put_timespec64(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_put_timespec64); - int compat_get_timeval(struct timeval *tv, const void __user *utv) { if (COMPAT_USE_64BIT_TIME) @@ -367,6 +324,14 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len, return ret; } +/* Todo: Delete these extern declarations when get/put_compat_itimerspec64() + * are moved to kernel/time/time.c . + */ +extern int __compat_get_timespec64(struct timespec64 *ts64, + const struct compat_timespec __user *cts); +extern int __compat_put_timespec64(const struct timespec64 *ts64, + struct compat_timespec __user *cts); + int get_compat_itimerspec64(struct itimerspec64 *its, const struct compat_itimerspec __user *uits) { diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index 946fb92418f7..81e9af7dcec2 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -12,7 +12,7 @@ CONFIG_BLK_DEV_DM=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 -CONFIG_CC_STACKPROTECTOR_STRONG=y +CONFIG_STACKPROTECTOR_STRONG=y CONFIG_COMPACTION=y CONFIG_CPU_SW_DOMAIN_PAN=y CONFIG_DM_CRYPT=y diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index 9bfdffc100da..7fa0c4ae6394 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -10,7 +10,3 @@ CONFIG_OPTIMIZE_INLINING=y # CONFIG_SLAB is not set # CONFIG_SLUB is not set CONFIG_SLOB=y -CONFIG_CC_STACKPROTECTOR_NONE=y -# CONFIG_CC_STACKPROTECTOR_REGULAR is not set -# CONFIG_CC_STACKPROTECTOR_STRONG is not set -# CONFIG_CC_STACKPROTECTOR_AUTO is not set diff --git a/kernel/crash_core.c b/kernel/crash_core.c index f7674d676889..b66aced5e8c2 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -460,6 +460,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_hwpoison); #endif VMCOREINFO_NUMBER(PG_head_mask); +#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #ifdef CONFIG_HUGETLB_PAGE VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index e405677ee08d..2ddfce8f1e8f 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -691,7 +691,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0) } if (!s->usable) return KDB_NOTIMP; - s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB); + s->command = kcalloc(s->count + 1, sizeof(*(s->command)), GFP_KDB); if (!s->command) { kdb_printf("Could not allocate new kdb_defcmd table for %s\n", cmdstr); @@ -729,8 +729,8 @@ static int kdb_defcmd(int argc, const char **argv) kdb_printf("Command only available during kdb_init()\n"); return KDB_NOTIMP; } - defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set), - GFP_KDB); + defcmd_set = kmalloc_array(defcmd_set_count + 1, sizeof(*defcmd_set), + GFP_KDB); if (!defcmd_set) goto fail_defcmd; memcpy(defcmd_set, save_defcmd_set, @@ -2706,8 +2706,11 @@ int kdb_register_flags(char *cmd, } if (i >= kdb_max_commands) { - kdbtab_t *new = kmalloc((kdb_max_commands - KDB_BASE_CMD_MAX + - kdb_command_extend) * sizeof(*new), GFP_KDB); + kdbtab_t *new = kmalloc_array(kdb_max_commands - + KDB_BASE_CMD_MAX + + kdb_command_extend, + sizeof(*new), + GFP_KDB); if (!new) { kdb_printf("Could not allocate new kdb_command " "table\n"); diff --git a/kernel/delayacct.c b/kernel/delayacct.c index e2764d767f18..ca8ac2824f0b 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -44,23 +44,24 @@ void __delayacct_tsk_init(struct task_struct *tsk) { tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL); if (tsk->delays) - spin_lock_init(&tsk->delays->lock); + raw_spin_lock_init(&tsk->delays->lock); } /* * Finish delay accounting for a statistic using its timestamps (@start), * accumalator (@total) and @count */ -static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count) +static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, + u32 *count) { s64 ns = ktime_get_ns() - *start; unsigned long flags; if (ns > 0) { - spin_lock_irqsave(lock, flags); + raw_spin_lock_irqsave(lock, flags); *total += ns; (*count)++; - spin_unlock_irqrestore(lock, flags); + raw_spin_unlock_irqrestore(lock, flags); } } @@ -127,7 +128,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ - spin_lock_irqsave(&tsk->delays->lock, flags); + raw_spin_lock_irqsave(&tsk->delays->lock, flags); tmp = d->blkio_delay_total + tsk->delays->blkio_delay; d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; tmp = d->swapin_delay_total + tsk->delays->swapin_delay; @@ -137,7 +138,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; - spin_unlock_irqrestore(&tsk->delays->lock, flags); + raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; } @@ -147,10 +148,10 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk) __u64 ret; unsigned long flags; - spin_lock_irqsave(&tsk->delays->lock, flags); + raw_spin_lock_irqsave(&tsk->delays->lock, flags); ret = nsec_to_clock_t(tsk->delays->blkio_delay + tsk->delays->swapin_delay); - spin_unlock_irqrestore(&tsk->delays->lock, flags); + raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return ret; } diff --git a/kernel/dma.c b/kernel/dma.c index 3506fc34a712..40f152936316 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -135,21 +135,9 @@ static int proc_dma_show(struct seq_file *m, void *v) } #endif /* MAX_DMA_CHANNELS */ -static int proc_dma_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_dma_show, NULL); -} - -static const struct file_operations proc_dma_operations = { - .open = proc_dma_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_dma_init(void) { - proc_create("dma", 0, NULL, &proc_dma_operations); + proc_create_single("dma", 0, NULL, proc_dma_show); return 0; } diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 772a43fea825..c187aa3df3c8 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -119,23 +119,20 @@ int get_callchain_buffers(int event_max_stack) goto exit; } - if (count > 1) { - /* If the allocation failed, give up */ - if (!callchain_cpus_entries) - err = -ENOMEM; - /* - * If requesting per event more than the global cap, - * return a different error to help userspace figure - * this out. - * - * And also do it here so that we have &callchain_mutex held. - */ - if (event_max_stack > sysctl_perf_event_max_stack) - err = -EOVERFLOW; + /* + * If requesting per event more than the global cap, + * return a different error to help userspace figure + * this out. + * + * And also do it here so that we have &callchain_mutex held. + */ + if (event_max_stack > sysctl_perf_event_max_stack) { + err = -EOVERFLOW; goto exit; } - err = alloc_callchain_buffers(); + if (count == 1) + err = alloc_callchain_buffers(); exit: if (err) atomic_dec(&nr_callchain_events); diff --git a/kernel/events/core.c b/kernel/events/core.c index 2d5fe26551f8..80cca2b30c4f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5120,6 +5120,8 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd, switch (_IOC_NR(cmd)) { case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): case _IOC_NR(PERF_EVENT_IOC_ID): + case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF): + case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES): /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { cmd &= ~IOCSIZE_MASK; @@ -6668,7 +6670,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { - if (filter->inode) { + if (filter->path.dentry) { event->addr_filters_offs[count] = 0; restart++; } @@ -7333,7 +7335,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, struct file *file, unsigned long offset, unsigned long size) { - if (filter->inode != file_inode(file)) + if (d_inode(filter->path.dentry) != file_inode(file)) return false; if (filter->offset > offset + size) @@ -7587,6 +7589,10 @@ static void perf_event_switch(struct task_struct *task, }, }; + if (!sched_in && task->state == TASK_RUNNING) + switch_event.event_id.header.misc |= + PERF_RECORD_MISC_SWITCH_OUT_PREEMPT; + perf_iterate_sb(perf_event_switch_output, &switch_event, NULL); @@ -8682,8 +8688,7 @@ static void free_filters_list(struct list_head *filters) struct perf_addr_filter *filter, *iter; list_for_each_entry_safe(filter, iter, filters, entry) { - if (filter->inode) - iput(filter->inode); + path_put(&filter->path); list_del(&filter->entry); kfree(filter); } @@ -8780,7 +8785,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) * Adjust base offset if the filter is associated to a binary * that needs to be mapped: */ - if (filter->inode) + if (filter->path.dentry) event->addr_filters_offs[count] = perf_addr_filter_apply(filter, mm); @@ -8854,7 +8859,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, { struct perf_addr_filter *filter = NULL; char *start, *orig, *filename = NULL; - struct path path; substring_t args[MAX_OPT_ARGS]; int state = IF_STATE_ACTION, token; unsigned int kernel = 0; @@ -8967,19 +8971,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, goto fail_free_name; /* look up the path and grab its inode */ - ret = kern_path(filename, LOOKUP_FOLLOW, &path); + ret = kern_path(filename, LOOKUP_FOLLOW, + &filter->path); if (ret) goto fail_free_name; - filter->inode = igrab(d_inode(path.dentry)); - path_put(&path); kfree(filename); filename = NULL; ret = -EINVAL; - if (!filter->inode || - !S_ISREG(filter->inode->i_mode)) - /* free_filters_list() will iput() */ + if (!filter->path.dentry || + !S_ISREG(d_inode(filter->path.dentry) + ->i_mode)) goto fail; event->addr_filters.nr_file_filters++; @@ -10205,9 +10208,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, * __u16 sample size limit. */ if (attr->sample_stack_user >= USHRT_MAX) - ret = -EINVAL; + return -EINVAL; else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) - ret = -EINVAL; + return -EINVAL; } if (!attr->sample_max_stack) @@ -10517,19 +10520,20 @@ SYSCALL_DEFINE5(perf_event_open, if (pmu->task_ctx_nr == perf_sw_context) event->event_caps |= PERF_EV_CAP_SOFTWARE; - if (group_leader && - (is_software_event(event) != is_software_event(group_leader))) { - if (is_software_event(event)) { + if (group_leader) { + if (is_software_event(event) && + !in_software_context(group_leader)) { /* - * If event and group_leader are not both a software - * event, and event is, then group leader is not. + * If the event is a sw event, but the group_leader + * is on hw context. * - * Allow the addition of software events to !software - * groups, this is safe because software events never - * fail to schedule. + * Allow the addition of software events to hw + * groups, this is safe because software events + * never fail to schedule. */ - pmu = group_leader->pmu; - } else if (is_software_event(group_leader) && + pmu = group_leader->ctx->pmu; + } else if (!is_software_event(event) && + is_software_event(group_leader) && (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { /* * In case the group is a pure software group, and we @@ -11208,6 +11212,14 @@ struct file *perf_event_get(unsigned int fd) return file; } +const struct perf_event *perf_get_event(struct file *file) +{ + if (file->f_op != &perf_fops) + return ERR_PTR(-EINVAL); + + return file->private_data; +} + const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { if (!event) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 6c6b3c48db71..045a37e9ddee 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/circ_buf.h> #include <linux/poll.h> +#include <linux/nospec.h> #include "internal.h" @@ -613,7 +614,8 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, } } - rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node); + rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL, + node); if (!rb->aux_pages) return -ENOMEM; @@ -867,8 +869,10 @@ perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) return NULL; /* AUX space */ - if (pgoff >= rb->aux_pgoff) - return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]); + if (pgoff >= rb->aux_pgoff) { + int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages); + return virt_to_page(rb->aux_pages[aux_pgoff]); + } } return __perf_mmap_to_page(rb, pgoff); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ce6848e46e94..ccc579a7d32e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -491,7 +491,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) if (!uprobe) return NULL; - uprobe->inode = igrab(inode); + uprobe->inode = inode; uprobe->offset = offset; init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); @@ -502,7 +502,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) if (cur_uprobe) { kfree(uprobe); uprobe = cur_uprobe; - iput(inode); } return uprobe; @@ -701,7 +700,6 @@ static void delete_uprobe(struct uprobe *uprobe) rb_erase(&uprobe->rb_node, &uprobes_tree); spin_unlock(&uprobes_treelock); RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ - iput(uprobe->inode); put_uprobe(uprobe); } @@ -873,7 +871,8 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe - * unregisters. + * unregisters. Caller of uprobe_register() is required to keep @inode + * (and the containing mount) referenced. * * Return errno if it cannot successully install probes * else return 0 (success) @@ -1185,7 +1184,8 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) if (unlikely(!area)) goto out; - area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); + area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long), + GFP_KERNEL); if (!area->bitmap) goto free_area; diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index a5697119290e..33f07c5f2515 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -27,21 +27,9 @@ static int execdomains_proc_show(struct seq_file *m, void *v) return 0; } -static int execdomains_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, execdomains_proc_show, NULL); -} - -static const struct file_operations execdomains_proc_fops = { - .open = execdomains_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_execdomains_init(void) { - proc_create("execdomains", 0, NULL, &execdomains_proc_fops); + proc_create_single("execdomains", 0, NULL, execdomains_proc_show); return 0; } module_init(proc_execdomains_init); diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 1d5632d8bbcc..5349c91c2298 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -258,7 +258,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, /* cut off if it is too long */ if (count > KSYM_NAME_LEN) count = KSYM_NAME_LEN; - buf = kmalloc(sizeof(char) * (count + 1), GFP_KERNEL); + buf = kmalloc(count + 1, GFP_KERNEL); if (!buf) return -ENOMEM; diff --git a/kernel/fork.c b/kernel/fork.c index 242c8c93d285..9440d61b925c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -216,10 +216,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) if (!s) continue; -#ifdef CONFIG_DEBUG_KMEMLEAK /* Clear stale pointers from reused stack. */ memset(s->addr, 0, THREAD_SIZE); -#endif + tsk->stack_vm_area = s; return s->addr; } @@ -441,6 +440,14 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, continue; } charge = 0; + /* + * Don't duplicate many vmas if we've been oom-killed (for + * example) + */ + if (fatal_signal_pending(current)) { + retval = -EINTR; + goto out; + } if (mpnt->vm_flags & VM_ACCOUNT) { unsigned long len = vma_pages(mpnt); @@ -812,7 +819,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) clear_tsk_need_resched(tsk); set_task_stack_end_magic(tsk); -#ifdef CONFIG_CC_STACKPROTECTOR +#ifdef CONFIG_STACKPROTECTOR tsk->stack_canary = get_random_canary(); #endif @@ -900,6 +907,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + spin_lock_init(&mm->arg_lock); mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); @@ -1713,7 +1721,7 @@ static __latent_entropy struct task_struct *copy_process( p->start_time = ktime_get_ns(); p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; - p->audit_context = NULL; + audit_set_context(p, NULL); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1900,6 +1908,8 @@ static __latent_entropy struct task_struct *copy_process( */ copy_seccomp(p); + rseq_fork(p, clone_flags); + /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 1276aabaab55..1e3823fa799b 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -53,23 +53,16 @@ config GCOV_PROFILE_ALL choice prompt "Specify GCOV format" depends on GCOV_KERNEL - default GCOV_FORMAT_AUTODETECT ---help--- - The gcov format is usually determined by the GCC version, but there are + The gcov format is usually determined by the GCC version, and the + default is chosen according to your GCC version. However, there are exceptions where format changes are integrated in lower-version GCCs. - In such a case use this option to adjust the format used in the kernel - accordingly. - - If unsure, choose "Autodetect". - -config GCOV_FORMAT_AUTODETECT - bool "Autodetect" - ---help--- - Select this option to use the format that corresponds to your GCC - version. + In such a case, change this option to adjust the format used in the + kernel accordingly. config GCOV_FORMAT_3_4 bool "GCC 3.4 format" + depends on CC_IS_GCC && GCC_VERSION < 40700 ---help--- Select this option to use the format defined by GCC 3.4. diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index c6c50e5c680e..ff06d64df397 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -4,5 +4,3 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-y := base.o fs.o obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o -obj-$(CONFIG_GCOV_FORMAT_AUTODETECT) += $(call cc-ifversion, -lt, 0407, \ - gcc_3_4.o, gcc_4_7.o) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 751593ed7c0b..32b479468e4d 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -44,6 +44,7 @@ int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; static bool hung_task_show_lock; +static bool hung_task_call_panic; static struct task_struct *watchdog_task; @@ -127,10 +128,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) touch_nmi_watchdog(); if (sysctl_hung_task_panic) { - if (hung_task_show_lock) - debug_show_all_locks(); - trigger_all_cpu_backtrace(); - panic("hung_task: blocked tasks"); + hung_task_show_lock = true; + hung_task_call_panic = true; } } @@ -193,6 +192,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); if (hung_task_show_lock) debug_show_all_locks(); + if (hung_task_call_panic) { + trigger_all_cpu_backtrace(); + panic("hung_task: blocked tasks"); + } } static long hung_timeout_jiffies(unsigned long last_checked, diff --git a/kernel/iomem.c b/kernel/iomem.c new file mode 100644 index 000000000000..f7525e14ebc6 --- /dev/null +++ b/kernel/iomem.c @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/device.h> +#include <linux/types.h> +#include <linux/io.h> +#include <linux/mm.h> + +#ifndef ioremap_cache +/* temporary while we convert existing ioremap_cache users to memremap */ +__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) +{ + return ioremap(offset, size); +} +#endif + +#ifndef arch_memremap_wb +static void *arch_memremap_wb(resource_size_t offset, unsigned long size) +{ + return (__force void *)ioremap_cache(offset, size); +} +#endif + +#ifndef arch_memremap_can_ram_remap +static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) +{ + return true; +} +#endif + +static void *try_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) +{ + unsigned long pfn = PHYS_PFN(offset); + + /* In the simple case just return the existing linear address */ + if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) && + arch_memremap_can_ram_remap(offset, size, flags)) + return __va(offset); + + return NULL; /* fallback to arch_memremap_wb */ +} + +/** + * memremap() - remap an iomem_resource as cacheable memory + * @offset: iomem resource start address + * @size: size of remap + * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC, + * MEMREMAP_ENC, MEMREMAP_DEC + * + * memremap() is "ioremap" for cases where it is known that the resource + * being mapped does not have i/o side effects and the __iomem + * annotation is not applicable. In the case of multiple flags, the different + * mapping types will be attempted in the order listed below until one of + * them succeeds. + * + * MEMREMAP_WB - matches the default mapping for System RAM on + * the architecture. This is usually a read-allocate write-back cache. + * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM + * memremap() will bypass establishing a new mapping and instead return + * a pointer into the direct map. + * + * MEMREMAP_WT - establish a mapping whereby writes either bypass the + * cache or are written through to memory and never exist in a + * cache-dirty state with respect to program visibility. Attempts to + * map System RAM with this mapping type will fail. + * + * MEMREMAP_WC - establish a writecombine mapping, whereby writes may + * be coalesced together (e.g. in the CPU's write buffers), but is otherwise + * uncached. Attempts to map System RAM with this mapping type will fail. + */ +void *memremap(resource_size_t offset, size_t size, unsigned long flags) +{ + int is_ram = region_intersects(offset, size, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); + void *addr = NULL; + + if (!flags) + return NULL; + + if (is_ram == REGION_MIXED) { + WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + /* Try all mapping types requested until one returns non-NULL */ + if (flags & MEMREMAP_WB) { + /* + * MEMREMAP_WB is special in that it can be satisifed + * from the direct map. Some archs depend on the + * capability of memremap() to autodetect cases where + * the requested range is potentially in System RAM. + */ + if (is_ram == REGION_INTERSECTS) + addr = try_ram_remap(offset, size, flags); + if (!addr) + addr = arch_memremap_wb(offset, size); + } + + /* + * If we don't have a mapping yet and other request flags are + * present then we will be attempting to establish a new virtual + * address mapping. Enforce that this mapping is not aliasing + * System RAM. + */ + if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) { + WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + if (!addr && (flags & MEMREMAP_WT)) + addr = ioremap_wt(offset, size); + + if (!addr && (flags & MEMREMAP_WC)) + addr = ioremap_wc(offset, size); + + return addr; +} +EXPORT_SYMBOL(memremap); + +void memunmap(void *addr) +{ + if (is_vmalloc_addr(addr)) + iounmap((void __iomem *) addr); +} +EXPORT_SYMBOL(memunmap); + +static void devm_memremap_release(struct device *dev, void *res) +{ + memunmap(*(void **)res); +} + +static int devm_memremap_match(struct device *dev, void *res, void *match_data) +{ + return *(void **)res == match_data; +} + +void *devm_memremap(struct device *dev, resource_size_t offset, + size_t size, unsigned long flags) +{ + void **ptr, *addr; + + ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL, + dev_to_node(dev)); + if (!ptr) + return ERR_PTR(-ENOMEM); + + addr = memremap(offset, size, flags); + if (addr) { + *ptr = addr; + devres_add(dev, ptr); + } else { + devres_free(ptr); + return ERR_PTR(-ENXIO); + } + + return addr; +} +EXPORT_SYMBOL(devm_memremap); + +void devm_memunmap(struct device *dev, void *addr) +{ + WARN_ON(devres_release(dev, devm_memremap_release, + devm_memremap_match, addr)); +} +EXPORT_SYMBOL(devm_memunmap); diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index fc4f361a86bb..dd20d0d528d4 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -1,11 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * Copyright (C) 2017 Bartosz Golaszewski <brgl@bgdev.pl> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. + * Copyright (C) 2017-2018 Bartosz Golaszewski <brgl@bgdev.pl> */ #include <linux/slab.h> diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e3336d904f64..daeabd791d58 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -24,6 +24,7 @@ #ifdef CONFIG_IRQ_FORCED_THREADING __read_mostly bool force_irqthreads; +EXPORT_SYMBOL_GPL(force_irqthreads); static int __init setup_forced_irqthreads(char *arg) { @@ -204,6 +205,39 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, return ret; } +#ifdef CONFIG_GENERIC_PENDING_IRQ +static inline int irq_set_affinity_pending(struct irq_data *data, + const struct cpumask *dest) +{ + struct irq_desc *desc = irq_data_to_desc(data); + + irqd_set_move_pending(data); + irq_copy_pending(desc, dest); + return 0; +} +#else +static inline int irq_set_affinity_pending(struct irq_data *data, + const struct cpumask *dest) +{ + return -EBUSY; +} +#endif + +static int irq_try_set_affinity(struct irq_data *data, + const struct cpumask *dest, bool force) +{ + int ret = irq_do_set_affinity(data, dest, force); + + /* + * In case that the underlying vector management is busy and the + * architecture supports the generic pending mechanism then utilize + * this to avoid returning an error to user space. + */ + if (ret == -EBUSY && !force) + ret = irq_set_affinity_pending(data, dest); + return ret; +} + int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -214,8 +248,8 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (!chip || !chip->irq_set_affinity) return -EINVAL; - if (irq_can_move_pcntxt(data)) { - ret = irq_do_set_affinity(data, mask, force); + if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) { + ret = irq_try_set_affinity(data, mask, force); } else { irqd_set_move_pending(data); irq_copy_pending(desc, mask); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 86ae0eb80b53..def48589ea48 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -38,17 +38,18 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear) void irq_move_masked_irq(struct irq_data *idata) { struct irq_desc *desc = irq_data_to_desc(idata); - struct irq_chip *chip = desc->irq_data.chip; + struct irq_data *data = &desc->irq_data; + struct irq_chip *chip = data->chip; - if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) + if (likely(!irqd_is_setaffinity_pending(data))) return; - irqd_clr_move_pending(&desc->irq_data); + irqd_clr_move_pending(data); /* * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. */ - if (irqd_is_per_cpu(&desc->irq_data)) { + if (irqd_is_per_cpu(data)) { WARN_ON(1); return; } @@ -73,13 +74,24 @@ void irq_move_masked_irq(struct irq_data *idata) * For correct operation this depends on the caller * masking the irqs. */ - if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) - irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false); - + if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) { + int ret; + + ret = irq_do_set_affinity(data, desc->pending_mask, false); + /* + * If the there is a cleanup pending in the underlying + * vector management, reschedule the move for the next + * interrupt. Leave desc->pending_mask intact. + */ + if (ret == -EBUSY) { + irqd_set_move_pending(data); + return; + } + } cpumask_clear(desc->pending_mask); } -void irq_move_irq(struct irq_data *idata) +void __irq_move_irq(struct irq_data *idata) { bool masked; @@ -90,9 +102,6 @@ void irq_move_irq(struct irq_data *idata) */ idata = irq_desc_get_irq_data(irq_data_to_desc(idata)); - if (likely(!irqd_is_setaffinity_pending(idata))) - return; - if (unlikely(irqd_irq_disabled(idata))) return; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 2a8571f72b17..4ca2fd46645d 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -76,6 +76,19 @@ static inline void irq_chip_write_msi_msg(struct irq_data *data, data->chip->irq_write_msi_msg(data, msg); } +static void msi_check_level(struct irq_domain *domain, struct msi_msg *msg) +{ + struct msi_domain_info *info = domain->host_data; + + /* + * If the MSI provider has messed with the second message and + * not advertized that it is level-capable, signal the breakage. + */ + WARN_ON(!((info->flags & MSI_FLAG_LEVEL_CAPABLE) && + (info->chip->flags & IRQCHIP_SUPPORTS_LEVEL_MSI)) && + (msg[1].address_lo || msg[1].address_hi || msg[1].data)); +} + /** * msi_domain_set_affinity - Generic affinity setter function for MSI domains * @irq_data: The irq data associated to the interrupt @@ -89,13 +102,14 @@ int msi_domain_set_affinity(struct irq_data *irq_data, const struct cpumask *mask, bool force) { struct irq_data *parent = irq_data->parent_data; - struct msi_msg msg; + struct msi_msg msg[2] = { [1] = { }, }; int ret; ret = parent->chip->irq_set_affinity(parent, mask, force); if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { - BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); - irq_chip_write_msi_msg(irq_data, &msg); + BUG_ON(irq_chip_compose_msi_msg(irq_data, msg)); + msi_check_level(irq_data->domain, msg); + irq_chip_write_msi_msg(irq_data, msg); } return ret; @@ -104,20 +118,21 @@ int msi_domain_set_affinity(struct irq_data *irq_data, static int msi_domain_activate(struct irq_domain *domain, struct irq_data *irq_data, bool early) { - struct msi_msg msg; + struct msi_msg msg[2] = { [1] = { }, }; - BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); - irq_chip_write_msi_msg(irq_data, &msg); + BUG_ON(irq_chip_compose_msi_msg(irq_data, msg)); + msi_check_level(irq_data->domain, msg); + irq_chip_write_msi_msg(irq_data, msg); return 0; } static void msi_domain_deactivate(struct irq_domain *domain, struct irq_data *irq_data) { - struct msi_msg msg; + struct msi_msg msg[2]; - memset(&msg, 0, sizeof(msg)); - irq_chip_write_msi_msg(irq_data, &msg); + memset(msg, 0, sizeof(msg)); + irq_chip_write_msi_msg(irq_data, msg); } static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 7cb091d81d91..37eda10f5c36 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -185,11 +185,6 @@ static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode)); } -static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode)); -} - static const struct file_operations irq_affinity_proc_fops = { .open = irq_affinity_proc_open, .read = seq_read, @@ -198,13 +193,6 @@ static const struct file_operations irq_affinity_proc_fops = { .write = irq_affinity_proc_write, }; -static const struct file_operations irq_affinity_hint_proc_fops = { - .open = irq_affinity_hint_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static const struct file_operations irq_affinity_list_proc_fops = { .open = irq_affinity_list_proc_open, .read = seq_read, @@ -223,32 +211,6 @@ static int irq_effective_aff_list_proc_show(struct seq_file *m, void *v) { return show_irq_affinity(EFFECTIVE_LIST, m); } - -static int irq_effective_aff_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_effective_aff_proc_show, PDE_DATA(inode)); -} - -static int irq_effective_aff_list_proc_open(struct inode *inode, - struct file *file) -{ - return single_open(file, irq_effective_aff_list_proc_show, - PDE_DATA(inode)); -} - -static const struct file_operations irq_effective_aff_proc_fops = { - .open = irq_effective_aff_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static const struct file_operations irq_effective_aff_list_proc_fops = { - .open = irq_effective_aff_list_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif static int default_affinity_show(struct seq_file *m, void *v) @@ -313,18 +275,6 @@ static int irq_node_proc_show(struct seq_file *m, void *v) seq_printf(m, "%d\n", irq_desc_get_node(desc)); return 0; } - -static int irq_node_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_node_proc_show, PDE_DATA(inode)); -} - -static const struct file_operations irq_node_proc_fops = { - .open = irq_node_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif static int irq_spurious_proc_show(struct seq_file *m, void *v) @@ -337,18 +287,6 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v) return 0; } -static int irq_spurious_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_spurious_proc_show, PDE_DATA(inode)); -} - -static const struct file_operations irq_spurious_proc_fops = { - .open = irq_spurious_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - #define MAX_NAMELEN 128 static int name_unique(unsigned int irq, struct irqaction *new_action) @@ -421,24 +359,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) &irq_affinity_proc_fops, irqp); /* create /proc/irq/<irq>/affinity_hint */ - proc_create_data("affinity_hint", 0444, desc->dir, - &irq_affinity_hint_proc_fops, irqp); + proc_create_single_data("affinity_hint", 0444, desc->dir, + irq_affinity_hint_proc_show, irqp); /* create /proc/irq/<irq>/smp_affinity_list */ proc_create_data("smp_affinity_list", 0644, desc->dir, &irq_affinity_list_proc_fops, irqp); - proc_create_data("node", 0444, desc->dir, - &irq_node_proc_fops, irqp); + proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show, + irqp); # ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK - proc_create_data("effective_affinity", 0444, desc->dir, - &irq_effective_aff_proc_fops, irqp); - proc_create_data("effective_affinity_list", 0444, desc->dir, - &irq_effective_aff_list_proc_fops, irqp); + proc_create_single_data("effective_affinity", 0444, desc->dir, + irq_effective_aff_proc_show, irqp); + proc_create_single_data("effective_affinity_list", 0444, desc->dir, + irq_effective_aff_list_proc_show, irqp); # endif #endif - proc_create_data("spurious", 0444, desc->dir, - &irq_spurious_proc_fops, (void *)(long)irq); + proc_create_single_data("spurious", 0444, desc->dir, + irq_spurious_proc_show, (void *)(long)irq); out_unlock: mutex_unlock(®ister_lock); diff --git a/kernel/kcov.c b/kernel/kcov.c index 2c16f1ab5e10..3ebd09efe72a 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -58,7 +58,7 @@ struct kcov { static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) { - enum kcov_mode mode; + unsigned int mode; /* * We are interested in code coverage as a function of a syscall inputs, @@ -241,7 +241,8 @@ static void kcov_put(struct kcov *kcov) void kcov_task_init(struct task_struct *t) { - t->kcov_mode = KCOV_MODE_DISABLED; + WRITE_ONCE(t->kcov_mode, KCOV_MODE_DISABLED); + barrier(); t->kcov_size = 0; t->kcov_area = NULL; t->kcov = NULL; @@ -323,6 +324,21 @@ static int kcov_close(struct inode *inode, struct file *filep) return 0; } +/* + * Fault in a lazily-faulted vmalloc area before it can be used by + * __santizer_cov_trace_pc(), to avoid recursion issues if any code on the + * vmalloc fault handling path is instrumented. + */ +static void kcov_fault_in_area(struct kcov *kcov) +{ + unsigned long stride = PAGE_SIZE / sizeof(unsigned long); + unsigned long *area = kcov->area; + unsigned long offset; + + for (offset = 0; offset < kcov->size; offset += stride) + READ_ONCE(area[offset]); +} + static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, unsigned long arg) { @@ -371,6 +387,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, #endif else return -EINVAL; + kcov_fault_in_area(kcov); /* Cache in task struct for performance. */ t->kcov_size = kcov->size; t->kcov_area = kcov->area; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 20fef1a38602..23a83a4da38a 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -829,6 +829,8 @@ static int kimage_load_normal_segment(struct kimage *image, else buf += mchunk; mbytes -= mchunk; + + cond_resched(); } out: return result; @@ -893,6 +895,8 @@ static int kimage_load_crash_segment(struct kimage *image, else buf += mchunk; mbytes -= mchunk; + + cond_resched(); } out: return result; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 75d8e7cf040e..c6a3b6851372 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -793,7 +793,7 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, * The section headers in kexec_purgatory are read-only. In order to * have them modifiable make a temporary copy. */ - sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + sechdrs = vzalloc(array_size(sizeof(Elf_Shdr), pi->ehdr->e_shnum)); if (!sechdrs) return -ENOMEM; memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff, diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 102160ff5c66..ea619021d901 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2428,7 +2428,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) struct kprobe_blacklist_entry *ent = list_entry(v, struct kprobe_blacklist_entry, list); - seq_printf(m, "0x%p-0x%p\t%ps\n", (void *)ent->start_addr, + seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr, (void *)ent->end_addr, (void *)ent->start_addr); return 0; } diff --git a/kernel/kthread.c b/kernel/kthread.c index cd50e99202b0..481951bf091d 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -55,7 +55,6 @@ enum KTHREAD_BITS { KTHREAD_IS_PER_CPU = 0, KTHREAD_SHOULD_STOP, KTHREAD_SHOULD_PARK, - KTHREAD_IS_PARKED, }; static inline void set_kthread_struct(void *kthread) @@ -177,14 +176,12 @@ void *kthread_probe_data(struct task_struct *task) static void __kthread_parkme(struct kthread *self) { - __set_current_state(TASK_PARKED); - while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { - if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) - complete(&self->parked); + for (;;) { + set_current_state(TASK_PARKED); + if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) + break; schedule(); - __set_current_state(TASK_PARKED); } - clear_bit(KTHREAD_IS_PARKED, &self->flags); __set_current_state(TASK_RUNNING); } @@ -194,6 +191,11 @@ void kthread_parkme(void) } EXPORT_SYMBOL_GPL(kthread_parkme); +void kthread_park_complete(struct task_struct *k) +{ + complete_all(&to_kthread(k)->parked); +} + static int kthread(void *_create) { /* Copy data: it's on kthread's stack */ @@ -450,22 +452,16 @@ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); - clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* - * We clear the IS_PARKED bit here as we don't wait - * until the task has left the park code. So if we'd - * park before that happens we'd see the IS_PARKED bit - * which might be about to be cleared. + * Newly created kthread was parked when the CPU was offline. + * The binding was lost and we need to set it again. */ - if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { - /* - * Newly created kthread was parked when the CPU was offline. - * The binding was lost and we need to set it again. - */ - if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) - __kthread_bind(k, kthread->cpu, TASK_PARKED); - wake_up_state(k, TASK_PARKED); - } + if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) + __kthread_bind(k, kthread->cpu, TASK_PARKED); + + reinit_completion(&kthread->parked); + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); @@ -488,12 +484,10 @@ int kthread_park(struct task_struct *k) if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; - if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { - set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); - if (k != current) { - wake_up_process(k); - wait_for_completion(&kthread->parked); - } + set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + if (k != current) { + wake_up_process(k); + wait_for_completion(&kthread->parked); } return 0; diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c index fdac27588d60..83958c814439 100644 --- a/kernel/livepatch/shadow.c +++ b/kernel/livepatch/shadow.c @@ -113,8 +113,10 @@ void *klp_shadow_get(void *obj, unsigned long id) } EXPORT_SYMBOL_GPL(klp_shadow_get); -static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, - size_t size, gfp_t gfp_flags, bool warn_on_exist) +static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, + size_t size, gfp_t gfp_flags, + klp_shadow_ctor_t ctor, void *ctor_data, + bool warn_on_exist) { struct klp_shadow *new_shadow; void *shadow_data; @@ -125,18 +127,15 @@ static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, if (shadow_data) goto exists; - /* Allocate a new shadow variable for use inside the lock below */ + /* + * Allocate a new shadow variable. Fill it with zeroes by default. + * More complex setting can be done by @ctor function. But it is + * called only when the buffer is really used (under klp_shadow_lock). + */ new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags); if (!new_shadow) return NULL; - new_shadow->obj = obj; - new_shadow->id = id; - - /* Initialize the shadow variable if data provided */ - if (data) - memcpy(new_shadow->data, data, size); - /* Look for <obj, id> again under the lock */ spin_lock_irqsave(&klp_shadow_lock, flags); shadow_data = klp_shadow_get(obj, id); @@ -150,6 +149,22 @@ static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, goto exists; } + new_shadow->obj = obj; + new_shadow->id = id; + + if (ctor) { + int err; + + err = ctor(obj, new_shadow->data, ctor_data); + if (err) { + spin_unlock_irqrestore(&klp_shadow_lock, flags); + kfree(new_shadow); + pr_err("Failed to construct shadow variable <%p, %lx> (%d)\n", + obj, id, err); + return NULL; + } + } + /* No <obj, id> found, so attach the newly allocated one */ hash_add_rcu(klp_shadow_hash, &new_shadow->node, (unsigned long)new_shadow->obj); @@ -170,26 +185,32 @@ exists: * klp_shadow_alloc() - allocate and add a new shadow variable * @obj: pointer to parent object * @id: data identifier - * @data: pointer to data to attach to parent * @size: size of attached data * @gfp_flags: GFP mask for allocation + * @ctor: custom constructor to initialize the shadow data (optional) + * @ctor_data: pointer to any data needed by @ctor (optional) + * + * Allocates @size bytes for new shadow variable data using @gfp_flags. + * The data are zeroed by default. They are further initialized by @ctor + * function if it is not NULL. The new shadow variable is then added + * to the global hashtable. * - * Allocates @size bytes for new shadow variable data using @gfp_flags - * and copies @size bytes from @data into the new shadow variable's own - * data space. If @data is NULL, @size bytes are still allocated, but - * no copy is performed. The new shadow variable is then added to the - * global hashtable. + * If an existing <obj, id> shadow variable can be found, this routine will + * issue a WARN, exit early and return NULL. * - * If an existing <obj, id> shadow variable can be found, this routine - * will issue a WARN, exit early and return NULL. + * This function guarantees that the constructor function is called only when + * the variable did not exist before. The cost is that @ctor is called + * in atomic context under a spin lock. * * Return: the shadow variable data element, NULL on duplicate or * failure. */ -void *klp_shadow_alloc(void *obj, unsigned long id, void *data, - size_t size, gfp_t gfp_flags) +void *klp_shadow_alloc(void *obj, unsigned long id, + size_t size, gfp_t gfp_flags, + klp_shadow_ctor_t ctor, void *ctor_data) { - return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true); + return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags, + ctor, ctor_data, true); } EXPORT_SYMBOL_GPL(klp_shadow_alloc); @@ -197,37 +218,51 @@ EXPORT_SYMBOL_GPL(klp_shadow_alloc); * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable * @obj: pointer to parent object * @id: data identifier - * @data: pointer to data to attach to parent * @size: size of attached data * @gfp_flags: GFP mask for allocation + * @ctor: custom constructor to initialize the shadow data (optional) + * @ctor_data: pointer to any data needed by @ctor (optional) * * Returns a pointer to existing shadow data if an <obj, id> shadow * variable is already present. Otherwise, it creates a new shadow * variable like klp_shadow_alloc(). * - * This function guarantees that only one shadow variable exists with - * the given @id for the given @obj. It also guarantees that the shadow - * variable will be initialized by the given @data only when it did not - * exist before. + * This function guarantees that only one shadow variable exists with the given + * @id for the given @obj. It also guarantees that the constructor function + * will be called only when the variable did not exist before. The cost is + * that @ctor is called in atomic context under a spin lock. * * Return: the shadow variable data element, NULL on failure. */ -void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, - size_t size, gfp_t gfp_flags) +void *klp_shadow_get_or_alloc(void *obj, unsigned long id, + size_t size, gfp_t gfp_flags, + klp_shadow_ctor_t ctor, void *ctor_data) { - return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false); + return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags, + ctor, ctor_data, false); } EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc); +static void klp_shadow_free_struct(struct klp_shadow *shadow, + klp_shadow_dtor_t dtor) +{ + hash_del_rcu(&shadow->node); + if (dtor) + dtor(shadow->obj, shadow->data); + kfree_rcu(shadow, rcu_head); +} + /** * klp_shadow_free() - detach and free a <obj, id> shadow variable * @obj: pointer to parent object * @id: data identifier + * @dtor: custom callback that can be used to unregister the variable + * and/or free data that the shadow variable points to (optional) * * This function releases the memory for this <obj, id> shadow variable * instance, callers should stop referencing it accordingly. */ -void klp_shadow_free(void *obj, unsigned long id) +void klp_shadow_free(void *obj, unsigned long id, klp_shadow_dtor_t dtor) { struct klp_shadow *shadow; unsigned long flags; @@ -239,8 +274,7 @@ void klp_shadow_free(void *obj, unsigned long id) (unsigned long)obj) { if (klp_shadow_match(shadow, obj, id)) { - hash_del_rcu(&shadow->node); - kfree_rcu(shadow, rcu_head); + klp_shadow_free_struct(shadow, dtor); break; } } @@ -252,11 +286,13 @@ EXPORT_SYMBOL_GPL(klp_shadow_free); /** * klp_shadow_free_all() - detach and free all <*, id> shadow variables * @id: data identifier + * @dtor: custom callback that can be used to unregister the variable + * and/or free data that the shadow variable points to (optional) * * This function releases the memory for all <*, id> shadow variable * instances, callers should stop referencing them accordingly. */ -void klp_shadow_free_all(unsigned long id) +void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor) { struct klp_shadow *shadow; unsigned long flags; @@ -266,10 +302,8 @@ void klp_shadow_free_all(unsigned long id) /* Delete all <*, id> from hash */ hash_for_each(klp_shadow_hash, i, shadow, node) { - if (klp_shadow_match(shadow, shadow->obj, id)) { - hash_del_rcu(&shadow->node); - kfree_rcu(shadow, rcu_head); - } + if (klp_shadow_match(shadow, shadow->obj, id)) + klp_shadow_free_struct(shadow, dtor); } spin_unlock_irqrestore(&klp_shadow_lock, flags); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 023386338269..edcac5de7ebc 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -561,20 +561,24 @@ static void print_lock(struct held_lock *hlock) printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } -static void lockdep_print_held_locks(struct task_struct *curr) +static void lockdep_print_held_locks(struct task_struct *p) { - int i, depth = curr->lockdep_depth; + int i, depth = READ_ONCE(p->lockdep_depth); - if (!depth) { - printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr)); + if (!depth) + printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p)); + else + printk("%d lock%s held by %s/%d:\n", depth, + depth > 1 ? "s" : "", p->comm, task_pid_nr(p)); + /* + * It's not reliable to print a task's held locks if it's not sleeping + * and it's not the current task. + */ + if (p->state == TASK_RUNNING && p != current) return; - } - printk("%d lock%s held by %s/%d:\n", - depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr)); - for (i = 0; i < depth; i++) { printk(" #%d: ", i); - print_lock(curr->held_locks + i); + print_lock(p->held_locks + i); } } @@ -4451,8 +4455,6 @@ EXPORT_SYMBOL_GPL(debug_check_no_locks_held); void debug_show_all_locks(void) { struct task_struct *g, *p; - int count = 10; - int unlock = 1; if (unlikely(!debug_locks)) { pr_warn("INFO: lockdep is turned off.\n"); @@ -4460,50 +4462,18 @@ void debug_show_all_locks(void) } pr_warn("\nShowing all locks held in the system:\n"); - /* - * Here we try to get the tasklist_lock as hard as possible, - * if not successful after 2 seconds we ignore it (but keep - * trying). This is to enable a debug printout even if a - * tasklist_lock-holding task deadlocks or crashes. - */ -retry: - if (!read_trylock(&tasklist_lock)) { - if (count == 10) - pr_warn("hm, tasklist_lock locked, retrying... "); - if (count) { - count--; - pr_cont(" #%d", 10-count); - mdelay(200); - goto retry; - } - pr_cont(" ignoring it.\n"); - unlock = 0; - } else { - if (count != 10) - pr_cont(" locked it.\n"); - } - - do_each_thread(g, p) { - /* - * It's not reliable to print a task's held locks - * if it's not sleeping (or if it's not the current - * task): - */ - if (p->state == TASK_RUNNING && p != current) + rcu_read_lock(); + for_each_process_thread(g, p) { + if (!p->lockdep_depth) continue; - if (p->lockdep_depth) - lockdep_print_held_locks(p); - if (!unlock) - if (read_trylock(&tasklist_lock)) - unlock = 1; + lockdep_print_held_locks(p); touch_nmi_watchdog(); - } while_each_thread(g, p); + touch_all_softlockup_watchdogs(); + } + rcu_read_unlock(); pr_warn("\n"); pr_warn("=============================================\n\n"); - - if (unlock) - read_unlock(&tasklist_lock); } EXPORT_SYMBOL_GPL(debug_show_all_locks); #endif diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index ad69bbc9bd28..3dd980dfba2d 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -101,18 +101,6 @@ static const struct seq_operations lockdep_ops = { .show = l_show, }; -static int lockdep_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &lockdep_ops); -} - -static const struct file_operations proc_lockdep_operations = { - .open = lockdep_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - #ifdef CONFIG_PROVE_LOCKING static void *lc_start(struct seq_file *m, loff_t *pos) { @@ -170,18 +158,6 @@ static const struct seq_operations lockdep_chains_ops = { .stop = lc_stop, .show = lc_show, }; - -static int lockdep_chains_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &lockdep_chains_ops); -} - -static const struct file_operations proc_lockdep_chains_operations = { - .open = lockdep_chains_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; #endif /* CONFIG_PROVE_LOCKING */ static void lockdep_stats_debug_show(struct seq_file *m) @@ -355,18 +331,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v) return 0; } -static int lockdep_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, lockdep_stats_show, NULL); -} - -static const struct file_operations proc_lockdep_stats_operations = { - .open = lockdep_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - #ifdef CONFIG_LOCK_STAT struct lock_stat_data { @@ -682,14 +646,11 @@ static const struct file_operations proc_lock_stat_operations = { static int __init lockdep_proc_init(void) { - proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations); + proc_create_seq("lockdep", S_IRUSR, NULL, &lockdep_ops); #ifdef CONFIG_PROVE_LOCKING - proc_create("lockdep_chains", S_IRUSR, NULL, - &proc_lockdep_chains_operations); + proc_create_seq("lockdep_chains", S_IRUSR, NULL, &lockdep_chains_ops); #endif - proc_create("lockdep_stats", S_IRUSR, NULL, - &proc_lockdep_stats_operations); - + proc_create_single("lockdep_stats", S_IRUSR, NULL, lockdep_stats_show); #ifdef CONFIG_LOCK_STAT proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, &proc_lock_stat_operations); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 6850ffd69125..8402b3349dca 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -913,7 +913,9 @@ static int __init lock_torture_init(void) /* Initialize the statistics so that each run gets its own numbers. */ if (nwriters_stress) { lock_is_write_held = 0; - cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL); + cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress, + sizeof(*cxt.lwsa), + GFP_KERNEL); if (cxt.lwsa == NULL) { VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory"); firsterr = -ENOMEM; @@ -942,7 +944,9 @@ static int __init lock_torture_init(void) if (nreaders_stress) { lock_is_read_held = 0; - cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL); + cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress, + sizeof(*cxt.lrsa), + GFP_KERNEL); if (cxt.lrsa == NULL) { VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); firsterr = -ENOMEM; @@ -985,7 +989,8 @@ static int __init lock_torture_init(void) } if (nwriters_stress) { - writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]), + writer_tasks = kcalloc(cxt.nrealwriters_stress, + sizeof(writer_tasks[0]), GFP_KERNEL); if (writer_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); @@ -995,7 +1000,8 @@ static int __init lock_torture_init(void) } if (cxt.cur_ops->readlock) { - reader_tasks = kzalloc(cxt.nrealreaders_stress * sizeof(reader_tasks[0]), + reader_tasks = kcalloc(cxt.nrealreaders_stress, + sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory"); diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index f046b7ce9dd6..5e10153b4d3c 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -23,13 +23,15 @@ struct mcs_spinlock { #ifndef arch_mcs_spin_lock_contended /* - * Using smp_load_acquire() provides a memory barrier that ensures - * subsequent operations happen after the lock is acquired. + * Using smp_cond_load_acquire() provides the acquire semantics + * required so that subsequent operations happen after the + * lock is acquired. Additionally, some architectures such as + * ARM64 would like to do spin-waiting instead of purely + * spinning, and smp_cond_load_acquire() provides that behavior. */ #define arch_mcs_spin_lock_contended(l) \ do { \ - while (!(smp_load_acquire(l))) \ - cpu_relax(); \ + smp_cond_load_acquire(l, VAL); \ } while (0) #endif diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 2048359f33d2..f44f658ae629 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -139,8 +139,9 @@ static inline bool __mutex_trylock(struct mutex *lock) static __always_inline bool __mutex_trylock_fast(struct mutex *lock) { unsigned long curr = (unsigned long)current; + unsigned long zero = 0UL; - if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr)) + if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr)) return true; return false; diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index d880296245c5..bfaeb05123ff 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -12,11 +12,11 @@ * GNU General Public License for more details. * * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. - * (C) Copyright 2013-2014 Red Hat, Inc. + * (C) Copyright 2013-2014,2018 Red Hat, Inc. * (C) Copyright 2015 Intel Corp. * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP * - * Authors: Waiman Long <waiman.long@hpe.com> + * Authors: Waiman Long <longman@redhat.com> * Peter Zijlstra <peterz@infradead.org> */ @@ -33,6 +33,11 @@ #include <asm/qspinlock.h> /* + * Include queued spinlock statistics code + */ +#include "qspinlock_stat.h" + +/* * The basic principle of a queue-based spinlock can best be understood * by studying a classic queue-based spinlock implementation called the * MCS lock. The paper below provides a good description for this kind @@ -77,6 +82,18 @@ #endif /* + * The pending bit spinning loop count. + * This heuristic is used to limit the number of lockword accesses + * made by atomic_cond_read_relaxed when waiting for the lock to + * transition out of the "== _Q_PENDING_VAL" state. We don't spin + * indefinitely because there's no guarantee that we'll make forward + * progress. + */ +#ifndef _Q_PENDING_LOOPS +#define _Q_PENDING_LOOPS 1 +#endif + +/* * Per-CPU queue node structures; we can never have more than 4 nested * contexts: task, softirq, hardirq, nmi. * @@ -114,41 +131,18 @@ static inline __pure struct mcs_spinlock *decode_tail(u32 tail) #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) -/* - * By using the whole 2nd least significant byte for the pending bit, we - * can allow better optimization of the lock acquisition for the pending - * bit holder. +#if _Q_PENDING_BITS == 8 +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure * - * This internal structure is also used by the set_locked function which - * is not restricted to _Q_PENDING_BITS == 8. + * *,1,* -> *,0,* */ -struct __qspinlock { - union { - atomic_t val; -#ifdef __LITTLE_ENDIAN - struct { - u8 locked; - u8 pending; - }; - struct { - u16 locked_pending; - u16 tail; - }; -#else - struct { - u16 tail; - u16 locked_pending; - }; - struct { - u8 reserved[2]; - u8 pending; - u8 locked; - }; -#endif - }; -}; +static __always_inline void clear_pending(struct qspinlock *lock) +{ + WRITE_ONCE(lock->pending, 0); +} -#if _Q_PENDING_BITS == 8 /** * clear_pending_set_locked - take ownership and clear the pending bit. * @lock: Pointer to queued spinlock structure @@ -159,9 +153,7 @@ struct __qspinlock { */ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL); + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); } /* @@ -176,19 +168,28 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) */ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) { - struct __qspinlock *l = (void *)lock; - /* - * Use release semantics to make sure that the MCS node is properly - * initialized before changing the tail code. + * We can use relaxed semantics since the caller ensures that the + * MCS node is properly initialized before updating the tail. */ - return (u32)xchg_release(&l->tail, + return (u32)xchg_relaxed(&lock->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; } #else /* _Q_PENDING_BITS == 8 */ /** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(struct qspinlock *lock) +{ + atomic_andnot(_Q_PENDING_VAL, &lock->val); +} + +/** * clear_pending_set_locked - take ownership and clear the pending bit. * @lock: Pointer to queued spinlock structure * @@ -216,10 +217,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) for (;;) { new = (val & _Q_LOCKED_PENDING_MASK) | tail; /* - * Use release semantics to make sure that the MCS node is - * properly initialized before changing the tail code. + * We can use relaxed semantics since the caller ensures that + * the MCS node is properly initialized before updating the + * tail. */ - old = atomic_cmpxchg_release(&lock->val, val, new); + old = atomic_cmpxchg_relaxed(&lock->val, val, new); if (old == val) break; @@ -237,9 +239,7 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) */ static __always_inline void set_locked(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->locked, _Q_LOCKED_VAL); + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); } @@ -294,86 +294,83 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { struct mcs_spinlock *prev, *next, *node; - u32 new, old, tail; + u32 old, tail; int idx; BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); if (pv_enabled()) - goto queue; + goto pv_queue; if (virt_spin_lock(lock)) return; /* - * wait for in-progress pending->locked hand-overs + * Wait for in-progress pending->locked hand-overs with a bounded + * number of spins so that we guarantee forward progress. * * 0,1,0 -> 0,0,1 */ if (val == _Q_PENDING_VAL) { - while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL) - cpu_relax(); + int cnt = _Q_PENDING_LOOPS; + val = atomic_cond_read_relaxed(&lock->val, + (VAL != _Q_PENDING_VAL) || !cnt--); } /* + * If we observe any contention; queue. + */ + if (val & ~_Q_LOCKED_MASK) + goto queue; + + /* * trylock || pending * * 0,0,0 -> 0,0,1 ; trylock * 0,0,1 -> 0,1,1 ; pending */ - for (;;) { + val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); + if (!(val & ~_Q_LOCKED_MASK)) { /* - * If we observe any contention; queue. + * We're pending, wait for the owner to go away. + * + * *,1,1 -> *,1,0 + * + * this wait loop must be a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because not all + * clear_pending_set_locked() implementations imply full + * barriers. */ - if (val & ~_Q_LOCKED_MASK) - goto queue; - - new = _Q_LOCKED_VAL; - if (val == new) - new |= _Q_PENDING_VAL; + if (val & _Q_LOCKED_MASK) { + atomic_cond_read_acquire(&lock->val, + !(VAL & _Q_LOCKED_MASK)); + } /* - * Acquire semantic is required here as the function may - * return immediately if the lock was free. + * take ownership and clear the pending bit. + * + * *,1,0 -> *,0,1 */ - old = atomic_cmpxchg_acquire(&lock->val, val, new); - if (old == val) - break; - - val = old; - } - - /* - * we won the trylock - */ - if (new == _Q_LOCKED_VAL) + clear_pending_set_locked(lock); + qstat_inc(qstat_lock_pending, true); return; + } /* - * we're pending, wait for the owner to go away. - * - * *,1,1 -> *,1,0 - * - * this wait loop must be a load-acquire such that we match the - * store-release that clears the locked bit and create lock - * sequentiality; this is because not all clear_pending_set_locked() - * implementations imply full barriers. - */ - smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK)); - - /* - * take ownership and clear the pending bit. - * - * *,1,0 -> *,0,1 + * If pending was clear but there are waiters in the queue, then + * we need to undo our setting of pending before we queue ourselves. */ - clear_pending_set_locked(lock); - return; + if (!(val & _Q_PENDING_MASK)) + clear_pending(lock); /* * End of pending bit optimistic spinning and beginning of MCS * queuing. */ queue: + qstat_inc(qstat_lock_slowpath, true); +pv_queue: node = this_cpu_ptr(&mcs_nodes[0]); idx = node->count++; tail = encode_tail(smp_processor_id(), idx); @@ -400,12 +397,18 @@ queue: goto release; /* + * Ensure that the initialisation of @node is complete before we + * publish the updated tail via xchg_tail() and potentially link + * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. + */ + smp_wmb(); + + /* + * Publish the updated tail. * We have already touched the queueing cacheline; don't bother with * pending stuff. * * p,*,* -> n,*,* - * - * RELEASE, such that the stores to @node must be complete. */ old = xchg_tail(lock, tail); next = NULL; @@ -417,14 +420,8 @@ queue: if (old & _Q_TAIL_MASK) { prev = decode_tail(old); - /* - * We must ensure that the stores to @node are observed before - * the write to prev->next. The address dependency from - * xchg_tail is not sufficient to ensure this because the read - * component of xchg_tail is unordered with respect to the - * initialisation of @node. - */ - smp_store_release(&prev->next, node); + /* Link @node into the waitqueue. */ + WRITE_ONCE(prev->next, node); pv_wait_node(node, prev); arch_mcs_spin_lock_contended(&node->locked); @@ -453,8 +450,8 @@ queue: * * The PV pv_wait_head_or_lock function, if active, will acquire * the lock and return a non-zero value. So we have to skip the - * smp_cond_load_acquire() call. As the next PV queue head hasn't been - * designated yet, there is no way for the locked value to become + * atomic_cond_read_acquire() call. As the next PV queue head hasn't + * been designated yet, there is no way for the locked value to become * _Q_SLOW_VAL. So both the set_locked() and the * atomic_cmpxchg_relaxed() calls will be safe. * @@ -464,44 +461,38 @@ queue: if ((val = pv_wait_head_or_lock(lock, node))) goto locked; - val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK)); + val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK)); locked: /* * claim the lock: * * n,0,0 -> 0,0,1 : lock, uncontended - * *,0,0 -> *,0,1 : lock, contended + * *,*,0 -> *,*,1 : lock, contended * - * If the queue head is the only one in the queue (lock value == tail), - * clear the tail code and grab the lock. Otherwise, we only need - * to grab the lock. + * If the queue head is the only one in the queue (lock value == tail) + * and nobody is pending, clear the tail code and grab the lock. + * Otherwise, we only need to grab the lock. */ - for (;;) { - /* In the PV case we might already have _Q_LOCKED_VAL set */ - if ((val & _Q_TAIL_MASK) != tail) { - set_locked(lock); - break; - } - /* - * The smp_cond_load_acquire() call above has provided the - * necessary acquire semantics required for locking. At most - * two iterations of this loop may be ran. - */ - old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); - if (old == val) - goto release; /* No contention */ - val = old; - } + /* + * In the PV case we might already have _Q_LOCKED_VAL set. + * + * The atomic_cond_read_acquire() call above has provided the + * necessary acquire semantics required for locking. + */ + if (((val & _Q_TAIL_MASK) == tail) && + atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) + goto release; /* No contention */ + + /* Either somebody is queued behind us or _Q_PENDING_VAL is set */ + set_locked(lock); /* * contended path; wait for next if not observed yet, release. */ - if (!next) { - while (!(next = READ_ONCE(node->next))) - cpu_relax(); - } + if (!next) + next = smp_cond_load_relaxed(&node->next, (VAL)); arch_mcs_spin_unlock_contended(&next->locked); pv_kick_node(lock, next); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 6ee477765e6c..5a0cf5f9008c 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -56,11 +56,6 @@ struct pv_node { }; /* - * Include queued spinlock statistics code - */ -#include "qspinlock_stat.h" - -/* * Hybrid PV queued/unfair lock * * By replacing the regular queued_spin_trylock() with the function below, @@ -87,8 +82,6 @@ struct pv_node { #define queued_spin_trylock(l) pv_hybrid_queued_unfair_trylock(l) static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - /* * Stay in unfair lock mode as long as queued mode waiters are * present in the MCS wait queue but the pending bit isn't set. @@ -97,7 +90,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) int val = atomic_read(&lock->val); if (!(val & _Q_LOCKED_PENDING_MASK) && - (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { + (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) { qstat_inc(qstat_pv_lock_stealing, true); return true; } @@ -117,16 +110,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) #if _Q_PENDING_BITS == 8 static __always_inline void set_pending(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->pending, 1); -} - -static __always_inline void clear_pending(struct qspinlock *lock) -{ - struct __qspinlock *l = (void *)lock; - - WRITE_ONCE(l->pending, 0); + WRITE_ONCE(lock->pending, 1); } /* @@ -136,10 +120,8 @@ static __always_inline void clear_pending(struct qspinlock *lock) */ static __always_inline int trylock_clear_pending(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; - - return !READ_ONCE(l->locked) && - (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL, + return !READ_ONCE(lock->locked) && + (cmpxchg_acquire(&lock->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL) == _Q_PENDING_VAL); } #else /* _Q_PENDING_BITS == 8 */ @@ -148,11 +130,6 @@ static __always_inline void set_pending(struct qspinlock *lock) atomic_or(_Q_PENDING_VAL, &lock->val); } -static __always_inline void clear_pending(struct qspinlock *lock) -{ - atomic_andnot(_Q_PENDING_VAL, &lock->val); -} - static __always_inline int trylock_clear_pending(struct qspinlock *lock) { int val = atomic_read(&lock->val); @@ -384,7 +361,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - struct __qspinlock *l = (void *)lock; /* * If the vCPU is indeed halted, advance its state to match that of @@ -413,7 +389,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) * the hash table later on at unlock time, no atomic instruction is * needed. */ - WRITE_ONCE(l->locked, _Q_SLOW_VAL); + WRITE_ONCE(lock->locked, _Q_SLOW_VAL); (void)pv_hash(lock, pn); } @@ -428,7 +404,6 @@ static u32 pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - struct __qspinlock *l = (void *)lock; struct qspinlock **lp = NULL; int waitcnt = 0; int loop; @@ -443,7 +418,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) /* * Tracking # of slowpath locking operations */ - qstat_inc(qstat_pv_lock_slowpath, true); + qstat_inc(qstat_lock_slowpath, true); for (;; waitcnt++) { /* @@ -479,13 +454,13 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) * * Matches the smp_rmb() in __pv_queued_spin_unlock(). */ - if (xchg(&l->locked, _Q_SLOW_VAL) == 0) { + if (xchg(&lock->locked, _Q_SLOW_VAL) == 0) { /* * The lock was free and now we own the lock. * Change the lock value back to _Q_LOCKED_VAL * and unhash the table. */ - WRITE_ONCE(l->locked, _Q_LOCKED_VAL); + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); WRITE_ONCE(*lp, NULL); goto gotlock; } @@ -493,7 +468,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) WRITE_ONCE(pn->state, vcpu_hashed); qstat_inc(qstat_pv_wait_head, true); qstat_inc(qstat_pv_wait_again, waitcnt); - pv_wait(&l->locked, _Q_SLOW_VAL); + pv_wait(&lock->locked, _Q_SLOW_VAL); /* * Because of lock stealing, the queue head vCPU may not be @@ -518,7 +493,6 @@ gotlock: __visible void __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) { - struct __qspinlock *l = (void *)lock; struct pv_node *node; if (unlikely(locked != _Q_SLOW_VAL)) { @@ -547,7 +521,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) * Now that we have a reference to the (likely) blocked pv_node, * release the lock. */ - smp_store_release(&l->locked, 0); + smp_store_release(&lock->locked, 0); /* * At this point the memory pointed at by lock can be freed/reused, @@ -573,7 +547,6 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) #ifndef __pv_queued_spin_unlock __visible void __pv_queued_spin_unlock(struct qspinlock *lock) { - struct __qspinlock *l = (void *)lock; u8 locked; /* @@ -581,7 +554,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) * unhash. Otherwise it would be possible to have multiple @lock * entries, which would be BAD. */ - locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0); + locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0); if (likely(locked == _Q_LOCKED_VAL)) return; diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 4a30ef63c607..6bd78c0740fc 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -22,13 +22,14 @@ * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake * pv_latency_kick - average latency (ns) of vCPU kick operation * pv_latency_wake - average latency (ns) from vCPU kick to wakeup - * pv_lock_slowpath - # of locking operations via the slowpath * pv_lock_stealing - # of lock stealing operations * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs * pv_wait_again - # of wait's after a queue head vCPU kick * pv_wait_early - # of early vCPU wait's * pv_wait_head - # of vCPU wait's at the queue head * pv_wait_node - # of vCPU wait's at a non-head queue node + * lock_pending - # of locking operations via pending code + * lock_slowpath - # of locking operations via MCS lock queue * * Writing to the "reset_counters" file will reset all the above counter * values. @@ -46,13 +47,14 @@ enum qlock_stats { qstat_pv_kick_wake, qstat_pv_latency_kick, qstat_pv_latency_wake, - qstat_pv_lock_slowpath, qstat_pv_lock_stealing, qstat_pv_spurious_wakeup, qstat_pv_wait_again, qstat_pv_wait_early, qstat_pv_wait_head, qstat_pv_wait_node, + qstat_lock_pending, + qstat_lock_slowpath, qstat_num, /* Total number of statistical counters */ qstat_reset_cnts = qstat_num, }; @@ -73,12 +75,13 @@ static const char * const qstat_names[qstat_num + 1] = { [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", [qstat_pv_latency_kick] = "pv_latency_kick", [qstat_pv_latency_wake] = "pv_latency_wake", - [qstat_pv_lock_slowpath] = "pv_lock_slowpath", [qstat_pv_lock_stealing] = "pv_lock_stealing", [qstat_pv_wait_again] = "pv_wait_again", [qstat_pv_wait_early] = "pv_wait_early", [qstat_pv_wait_head] = "pv_wait_head", [qstat_pv_wait_node] = "pv_wait_node", + [qstat_lock_pending] = "lock_pending", + [qstat_lock_slowpath] = "lock_slowpath", [qstat_reset_cnts] = "reset_counters", }; diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index e795908f3607..3064c50e181e 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -347,30 +347,31 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) } } +static inline bool owner_on_cpu(struct task_struct *owner) +{ + /* + * As lock holder preemption issue, we both skip spinning if + * task is not on cpu or its cpu is preempted + */ + return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); +} + static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner; bool ret = true; + BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN)); + if (need_resched()) return false; rcu_read_lock(); owner = READ_ONCE(sem->owner); - if (!rwsem_owner_is_writer(owner)) { - /* - * Don't spin if the rwsem is readers owned. - */ - ret = !rwsem_owner_is_reader(owner); - goto done; + if (owner) { + ret = is_rwsem_owner_spinnable(owner) && + owner_on_cpu(owner); } - - /* - * As lock holder preemption issue, we both skip spinning if task is not - * on cpu or its cpu is preempted - */ - ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); -done: rcu_read_unlock(); return ret; } @@ -382,11 +383,11 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner = READ_ONCE(sem->owner); - if (!rwsem_owner_is_writer(owner)) - goto out; + if (!is_rwsem_owner_spinnable(owner)) + return false; rcu_read_lock(); - while (sem->owner == owner) { + while (owner && (READ_ONCE(sem->owner) == owner)) { /* * Ensure we emit the owner->on_cpu, dereference _after_ * checking sem->owner still matches owner, if that fails, @@ -399,8 +400,7 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) * abort spinning when need_resched or owner is not running or * owner's cpu is preempted. */ - if (!owner->on_cpu || need_resched() || - vcpu_is_preempted(task_cpu(owner))) { + if (need_resched() || !owner_on_cpu(owner)) { rcu_read_unlock(); return false; } @@ -408,12 +408,12 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) cpu_relax(); } rcu_read_unlock(); -out: + /* * If there is a new owner or the owner is not set, we continue * spinning. */ - return !rwsem_owner_is_reader(READ_ONCE(sem->owner)); + return is_rwsem_owner_spinnable(READ_ONCE(sem->owner)); } static bool rwsem_optimistic_spin(struct rw_semaphore *sem) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 30465a2f2b6c..bc1e507be9ff 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -221,5 +221,3 @@ void up_read_non_owner(struct rw_semaphore *sem) EXPORT_SYMBOL(up_read_non_owner); #endif - - diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index a17cba8d94bb..b9d0e72aa80f 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,20 +1,24 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * The owner field of the rw_semaphore structure will be set to - * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear + * RWSEM_READER_OWNED when a reader grabs the lock. A writer will clear * the owner field when it unlocks. A reader, on the other hand, will * not touch the owner field when it unlocks. * - * In essence, the owner field now has the following 3 states: + * In essence, the owner field now has the following 4 states: * 1) 0 * - lock is free or the owner hasn't set the field yet * 2) RWSEM_READER_OWNED * - lock is currently or previously owned by readers (lock is free * or not set by owner yet) - * 3) Other non-zero value - * - a writer owns the lock + * 3) RWSEM_ANONYMOUSLY_OWNED bit set with some other bits set as well + * - lock is owned by an anonymous writer, so spinning on the lock + * owner should be disabled. + * 4) Other non-zero value + * - a writer owns the lock and other writers can spin on the lock owner. */ -#define RWSEM_READER_OWNED ((struct task_struct *)1UL) +#define RWSEM_ANONYMOUSLY_OWNED (1UL << 0) +#define RWSEM_READER_OWNED ((struct task_struct *)RWSEM_ANONYMOUSLY_OWNED) #ifdef CONFIG_DEBUG_RWSEMS # define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) @@ -51,14 +55,22 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); } -static inline bool rwsem_owner_is_writer(struct task_struct *owner) +/* + * Return true if the a rwsem waiter can spin on the rwsem's owner + * and steal the lock, i.e. the lock is not anonymously owned. + * N.B. !owner is considered spinnable. + */ +static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) { - return owner && owner != RWSEM_READER_OWNED; + return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED); } -static inline bool rwsem_owner_is_reader(struct task_struct *owner) +/* + * Return true if rwsem is owned by an anonymous writer or readers. + */ +static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) { - return owner == RWSEM_READER_OWNED; + return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED; } #else static inline void rwsem_set_owner(struct rw_semaphore *sem) diff --git a/kernel/memremap.c b/kernel/memremap.c index 895e6b76b25e..5857267a4af5 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -1,15 +1,5 @@ -/* - * Copyright(c) 2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2015 Intel Corporation. All rights reserved. */ #include <linux/radix-tree.h> #include <linux/device.h> #include <linux/types.h> @@ -19,170 +9,8 @@ #include <linux/memory_hotplug.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/wait_bit.h> -#ifndef ioremap_cache -/* temporary while we convert existing ioremap_cache users to memremap */ -__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) -{ - return ioremap(offset, size); -} -#endif - -#ifndef arch_memremap_wb -static void *arch_memremap_wb(resource_size_t offset, unsigned long size) -{ - return (__force void *)ioremap_cache(offset, size); -} -#endif - -#ifndef arch_memremap_can_ram_remap -static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, - unsigned long flags) -{ - return true; -} -#endif - -static void *try_ram_remap(resource_size_t offset, size_t size, - unsigned long flags) -{ - unsigned long pfn = PHYS_PFN(offset); - - /* In the simple case just return the existing linear address */ - if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) && - arch_memremap_can_ram_remap(offset, size, flags)) - return __va(offset); - - return NULL; /* fallback to arch_memremap_wb */ -} - -/** - * memremap() - remap an iomem_resource as cacheable memory - * @offset: iomem resource start address - * @size: size of remap - * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC, - * MEMREMAP_ENC, MEMREMAP_DEC - * - * memremap() is "ioremap" for cases where it is known that the resource - * being mapped does not have i/o side effects and the __iomem - * annotation is not applicable. In the case of multiple flags, the different - * mapping types will be attempted in the order listed below until one of - * them succeeds. - * - * MEMREMAP_WB - matches the default mapping for System RAM on - * the architecture. This is usually a read-allocate write-back cache. - * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM - * memremap() will bypass establishing a new mapping and instead return - * a pointer into the direct map. - * - * MEMREMAP_WT - establish a mapping whereby writes either bypass the - * cache or are written through to memory and never exist in a - * cache-dirty state with respect to program visibility. Attempts to - * map System RAM with this mapping type will fail. - * - * MEMREMAP_WC - establish a writecombine mapping, whereby writes may - * be coalesced together (e.g. in the CPU's write buffers), but is otherwise - * uncached. Attempts to map System RAM with this mapping type will fail. - */ -void *memremap(resource_size_t offset, size_t size, unsigned long flags) -{ - int is_ram = region_intersects(offset, size, - IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); - void *addr = NULL; - - if (!flags) - return NULL; - - if (is_ram == REGION_MIXED) { - WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", - &offset, (unsigned long) size); - return NULL; - } - - /* Try all mapping types requested until one returns non-NULL */ - if (flags & MEMREMAP_WB) { - /* - * MEMREMAP_WB is special in that it can be satisifed - * from the direct map. Some archs depend on the - * capability of memremap() to autodetect cases where - * the requested range is potentially in System RAM. - */ - if (is_ram == REGION_INTERSECTS) - addr = try_ram_remap(offset, size, flags); - if (!addr) - addr = arch_memremap_wb(offset, size); - } - - /* - * If we don't have a mapping yet and other request flags are - * present then we will be attempting to establish a new virtual - * address mapping. Enforce that this mapping is not aliasing - * System RAM. - */ - if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) { - WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", - &offset, (unsigned long) size); - return NULL; - } - - if (!addr && (flags & MEMREMAP_WT)) - addr = ioremap_wt(offset, size); - - if (!addr && (flags & MEMREMAP_WC)) - addr = ioremap_wc(offset, size); - - return addr; -} -EXPORT_SYMBOL(memremap); - -void memunmap(void *addr) -{ - if (is_vmalloc_addr(addr)) - iounmap((void __iomem *) addr); -} -EXPORT_SYMBOL(memunmap); - -static void devm_memremap_release(struct device *dev, void *res) -{ - memunmap(*(void **)res); -} - -static int devm_memremap_match(struct device *dev, void *res, void *match_data) -{ - return *(void **)res == match_data; -} - -void *devm_memremap(struct device *dev, resource_size_t offset, - size_t size, unsigned long flags) -{ - void **ptr, *addr; - - ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL, - dev_to_node(dev)); - if (!ptr) - return ERR_PTR(-ENOMEM); - - addr = memremap(offset, size, flags); - if (addr) { - *ptr = addr; - devres_add(dev, ptr); - } else { - devres_free(ptr); - return ERR_PTR(-ENXIO); - } - - return addr; -} -EXPORT_SYMBOL(devm_memremap); - -void devm_memunmap(struct device *dev, void *addr) -{ - WARN_ON(devres_release(dev, devm_memremap_release, - devm_memremap_match, addr)); -} -EXPORT_SYMBOL(devm_memunmap); - -#ifdef CONFIG_ZONE_DEVICE static DEFINE_MUTEX(pgmap_lock); static RADIX_TREE(pgmap_radix, GFP_KERNEL); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) @@ -473,10 +301,32 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, return pgmap; } -#endif /* CONFIG_ZONE_DEVICE */ +EXPORT_SYMBOL_GPL(get_dev_pagemap); + +#ifdef CONFIG_DEV_PAGEMAP_OPS +DEFINE_STATIC_KEY_FALSE(devmap_managed_key); +EXPORT_SYMBOL_GPL(devmap_managed_key); +static atomic_t devmap_enable; + +/* + * Toggle the static key for ->page_free() callbacks when dev_pagemap + * pages go idle. + */ +void dev_pagemap_get_ops(void) +{ + if (atomic_inc_return(&devmap_enable) == 1) + static_branch_enable(&devmap_managed_key); +} +EXPORT_SYMBOL_GPL(dev_pagemap_get_ops); + +void dev_pagemap_put_ops(void) +{ + if (atomic_dec_and_test(&devmap_enable)) + static_branch_disable(&devmap_managed_key); +} +EXPORT_SYMBOL_GPL(dev_pagemap_put_ops); -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) -void put_zone_device_private_or_public_page(struct page *page) +void __put_devmap_managed_page(struct page *page) { int count = page_ref_dec_return(page); @@ -496,5 +346,5 @@ void put_zone_device_private_or_public_page(struct page *page) } else if (!count) __put_page(page); } -EXPORT_SYMBOL(put_zone_device_private_or_public_page); -#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ +EXPORT_SYMBOL_GPL(__put_devmap_managed_page); +#endif /* CONFIG_DEV_PAGEMAP_OPS */ diff --git a/kernel/module.c b/kernel/module.c index 1e3337bcf1e7..f475f30eed8c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1470,7 +1470,8 @@ static ssize_t module_sect_show(struct module_attribute *mattr, { struct module_sect_attr *sattr = container_of(mattr, struct module_sect_attr, mattr); - return sprintf(buf, "0x%pK\n", (void *)sattr->address); + return sprintf(buf, "0x%px\n", kptr_restrict < 2 ? + (void *)sattr->address : NULL); } static void free_sect_attrs(struct module_sect_attrs *sect_attrs) @@ -1601,8 +1602,7 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info) if (notes == 0) return; - notes_attrs = kzalloc(sizeof(*notes_attrs) - + notes * sizeof(notes_attrs->attrs[0]), + notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes), GFP_KERNEL); if (notes_attrs == NULL) return; @@ -3514,6 +3514,11 @@ static noinline int do_init_module(struct module *mod) * walking this with preempt disabled. In all the failure paths, we * call synchronize_sched(), but we don't want to slow down the success * path, so use actual RCU here. + * Note that module_alloc() on most architectures creates W+X page + * mappings which won't be cleaned up until do_free_init() runs. Any + * code such as mark_rodata_ro() which depends on those mappings to + * be cleaned up needs to sync with the queued work - ie + * rcu_barrier_sched() */ call_rcu_sched(&freeinit->rcu, do_free_init); mutex_unlock(&module_mutex); diff --git a/kernel/panic.c b/kernel/panic.c index 42e487488554..8b2e002d52eb 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -623,7 +623,7 @@ static __init int register_warn_debugfs(void) device_initcall(register_warn_debugfs); #endif -#ifdef CONFIG_CC_STACKPROTECTOR +#ifdef CONFIG_STACKPROTECTOR /* * Called when gcc's -fstack-protector feature is used, and diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 5454cc639a8d..9c85c7822383 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -287,6 +287,8 @@ static int create_image(int platform_mode) local_irq_disable(); + system_state = SYSTEM_SUSPEND; + error = syscore_suspend(); if (error) { pr_err("Some system devices failed to power down, aborting hibernation\n"); @@ -317,6 +319,7 @@ static int create_image(int platform_mode) syscore_resume(); Enable_irqs: + system_state = SYSTEM_RUNNING; local_irq_enable(); Enable_cpus: @@ -445,6 +448,7 @@ static int resume_target_kernel(bool platform_mode) goto Enable_cpus; local_irq_disable(); + system_state = SYSTEM_SUSPEND; error = syscore_suspend(); if (error) @@ -478,6 +482,7 @@ static int resume_target_kernel(bool platform_mode) syscore_resume(); Enable_irqs: + system_state = SYSTEM_RUNNING; local_irq_enable(); Enable_cpus: @@ -563,6 +568,7 @@ int hibernation_platform_enter(void) goto Enable_cpus; local_irq_disable(); + system_state = SYSTEM_SUSPEND; syscore_suspend(); if (pm_wakeup_pending()) { error = -EAGAIN; @@ -575,6 +581,7 @@ int hibernation_platform_enter(void) Power_up: syscore_resume(); + system_state = SYSTEM_RUNNING; local_irq_enable(); Enable_cpus: diff --git a/kernel/power/qos.c b/kernel/power/qos.c index fa39092b7aea..86d72ffb811b 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -184,7 +184,6 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) c->target_value = value; } -static inline int pm_qos_get_value(struct pm_qos_constraints *c); static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused) { struct pm_qos_object *qos = (struct pm_qos_object *)s->private; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4c10be0f4843..87331565e505 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -27,6 +27,7 @@ #include <linux/export.h> #include <linux/suspend.h> #include <linux/syscore_ops.h> +#include <linux/swait.h> #include <linux/ftrace.h> #include <trace/events/power.h> #include <linux/compiler.h> @@ -57,10 +58,10 @@ EXPORT_SYMBOL_GPL(pm_suspend_global_flags); static const struct platform_suspend_ops *suspend_ops; static const struct platform_s2idle_ops *s2idle_ops; -static DECLARE_WAIT_QUEUE_HEAD(s2idle_wait_head); +static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head); enum s2idle_states __read_mostly s2idle_state; -static DEFINE_SPINLOCK(s2idle_lock); +static DEFINE_RAW_SPINLOCK(s2idle_lock); void s2idle_set_ops(const struct platform_s2idle_ops *ops) { @@ -78,12 +79,12 @@ static void s2idle_enter(void) { trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true); - spin_lock_irq(&s2idle_lock); + raw_spin_lock_irq(&s2idle_lock); if (pm_wakeup_pending()) goto out; s2idle_state = S2IDLE_STATE_ENTER; - spin_unlock_irq(&s2idle_lock); + raw_spin_unlock_irq(&s2idle_lock); get_online_cpus(); cpuidle_resume(); @@ -91,17 +92,17 @@ static void s2idle_enter(void) /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); /* Make the current CPU wait so it can enter the idle loop too. */ - wait_event(s2idle_wait_head, - s2idle_state == S2IDLE_STATE_WAKE); + swait_event(s2idle_wait_head, + s2idle_state == S2IDLE_STATE_WAKE); cpuidle_pause(); put_online_cpus(); - spin_lock_irq(&s2idle_lock); + raw_spin_lock_irq(&s2idle_lock); out: s2idle_state = S2IDLE_STATE_NONE; - spin_unlock_irq(&s2idle_lock); + raw_spin_unlock_irq(&s2idle_lock); trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, false); } @@ -156,12 +157,12 @@ void s2idle_wake(void) { unsigned long flags; - spin_lock_irqsave(&s2idle_lock, flags); + raw_spin_lock_irqsave(&s2idle_lock, flags); if (s2idle_state > S2IDLE_STATE_NONE) { s2idle_state = S2IDLE_STATE_WAKE; - wake_up(&s2idle_wait_head); + swake_up(&s2idle_wait_head); } - spin_unlock_irqrestore(&s2idle_lock, flags); + raw_spin_unlock_irqrestore(&s2idle_lock, flags); } EXPORT_SYMBOL_GPL(s2idle_wake); @@ -428,6 +429,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) arch_suspend_disable_irqs(); BUG_ON(!irqs_disabled()); + system_state = SYSTEM_SUSPEND; + error = syscore_suspend(); if (!error) { *wakeup = pm_wakeup_pending(); @@ -443,6 +446,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) syscore_resume(); } + system_state = SYSTEM_RUNNING; + arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 11b4282c2d20..c2bcf97d24c8 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -269,7 +269,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, struct bio *bio; int error = 0; - bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); + bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); bio_set_dev(bio, hib_resume_bdev); bio_set_op_attrs(bio, op, op_flags); @@ -376,7 +376,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) return -ENOSPC; if (hb) { - src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN | + src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (src) { copy_page(src, buf); @@ -384,7 +384,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) ret = hib_wait_io(hb); /* Free pages */ if (ret) return ret; - src = (void *)__get_free_page(__GFP_RECLAIM | + src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (src) { @@ -691,14 +691,14 @@ static int save_image_lzo(struct swap_map_handle *handle, nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); - page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH); + page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH); if (!page) { pr_err("Failed to allocate LZO page\n"); ret = -ENOMEM; goto out_clean; } - data = vmalloc(sizeof(*data) * nr_threads); + data = vmalloc(array_size(nr_threads, sizeof(*data))); if (!data) { pr_err("Failed to allocate LZO data\n"); ret = -ENOMEM; @@ -989,7 +989,7 @@ static int get_swap_reader(struct swap_map_handle *handle, last = tmp; tmp->map = (struct swap_map_page *) - __get_free_page(__GFP_RECLAIM | __GFP_HIGH); + __get_free_page(GFP_NOIO | __GFP_HIGH); if (!tmp->map) { release_swap_reader(handle); return -ENOMEM; @@ -1183,14 +1183,14 @@ static int load_image_lzo(struct swap_map_handle *handle, nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); - page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES); + page = vmalloc(array_size(LZO_MAX_RD_PAGES, sizeof(*page))); if (!page) { pr_err("Failed to allocate LZO page\n"); ret = -ENOMEM; goto out_clean; } - data = vmalloc(sizeof(*data) * nr_threads); + data = vmalloc(array_size(nr_threads, sizeof(*data))); if (!data) { pr_err("Failed to allocate LZO data\n"); ret = -ENOMEM; @@ -1261,8 +1261,8 @@ static int load_image_lzo(struct swap_map_handle *handle, for (i = 0; i < read_pages; i++) { page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? - __GFP_RECLAIM | __GFP_HIGH : - __GFP_RECLAIM | __GFP_NOWARN | + GFP_NOIO | __GFP_HIGH : + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (!page[i]) { diff --git a/kernel/power/user.c b/kernel/power/user.c index 75c959de4b29..abd225550271 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -186,6 +186,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, res = PAGE_SIZE - pg_offp; } + if (!data_of(data->handle)) { + res = -EINVAL; + goto unlock; + } + res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp, buf, count); if (res > 0) diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index dfba59be190b..4210152e56f0 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -188,6 +188,7 @@ static struct wakelock *wakelock_lookup_add(const char *name, size_t len, return ERR_PTR(-ENOMEM); } wl->ws.name = wl->name; + wl->ws.last_time = ktime_get(); wakeup_source_add(&wl->ws); rb_link_node(&wl->node, parent, node); rb_insert_color(&wl->node, &wakelocks_tree); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 2f4af216bd6e..247808333ba4 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1908,6 +1908,7 @@ asmlinkage int vprintk_emit(int facility, int level, preempt_enable(); } + wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); @@ -2289,9 +2290,7 @@ void console_unlock(void) { static char ext_text[CONSOLE_EXT_LOG_MAX]; static char text[LOG_LINE_MAX + PREFIX_MAX]; - static u64 seen_seq; unsigned long flags; - bool wake_klogd = false; bool do_cond_resched, retry; if (console_suspended) { @@ -2335,11 +2334,6 @@ again: printk_safe_enter_irqsave(flags); raw_spin_lock(&logbuf_lock); - if (seen_seq != log_next_seq) { - wake_klogd = true; - seen_seq = log_next_seq; - } - if (console_seq < log_first_seq) { len = sprintf(text, "** %u printk messages dropped **\n", (unsigned)(log_first_seq - console_seq)); @@ -2397,7 +2391,7 @@ skip: if (console_lock_spinning_disable_and_check()) { printk_safe_exit_irqrestore(flags); - goto out; + return; } printk_safe_exit_irqrestore(flags); @@ -2429,10 +2423,6 @@ skip: if (retry && console_trylock()) goto again; - -out: - if (wake_klogd) - wake_up_klogd(); } EXPORT_SYMBOL(console_unlock); diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 3e3c2004bb23..d7d091309054 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -82,6 +82,7 @@ static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s, { int add; size_t len; + va_list ap; again: len = atomic_read(&s->len); @@ -100,7 +101,9 @@ again: if (!len) smp_rmb(); - add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + va_copy(ap, args); + add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap); + va_end(ap); if (!add) return 0; @@ -278,7 +281,7 @@ void printk_safe_flush_on_panic(void) * Make sure that we could access the main ring buffer. * Do not risk a double release when more CPUs are up. */ - if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) { + if (raw_spin_is_locked(&logbuf_lock)) { if (num_online_cpus() > 1) return; diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 7a693e31184a..40cea6735c2d 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -270,6 +270,12 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) } } +/* Returns first leaf rcu_node of the specified RCU flavor. */ +#define rcu_first_leaf_node(rsp) ((rsp)->level[rcu_num_lvls - 1]) + +/* Is this rcu_node a leaf? */ +#define rcu_is_leaf_node(rnp) ((rnp)->level == rcu_num_lvls - 1) + /* * Do a full breadth-first scan of the rcu_node structures for the * specified rcu_state structure. @@ -284,8 +290,7 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * rcu_node tree with but one rcu_node structure, this loop is a no-op. */ #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; \ - (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) + for ((rnp) = &(rsp)->node[0]; !rcu_is_leaf_node(rsp, rnp); (rnp)++) /* * Scan the leaves of the rcu_node hierarchy for the specified rcu_state @@ -294,7 +299,7 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * It is still a leaf node, even if it is also the root node. */ #define rcu_for_each_leaf_node(rsp, rnp) \ - for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ + for ((rnp) = rcu_first_leaf_node(rsp); \ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) /* @@ -486,6 +491,7 @@ void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); void rcu_sched_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; +extern struct workqueue_struct *rcu_par_gp_wq; #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 88cba7c2956c..5aff271adf1e 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -404,24 +404,6 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) } /* - * Scan the specified rcu_segcblist structure for callbacks that need - * a grace period later than the one specified by "seq". We don't look - * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't - * have a grace-period sequence number. - */ -bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, - unsigned long seq) -{ - int i; - - for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) - if (rsclp->tails[i - 1] != rsclp->tails[i] && - ULONG_CMP_LT(seq, rsclp->gp_seq[i])) - return true; - return false; -} - -/* * Merge the source rcu_segcblist structure into the destination * rcu_segcblist structure, then initialize the source. Any pending * callbacks from the source get to start over. It is best to diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 581c12b63544..948470cef385 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -134,7 +134,5 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp); void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq); bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq); -bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, - unsigned long seq); void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, struct rcu_segcblist *src_rsclp); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 777e7a6a0292..e232846516b3 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -369,7 +369,7 @@ static bool __maybe_unused torturing_tasks(void) */ static void rcu_perf_wait_shutdown(void) { - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters) return; while (!torture_must_stop()) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 680c96d8c00f..42fcb7f05fac 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -593,7 +593,12 @@ static void srcu_torture_init(void) static void srcu_torture_cleanup(void) { - cleanup_srcu_struct(&srcu_ctld); + static DEFINE_TORTURE_RANDOM(rand); + + if (torture_random(&rand) & 0x800) + cleanup_srcu_struct(&srcu_ctld); + else + cleanup_srcu_struct_quiesced(&srcu_ctld); srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */ } @@ -826,8 +831,9 @@ rcu_torture_cbflood(void *arg) cbflood_intra_holdoff > 0 && cur_ops->call && cur_ops->cb_barrier) { - rhp = vmalloc(sizeof(*rhp) * - cbflood_n_burst * cbflood_n_per_burst); + rhp = vmalloc(array3_size(cbflood_n_burst, + cbflood_n_per_burst, + sizeof(*rhp))); err = !rhp; } if (err) { @@ -1609,6 +1615,9 @@ static enum cpuhp_state rcutor_hp; static void rcu_torture_cleanup(void) { + int flags = 0; + unsigned long gpnum = 0; + unsigned long completed = 0; int i; rcutorture_record_test_transition(); @@ -1639,6 +1648,11 @@ rcu_torture_cleanup(void) fakewriter_tasks = NULL; } + rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); + srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, + &flags, &gpnum, &completed); + pr_alert("%s: End-test grace-period state: g%lu c%lu f%#x\n", + cur_ops->name, gpnum, completed, flags); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); for (i = 0; i < ncbflooders; i++) diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 76ac5f50b2c7..622792abe41a 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -86,16 +86,19 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); * Must invoke this after you are finished using a given srcu_struct that * was initialized via init_srcu_struct(), else you leak memory. */ -void cleanup_srcu_struct(struct srcu_struct *sp) +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) { WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); - flush_work(&sp->srcu_work); + if (quiesced) + WARN_ON(work_pending(&sp->srcu_work)); + else + flush_work(&sp->srcu_work); WARN_ON(sp->srcu_gp_running); WARN_ON(sp->srcu_gp_waiting); WARN_ON(sp->srcu_cb_head); WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail); } -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); /* * Removes the count for the old reader from the appropriate element of diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index fb560fca9ef4..b4123d7a2cec 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -366,24 +366,28 @@ static unsigned long srcu_get_delay(struct srcu_struct *sp) return SRCU_INTERVAL; } -/** - * cleanup_srcu_struct - deconstruct a sleep-RCU structure - * @sp: structure to clean up. - * - * Must invoke this after you are finished using a given srcu_struct that - * was initialized via init_srcu_struct(), else you leak memory. - */ -void cleanup_srcu_struct(struct srcu_struct *sp) +/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */ +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) { int cpu; if (WARN_ON(!srcu_get_delay(sp))) - return; /* Leakage unless caller handles error. */ + return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(sp))) - return; /* Leakage unless caller handles error. */ - flush_delayed_work(&sp->work); + return; /* Just leak it! */ + if (quiesced) { + if (WARN_ON(delayed_work_pending(&sp->work))) + return; /* Just leak it! */ + } else { + flush_delayed_work(&sp->work); + } for_each_possible_cpu(cpu) - flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); + if (quiesced) { + if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work))) + return; /* Just leak it! */ + } else { + flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); + } if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || WARN_ON(srcu_readers_active(sp))) { pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); @@ -392,7 +396,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp) free_percpu(sp->sda); sp->sda = NULL; } -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); /* * Counts the new reader in the appropriate per-CPU element of the diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2a734692a581..aa7cade1b9f3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -524,8 +524,6 @@ module_param(rcu_kick_kthreads, bool, 0644); static ulong jiffies_till_sched_qs = HZ / 10; module_param(jiffies_till_sched_qs, ulong, 0444); -static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp); static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(void); @@ -711,44 +709,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) } /* - * Is there any need for future grace periods? - * Interrupts must be disabled. If the caller does not hold the root - * rnp_node structure's ->lock, the results are advisory only. - */ -static int rcu_future_needs_gp(struct rcu_state *rsp) -{ - struct rcu_node *rnp = rcu_get_root(rsp); - int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; - int *fp = &rnp->need_future_gp[idx]; - - lockdep_assert_irqs_disabled(); - return READ_ONCE(*fp); -} - -/* - * Does the current CPU require a not-yet-started grace period? - * The caller must have disabled interrupts to prevent races with - * normal callback registry. - */ -static bool -cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) -{ - lockdep_assert_irqs_disabled(); - if (rcu_gp_in_progress(rsp)) - return false; /* No, a grace period is already in progress. */ - if (rcu_future_needs_gp(rsp)) - return true; /* Yes, a no-CBs CPU needs one. */ - if (!rcu_segcblist_is_enabled(&rdp->cblist)) - return false; /* No, this is a no-CBs (or offline) CPU. */ - if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) - return true; /* Yes, CPU has newly registered callbacks. */ - if (rcu_segcblist_future_gp_needed(&rdp->cblist, - READ_ONCE(rsp->completed))) - return true; /* Yes, CBs for future grace period. */ - return false; /* No grace period needed. */ -} - -/* * Enter an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. * @@ -1234,10 +1194,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) } /* - * Has this CPU encountered a cond_resched_rcu_qs() since the - * beginning of the grace period? For this to be the case, - * the CPU has to have noticed the current grace period. This - * might not be the case for nohz_full CPUs looping in the kernel. + * Has this CPU encountered a cond_resched() since the beginning + * of the grace period? For this to be the case, the CPU has to + * have noticed the current grace period. This might not be the + * case for nohz_full CPUs looping in the kernel. */ jtsq = jiffies_till_sched_qs; ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); @@ -1642,18 +1602,30 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, return rnp->completed + 1; /* + * If the current rcu_node structure believes that RCU is + * idle, and if the rcu_state structure does not yet reflect + * the start of a new grace period, then the next grace period + * will suffice. The memory barrier is needed to accurately + * sample the rsp->gpnum, and pairs with the second lock + * acquisition in rcu_gp_init(), which is augmented with + * smp_mb__after_unlock_lock() for this purpose. + */ + if (rnp->gpnum == rnp->completed) { + smp_mb(); /* See above block comment. */ + if (READ_ONCE(rsp->gpnum) == rnp->completed) + return rnp->completed + 1; + } + + /* * Otherwise, wait for a possible partial grace period and * then the subsequent full grace period. */ return rnp->completed + 2; } -/* - * Trace-event helper function for rcu_start_future_gp() and - * rcu_nocb_wait_gp(). - */ -static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long c, const char *s) +/* Trace-event wrapper function for trace_rcu_future_grace_period. */ +static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long c, const char *s) { trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, rnp->completed, c, rnp->level, @@ -1661,96 +1633,67 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, } /* - * Start some future grace period, as needed to handle newly arrived + * Start the specified grace period, as needed to handle newly arrived * callbacks. The required future grace periods are recorded in each - * rcu_node structure's ->need_future_gp field. Returns true if there + * rcu_node structure's ->need_future_gp[] field. Returns true if there * is reason to awaken the grace-period kthread. * - * The caller must hold the specified rcu_node structure's ->lock. + * The caller must hold the specified rcu_node structure's ->lock, which + * is why the caller is responsible for waking the grace-period kthread. */ -static bool __maybe_unused -rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long *c_out) +static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long c) { - unsigned long c; bool ret = false; - struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); - - raw_lockdep_assert_held_rcu_node(rnp); - - /* - * Pick up grace-period number for new callbacks. If this - * grace period is already marked as needed, return to the caller. - */ - c = rcu_cbs_completed(rdp->rsp, rnp); - trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); - if (rnp->need_future_gp[c & 0x1]) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); - goto out; - } + struct rcu_state *rsp = rdp->rsp; + struct rcu_node *rnp_root; /* - * If either this rcu_node structure or the root rcu_node structure - * believe that a grace period is in progress, then we must wait - * for the one following, which is in "c". Because our request - * will be noticed at the end of the current grace period, we don't - * need to explicitly start one. We only do the lockless check - * of rnp_root's fields if the current rcu_node structure thinks - * there is no grace period in flight, and because we hold rnp->lock, - * the only possible change is when rnp_root's two fields are - * equal, in which case rnp_root->gpnum might be concurrently - * incremented. But that is OK, as it will just result in our - * doing some extra useless work. + * Use funnel locking to either acquire the root rcu_node + * structure's lock or bail out if the need for this grace period + * has already been recorded -- or has already started. If there + * is already a grace period in progress in a non-leaf node, no + * recording is needed because the end of the grace period will + * scan the leaf rcu_node structures. Note that rnp->lock must + * not be released. */ - if (rnp->gpnum != rnp->completed || - READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) { - rnp->need_future_gp[c & 0x1]++; - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); - goto out; + raw_lockdep_assert_held_rcu_node(rnp); + trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); + for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { + if (rnp_root != rnp) + raw_spin_lock_rcu_node(rnp_root); + WARN_ON_ONCE(ULONG_CMP_LT(rnp_root->gpnum + + need_future_gp_mask(), c)); + if (need_future_gp_element(rnp_root, c) || + ULONG_CMP_GE(rnp_root->gpnum, c) || + (rnp != rnp_root && + rnp_root->gpnum != rnp_root->completed)) { + trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted")); + goto unlock_out; + } + need_future_gp_element(rnp_root, c) = true; + if (rnp_root != rnp && rnp_root->parent != NULL) + raw_spin_unlock_rcu_node(rnp_root); + if (!rnp_root->parent) + break; /* At root, and perhaps also leaf. */ } - /* - * There might be no grace period in progress. If we don't already - * hold it, acquire the root rcu_node structure's lock in order to - * start one (if needed). - */ - if (rnp != rnp_root) - raw_spin_lock_rcu_node(rnp_root); - - /* - * Get a new grace-period number. If there really is no grace - * period in progress, it will be smaller than the one we obtained - * earlier. Adjust callbacks as needed. - */ - c = rcu_cbs_completed(rdp->rsp, rnp_root); - if (!rcu_is_nocb_cpu(rdp->cpu)) - (void)rcu_segcblist_accelerate(&rdp->cblist, c); - - /* - * If the needed for the required grace period is already - * recorded, trace and leave. - */ - if (rnp_root->need_future_gp[c & 0x1]) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); + /* If GP already in progress, just leave, otherwise start one. */ + if (rnp_root->gpnum != rnp_root->completed) { + trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedleafroot")); goto unlock_out; } - - /* Record the need for the future grace period. */ - rnp_root->need_future_gp[c & 0x1]++; - - /* If a grace period is not already in progress, start one. */ - if (rnp_root->gpnum != rnp_root->completed) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); - } else { - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); - ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); + trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedroot")); + WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); + if (!rsp->gp_kthread) { + trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread")); + goto unlock_out; } + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); + ret = true; /* Caller must wake GP kthread. */ unlock_out: if (rnp != rnp_root) raw_spin_unlock_rcu_node(rnp_root); -out: - if (c_out != NULL) - *c_out = c; return ret; } @@ -1758,16 +1701,16 @@ out: * Clean up any old requests for the just-ended grace period. Also return * whether any additional grace periods have been requested. */ -static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { - int c = rnp->completed; - int needmore; + unsigned long c = rnp->completed; + bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - rnp->need_future_gp[c & 0x1] = 0; - needmore = rnp->need_future_gp[(c + 1) & 0x1]; - trace_rcu_future_gp(rnp, rdp, c, - needmore ? TPS("CleanupMore") : TPS("Cleanup")); + need_future_gp_element(rnp, c) = false; + needmore = need_any_future_gp(rnp); + trace_rcu_this_gp(rnp, rdp, c, + needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; } @@ -1802,6 +1745,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + unsigned long c; bool ret = false; raw_lockdep_assert_held_rcu_node(rnp); @@ -1820,8 +1764,9 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * accelerating callback invocation to an earlier grace-period * number. */ - if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp))) - ret = rcu_start_future_gp(rnp, rdp, NULL); + c = rcu_cbs_completed(rsp, rnp); + if (rcu_segcblist_accelerate(&rdp->cblist, c)) + ret = rcu_start_this_gp(rnp, rdp, c); /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) @@ -2049,7 +1994,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); raw_spin_unlock_irq_rcu_node(rnp); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); } @@ -2108,7 +2053,6 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; bool needgp = false; - int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); struct swait_queue_head *sq; @@ -2147,31 +2091,35 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) if (rnp == rdp->mynode) needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ - nocb += rcu_future_gp_cleanup(rsp, rnp); + needgp = rcu_future_gp_cleanup(rsp, rnp) || needgp; sq = rcu_nocb_gp_get(rnp); raw_spin_unlock_irq_rcu_node(rnp); rcu_nocb_gp_cleanup(sq); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); rcu_gp_slow(rsp, gp_cleanup_delay); } rnp = rcu_get_root(rsp); raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ - rcu_nocb_gp_set(rnp, nocb); /* Declare grace period done. */ WRITE_ONCE(rsp->completed, rsp->gpnum); trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->gp_state = RCU_GP_IDLE; + /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); + if (need_any_future_gp(rnp)) { + trace_rcu_this_gp(rnp, rdp, rsp->completed - 1, + TPS("CleanupMore")); + needgp = true; + } /* Advance CBs to reduce false positives below. */ - needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; - if (needgp || cpu_needs_another_gp(rsp, rdp)) { + if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) { WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); - trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); } + WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); raw_spin_unlock_irq_rcu_node(rnp); } @@ -2202,7 +2150,7 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, @@ -2247,7 +2195,7 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqsend")); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); ret = 0; /* Force full wait till next FQS. */ j = jiffies_till_next_fqs; @@ -2260,7 +2208,7 @@ static int __noreturn rcu_gp_kthread(void *arg) } } else { /* Deal with stray signal. */ - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, @@ -2283,71 +2231,6 @@ static int __noreturn rcu_gp_kthread(void *arg) } /* - * Start a new RCU grace period if warranted, re-initializing the hierarchy - * in preparation for detecting the next grace period. The caller must hold - * the root node's ->lock and hard irqs must be disabled. - * - * Note that it is legal for a dying CPU (which is marked as offline) to - * invoke this function. This can happen when the dying CPU reports its - * quiescent state. - * - * Returns true if the grace-period kthread must be awakened. - */ -static bool -rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp) -{ - raw_lockdep_assert_held_rcu_node(rnp); - if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { - /* - * Either we have not yet spawned the grace-period - * task, this CPU does not need another grace period, - * or a grace period is already in progress. - * Either way, don't start a new grace period. - */ - return false; - } - WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), - TPS("newreq")); - - /* - * We can't do wakeups while holding the rnp->lock, as that - * could cause possible deadlocks with the rq->lock. Defer - * the wakeup to our caller. - */ - return true; -} - -/* - * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's - * callbacks. Note that rcu_start_gp_advanced() cannot do this because it - * is invoked indirectly from rcu_advance_cbs(), which would result in - * endless recursion -- or would do so if it wasn't for the self-deadlock - * that is encountered beforehand. - * - * Returns true if the grace-period kthread needs to be awakened. - */ -static bool rcu_start_gp(struct rcu_state *rsp) -{ - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - struct rcu_node *rnp = rcu_get_root(rsp); - bool ret = false; - - /* - * If there is no grace period in progress right now, any - * callbacks we have up to this point will be satisfied by the - * next grace period. Also, advancing the callbacks reduces the - * probability of false positives from cpu_needs_another_gp() - * resulting in pointless grace periods. So, advance callbacks - * then start the grace period! - */ - ret = rcu_advance_cbs(rsp, rnp, rdp) || ret; - ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret; - return ret; -} - -/* * Report a full set of quiescent states to the specified rcu_state data * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period * kthread if another grace period is required. Whether we wake @@ -2398,7 +2281,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, return; } WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ - WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 && + WARN_ON_ONCE(!rcu_is_leaf_node(rnp) && rcu_preempt_blocked_readers_cgp(rnp)); rnp->qsmask &= ~mask; trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, @@ -2782,7 +2665,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { @@ -2874,22 +2757,27 @@ __rcu_process_callbacks(struct rcu_state *rsp) unsigned long flags; bool needwake; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); + struct rcu_node *rnp; WARN_ON_ONCE(!rdp->beenonline); /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); - /* Does this CPU require a not-yet-started grace period? */ - local_irq_save(flags); - if (cpu_needs_another_gp(rsp, rdp)) { - raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ - needwake = rcu_start_gp(rsp); - raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); - if (needwake) - rcu_gp_kthread_wake(rsp); - } else { - local_irq_restore(flags); + /* No grace period and unregistered callbacks? */ + if (!rcu_gp_in_progress(rsp) && + rcu_segcblist_is_enabled(&rdp->cblist)) { + local_irq_save(flags); + if (rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) { + local_irq_restore(flags); + } else { + rnp = rdp->mynode; + raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (needwake) + rcu_gp_kthread_wake(rsp); + } } /* If there are callbacks ready, invoke them. */ @@ -2973,11 +2861,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress(rsp)) { - struct rcu_node *rnp_root = rcu_get_root(rsp); + struct rcu_node *rnp = rdp->mynode; - raw_spin_lock_rcu_node(rnp_root); - needwake = rcu_start_gp(rsp); - raw_spin_unlock_rcu_node(rnp_root); + raw_spin_lock_rcu_node(rnp); + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock_rcu_node(rnp); if (needwake) rcu_gp_kthread_wake(rsp); } else { @@ -3368,7 +3256,9 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 1; /* Has RCU gone idle with this CPU needing another grace period? */ - if (cpu_needs_another_gp(rsp, rdp)) + if (!rcu_gp_in_progress(rsp) && + rcu_segcblist_is_enabled(&rdp->cblist) && + !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; /* Has another RCU grace period completed? */ @@ -3775,6 +3665,8 @@ int rcutree_dead_cpu(unsigned int cpu) return 0; } +static DEFINE_PER_CPU(int, rcu_cpu_started); + /* * Mark the specified CPU as being online so that subsequent grace periods * (both expedited and normal) will wait on it. Note that this means that @@ -3796,6 +3688,11 @@ void rcu_cpu_starting(unsigned int cpu) struct rcu_node *rnp; struct rcu_state *rsp; + if (per_cpu(rcu_cpu_started, cpu)) + return; + + per_cpu(rcu_cpu_started, cpu) = 1; + for_each_rcu_flavor(rsp) { rdp = per_cpu_ptr(rsp->rda, cpu); rnp = rdp->mynode; @@ -3852,6 +3749,8 @@ void rcu_report_dead(unsigned int cpu) preempt_enable(); for_each_rcu_flavor(rsp) rcu_cleanup_dying_idle_cpu(cpu, rsp); + + per_cpu(rcu_cpu_started, cpu) = 0; } /* Migrate the dead CPU's callbacks to the current CPU. */ @@ -3861,6 +3760,7 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) struct rcu_data *my_rdp; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + bool needwake; if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ @@ -3872,12 +3772,15 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) return; } raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ - rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ - rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */ + /* Leverage recent GPs and set GP for new callbacks. */ + needwake = rcu_advance_cbs(rsp, rnp_root, rdp) || + rcu_advance_cbs(rsp, rnp_root, my_rdp); rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); + if (needwake) + rcu_gp_kthread_wake(rsp); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || !rcu_segcblist_empty(&rdp->cblist), "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", @@ -4056,7 +3959,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) init_swait_queue_head(&rsp->gp_wq); init_swait_queue_head(&rsp->expedited_wq); - rnp = rsp->level[rcu_num_lvls - 1]; + rnp = rcu_first_leaf_node(rsp); for_each_possible_cpu(i) { while (i > rnp->grphi) rnp++; @@ -4168,6 +4071,7 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) } struct workqueue_struct *rcu_gp_wq; +struct workqueue_struct *rcu_par_gp_wq; void __init rcu_init(void) { @@ -4199,6 +4103,8 @@ void __init rcu_init(void) /* Create workqueue for expedited GPs and for Tree SRCU. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); + rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_par_gp_wq); } #include "tree_exp.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f491ab4f2e8e..78e051dffc5b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -58,6 +58,14 @@ struct rcu_dynticks { #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ }; +/* Communicate arguments to a workqueue handler. */ +struct rcu_exp_work { + smp_call_func_t rew_func; + struct rcu_state *rew_rsp; + unsigned long rew_s; + struct work_struct rew_work; +}; + /* RCU's kthread states for tracing. */ #define RCU_KTHREAD_STOPPED 0 #define RCU_KTHREAD_RUNNING 1 @@ -150,15 +158,32 @@ struct rcu_node { struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - int need_future_gp[2]; - /* Counts of upcoming no-CB GP requests. */ + u8 need_future_gp[4]; /* Counts of upcoming GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; spinlock_t exp_lock ____cacheline_internodealigned_in_smp; unsigned long exp_seq_rq; wait_queue_head_t exp_wq[4]; + struct rcu_exp_work rew; + bool exp_need_flush; /* Need to flush workitem? */ } ____cacheline_internodealigned_in_smp; +/* Accessors for ->need_future_gp[] array. */ +#define need_future_gp_mask() \ + (ARRAY_SIZE(((struct rcu_node *)NULL)->need_future_gp) - 1) +#define need_future_gp_element(rnp, c) \ + ((rnp)->need_future_gp[(c) & need_future_gp_mask()]) +#define need_any_future_gp(rnp) \ +({ \ + int __i; \ + bool __nonzero = false; \ + \ + for (__i = 0; __i < ARRAY_SIZE((rnp)->need_future_gp); __i++) \ + __nonzero = __nonzero || \ + READ_ONCE((rnp)->need_future_gp[__i]); \ + __nonzero; \ +}) + /* * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and * are indexed relative to this interval rather than the global CPU ID space. @@ -224,10 +249,6 @@ struct rcu_data { #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - atomic_long_t exp_workdone0; /* # done by workqueue. */ - atomic_long_t exp_workdone1; /* # done by others #1. */ - atomic_long_t exp_workdone2; /* # done by others #2. */ - atomic_long_t exp_workdone3; /* # done by others #3. */ int exp_dynticks_snap; /* Double-check need for IPI. */ /* 6) Callback offloading. */ @@ -408,7 +429,6 @@ extern struct rcu_state rcu_preempt_state; #endif /* #ifdef CONFIG_PREEMPT_RCU */ int rcu_dynticks_snap(struct rcu_dynticks *rdtp); -bool rcu_eqs_special_set(int cpu); #ifdef CONFIG_RCU_BOOST DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); @@ -438,7 +458,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); #ifdef CONFIG_RCU_BOOST -static void rcu_preempt_do_callbacks(void); static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ @@ -454,7 +473,6 @@ static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index f72eefab8543..d40708e8c5d6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -20,6 +20,8 @@ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ +#include <linux/lockdep.h> + /* * Record the start of an expedited grace period. */ @@ -154,15 +156,35 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) * for the current expedited grace period. Works only for preemptible * RCU -- other RCU implementation use other means. * - * Caller must hold the rcu_state's exp_mutex. + * Caller must hold the specificed rcu_node structure's ->lock */ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) { + raw_lockdep_assert_held_rcu_node(rnp); + return rnp->exp_tasks == NULL && READ_ONCE(rnp->expmask) == 0; } /* + * Like sync_rcu_preempt_exp_done(), but this function assumes the caller + * doesn't hold the rcu_node's ->lock, and will acquire and release the lock + * itself + */ +static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp) +{ + unsigned long flags; + bool ret; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + ret = sync_rcu_preempt_exp_done(rnp); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + return ret; +} + + +/* * Report the exit from RCU read-side critical section for the last task * that queued itself during or before the current expedited preemptible-RCU * grace period. This event is reported either to the rcu_node structure on @@ -170,8 +192,7 @@ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * - * Caller must hold the rcu_state's exp_mutex and the specified rcu_node - * structure's ->lock. + * Caller must hold the specified rcu_node structure's ->lock. */ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake, unsigned long flags) @@ -207,8 +228,6 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, /* * Report expedited quiescent state for specified node. This is a * lock-acquisition wrapper function for __rcu_report_exp_rnp(). - * - * Caller must hold the rcu_state's exp_mutex. */ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake) @@ -221,8 +240,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, /* * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. Caller must hold the rcu_state's - * exp_mutex. + * specified leaf rcu_node structure. */ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, unsigned long mask, bool wake) @@ -248,14 +266,12 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, } /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, - unsigned long s) +static bool sync_exp_work_done(struct rcu_state *rsp, unsigned long s) { if (rcu_exp_gp_seq_done(rsp, s)) { trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(stat); return true; } return false; @@ -289,7 +305,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) * promoting locality and is not strictly needed for correctness. */ for (; rnp != NULL; rnp = rnp->parent) { - if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) + if (sync_exp_work_done(rsp, s)) return true; /* Work not done, either wait here or go up. */ @@ -302,8 +318,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) rnp->grplo, rnp->grphi, TPS("wait")); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], - sync_exp_work_done(rsp, - &rdp->exp_workdone2, s)); + sync_exp_work_done(rsp, s)); return true; } rnp->exp_seq_rq = s; /* Followers can wait on us. */ @@ -313,7 +328,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) } mutex_lock(&rsp->exp_mutex); fastpath: - if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { + if (sync_exp_work_done(rsp, s)) { mutex_unlock(&rsp->exp_mutex); return true; } @@ -362,93 +377,129 @@ static void sync_sched_exp_online_cleanup(int cpu) } /* - * Select the nodes that the upcoming expedited grace period needs - * to wait for. + * Select the CPUs within the specified rcu_node that the upcoming + * expedited grace period needs to wait for. */ -static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, - smp_call_func_t func) +static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) { int cpu; unsigned long flags; + smp_call_func_t func; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; - struct rcu_node *rnp; - - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); - sync_exp_reset_tree(rsp); - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select")); - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); + struct rcu_state *rsp = rewp->rew_rsp; - /* Each pass checks a CPU for identity, offline, and idle. */ - mask_ofl_test = 0; - for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu); - int snap; + func = rewp->rew_func; + raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (raw_smp_processor_id() == cpu || - !(rnp->qsmaskinitnext & mask)) { + /* Each pass checks a CPU for identity, offline, and idle. */ + mask_ofl_test = 0; + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu); + int snap; + + if (raw_smp_processor_id() == cpu || + !(rnp->qsmaskinitnext & mask)) { + mask_ofl_test |= mask; + } else { + snap = rcu_dynticks_snap(rdtp); + if (rcu_dynticks_in_eqs(snap)) mask_ofl_test |= mask; - } else { - snap = rcu_dynticks_snap(rdtp); - if (rcu_dynticks_in_eqs(snap)) - mask_ofl_test |= mask; - else - rdp->exp_dynticks_snap = snap; - } + else + rdp->exp_dynticks_snap = snap; } - mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - - /* - * Need to wait for any blocked tasks as well. Note that - * additional blocking tasks will also block the expedited - * GP until such time as the ->expmask bits are cleared. - */ - if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - /* IPI the remaining CPUs for expedited quiescent state. */ - for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited GP + * until such time as the ->expmask bits are cleared. + */ + if (rcu_preempt_has_tasks(rnp)) + rnp->exp_tasks = rnp->blkd_tasks.next; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (!(mask_ofl_ipi & mask)) - continue; + /* IPI the remaining CPUs for expedited quiescent state. */ + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + + if (!(mask_ofl_ipi & mask)) + continue; retry_ipi: - if (rcu_dynticks_in_eqs_since(rdp->dynticks, - rdp->exp_dynticks_snap)) { - mask_ofl_test |= mask; - continue; - } - ret = smp_call_function_single(cpu, func, rsp, 0); - if (!ret) { - mask_ofl_ipi &= ~mask; - continue; - } - /* Failed, raced with CPU hotplug operation. */ - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if ((rnp->qsmaskinitnext & mask) && - (rnp->expmask & mask)) { - /* Online, so delay for a bit and try again. */ - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl")); - schedule_timeout_uninterruptible(1); - goto retry_ipi; - } - /* CPU really is offline, so we can ignore it. */ - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; + if (rcu_dynticks_in_eqs_since(rdp->dynticks, + rdp->exp_dynticks_snap)) { + mask_ofl_test |= mask; + continue; + } + ret = smp_call_function_single(cpu, func, rsp, 0); + if (!ret) { + mask_ofl_ipi &= ~mask; + continue; + } + /* Failed, raced with CPU hotplug operation. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if ((rnp->qsmaskinitnext & mask) && + (rnp->expmask & mask)) { + /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl")); + schedule_timeout_uninterruptible(1); + goto retry_ipi; + } + /* CPU really is offline, so we can ignore it. */ + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + /* Report quiescent states for those that went offline. */ + mask_ofl_test |= mask_ofl_ipi; + if (mask_ofl_test) + rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); +} + +/* + * Select the nodes that the upcoming expedited grace period needs + * to wait for. + */ +static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, + smp_call_func_t func) +{ + struct rcu_node *rnp; + + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); + sync_exp_reset_tree(rsp); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select")); + + /* Schedule work for each leaf rcu_node structure. */ + rcu_for_each_leaf_node(rsp, rnp) { + rnp->exp_need_flush = false; + if (!READ_ONCE(rnp->expmask)) + continue; /* Avoid early boot non-existent wq. */ + rnp->rew.rew_func = func; + rnp->rew.rew_rsp = rsp; + if (!READ_ONCE(rcu_par_gp_wq) || + rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { + /* No workqueues yet. */ + sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); + continue; } - /* Report quiescent states for those that went offline. */ - mask_ofl_test |= mask_ofl_ipi; - if (mask_ofl_test) - rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); + INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work); + rnp->exp_need_flush = true; } + + /* Wait for workqueue jobs (if any) to complete. */ + rcu_for_each_leaf_node(rsp, rnp) + if (rnp->exp_need_flush) + flush_work(&rnp->rew.rew_work); } static void synchronize_sched_expedited_wait(struct rcu_state *rsp) @@ -469,9 +520,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) for (;;) { ret = swait_event_timeout( rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root), + sync_rcu_preempt_exp_done_unlocked(rnp_root), jiffies_stall); - if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) + if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root)) return; WARN_ON(ret < 0); /* workqueues should not be signaled. */ if (rcu_cpu_stall_suppress) @@ -504,7 +555,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) rcu_for_each_node_breadth_first(rsp, rnp) { if (rnp == rnp_root) continue; /* printed unconditionally */ - if (sync_rcu_preempt_exp_done(rnp)) + if (sync_rcu_preempt_exp_done_unlocked(rnp)) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, @@ -560,14 +611,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) mutex_unlock(&rsp->exp_wake_mutex); } -/* Let the workqueue handler know what it is supposed to do. */ -struct rcu_exp_work { - smp_call_func_t rew_func; - struct rcu_state *rew_rsp; - unsigned long rew_s; - struct work_struct rew_work; -}; - /* * Common code to drive an expedited grace period forward, used by * workqueues and mid-boot-time tasks. @@ -633,7 +676,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); rnp = rcu_get_root(rsp); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], - sync_exp_work_done(rsp, &rdp->exp_workdone0, s)); + sync_exp_work_done(rsp, s)); smp_mb(); /* Workqueue actions happen before return. */ /* Let the next expedited grace period start. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 84fbee4686d3..7fd12039e512 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -182,7 +182,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) raw_lockdep_assert_held_rcu_node(rnp); WARN_ON_ONCE(rdp->mynode != rnp); - WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); + WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); /* * Decide where to queue the newly blocked task. In theory, @@ -384,6 +384,50 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) } /* + * Preemptible RCU implementation for rcu_read_lock(). + * Just increment ->rcu_read_lock_nesting, shared state will be updated + * if we block. + */ +void __rcu_read_lock(void) +{ + current->rcu_read_lock_nesting++; + barrier(); /* critical section after entry code. */ +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +/* + * Preemptible RCU implementation for rcu_read_unlock(). + * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost + * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then + * invoke rcu_read_unlock_special() to clean up after a context switch + * in an RCU read-side critical section and other special cases. + */ +void __rcu_read_unlock(void) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting != 1) { + --t->rcu_read_lock_nesting; + } else { + barrier(); /* critical section before exit code. */ + t->rcu_read_lock_nesting = INT_MIN; + barrier(); /* assign before ->rcu_read_unlock_special load */ + if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) + rcu_read_unlock_special(t); + barrier(); /* ->rcu_read_unlock_special load before assign */ + t->rcu_read_lock_nesting = 0; + } +#ifdef CONFIG_PROVE_LOCKING + { + int rrln = READ_ONCE(t->rcu_read_lock_nesting); + + WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); + } +#endif /* #ifdef CONFIG_PROVE_LOCKING */ +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + +/* * Advance a ->blkd_tasks-list pointer to the next entry, instead * returning NULL if at the end of the list. */ @@ -489,7 +533,7 @@ void rcu_read_unlock_special(struct task_struct *t) rnp = t->rcu_blocked_node; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ WARN_ON_ONCE(rnp != t->rcu_blocked_node); - WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); + WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ @@ -685,15 +729,6 @@ static void rcu_preempt_check_callbacks(void) t->rcu_read_unlock_special.b.need_qs = true; } -#ifdef CONFIG_RCU_BOOST - -static void rcu_preempt_do_callbacks(void) -{ - rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p)); -} - -#endif /* #ifdef CONFIG_RCU_BOOST */ - /** * call_rcu() - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -1140,7 +1175,7 @@ static void rcu_kthread_do_work(void) { rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); - rcu_preempt_do_callbacks(); + rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); } static void rcu_cpu_kthread_setup(unsigned int cpu) @@ -1607,7 +1642,7 @@ static int rcu_oom_notify(struct notifier_block *self, for_each_online_cpu(cpu) { smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); } /* Unconditionally decrement: no need to wake ourselves up. */ @@ -1780,19 +1815,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) swake_up_all(sq); } -/* - * Set the root rcu_node structure's ->need_future_gp field - * based on the sum of those of all rcu_node structures. This does - * double-count the root rcu_node structure's requests, but this - * is necessary to handle the possibility of a rcu_nocb_kthread() - * having awakened during the time that the rcu_node structures - * were being updated for the end of the previous grace period. - */ -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) -{ - rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; -} - static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) { return &rnp->nocb_gp_wq[rnp->completed & 0x1]; @@ -1966,7 +1988,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, + wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE, TPS("WakeOvfIsDeferred")); } rdp->qlen_last_fqs_check = LONG_MAX / 2; @@ -2048,7 +2070,8 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) struct rcu_node *rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); - needwake = rcu_start_future_gp(rnp, rdp, &c); + c = rcu_cbs_completed(rdp->rsp, rnp); + needwake = rcu_start_this_gp(rnp, rdp, c); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(rdp->rsp); @@ -2057,7 +2080,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) * Wait for the grace period. Do so interruptibly to avoid messing * up the load average. */ - trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); + trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { swait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], @@ -2065,9 +2088,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) if (likely(d)) break; WARN_ON(signal_pending(current)); - trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); + trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait")); } - trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); + trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait")); smp_mb(); /* Ensure that CB invocation happens after GP end. */ } @@ -2236,7 +2259,7 @@ static int rcu_nocb_kthread(void *arg) cl++; c++; local_bh_enable(); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); list = next; } trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); @@ -2292,7 +2315,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) void __init rcu_init_nohz(void) { int cpu; - bool need_rcu_nocb_mask = true; + bool need_rcu_nocb_mask = false; struct rcu_state *rsp; #if defined(CONFIG_NO_HZ_FULL) @@ -2315,7 +2338,7 @@ void __init rcu_init_nohz(void) #endif /* #if defined(CONFIG_NO_HZ_FULL) */ if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { - pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); + pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); cpumask_and(rcu_nocb_mask, cpu_possible_mask, rcu_nocb_mask); } @@ -2495,10 +2518,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) { } -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) -{ -} - static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) { return NULL; @@ -2587,8 +2606,7 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) } /* - * Bind the grace-period kthread for the sysidle flavor of RCU to the - * timekeeping CPU. + * Bind the RCU grace-period kthreads to the housekeeping CPU. */ static void rcu_bind_gp_kthread(void) { diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 68fa19a5e7bd..4c230a60ece4 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -226,54 +226,6 @@ core_initcall(rcu_set_runtime_mode); #endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */ -#ifdef CONFIG_PREEMPT_RCU - -/* - * Preemptible RCU implementation for rcu_read_lock(). - * Just increment ->rcu_read_lock_nesting, shared state will be updated - * if we block. - */ -void __rcu_read_lock(void) -{ - current->rcu_read_lock_nesting++; - barrier(); /* critical section after entry code. */ -} -EXPORT_SYMBOL_GPL(__rcu_read_lock); - -/* - * Preemptible RCU implementation for rcu_read_unlock(). - * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost - * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then - * invoke rcu_read_unlock_special() to clean up after a context switch - * in an RCU read-side critical section and other special cases. - */ -void __rcu_read_unlock(void) -{ - struct task_struct *t = current; - - if (t->rcu_read_lock_nesting != 1) { - --t->rcu_read_lock_nesting; - } else { - barrier(); /* critical section before exit code. */ - t->rcu_read_lock_nesting = INT_MIN; - barrier(); /* assign before ->rcu_read_unlock_special load */ - if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) - rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - t->rcu_read_lock_nesting = 0; - } -#ifdef CONFIG_PROVE_LOCKING - { - int rrln = READ_ONCE(t->rcu_read_lock_nesting); - - WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); - } -#endif /* #ifdef CONFIG_PROVE_LOCKING */ -} -EXPORT_SYMBOL_GPL(__rcu_read_unlock); - -#endif /* #ifdef CONFIG_PREEMPT_RCU */ - #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; struct lockdep_map rcu_lock_map = @@ -624,7 +576,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks); * grace period has elapsed, in other words after all currently * executing rcu-tasks read-side critical sections have elapsed. These * read-side critical sections are delimited by calls to schedule(), - * cond_resched_rcu_qs(), idle execution, userspace execution, calls + * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). * * This is a very specialized primitive, intended only for a few uses in diff --git a/kernel/relay.c b/kernel/relay.c index c955b10c973c..04f248644e06 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -39,7 +39,7 @@ static void relay_file_mmap_close(struct vm_area_struct *vma) /* * fault() vm_op implementation for relay file mapping. */ -static int relay_buf_fault(struct vm_fault *vmf) +static vm_fault_t relay_buf_fault(struct vm_fault *vmf) { struct page *page; struct rchan_buf *buf = vmf->vma->vm_private_data; @@ -169,7 +169,8 @@ static struct rchan_buf *relay_create_buf(struct rchan *chan) buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); if (!buf) return NULL; - buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); + buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t *), + GFP_KERNEL); if (!buf->padding) goto free_buf; diff --git a/kernel/resource.c b/kernel/resource.c index 2af6c03858b9..30e1bc68503b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -87,7 +87,7 @@ enum { MAX_IORES_LEVEL = 5 }; static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) { - struct resource *p = m->private; + struct resource *p = PDE_DATA(file_inode(m->file)); loff_t l = 0; read_lock(&resource_lock); for (p = p->child; p && l < *pos; p = r_next(m, p, &l)) @@ -103,7 +103,7 @@ static void r_stop(struct seq_file *m, void *v) static int r_show(struct seq_file *m, void *v) { - struct resource *root = m->private; + struct resource *root = PDE_DATA(file_inode(m->file)); struct resource *r = v, *p; unsigned long long start, end; int width = root->end < 0x10000 ? 4 : 8; @@ -135,44 +135,11 @@ static const struct seq_operations resource_op = { .show = r_show, }; -static int ioports_open(struct inode *inode, struct file *file) -{ - int res = seq_open(file, &resource_op); - if (!res) { - struct seq_file *m = file->private_data; - m->private = &ioport_resource; - } - return res; -} - -static int iomem_open(struct inode *inode, struct file *file) -{ - int res = seq_open(file, &resource_op); - if (!res) { - struct seq_file *m = file->private_data; - m->private = &iomem_resource; - } - return res; -} - -static const struct file_operations proc_ioports_operations = { - .open = ioports_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const struct file_operations proc_iomem_operations = { - .open = iomem_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init ioresources_init(void) { - proc_create("ioports", 0, NULL, &proc_ioports_operations); - proc_create("iomem", 0, NULL, &proc_iomem_operations); + proc_create_seq_data("ioports", 0, NULL, &resource_op, + &ioport_resource); + proc_create_seq_data("iomem", 0, NULL, &resource_op, &iomem_resource); return 0; } __initcall(ioresources_init); @@ -448,6 +415,7 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, return __walk_iomem_res_desc(&res, desc, false, arg, func); } +EXPORT_SYMBOL_GPL(walk_iomem_res_desc); /* * This function calls the @func callback against all memory ranges of type diff --git a/kernel/rseq.c b/kernel/rseq.c new file mode 100644 index 000000000000..ae306f90c514 --- /dev/null +++ b/kernel/rseq.c @@ -0,0 +1,357 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Restartable sequences system call + * + * Copyright (C) 2015, Google, Inc., + * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> + * Copyright (C) 2015-2018, EfficiOS Inc., + * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> + */ + +#include <linux/sched.h> +#include <linux/uaccess.h> +#include <linux/syscalls.h> +#include <linux/rseq.h> +#include <linux/types.h> +#include <asm/ptrace.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/rseq.h> + +#define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \ + RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT) + +/* + * + * Restartable sequences are a lightweight interface that allows + * user-level code to be executed atomically relative to scheduler + * preemption and signal delivery. Typically used for implementing + * per-cpu operations. + * + * It allows user-space to perform update operations on per-cpu data + * without requiring heavy-weight atomic operations. + * + * Detailed algorithm of rseq user-space assembly sequences: + * + * init(rseq_cs) + * cpu = TLS->rseq::cpu_id_start + * [1] TLS->rseq::rseq_cs = rseq_cs + * [start_ip] ---------------------------- + * [2] if (cpu != TLS->rseq::cpu_id) + * goto abort_ip; + * [3] <last_instruction_in_cs> + * [post_commit_ip] ---------------------------- + * + * The address of jump target abort_ip must be outside the critical + * region, i.e.: + * + * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] + * + * Steps [2]-[3] (inclusive) need to be a sequence of instructions in + * userspace that can handle being interrupted between any of those + * instructions, and then resumed to the abort_ip. + * + * 1. Userspace stores the address of the struct rseq_cs assembly + * block descriptor into the rseq_cs field of the registered + * struct rseq TLS area. This update is performed through a single + * store within the inline assembly instruction sequence. + * [start_ip] + * + * 2. Userspace tests to check whether the current cpu_id field match + * the cpu number loaded before start_ip, branching to abort_ip + * in case of a mismatch. + * + * If the sequence is preempted or interrupted by a signal + * at or after start_ip and before post_commit_ip, then the kernel + * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return + * ip to abort_ip before returning to user-space, so the preempted + * execution resumes at abort_ip. + * + * 3. Userspace critical section final instruction before + * post_commit_ip is the commit. The critical section is + * self-terminating. + * [post_commit_ip] + * + * 4. <success> + * + * On failure at [2], or if interrupted by preempt or signal delivery + * between [1] and [3]: + * + * [abort_ip] + * F1. <failure> + */ + +static int rseq_update_cpu_id(struct task_struct *t) +{ + u32 cpu_id = raw_smp_processor_id(); + + if (__put_user(cpu_id, &t->rseq->cpu_id_start)) + return -EFAULT; + if (__put_user(cpu_id, &t->rseq->cpu_id)) + return -EFAULT; + trace_rseq_update(t); + return 0; +} + +static int rseq_reset_rseq_cpu_id(struct task_struct *t) +{ + u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED; + + /* + * Reset cpu_id_start to its initial state (0). + */ + if (__put_user(cpu_id_start, &t->rseq->cpu_id_start)) + return -EFAULT; + /* + * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming + * in after unregistration can figure out that rseq needs to be + * registered again. + */ + if (__put_user(cpu_id, &t->rseq->cpu_id)) + return -EFAULT; + return 0; +} + +static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) +{ + struct rseq_cs __user *urseq_cs; + unsigned long ptr; + u32 __user *usig; + u32 sig; + int ret; + + ret = __get_user(ptr, &t->rseq->rseq_cs); + if (ret) + return ret; + if (!ptr) { + memset(rseq_cs, 0, sizeof(*rseq_cs)); + return 0; + } + urseq_cs = (struct rseq_cs __user *)ptr; + if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) + return -EFAULT; + if (rseq_cs->version > 0) + return -EINVAL; + + /* Ensure that abort_ip is not in the critical section. */ + if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) + return -EINVAL; + + usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32)); + ret = get_user(sig, usig); + if (ret) + return ret; + + if (current->rseq_sig != sig) { + printk_ratelimited(KERN_WARNING + "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", + sig, current->rseq_sig, current->pid, usig); + return -EPERM; + } + return 0; +} + +static int rseq_need_restart(struct task_struct *t, u32 cs_flags) +{ + u32 flags, event_mask; + int ret; + + /* Get thread flags. */ + ret = __get_user(flags, &t->rseq->flags); + if (ret) + return ret; + + /* Take critical section flags into account. */ + flags |= cs_flags; + + /* + * Restart on signal can only be inhibited when restart on + * preempt and restart on migrate are inhibited too. Otherwise, + * a preempted signal handler could fail to restart the prior + * execution context on sigreturn. + */ + if (unlikely((flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) && + (flags & RSEQ_CS_PREEMPT_MIGRATE_FLAGS) != + RSEQ_CS_PREEMPT_MIGRATE_FLAGS)) + return -EINVAL; + + /* + * Load and clear event mask atomically with respect to + * scheduler preemption. + */ + preempt_disable(); + event_mask = t->rseq_event_mask; + t->rseq_event_mask = 0; + preempt_enable(); + + return !!(event_mask & ~flags); +} + +static int clear_rseq_cs(struct task_struct *t) +{ + /* + * The rseq_cs field is set to NULL on preemption or signal + * delivery on top of rseq assembly block, as well as on top + * of code outside of the rseq assembly block. This performs + * a lazy clear of the rseq_cs field. + * + * Set rseq_cs to NULL with single-copy atomicity. + */ + return __put_user(0UL, &t->rseq->rseq_cs); +} + +/* + * Unsigned comparison will be true when ip >= start_ip, and when + * ip < start_ip + post_commit_offset. + */ +static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) +{ + return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; +} + +static int rseq_ip_fixup(struct pt_regs *regs) +{ + unsigned long ip = instruction_pointer(regs); + struct task_struct *t = current; + struct rseq_cs rseq_cs; + int ret; + + ret = rseq_get_rseq_cs(t, &rseq_cs); + if (ret) + return ret; + + /* + * Handle potentially not being within a critical section. + * If not nested over a rseq critical section, restart is useless. + * Clear the rseq_cs pointer and return. + */ + if (!in_rseq_cs(ip, &rseq_cs)) + return clear_rseq_cs(t); + ret = rseq_need_restart(t, rseq_cs.flags); + if (ret <= 0) + return ret; + ret = clear_rseq_cs(t); + if (ret) + return ret; + trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, + rseq_cs.abort_ip); + instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); + return 0; +} + +/* + * This resume handler must always be executed between any of: + * - preemption, + * - signal delivery, + * and return to user-space. + * + * This is how we can ensure that the entire rseq critical section, + * consisting of both the C part and the assembly instruction sequence, + * will issue the commit instruction only if executed atomically with + * respect to other threads scheduled on the same CPU, and with respect + * to signal handlers. + */ +void __rseq_handle_notify_resume(struct pt_regs *regs) +{ + struct task_struct *t = current; + int ret; + + if (unlikely(t->flags & PF_EXITING)) + return; + if (unlikely(!access_ok(VERIFY_WRITE, t->rseq, sizeof(*t->rseq)))) + goto error; + ret = rseq_ip_fixup(regs); + if (unlikely(ret < 0)) + goto error; + if (unlikely(rseq_update_cpu_id(t))) + goto error; + return; + +error: + force_sig(SIGSEGV, t); +} + +#ifdef CONFIG_DEBUG_RSEQ + +/* + * Terminate the process if a syscall is issued within a restartable + * sequence. + */ +void rseq_syscall(struct pt_regs *regs) +{ + unsigned long ip = instruction_pointer(regs); + struct task_struct *t = current; + struct rseq_cs rseq_cs; + + if (!t->rseq) + return; + if (!access_ok(VERIFY_READ, t->rseq, sizeof(*t->rseq)) || + rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) + force_sig(SIGSEGV, t); +} + +#endif + +/* + * sys_rseq - setup restartable sequences for caller thread. + */ +SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, + int, flags, u32, sig) +{ + int ret; + + if (flags & RSEQ_FLAG_UNREGISTER) { + /* Unregister rseq for current thread. */ + if (current->rseq != rseq || !current->rseq) + return -EINVAL; + if (current->rseq_len != rseq_len) + return -EINVAL; + if (current->rseq_sig != sig) + return -EPERM; + ret = rseq_reset_rseq_cpu_id(current); + if (ret) + return ret; + current->rseq = NULL; + current->rseq_len = 0; + current->rseq_sig = 0; + return 0; + } + + if (unlikely(flags)) + return -EINVAL; + + if (current->rseq) { + /* + * If rseq is already registered, check whether + * the provided address differs from the prior + * one. + */ + if (current->rseq != rseq || current->rseq_len != rseq_len) + return -EINVAL; + if (current->rseq_sig != sig) + return -EPERM; + /* Already registered. */ + return -EBUSY; + } + + /* + * If there was no rseq previously registered, + * ensure the provided rseq is properly aligned and valid. + */ + if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || + rseq_len != sizeof(*rseq)) + return -EINVAL; + if (!access_ok(VERIFY_WRITE, rseq, rseq_len)) + return -EFAULT; + current->rseq = rseq; + current->rseq_len = rseq_len; + current->rseq_sig = sig; + /* + * If rseq was previously inactive, and has just been + * registered, ensure the cpu_id_start and cpu_id fields + * are updated before returning to user-space. + */ + rseq_set_notify_resume(current); + + return 0; +} diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 6be6c575b6cd..2d4ff5353ded 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -2,6 +2,7 @@ /* * Auto-group scheduling implementation: */ +#include <linux/nospec.h> #include "sched.h" unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; @@ -209,7 +210,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) static unsigned long next = INITIAL_JIFFIES; struct autogroup *ag; unsigned long shares; - int err; + int err, idx; if (nice < MIN_NICE || nice > MAX_NICE) return -EINVAL; @@ -227,7 +228,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) next = HZ / 10 + jiffies; ag = autogroup_task_get(p); - shares = scale_load(sched_prio_to_weight[nice + 20]); + + idx = array_index_nospec(nice + 20, 40); + shares = scale_load(sched_prio_to_weight[idx]); down_write(&ag->lock); err = sched_group_set_shares(ag->tg, shares); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5e10aaeebfcc..78d8facba456 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7,6 +7,11 @@ */ #include "sched.h" +#include <linux/kthread.h> +#include <linux/nospec.h> + +#include <linux/kcov.h> + #include <asm/switch_to.h> #include <asm/tlb.h> @@ -878,6 +883,33 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP + +static inline bool is_per_cpu_kthread(struct task_struct *p) +{ + if (!(p->flags & PF_KTHREAD)) + return false; + + if (p->nr_cpus_allowed != 1) + return false; + + return true; +} + +/* + * Per-CPU kthreads are allowed to run on !actie && online CPUs, see + * __set_cpus_allowed_ptr() and select_fallback_rq(). + */ +static inline bool is_cpu_allowed(struct task_struct *p, int cpu) +{ + if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + return false; + + if (is_per_cpu_kthread(p)) + return cpu_online(cpu); + + return cpu_active(cpu); +} + /* * This is how migration works: * @@ -935,16 +967,8 @@ struct migration_arg { static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int dest_cpu) { - if (p->flags & PF_KTHREAD) { - if (unlikely(!cpu_online(dest_cpu))) - return rq; - } else { - if (unlikely(!cpu_active(dest_cpu))) - return rq; - } - /* Affinity changed (again). */ - if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + if (!is_cpu_allowed(p, dest_cpu)) return rq; update_rq_clock(rq); @@ -1169,6 +1193,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p); p->se.nr_migrations++; + rseq_migrate(p); perf_event_task_migrate(p); } @@ -1473,10 +1498,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for (;;) { /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, &p->cpus_allowed) { - if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) - continue; - if (!cpu_online(dest_cpu)) + if (!is_cpu_allowed(p, dest_cpu)) continue; + goto out; } @@ -1539,8 +1563,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) * [ this allows ->select_task() to simply return task_cpu(p) and * not worry about this generic constraint ] */ - if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || - !cpu_online(cpu))) + if (unlikely(!is_cpu_allowed(p, cpu))) cpu = select_fallback_rq(task_cpu(p), p); return cpu; @@ -2174,27 +2197,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) INIT_HLIST_HEAD(&p->preempt_notifiers); #endif -#ifdef CONFIG_NUMA_BALANCING - if (p->mm && atomic_read(&p->mm->mm_users) == 1) { - p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); - p->mm->numa_scan_seq = 0; - } - - if (clone_flags & CLONE_VM) - p->numa_preferred_nid = current->numa_preferred_nid; - else - p->numa_preferred_nid = -1; - - p->node_stamp = 0ULL; - p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; - p->numa_scan_period = sysctl_numa_balancing_scan_delay; - p->numa_work.next = &p->numa_work; - p->numa_faults = NULL; - p->last_task_numa_placement = 0; - p->last_sum_exec_runtime = 0; - - p->numa_group = NULL; -#endif /* CONFIG_NUMA_BALANCING */ + init_numa_balancing(clone_flags, p); } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -2632,8 +2635,10 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { + kcov_prepare_switch(prev); sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); + rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); prepare_task(next); prepare_arch_switch(next); @@ -2700,6 +2705,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) finish_task(prev); finish_lock_switch(rq); finish_arch_post_lock_switch(); + kcov_finish_switch(current); fire_sched_in_preempt_notifiers(current); /* @@ -2718,20 +2724,28 @@ static struct rq *finish_task_switch(struct task_struct *prev) membarrier_mm_sync_core_before_usermode(mm); mmdrop(mm); } - if (unlikely(prev_state == TASK_DEAD)) { - if (prev->sched_class->task_dead) - prev->sched_class->task_dead(prev); + if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) { + switch (prev_state) { + case TASK_DEAD: + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(prev); - /* Task is done with its stack. */ - put_task_stack(prev); + /* Task is done with its stack. */ + put_task_stack(prev); - put_task_struct(prev); + put_task_struct(prev); + break; + + case TASK_PARKED: + kthread_park_complete(prev); + break; + } } tick_nohz_task_switch(); @@ -3498,23 +3512,8 @@ static void __sched notrace __schedule(bool preempt) void __noreturn do_task_dead(void) { - /* - * The setting of TASK_RUNNING by try_to_wake_up() may be delayed - * when the following two conditions become true. - * - There is race condition of mmap_sem (It is acquired by - * exit_mm()), and - * - SMI occurs before setting TASK_RUNINNG. - * (or hypervisor of virtual machine switches to other guest) - * As a result, we may become TASK_RUNNING after becoming TASK_DEAD - * - * To avoid it, we have to wait for releasing tsk->pi_lock which - * is held by try_to_wake_up() - */ - raw_spin_lock_irq(¤t->pi_lock); - raw_spin_unlock_irq(¤t->pi_lock); - /* Causes final put_task_struct in finish_task_switch(): */ - __set_current_state(TASK_DEAD); + set_special_state(TASK_DEAD); /* Tell freezer to ignore us: */ current->flags |= PF_NOFREEZE; @@ -4037,6 +4036,23 @@ int idle_cpu(int cpu) } /** + * available_idle_cpu - is a given CPU idle for enqueuing work. + * @cpu: the CPU in question. + * + * Return: 1 if the CPU is currently idle. 0 otherwise. + */ +int available_idle_cpu(int cpu) +{ + if (!idle_cpu(cpu)) + return 0; + + if (vcpu_is_preempted(cpu)) + return 0; + + return 1; +} + +/** * idle_task - return the idle task for a given CPU. * @cpu: the processor in question. * @@ -5012,20 +5028,6 @@ int __cond_resched_lock(spinlock_t *lock) } EXPORT_SYMBOL(__cond_resched_lock); -int __sched __cond_resched_softirq(void) -{ - BUG_ON(!in_softirq()); - - if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { - local_bh_enable(); - preempt_schedule_common(); - local_bh_disable(); - return 1; - } - return 0; -} -EXPORT_SYMBOL(__cond_resched_softirq); - /** * yield - yield the current processor to other threads. * @@ -6928,11 +6930,15 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 nice) { unsigned long weight; + int idx; if (nice < MIN_NICE || nice > MAX_NICE) return -ERANGE; - weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO]; + idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO; + idx = array_index_nospec(idx, 40); + weight = sched_prio_to_weight[idx]; + return sched_group_set_shares(css_tg(css), scale_load(weight)); } #endif diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index d2c6083304b4..3cde46483f0a 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -51,7 +51,7 @@ struct sugov_cpu { bool iowait_boost_pending; unsigned int iowait_boost; unsigned int iowait_boost_max; - u64 last_update; + u64 last_update; /* The fields below are only needed when sharing a policy: */ unsigned long util_cfs; @@ -89,46 +89,52 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) * schedule the kthread. */ if (sg_policy->policy->fast_switch_enabled && - !cpufreq_can_do_remote_dvfs(sg_policy->policy)) + !cpufreq_this_cpu_can_update(sg_policy->policy)) return false; - if (sg_policy->work_in_progress) - return false; - - if (unlikely(sg_policy->need_freq_update)) { - sg_policy->need_freq_update = false; - /* - * This happens when limits change, so forget the previous - * next_freq value and force an update. - */ - sg_policy->next_freq = UINT_MAX; + if (unlikely(sg_policy->need_freq_update)) return true; - } delta_ns = time - sg_policy->last_freq_update_time; return delta_ns >= sg_policy->freq_update_delay_ns; } -static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, - unsigned int next_freq) +static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) { - struct cpufreq_policy *policy = sg_policy->policy; - if (sg_policy->next_freq == next_freq) - return; + return false; sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; - if (policy->fast_switch_enabled) { - next_freq = cpufreq_driver_fast_switch(policy, next_freq); - if (!next_freq) - return; + return true; +} - policy->cur = next_freq; - trace_cpu_frequency(next_freq, smp_processor_id()); - } else { +static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) +{ + struct cpufreq_policy *policy = sg_policy->policy; + + if (!sugov_update_next_freq(sg_policy, time, next_freq)) + return; + + next_freq = cpufreq_driver_fast_switch(policy, next_freq); + if (!next_freq) + return; + + policy->cur = next_freq; + trace_cpu_frequency(next_freq, smp_processor_id()); +} + +static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) +{ + if (!sugov_update_next_freq(sg_policy, time, next_freq)) + return; + + if (!sg_policy->work_in_progress) { sg_policy->work_in_progress = true; irq_work_queue(&sg_policy->irq_work); } @@ -165,8 +171,10 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, freq = (freq + (freq >> 2)) * util / max; - if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX) + if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) return sg_policy->next_freq; + + sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = freq; return cpufreq_driver_resolve_freq(policy, freq); } @@ -183,61 +191,137 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long util; - if (rq->rt.rt_nr_running) { - util = sg_cpu->max; - } else { - util = sg_cpu->util_dl; - if (rq->cfs.h_nr_running) - util += sg_cpu->util_cfs; - } + if (rq->rt.rt_nr_running) + return sg_cpu->max; /* + * Utilization required by DEADLINE must always be granted while, for + * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to + * gracefully reduce the frequency when no tasks show up for longer + * periods of time. + * * Ideally we would like to set util_dl as min/guaranteed freq and * util_cfs + util_dl as requested freq. However, cpufreq is not yet * ready for such an interface. So, we only do the latter for now. */ - return min(util, sg_cpu->max); + return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs)); } -static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) +/** + * sugov_iowait_reset() - Reset the IO boost status of a CPU. + * @sg_cpu: the sugov data for the CPU to boost + * @time: the update time from the caller + * @set_iowait_boost: true if an IO boost has been requested + * + * The IO wait boost of a task is disabled after a tick since the last update + * of a CPU. If a new IO wait boost is requested after more then a tick, then + * we enable the boost starting from the minimum frequency, which improves + * energy efficiency by ignoring sporadic wakeups from IO. + */ +static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, + bool set_iowait_boost) { - if (flags & SCHED_CPUFREQ_IOWAIT) { - if (sg_cpu->iowait_boost_pending) - return; + s64 delta_ns = time - sg_cpu->last_update; - sg_cpu->iowait_boost_pending = true; + /* Reset boost only if a tick has elapsed since last request */ + if (delta_ns <= TICK_NSEC) + return false; - if (sg_cpu->iowait_boost) { - sg_cpu->iowait_boost <<= 1; - if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) - sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; - } else { - sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; - } - } else if (sg_cpu->iowait_boost) { - s64 delta_ns = time - sg_cpu->last_update; + sg_cpu->iowait_boost = set_iowait_boost + ? sg_cpu->sg_policy->policy->min : 0; + sg_cpu->iowait_boost_pending = set_iowait_boost; - /* Clear iowait_boost if the CPU apprears to have been idle. */ - if (delta_ns > TICK_NSEC) { - sg_cpu->iowait_boost = 0; - sg_cpu->iowait_boost_pending = false; - } + return true; +} + +/** + * sugov_iowait_boost() - Updates the IO boost status of a CPU. + * @sg_cpu: the sugov data for the CPU to boost + * @time: the update time from the caller + * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait + * + * Each time a task wakes up after an IO operation, the CPU utilization can be + * boosted to a certain utilization which doubles at each "frequent and + * successive" wakeup from IO, ranging from the utilization of the minimum + * OPP to the utilization of the maximum OPP. + * To keep doubling, an IO boost has to be requested at least once per tick, + * otherwise we restart from the utilization of the minimum OPP. + */ +static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, + unsigned int flags) +{ + bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT; + + /* Reset boost if the CPU appears to have been idle enough */ + if (sg_cpu->iowait_boost && + sugov_iowait_reset(sg_cpu, time, set_iowait_boost)) + return; + + /* Boost only tasks waking up after IO */ + if (!set_iowait_boost) + return; + + /* Ensure boost doubles only one time at each request */ + if (sg_cpu->iowait_boost_pending) + return; + sg_cpu->iowait_boost_pending = true; + + /* Double the boost at each request */ + if (sg_cpu->iowait_boost) { + sg_cpu->iowait_boost <<= 1; + if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) + sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + return; } + + /* First wakeup after IO: start with minimum boost */ + sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; } -static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, - unsigned long *max) +/** + * sugov_iowait_apply() - Apply the IO boost to a CPU. + * @sg_cpu: the sugov data for the cpu to boost + * @time: the update time from the caller + * @util: the utilization to (eventually) boost + * @max: the maximum value the utilization can be boosted to + * + * A CPU running a task which woken up after an IO operation can have its + * utilization boosted to speed up the completion of those IO operations. + * The IO boost value is increased each time a task wakes up from IO, in + * sugov_iowait_apply(), and it's instead decreased by this function, + * each time an increase has not been requested (!iowait_boost_pending). + * + * A CPU which also appears to have been idle for at least one tick has also + * its IO boost utilization reset. + * + * This mechanism is designed to boost high frequently IO waiting tasks, while + * being more conservative on tasks which does sporadic IO operations. + */ +static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, + unsigned long *util, unsigned long *max) { unsigned int boost_util, boost_max; + /* No boost currently required */ if (!sg_cpu->iowait_boost) return; + /* Reset boost if the CPU appears to have been idle enough */ + if (sugov_iowait_reset(sg_cpu, time, false)) + return; + + /* + * An IO waiting task has just woken up: + * allow to further double the boost value + */ if (sg_cpu->iowait_boost_pending) { sg_cpu->iowait_boost_pending = false; } else { + /* + * Otherwise: reduce the boost value and disable it when we + * reach the minimum. + */ sg_cpu->iowait_boost >>= 1; if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { sg_cpu->iowait_boost = 0; @@ -245,9 +329,12 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, } } + /* + * Apply the current boost value: a CPU is boosted only if its current + * utilization is smaller then the current IO boost level. + */ boost_util = sg_cpu->iowait_boost; boost_max = sg_cpu->iowait_boost_max; - if (*util * boost_max < *max * boost_util) { *util = boost_util; *max = boost_max; @@ -286,7 +373,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned int next_f; bool busy; - sugov_set_iowait_boost(sg_cpu, time, flags); + sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; ignore_dl_rate_limit(sg_cpu, sg_policy); @@ -299,7 +386,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, sugov_get_util(sg_cpu); max = sg_cpu->max; util = sugov_aggregate_util(sg_cpu); - sugov_iowait_boost(sg_cpu, &util, &max); + sugov_iowait_apply(sg_cpu, time, &util, &max); next_f = get_next_freq(sg_policy, util, max); /* * Do not reduce the frequency if the CPU has not been idle @@ -312,7 +399,18 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, sg_policy->cached_raw_freq = 0; } - sugov_update_commit(sg_policy, time, next_f); + /* + * This code runs under rq->lock for the target CPU, so it won't run + * concurrently on two different CPUs for the same target and it is not + * necessary to acquire the lock in the fast switch case. + */ + if (sg_policy->policy->fast_switch_enabled) { + sugov_fast_switch(sg_policy, time, next_f); + } else { + raw_spin_lock(&sg_policy->update_lock); + sugov_deferred_update(sg_policy, time, next_f); + raw_spin_unlock(&sg_policy->update_lock); + } } static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) @@ -325,28 +423,12 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); unsigned long j_util, j_max; - s64 delta_ns; sugov_get_util(j_sg_cpu); - - /* - * If the CFS CPU utilization was last updated before the - * previous frequency update and the time elapsed between the - * last update of the CPU utilization and the last frequency - * update is long enough, reset iowait_boost and util_cfs, as - * they are now probably stale. However, still consider the - * CPU contribution if it has some DEADLINE utilization - * (util_dl). - */ - delta_ns = time - j_sg_cpu->last_update; - if (delta_ns > TICK_NSEC) { - j_sg_cpu->iowait_boost = 0; - j_sg_cpu->iowait_boost_pending = false; - } - j_max = j_sg_cpu->max; j_util = sugov_aggregate_util(j_sg_cpu); - sugov_iowait_boost(j_sg_cpu, &j_util, &j_max); + sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max); + if (j_util * max > j_max * util) { util = j_util; max = j_max; @@ -365,14 +447,18 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) raw_spin_lock(&sg_policy->update_lock); - sugov_set_iowait_boost(sg_cpu, time, flags); + sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; ignore_dl_rate_limit(sg_cpu, sg_policy); if (sugov_should_update_freq(sg_policy, time)) { next_f = sugov_next_freq_shared(sg_cpu, time); - sugov_update_commit(sg_policy, time, next_f); + + if (sg_policy->policy->fast_switch_enabled) + sugov_fast_switch(sg_policy, time, next_f); + else + sugov_deferred_update(sg_policy, time, next_f); } raw_spin_unlock(&sg_policy->update_lock); @@ -381,13 +467,27 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) static void sugov_work(struct kthread_work *work) { struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); + unsigned int freq; + unsigned long flags; + + /* + * Hold sg_policy->update_lock shortly to handle the case where: + * incase sg_policy->next_freq is read here, and then updated by + * sugov_deferred_update() just before work_in_progress is set to false + * here, we may miss queueing the new update. + * + * Note: If a work was queued after the update_lock is released, + * sugov_work() will just be called again by kthread_work code; and the + * request will be proceed before the sugov thread sleeps. + */ + raw_spin_lock_irqsave(&sg_policy->update_lock, flags); + freq = sg_policy->next_freq; + sg_policy->work_in_progress = false; + raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags); mutex_lock(&sg_policy->work_lock); - __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq, - CPUFREQ_RELATION_L); + __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L); mutex_unlock(&sg_policy->work_lock); - - sg_policy->work_in_progress = false; } static void sugov_irq_work(struct irq_work *irq_work) @@ -396,19 +496,6 @@ static void sugov_irq_work(struct irq_work *irq_work) sg_policy = container_of(irq_work, struct sugov_policy, irq_work); - /* - * For RT tasks, the schedutil governor shoots the frequency to maximum. - * Special care must be taken to ensure that this kthread doesn't result - * in the same behavior. - * - * This is (mostly) guaranteed by the work_in_progress flag. The flag is - * updated only at the end of the sugov_work() function and before that - * the schedutil governor rejects all other frequency scaling requests. - * - * There is a very rare case though, where the RT thread yields right - * after the work_in_progress flag is cleared. The effects of that are - * neglected for now. - */ kthread_queue_work(&sg_policy->worker, &sg_policy->work); } @@ -523,11 +610,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) } sg_policy->thread = thread; - - /* Kthread is bound to all CPUs by default */ - if (!policy->dvfs_possible_from_any_cpu) - kthread_bind_mask(thread, policy->related_cpus); - + kthread_bind_mask(thread, policy->related_cpus); init_irq_work(&sg_policy->irq_work, sugov_irq_work); mutex_init(&sg_policy->work_lock); @@ -670,7 +753,7 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; sg_policy->last_freq_update_time = 0; - sg_policy->next_freq = UINT_MAX; + sg_policy->next_freq = 0; sg_policy->work_in_progress = false; sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = 0; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e7b3008b85bb..fbfc3f1d368a 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1117,7 +1117,7 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. * So, overflow is not an issue here. */ -u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) +static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) { u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ u64 u_act; @@ -1259,6 +1259,9 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) rq = task_rq_lock(p, &rf); + sched_clock_tick(); + update_rq_clock(rq); + if (!dl_task(p) || p->state == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); @@ -1278,9 +1281,6 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) if (dl_se->dl_non_contending == 0) goto unlock; - sched_clock_tick(); - update_rq_clock(rq); - sub_running_bw(dl_se, &rq->dl); dl_se->dl_non_contending = 0; unlock: @@ -2731,8 +2731,6 @@ bool dl_cpu_busy(unsigned int cpu) #endif #ifdef CONFIG_SCHED_DEBUG -extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); - void print_dl_stats(struct seq_file *m, int cpu) { print_dl_rq(m, cpu, &cpu_rq(cpu)->dl); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 15b10e210a6b..e593b4118578 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -823,35 +823,9 @@ static const struct seq_operations sched_debug_sops = { .show = sched_debug_show, }; -static int sched_debug_release(struct inode *inode, struct file *file) -{ - seq_release(inode, file); - - return 0; -} - -static int sched_debug_open(struct inode *inode, struct file *filp) -{ - int ret = 0; - - ret = seq_open(filp, &sched_debug_sops); - - return ret; -} - -static const struct file_operations sched_debug_fops = { - .open = sched_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = sched_debug_release, -}; - static int __init init_sched_debug_procfs(void) { - struct proc_dir_entry *pe; - - pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops); - if (!pe) + if (!proc_create_seq("sched_debug", 0444, NULL, &sched_debug_sops)) return -ENOMEM; return 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 54dc31e7ab9b..1866e64792a7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1139,6 +1139,47 @@ static unsigned int task_scan_max(struct task_struct *p) return max(smin, smax); } +void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +{ + int mm_users = 0; + struct mm_struct *mm = p->mm; + + if (mm) { + mm_users = atomic_read(&mm->mm_users); + if (mm_users == 1) { + mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + mm->numa_scan_seq = 0; + } + } + p->node_stamp = 0; + p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; + p->numa_scan_period = sysctl_numa_balancing_scan_delay; + p->numa_work.next = &p->numa_work; + p->numa_faults = NULL; + p->numa_group = NULL; + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; + + /* New address space, reset the preferred nid */ + if (!(clone_flags & CLONE_VM)) { + p->numa_preferred_nid = -1; + return; + } + + /* + * New thread, keep existing numa_preferred_nid which should be copied + * already by arch_dup_task_struct but stagger when scans start. + */ + if (mm) { + unsigned int delay; + + delay = min_t(unsigned int, task_scan_max(current), + current->numa_scan_period * mm_users * NSEC_PER_MSEC); + delay += 2 * TICK_NSEC; + p->node_stamp = delay; + } +} + static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { rq->nr_numa_running += (p->numa_preferred_nid != -1); @@ -1854,7 +1895,6 @@ static int task_numa_migrate(struct task_struct *p) static void numa_migrate_preferred(struct task_struct *p) { unsigned long interval = HZ; - unsigned long numa_migrate_retry; /* This task has no NUMA fault statistics yet */ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) @@ -1862,18 +1902,7 @@ static void numa_migrate_preferred(struct task_struct *p) /* Periodically retry migrating the task to the preferred node */ interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); - numa_migrate_retry = jiffies + interval; - - /* - * Check that the new retry threshold is after the current one. If - * the retry is in the future, it implies that wake_affine has - * temporarily asked NUMA balancing to backoff from placement. - */ - if (numa_migrate_retry > p->numa_migrate_retry) - return; - - /* Safe to try placing the task on the preferred node */ - p->numa_migrate_retry = numa_migrate_retry; + p->numa_migrate_retry = jiffies + interval; /* Success if task is already running on preferred CPU */ if (task_node(p) == p->numa_preferred_nid) @@ -5357,6 +5386,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; /* + * The code below (indirectly) updates schedutil which looks at + * the cfs_rq utilization to select a frequency. + * Let's add the task's estimated utilization to the cfs_rq's + * estimated utilization, before we update schedutil. + */ + util_est_enqueue(&rq->cfs, p); + + /* * If in_iowait is set, the code below may not trigger any cpufreq * utilization updates, so do it here explicitly with the IOWAIT flag * passed. @@ -5397,7 +5434,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) add_nr_running(rq, 1); - util_est_enqueue(&rq->cfs, p); hrtick_update(rq); } @@ -5870,8 +5906,8 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) * a cpufreq perspective, it's better to have higher utilisation * on one CPU. */ - if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) - return idle_cpu(prev_cpu) ? prev_cpu : this_cpu; + if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) + return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu; if (sync && cpu_rq(this_cpu)->nr_running == 1) return this_cpu; @@ -5922,48 +5958,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits; } -#ifdef CONFIG_NUMA_BALANCING -static void -update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) -{ - unsigned long interval; - - if (!static_branch_likely(&sched_numa_balancing)) - return; - - /* If balancing has no preference then continue gathering data */ - if (p->numa_preferred_nid == -1) - return; - - /* - * If the wakeup is not affecting locality then it is neutral from - * the perspective of NUMA balacing so continue gathering data. - */ - if (cpu_to_node(prev_cpu) == cpu_to_node(target)) - return; - - /* - * Temporarily prevent NUMA balancing trying to place waker/wakee after - * wakee has been moved by wake_affine. This will potentially allow - * related tasks to converge and update their data placement. The - * 4 * numa_scan_period is to allow the two-pass filter to migrate - * hot data to the wakers node. - */ - interval = max(sysctl_numa_balancing_scan_delay, - p->numa_scan_period << 2); - p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); - - interval = max(sysctl_numa_balancing_scan_delay, - current->numa_scan_period << 2); - current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); -} -#else -static void -update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) -{ -} -#endif - static int wake_affine(struct sched_domain *sd, struct task_struct *p, int this_cpu, int prev_cpu, int sync) { @@ -5979,7 +5973,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, if (target == nr_cpumask_bits) return prev_cpu; - update_wa_numa_placement(p, prev_cpu, target); schedstat_inc(sd->ttwu_move_affine); schedstat_inc(p->se.statistics.nr_wakeups_affine); return target; @@ -6157,7 +6150,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { - if (idle_cpu(i)) { + if (available_idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { @@ -6199,6 +6192,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) return prev_cpu; + /* + * We need task's util for capacity_spare_wake, sync it up to prev_cpu's + * last_update_time. + */ + if (!(sd_flag & SD_BALANCE_FORK)) + sync_entity_load_avg(&p->se); + while (sd) { struct sched_group *group; struct sched_domain *tmp; @@ -6279,7 +6279,7 @@ void __update_idle_core(struct rq *rq) if (cpu == core) continue; - if (!idle_cpu(cpu)) + if (!available_idle_cpu(cpu)) goto unlock; } @@ -6311,7 +6311,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int for_each_cpu(cpu, cpu_smt_mask(core)) { cpumask_clear_cpu(cpu, cpus); - if (!idle_cpu(cpu)) + if (!available_idle_cpu(cpu)) idle = false; } @@ -6340,7 +6340,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t for_each_cpu(cpu, cpu_smt_mask(target)) { if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) continue; - if (idle_cpu(cpu)) + if (available_idle_cpu(cpu)) return cpu; } @@ -6403,7 +6403,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t return -1; if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) continue; - if (idle_cpu(cpu)) + if (available_idle_cpu(cpu)) break; } @@ -6423,13 +6423,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) struct sched_domain *sd; int i, recent_used_cpu; - if (idle_cpu(target)) + if (available_idle_cpu(target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ - if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) + if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev)) return prev; /* Check a recently used CPU as a potential idle candidate: */ @@ -6437,7 +6437,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && - idle_cpu(recent_used_cpu) && + available_idle_cpu(recent_used_cpu) && cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { /* * Replace recent_used_cpu with prev as it is a potential @@ -6613,7 +6613,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) static int select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) { - struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; + struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; int want_affine = 0; @@ -6636,7 +6636,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f */ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { - affine_sd = tmp; + if (cpu != prev_cpu) + new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync); + + sd = NULL; /* Prefer wake_affine over balance flags */ break; } @@ -6646,33 +6649,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f break; } - if (affine_sd) { - sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu == prev_cpu) - goto pick_cpu; - - new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync); - } - - if (sd && !(sd_flag & SD_BALANCE_FORK)) { - /* - * We're going to need the task's util for capacity_spare_wake - * in find_idlest_group. Sync it up to prev_cpu's - * last_update_time. - */ - sync_entity_load_avg(&p->se); - } + if (unlikely(sd)) { + /* Slow path */ + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); + } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ + /* Fast path */ - if (!sd) { -pick_cpu: - if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ - new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - if (want_affine) - current->recent_used_cpu = cpu; - } - } else { - new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); + if (want_affine) + current->recent_used_cpu = cpu; } rcu_read_unlock(); @@ -9847,6 +9833,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) if (curr_cost > this_rq->max_idle_balance_cost) this_rq->max_idle_balance_cost = curr_cost; +out: /* * While browsing the domains, we released the rq lock, a task could * have been enqueued in the meantime. Since we're not going idle, @@ -9855,7 +9842,6 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) if (this_rq->cfs.h_nr_running && !pulled_task) pulled_task = 1; -out: /* Move the next balance forward */ if (time_after(this_rq->next_balance, next_balance)) this_rq->next_balance = next_balance; @@ -10229,10 +10215,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) struct cfs_rq *cfs_rq; int i; - tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); + tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL); if (!tg->cfs_rq) goto err; - tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); + tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL); if (!tg->se) goto err; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7aef6b4e885a..47556b0c9a95 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -183,10 +183,10 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) struct sched_rt_entity *rt_se; int i; - tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); + tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL); if (!tg->rt_rq) goto err; - tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); + tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL); if (!tg->rt_se) goto err; @@ -2701,8 +2701,6 @@ int sched_rr_handler(struct ctl_table *table, int write, } #ifdef CONFIG_SCHED_DEBUG -extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); - void print_rt_stats(struct seq_file *m, int cpu) { rt_rq_iter_t iter; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 15750c222ca2..6601baf2361c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -983,7 +983,7 @@ static inline void rq_clock_skip_update(struct rq *rq) } /* - * See rt task throttoling, which is the only time a skip + * See rt task throttling, which is the only time a skip * request is cancelled. */ static inline void rq_clock_cancel_skipupdate(struct rq *rq) @@ -1069,6 +1069,12 @@ enum numa_faults_stats { extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *, struct task_struct *); +extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); +#else +static inline void +init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +{ +} #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_SMP @@ -2025,8 +2031,9 @@ extern bool sched_debug_enabled; extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); extern void print_dl_stats(struct seq_file *m, int cpu); -extern void -print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); +extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); +extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); +extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); #ifdef CONFIG_NUMA_BALANCING extern void show_numa_stats(struct task_struct *p, struct seq_file *m); diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index ab112cbfd7c8..750fb3c67eed 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -120,22 +120,9 @@ static const struct seq_operations schedstat_sops = { .show = show_schedstat, }; -static int schedstat_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &schedstat_sops); -} - -static const struct file_operations proc_schedstat_operations = { - .open = schedstat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init proc_schedstat_init(void) { - proc_create("schedstat", 0, NULL, &proc_schedstat_operations); - + proc_create_seq("schedstat", 0, NULL, &schedstat_sops); return 0; } subsys_initcall(proc_schedstat_init); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 64cc564f5255..05a831427bc7 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1708,7 +1708,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att rcu_read_unlock(); if (rq && sched_debug_enabled) { - pr_info("span: %*pbl (max cpu_capacity = %lu)\n", + pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); } @@ -1750,7 +1750,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms) int i; cpumask_var_t *doms; - doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); + doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL); if (!doms) return NULL; for (i = 0; i < ndoms; i++) { diff --git a/kernel/seccomp.c b/kernel/seccomp.c index dc77548167ef..fd023ac24e10 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -19,6 +19,8 @@ #include <linux/compat.h> #include <linux/coredump.h> #include <linux/kmemleak.h> +#include <linux/nospec.h> +#include <linux/prctl.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/seccomp.h> @@ -227,8 +229,11 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) return true; } +void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { } + static inline void seccomp_assign_mode(struct task_struct *task, - unsigned long seccomp_mode) + unsigned long seccomp_mode, + unsigned long flags) { assert_spin_locked(&task->sighand->siglock); @@ -238,6 +243,9 @@ static inline void seccomp_assign_mode(struct task_struct *task, * filter) is set. */ smp_mb__before_atomic(); + /* Assume default seccomp processes want spec flaw mitigation. */ + if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0) + arch_seccomp_spec_mitigate(task); set_tsk_thread_flag(task, TIF_SECCOMP); } @@ -305,7 +313,7 @@ static inline pid_t seccomp_can_sync_threads(void) * without dropping the locks. * */ -static inline void seccomp_sync_threads(void) +static inline void seccomp_sync_threads(unsigned long flags) { struct task_struct *thread, *caller; @@ -346,7 +354,8 @@ static inline void seccomp_sync_threads(void) * allow one thread to transition the other. */ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) - seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); + seccomp_assign_mode(thread, SECCOMP_MODE_FILTER, + flags); } } @@ -469,7 +478,7 @@ static long seccomp_attach_filter(unsigned int flags, /* Now that the new filter is in place, synchronize to all threads. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC) - seccomp_sync_threads(); + seccomp_sync_threads(flags); return 0; } @@ -584,18 +593,15 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, } /* - * Force an audit message to be emitted when the action is RET_KILL_*, - * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is - * allowed to be logged by the admin. + * Emit an audit message when the action is RET_KILL_*, RET_LOG, or the + * FILTER_FLAG_LOG bit was set. The admin has the ability to silence + * any action from being logged by removing the action name from the + * seccomp_actions_logged sysctl. */ - if (log) - return __audit_seccomp(syscall, signr, action); + if (!log) + return; - /* - * Let the audit subsystem decide if the action should be audited based - * on whether the current task itself is being audited. - */ - return audit_seccomp(syscall, signr, action); + audit_seccomp(syscall, signr, action); } /* @@ -818,7 +824,7 @@ static long seccomp_set_mode_strict(void) #ifdef TIF_NOTSC disable_TSC(); #endif - seccomp_assign_mode(current, seccomp_mode); + seccomp_assign_mode(current, seccomp_mode, 0); ret = 0; out: @@ -876,7 +882,7 @@ static long seccomp_set_mode_filter(unsigned int flags, /* Do not free the successfully attached filter. */ prepared = NULL; - seccomp_assign_mode(current, seccomp_mode); + seccomp_assign_mode(current, seccomp_mode, flags); out: spin_unlock_irq(¤t->sighand->siglock); if (flags & SECCOMP_FILTER_FLAG_TSYNC) @@ -1135,10 +1141,11 @@ static const struct seccomp_log_name seccomp_log_names[] = { }; static bool seccomp_names_from_actions_logged(char *names, size_t size, - u32 actions_logged) + u32 actions_logged, + const char *sep) { const struct seccomp_log_name *cur; - bool append_space = false; + bool append_sep = false; for (cur = seccomp_log_names; cur->name && size; cur++) { ssize_t ret; @@ -1146,15 +1153,15 @@ static bool seccomp_names_from_actions_logged(char *names, size_t size, if (!(actions_logged & cur->log)) continue; - if (append_space) { - ret = strscpy(names, " ", size); + if (append_sep) { + ret = strscpy(names, sep, size); if (ret < 0) return false; names += ret; size -= ret; } else - append_space = true; + append_sep = true; ret = strscpy(names, cur->name, size); if (ret < 0) @@ -1199,46 +1206,102 @@ static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names) return true; } -static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + char names[sizeof(seccomp_actions_avail)]; + struct ctl_table table; + + memset(names, 0, sizeof(names)); + + if (!seccomp_names_from_actions_logged(names, sizeof(names), + seccomp_actions_logged, " ")) + return -EINVAL; + + table = *ro_table; + table.data = names; + table.maxlen = sizeof(names); + return proc_dostring(&table, 0, buffer, lenp, ppos); +} + +static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer, + size_t *lenp, loff_t *ppos, u32 *actions_logged) { char names[sizeof(seccomp_actions_avail)]; struct ctl_table table; int ret; - if (write && !capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) return -EPERM; memset(names, 0, sizeof(names)); - if (!write) { - if (!seccomp_names_from_actions_logged(names, sizeof(names), - seccomp_actions_logged)) - return -EINVAL; - } - table = *ro_table; table.data = names; table.maxlen = sizeof(names); - ret = proc_dostring(&table, write, buffer, lenp, ppos); + ret = proc_dostring(&table, 1, buffer, lenp, ppos); if (ret) return ret; - if (write) { - u32 actions_logged; + if (!seccomp_actions_logged_from_names(actions_logged, table.data)) + return -EINVAL; - if (!seccomp_actions_logged_from_names(&actions_logged, - table.data)) - return -EINVAL; + if (*actions_logged & SECCOMP_LOG_ALLOW) + return -EINVAL; - if (actions_logged & SECCOMP_LOG_ALLOW) - return -EINVAL; + seccomp_actions_logged = *actions_logged; + return 0; +} - seccomp_actions_logged = actions_logged; - } +static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged, + int ret) +{ + char names[sizeof(seccomp_actions_avail)]; + char old_names[sizeof(seccomp_actions_avail)]; + const char *new = names; + const char *old = old_names; - return 0; + if (!audit_enabled) + return; + + memset(names, 0, sizeof(names)); + memset(old_names, 0, sizeof(old_names)); + + if (ret) + new = "?"; + else if (!actions_logged) + new = "(none)"; + else if (!seccomp_names_from_actions_logged(names, sizeof(names), + actions_logged, ",")) + new = "?"; + + if (!old_actions_logged) + old = "(none)"; + else if (!seccomp_names_from_actions_logged(old_names, + sizeof(old_names), + old_actions_logged, ",")) + old = "?"; + + return audit_seccomp_actions_logged(new, old, !ret); +} + +static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + if (write) { + u32 actions_logged = 0; + u32 old_actions_logged = seccomp_actions_logged; + + ret = write_actions_logged(ro_table, buffer, lenp, ppos, + &actions_logged); + audit_actions_logged(actions_logged, old_actions_logged, ret); + } else + ret = read_actions_logged(ro_table, buffer, lenp, ppos); + + return ret; } static struct ctl_path seccomp_sysctl_path[] = { diff --git a/kernel/signal.c b/kernel/signal.c index d4ccea599692..8d8a940422a8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1244,19 +1244,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, { struct sighand_struct *sighand; + rcu_read_lock(); for (;;) { - /* - * Disable interrupts early to avoid deadlocks. - * See rcu_read_unlock() comment header for details. - */ - local_irq_save(*flags); - rcu_read_lock(); sighand = rcu_dereference(tsk->sighand); - if (unlikely(sighand == NULL)) { - rcu_read_unlock(); - local_irq_restore(*flags); + if (unlikely(sighand == NULL)) break; - } + /* * This sighand can be already freed and even reused, but * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which @@ -1268,15 +1261,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, * __exit_signal(). In the latter case the next iteration * must see ->sighand == NULL. */ - spin_lock(&sighand->siglock); - if (likely(sighand == tsk->sighand)) { - rcu_read_unlock(); + spin_lock_irqsave(&sighand->siglock, *flags); + if (likely(sighand == tsk->sighand)) break; - } - spin_unlock(&sighand->siglock); - rcu_read_unlock(); - local_irq_restore(*flags); + spin_unlock_irqrestore(&sighand->siglock, *flags); } + rcu_read_unlock(); return sighand; } @@ -1539,7 +1529,6 @@ int send_sig_fault(int sig, int code, void __user *addr return send_sig_info(info.si_signo, &info, t); } -#if defined(BUS_MCEERR_AO) && defined(BUS_MCEERR_AR) int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) { struct siginfo info; @@ -1568,9 +1557,7 @@ int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct * return send_sig_info(info.si_signo, &info, t); } EXPORT_SYMBOL(send_sig_mceerr); -#endif -#ifdef SEGV_BNDERR int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) { struct siginfo info; @@ -1584,7 +1571,6 @@ int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) info.si_upper = upper; return force_sig_info(info.si_signo, &info, current); } -#endif #ifdef SEGV_PKUERR int force_sig_pkuerr(void __user *addr, u32 pkey) @@ -1961,14 +1947,27 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) return; } + set_special_state(TASK_TRACED); + /* * We're committing to trapping. TRACED should be visible before * TRAPPING is cleared; otherwise, the tracer might fail do_wait(). * Also, transition to TRACED and updates to ->jobctl should be * atomic with respect to siglock and should be done after the arch * hook as siglock is released and regrabbed across it. + * + * TRACER TRACEE + * + * ptrace_attach() + * [L] wait_on_bit(JOBCTL_TRAPPING) [S] set_special_state(TRACED) + * do_wait() + * set_current_state() smp_wmb(); + * ptrace_do_wait() + * wait_task_stopped() + * task_stopped_code() + * [L] task_is_traced() [S] task_clear_jobctl_trapping(); */ - set_current_state(TASK_TRACED); + smp_wmb(); current->last_siginfo = info; current->exit_code = exit_code; @@ -2176,7 +2175,7 @@ static bool do_signal_stop(int signr) if (task_participate_group_stop(current)) notify = CLD_STOPPED; - __set_current_state(TASK_STOPPED); + set_special_state(TASK_STOPPED); spin_unlock_irq(¤t->sighand->siglock); /* @@ -2824,8 +2823,19 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) [SIGPOLL] = { NSIGPOLL, SIL_POLL }, [SIGSYS] = { NSIGSYS, SIL_SYS }, }; - if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) + if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) { layout = filter[sig].layout; + /* Handle the exceptions */ + if ((sig == SIGBUS) && + (si_code >= BUS_MCEERR_AR) && (si_code <= BUS_MCEERR_AO)) + layout = SIL_FAULT_MCEERR; + else if ((sig == SIGSEGV) && (si_code == SEGV_BNDERR)) + layout = SIL_FAULT_BNDERR; +#ifdef SEGV_PKUERR + else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR)) + layout = SIL_FAULT_PKUERR; +#endif + } else if (si_code <= NSIGPOLL) layout = SIL_POLL; } else { @@ -2835,104 +2845,15 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) layout = SIL_POLL; else if (si_code < 0) layout = SIL_RT; - /* Tests to support buggy kernel ABIs */ -#ifdef TRAP_FIXME - if ((sig == SIGTRAP) && (si_code == TRAP_FIXME)) - layout = SIL_FAULT; -#endif -#ifdef FPE_FIXME - if ((sig == SIGFPE) && (si_code == FPE_FIXME)) - layout = SIL_FAULT; -#endif } return layout; } int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) { - int err; - - if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t))) + if (copy_to_user(to, from , sizeof(struct siginfo))) return -EFAULT; - if (from->si_code < 0) - return __copy_to_user(to, from, sizeof(siginfo_t)) - ? -EFAULT : 0; - /* - * If you change siginfo_t structure, please be sure - * this code is fixed accordingly. - * Please remember to update the signalfd_copyinfo() function - * inside fs/signalfd.c too, in case siginfo_t changes. - * It should never copy any pad contained in the structure - * to avoid security leaks, but must copy the generic - * 3 ints plus the relevant union member. - */ - err = __put_user(from->si_signo, &to->si_signo); - err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user(from->si_code, &to->si_code); - switch (siginfo_layout(from->si_signo, from->si_code)) { - case SIL_KILL: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - break; - case SIL_TIMER: - /* Unreached SI_TIMER is negative */ - break; - case SIL_POLL: - err |= __put_user(from->si_band, &to->si_band); - err |= __put_user(from->si_fd, &to->si_fd); - break; - case SIL_FAULT: - err |= __put_user(from->si_addr, &to->si_addr); -#ifdef __ARCH_SI_TRAPNO - err |= __put_user(from->si_trapno, &to->si_trapno); -#endif -#ifdef __ia64__ - err |= __put_user(from->si_imm, &to->si_imm); - err |= __put_user(from->si_flags, &to->si_flags); - err |= __put_user(from->si_isr, &to->si_isr); -#endif - /* - * Other callers might not initialize the si_lsb field, - * so check explicitly for the right codes here. - */ -#ifdef BUS_MCEERR_AR - if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AR) - err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); -#endif -#ifdef BUS_MCEERR_AO - if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AO) - err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); -#endif -#ifdef SEGV_BNDERR - if (from->si_signo == SIGSEGV && from->si_code == SEGV_BNDERR) { - err |= __put_user(from->si_lower, &to->si_lower); - err |= __put_user(from->si_upper, &to->si_upper); - } -#endif -#ifdef SEGV_PKUERR - if (from->si_signo == SIGSEGV && from->si_code == SEGV_PKUERR) - err |= __put_user(from->si_pkey, &to->si_pkey); -#endif - break; - case SIL_CHLD: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_status, &to->si_status); - err |= __put_user(from->si_utime, &to->si_utime); - err |= __put_user(from->si_stime, &to->si_stime); - break; - case SIL_RT: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_ptr, &to->si_ptr); - break; - case SIL_SYS: - err |= __put_user(from->si_call_addr, &to->si_call_addr); - err |= __put_user(from->si_syscall, &to->si_syscall); - err |= __put_user(from->si_arch, &to->si_arch); - break; - } - return err; + return 0; } #ifdef CONFIG_COMPAT @@ -2971,27 +2892,28 @@ int __copy_siginfo_to_user32(struct compat_siginfo __user *to, #ifdef __ARCH_SI_TRAPNO new.si_trapno = from->si_trapno; #endif -#ifdef BUS_MCEERR_AR - if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AR)) - new.si_addr_lsb = from->si_addr_lsb; -#endif -#ifdef BUS_MCEERR_AO - if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AO)) - new.si_addr_lsb = from->si_addr_lsb; + break; + case SIL_FAULT_MCEERR: + new.si_addr = ptr_to_compat(from->si_addr); +#ifdef __ARCH_SI_TRAPNO + new.si_trapno = from->si_trapno; #endif -#ifdef SEGV_BNDERR - if ((from->si_signo == SIGSEGV) && - (from->si_code == SEGV_BNDERR)) { - new.si_lower = ptr_to_compat(from->si_lower); - new.si_upper = ptr_to_compat(from->si_upper); - } + new.si_addr_lsb = from->si_addr_lsb; + break; + case SIL_FAULT_BNDERR: + new.si_addr = ptr_to_compat(from->si_addr); +#ifdef __ARCH_SI_TRAPNO + new.si_trapno = from->si_trapno; #endif -#ifdef SEGV_PKUERR - if ((from->si_signo == SIGSEGV) && - (from->si_code == SEGV_PKUERR)) - new.si_pkey = from->si_pkey; + new.si_lower = ptr_to_compat(from->si_lower); + new.si_upper = ptr_to_compat(from->si_upper); + break; + case SIL_FAULT_PKUERR: + new.si_addr = ptr_to_compat(from->si_addr); +#ifdef __ARCH_SI_TRAPNO + new.si_trapno = from->si_trapno; #endif - + new.si_pkey = from->si_pkey; break; case SIL_CHLD: new.si_pid = from->si_pid; @@ -3057,24 +2979,28 @@ int copy_siginfo_from_user32(struct siginfo *to, #ifdef __ARCH_SI_TRAPNO to->si_trapno = from.si_trapno; #endif -#ifdef BUS_MCEERR_AR - if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AR)) - to->si_addr_lsb = from.si_addr_lsb; -#endif -#ifdef BUS_MCEER_AO - if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AO)) - to->si_addr_lsb = from.si_addr_lsb; + break; + case SIL_FAULT_MCEERR: + to->si_addr = compat_ptr(from.si_addr); +#ifdef __ARCH_SI_TRAPNO + to->si_trapno = from.si_trapno; #endif -#ifdef SEGV_BNDERR - if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_BNDERR)) { - to->si_lower = compat_ptr(from.si_lower); - to->si_upper = compat_ptr(from.si_upper); - } + to->si_addr_lsb = from.si_addr_lsb; + break; + case SIL_FAULT_BNDERR: + to->si_addr = compat_ptr(from.si_addr); +#ifdef __ARCH_SI_TRAPNO + to->si_trapno = from.si_trapno; #endif -#ifdef SEGV_PKUERR - if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_PKUERR)) - to->si_pkey = from.si_pkey; + to->si_lower = compat_ptr(from.si_lower); + to->si_upper = compat_ptr(from.si_upper); + break; + case SIL_FAULT_PKUERR: + to->si_addr = compat_ptr(from.si_addr); +#ifdef __ARCH_SI_TRAPNO + to->si_trapno = from.si_trapno; #endif + to->si_pkey = from.si_pkey; break; case SIL_CHLD: to->si_pid = from.si_pid; diff --git a/kernel/softirq.c b/kernel/softirq.c index 177de3640c78..de2f57fddc04 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -49,8 +49,8 @@ */ #ifndef __ARCH_IRQ_STAT -irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned; -EXPORT_SYMBOL(irq_stat); +DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); +EXPORT_PER_CPU_SYMBOL(irq_stat); #endif static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; @@ -145,8 +145,7 @@ static void __local_bh_enable(unsigned int cnt) } /* - * Special-case - softirqs can safely be enabled in - * cond_resched_softirq(), or by __do_softirq(), + * Special-case - softirqs can safely be enabled by __do_softirq(), * without processing still-pending softirqs: */ void _local_bh_enable(void) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b7591261652d..f89014a2c238 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -21,6 +21,7 @@ #include <linux/smpboot.h> #include <linux/atomic.h> #include <linux/nmi.h> +#include <linux/sched/wake_q.h> /* * Structure to determine completion condition and record errors. May @@ -36,7 +37,7 @@ struct cpu_stop_done { struct cpu_stopper { struct task_struct *thread; - spinlock_t lock; + raw_spinlock_t lock; bool enabled; /* is this stopper enabled? */ struct list_head works; /* list of pending works */ @@ -65,26 +66,30 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done) } static void __cpu_stop_queue_work(struct cpu_stopper *stopper, - struct cpu_stop_work *work) + struct cpu_stop_work *work, + struct wake_q_head *wakeq) { list_add_tail(&work->list, &stopper->works); - wake_up_process(stopper->thread); + wake_q_add(wakeq, stopper->thread); } /* queue @work to @stopper. if offline, @work is completed immediately */ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + DEFINE_WAKE_Q(wakeq); unsigned long flags; bool enabled; - spin_lock_irqsave(&stopper->lock, flags); + raw_spin_lock_irqsave(&stopper->lock, flags); enabled = stopper->enabled; if (enabled) - __cpu_stop_queue_work(stopper, work); + __cpu_stop_queue_work(stopper, work, &wakeq); else if (work->done) cpu_stop_signal_done(work->done); - spin_unlock_irqrestore(&stopper->lock, flags); + raw_spin_unlock_irqrestore(&stopper->lock, flags); + + wake_up_q(&wakeq); return enabled; } @@ -229,10 +234,11 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, { struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); + DEFINE_WAKE_Q(wakeq); int err; retry: - spin_lock_irq(&stopper1->lock); - spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock_irq(&stopper1->lock); + raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); err = -ENOENT; if (!stopper1->enabled || !stopper2->enabled) @@ -252,17 +258,20 @@ retry: goto unlock; err = 0; - __cpu_stop_queue_work(stopper1, work1); - __cpu_stop_queue_work(stopper2, work2); + __cpu_stop_queue_work(stopper1, work1, &wakeq); + __cpu_stop_queue_work(stopper2, work2, &wakeq); unlock: - spin_unlock(&stopper2->lock); - spin_unlock_irq(&stopper1->lock); + raw_spin_unlock(&stopper2->lock); + raw_spin_unlock_irq(&stopper1->lock); if (unlikely(err == -EDEADLK)) { while (stop_cpus_in_progress) cpu_relax(); goto retry; } + + wake_up_q(&wakeq); + return err; } /** @@ -448,9 +457,9 @@ static int cpu_stop_should_run(unsigned int cpu) unsigned long flags; int run; - spin_lock_irqsave(&stopper->lock, flags); + raw_spin_lock_irqsave(&stopper->lock, flags); run = !list_empty(&stopper->works); - spin_unlock_irqrestore(&stopper->lock, flags); + raw_spin_unlock_irqrestore(&stopper->lock, flags); return run; } @@ -461,13 +470,13 @@ static void cpu_stopper_thread(unsigned int cpu) repeat: work = NULL; - spin_lock_irq(&stopper->lock); + raw_spin_lock_irq(&stopper->lock); if (!list_empty(&stopper->works)) { work = list_first_entry(&stopper->works, struct cpu_stop_work, list); list_del_init(&work->list); } - spin_unlock_irq(&stopper->lock); + raw_spin_unlock_irq(&stopper->lock); if (work) { cpu_stop_fn_t fn = work->fn; @@ -541,7 +550,7 @@ static int __init cpu_stop_init(void) for_each_possible_cpu(cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - spin_lock_init(&stopper->lock); + raw_spin_lock_init(&stopper->lock); INIT_LIST_HEAD(&stopper->works); } diff --git a/kernel/sys.c b/kernel/sys.c index ad692183dfe9..38509dc1f77b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -61,6 +61,8 @@ #include <linux/uidgid.h> #include <linux/cred.h> +#include <linux/nospec.h> + #include <linux/kmsg_dump.h> /* Move somewhere else to avoid recompiling? */ #include <generated/utsrelease.h> @@ -69,6 +71,9 @@ #include <asm/io.h> #include <asm/unistd.h> +/* Hardening for Spectre-v1 */ +#include <linux/nospec.h> + #include "uid16.h" #ifndef SET_UNALIGN_CTL @@ -1451,6 +1456,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, if (resource >= RLIM_NLIMITS) return -EINVAL; + resource = array_index_nospec(resource, RLIM_NLIMITS); task_lock(current->group_leader); x = current->signal->rlim[resource]; task_unlock(current->group_leader); @@ -1470,6 +1476,7 @@ COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, if (resource >= RLIM_NLIMITS) return -EINVAL; + resource = array_index_nospec(resource, RLIM_NLIMITS); task_lock(current->group_leader); r = current->signal->rlim[resource]; task_unlock(current->group_leader); @@ -2011,7 +2018,11 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data return error; } - down_write(&mm->mmap_sem); + /* + * arg_lock protects concurent updates but we still need mmap_sem for + * read to exclude races with sys_brk. + */ + down_read(&mm->mmap_sem); /* * We don't validate if these members are pointing to @@ -2025,6 +2036,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * to any problem in kernel itself */ + spin_lock(&mm->arg_lock); mm->start_code = prctl_map.start_code; mm->end_code = prctl_map.end_code; mm->start_data = prctl_map.start_data; @@ -2036,6 +2048,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data mm->arg_end = prctl_map.arg_end; mm->env_start = prctl_map.env_start; mm->env_end = prctl_map.env_end; + spin_unlock(&mm->arg_lock); /* * Note this update of @saved_auxv is lockless thus @@ -2048,7 +2061,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); - up_write(&mm->mmap_sem); + up_read(&mm->mmap_sem); return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ @@ -2242,6 +2255,17 @@ static int propagate_has_child_subreaper(struct task_struct *p, void *data) return 1; } +int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which) +{ + return -EINVAL; +} + +int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, + unsigned long ctrl) +{ + return -EINVAL; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2450,6 +2474,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SVE_GET_VL: error = SVE_GET_VL(); break; + case PR_GET_SPECULATION_CTRL: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_prctl_spec_ctrl_get(me, arg2); + break; + case PR_SET_SPECULATION_CTRL: + if (arg4 || arg5) + return -EINVAL; + error = arch_prctl_spec_ctrl_set(me, arg2, arg3); + break; default: error = -EINVAL; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 9791364925dc..df556175be50 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -43,7 +43,9 @@ COND_SYSCALL(io_submit); COND_SYSCALL_COMPAT(io_submit); COND_SYSCALL(io_cancel); COND_SYSCALL(io_getevents); +COND_SYSCALL(io_pgetevents); COND_SYSCALL_COMPAT(io_getevents); +COND_SYSCALL_COMPAT(io_pgetevents); /* fs/xattr.c */ @@ -365,7 +367,7 @@ COND_SYSCALL(s390_pci_mmio_write); COND_SYSCALL_COMPAT(s390_ipc); /* powerpc */ -cond_syscall(ppc_rtas); +COND_SYSCALL(rtas); COND_SYSCALL(spu_run); COND_SYSCALL(spu_create); COND_SYSCALL(subpage_prot); @@ -430,3 +432,6 @@ COND_SYSCALL(setresgid16); COND_SYSCALL(setresuid16); COND_SYSCALL(setreuid16); COND_SYSCALL(setuid16); + +/* restartable sequence */ +COND_SYSCALL(rseq); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6a78cf70761d..2d9837c0aff4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3047,7 +3047,8 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long), + tmp_bitmap = kcalloc(BITS_TO_LONGS(bitmap_len), + sizeof(unsigned long), GFP_KERNEL); if (!tmp_bitmap) { kfree(kbuf); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index e8c0dab4fd65..07148b497451 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -704,24 +704,6 @@ static const struct bin_table bin_net_netfilter_table[] = { {} }; -static const struct bin_table bin_net_irda_table[] = { - { CTL_INT, NET_IRDA_DISCOVERY, "discovery" }, - { CTL_STR, NET_IRDA_DEVNAME, "devname" }, - { CTL_INT, NET_IRDA_DEBUG, "debug" }, - { CTL_INT, NET_IRDA_FAST_POLL, "fast_poll_increase" }, - { CTL_INT, NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" }, - { CTL_INT, NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" }, - { CTL_INT, NET_IRDA_SLOT_TIMEOUT, "slot_timeout" }, - { CTL_INT, NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" }, - { CTL_INT, NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" }, - { CTL_INT, NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" }, - { CTL_INT, NET_IRDA_MAX_TX_WINDOW, "max_tx_window" }, - { CTL_INT, NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" }, - { CTL_INT, NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" }, - { CTL_INT, NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" }, - {} -}; - static const struct bin_table bin_net_table[] = { { CTL_DIR, NET_CORE, "core", bin_net_core_table }, /* NET_ETHER not used */ @@ -743,7 +725,7 @@ static const struct bin_table bin_net_table[] = { { CTL_DIR, NET_LLC, "llc", bin_net_llc_table }, { CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table }, /* NET_DCCP "dccp" no longer used */ - { CTL_DIR, NET_IRDA, "irda", bin_net_irda_table }, + /* NET_IRDA "irda" no longer used */ { CTL_INT, 2089, "nf_conntrack_max" }, {} }; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 0e974cface0b..f89a78e2792b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -119,8 +119,15 @@ static DEFINE_SPINLOCK(watchdog_lock); static int watchdog_running; static atomic_t watchdog_reset_pending; -static int clocksource_watchdog_kthread(void *data); -static void __clocksource_change_rating(struct clocksource *cs, int rating); +static void inline clocksource_watchdog_lock(unsigned long *flags) +{ + spin_lock_irqsave(&watchdog_lock, *flags); +} + +static void inline clocksource_watchdog_unlock(unsigned long *flags) +{ + spin_unlock_irqrestore(&watchdog_lock, *flags); +} /* * Interval: 0.5sec Threshold: 0.0625s @@ -128,23 +135,24 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating); #define WATCHDOG_INTERVAL (HZ >> 1) #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) -static void clocksource_watchdog_work(struct work_struct *work) -{ - /* - * If kthread_run fails the next watchdog scan over the - * watchdog_list will find the unstable clock again. - */ - kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); -} - static void __clocksource_unstable(struct clocksource *cs) { cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); cs->flags |= CLOCK_SOURCE_UNSTABLE; + /* + * If the clocksource is registered clocksource_watchdog_work() will + * re-rate and re-select. + */ + if (list_empty(&cs->list)) { + cs->rating = 0; + return; + } + if (cs->mark_unstable) cs->mark_unstable(cs); + /* kick clocksource_watchdog_work() */ if (finished_booting) schedule_work(&watchdog_work); } @@ -153,10 +161,8 @@ static void __clocksource_unstable(struct clocksource *cs) * clocksource_mark_unstable - mark clocksource unstable via watchdog * @cs: clocksource to be marked unstable * - * This function is called instead of clocksource_change_rating from - * cpu hotplug code to avoid a deadlock between the clocksource mutex - * and the cpu hotplug mutex. It defers the update of the clocksource - * to the watchdog thread. + * This function is called by the x86 TSC code to mark clocksources as unstable; + * it defers demotion and re-selection to a work. */ void clocksource_mark_unstable(struct clocksource *cs) { @@ -164,7 +170,7 @@ void clocksource_mark_unstable(struct clocksource *cs) spin_lock_irqsave(&watchdog_lock, flags); if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) { - if (list_empty(&cs->wd_list)) + if (!list_empty(&cs->list) && list_empty(&cs->wd_list)) list_add(&cs->wd_list, &watchdog_list); __clocksource_unstable(cs); } @@ -319,9 +325,8 @@ static void clocksource_resume_watchdog(void) static void clocksource_enqueue_watchdog(struct clocksource *cs) { - unsigned long flags; + INIT_LIST_HEAD(&cs->wd_list); - spin_lock_irqsave(&watchdog_lock, flags); if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { /* cs is a clocksource to be watched. */ list_add(&cs->wd_list, &watchdog_list); @@ -331,7 +336,6 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } - spin_unlock_irqrestore(&watchdog_lock, flags); } static void clocksource_select_watchdog(bool fallback) @@ -373,9 +377,6 @@ static void clocksource_select_watchdog(bool fallback) static void clocksource_dequeue_watchdog(struct clocksource *cs) { - unsigned long flags; - - spin_lock_irqsave(&watchdog_lock, flags); if (cs != watchdog) { if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { /* cs is a watched clocksource. */ @@ -384,21 +385,21 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs) clocksource_stop_watchdog(); } } - spin_unlock_irqrestore(&watchdog_lock, flags); } -static int __clocksource_watchdog_kthread(void) +static void __clocksource_change_rating(struct clocksource *cs, int rating); + +static int __clocksource_watchdog_work(void) { struct clocksource *cs, *tmp; unsigned long flags; - LIST_HEAD(unstable); int select = 0; spin_lock_irqsave(&watchdog_lock, flags); list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { if (cs->flags & CLOCK_SOURCE_UNSTABLE) { list_del_init(&cs->wd_list); - list_add(&cs->wd_list, &unstable); + __clocksource_change_rating(cs, 0); select = 1; } if (cs->flags & CLOCK_SOURCE_RESELECT) { @@ -410,21 +411,15 @@ static int __clocksource_watchdog_kthread(void) clocksource_stop_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); - /* Needs to be done outside of watchdog lock */ - list_for_each_entry_safe(cs, tmp, &unstable, wd_list) { - list_del_init(&cs->wd_list); - __clocksource_change_rating(cs, 0); - } return select; } -static int clocksource_watchdog_kthread(void *data) +static void clocksource_watchdog_work(struct work_struct *work) { mutex_lock(&clocksource_mutex); - if (__clocksource_watchdog_kthread()) + if (__clocksource_watchdog_work()) clocksource_select(); mutex_unlock(&clocksource_mutex); - return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) @@ -443,10 +438,13 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static void clocksource_select_watchdog(bool fallback) { } static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } -static inline int __clocksource_watchdog_kthread(void) { return 0; } +static inline int __clocksource_watchdog_work(void) { return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } void clocksource_mark_unstable(struct clocksource *cs) { } +static inline void clocksource_watchdog_lock(unsigned long *flags) { } +static inline void clocksource_watchdog_unlock(unsigned long *flags) { } + #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ /** @@ -674,7 +672,7 @@ static int __init clocksource_done_booting(void) /* * Run the watchdog first to eliminate unstable clock sources */ - __clocksource_watchdog_kthread(); + __clocksource_watchdog_work(); clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; @@ -779,14 +777,19 @@ EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); */ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) { + unsigned long flags; /* Initialize mult/shift and max_idle_ns */ __clocksource_update_freq_scale(cs, scale, freq); /* Add clocksource to the clocksource list */ mutex_lock(&clocksource_mutex); + + clocksource_watchdog_lock(&flags); clocksource_enqueue(cs); clocksource_enqueue_watchdog(cs); + clocksource_watchdog_unlock(&flags); + clocksource_select(); clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); @@ -808,8 +811,13 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating) */ void clocksource_change_rating(struct clocksource *cs, int rating) { + unsigned long flags; + mutex_lock(&clocksource_mutex); + clocksource_watchdog_lock(&flags); __clocksource_change_rating(cs, rating); + clocksource_watchdog_unlock(&flags); + clocksource_select(); clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); @@ -821,6 +829,8 @@ EXPORT_SYMBOL(clocksource_change_rating); */ static int clocksource_unbind(struct clocksource *cs) { + unsigned long flags; + if (clocksource_is_watchdog(cs)) { /* Select and try to install a replacement watchdog. */ clocksource_select_watchdog(true); @@ -834,8 +844,12 @@ static int clocksource_unbind(struct clocksource *cs) if (curr_clocksource == cs) return -EBUSY; } + + clocksource_watchdog_lock(&flags); clocksource_dequeue_watchdog(cs); list_del_init(&cs->list); + clocksource_watchdog_unlock(&flags); + return 0; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index eda1210ce50f..055a4a728c00 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -91,6 +91,11 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .get_time = &ktime_get_real, }, { + .index = HRTIMER_BASE_BOOTTIME, + .clockid = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + }, + { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, @@ -106,6 +111,11 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .get_time = &ktime_get_real, }, { + .index = HRTIMER_BASE_BOOTTIME_SOFT, + .clockid = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + }, + { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, @@ -119,7 +129,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_BOOTTIME] = HRTIMER_BASE_MONOTONIC, + [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; @@ -571,12 +581,14 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; + ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, - offs_real, offs_tai); + offs_real, offs_boot, offs_tai); base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; + base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; return now; @@ -1747,8 +1759,10 @@ out: return ret; } -SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, - struct timespec __user *, rmtp) +#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) + +SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, + struct __kernel_timespec __user *, rmtp) { struct timespec64 tu; @@ -1763,7 +1777,9 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } -#ifdef CONFIG_COMPAT +#endif + +#ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, struct compat_timespec __user *, rmtp) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 2541bd89f20e..5a6251ac6f7a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1205,10 +1205,12 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, u64 *newval, u64 *oldval) { u64 now; + int ret; WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); + ret = cpu_timer_sample_group(clock_idx, tsk, &now); - if (oldval && cpu_timer_sample_group(clock_idx, tsk, &now) != -EINVAL) { + if (oldval && ret != -EINVAL) { /* * We are setting itimer. The *oldval is absolute and we update * it to be relative, *newval argument is relative and we update diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index e0dbae98db9d..26aa9569e24a 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -59,7 +59,7 @@ SYS_NI(alarm); */ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, - const struct timespec __user *, tp) + const struct __kernel_timespec __user *, tp) { struct timespec64 new_tp; @@ -83,8 +83,6 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) case CLOCK_BOOTTIME: get_monotonic_boottime64(tp); break; - case CLOCK_MONOTONIC_ACTIVE: - ktime_get_active_ts64(tp); default: return -EINVAL; } @@ -92,7 +90,7 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) return 0; } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, - struct timespec __user *, tp) + struct __kernel_timespec __user *, tp) { int ret; struct timespec64 kernel_tp; @@ -106,7 +104,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, return 0; } -SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) +SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct __kernel_timespec __user *, tp) { struct timespec64 rtn_tp = { .tv_sec = 0, @@ -126,8 +124,8 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __us } SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, - const struct timespec __user *, rqtp, - struct timespec __user *, rmtp) + const struct __kernel_timespec __user *, rqtp, + struct __kernel_timespec __user *, rmtp) { struct timespec64 t; @@ -160,7 +158,9 @@ COMPAT_SYS_NI(timer_settime); COMPAT_SYS_NI(timer_gettime); COMPAT_SYS_NI(getitimer); COMPAT_SYS_NI(setitimer); +#endif +#ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, struct compat_timespec __user *, tp) { diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index b6899b5060bd..e08ce3f27447 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -252,16 +252,15 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 * return 0; } -static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp) { - timekeeping_clocktai64(tp); + get_monotonic_boottime64(tp); return 0; } -static int posix_get_monotonic_active(clockid_t which_clock, - struct timespec64 *tp) +static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) { - ktime_get_active_ts64(tp); + timekeeping_clocktai64(tp); return 0; } @@ -1041,7 +1040,7 @@ void exit_itimers(struct signal_struct *sig) } SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, - const struct timespec __user *, tp) + const struct __kernel_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 new_tp; @@ -1056,7 +1055,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, - struct timespec __user *,tp) + struct __kernel_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 kernel_tp; @@ -1097,7 +1096,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, } SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, - struct timespec __user *, tp) + struct __kernel_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 rtn_tp; @@ -1114,7 +1113,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, return error; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, struct compat_timespec __user *, tp) @@ -1149,6 +1148,10 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, return err; } +#endif + +#ifdef CONFIG_COMPAT + COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, struct compat_timex __user *, utp) { @@ -1173,6 +1176,10 @@ COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, return err; } +#endif + +#ifdef CONFIG_COMPAT_32BIT_TIME + COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, struct compat_timespec __user *, tp) { @@ -1204,8 +1211,8 @@ static int common_nsleep(const clockid_t which_clock, int flags, } SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, - const struct timespec __user *, rqtp, - struct timespec __user *, rmtp) + const struct __kernel_timespec __user *, rqtp, + struct __kernel_timespec __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t; @@ -1228,7 +1235,8 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, return kc->nsleep(which_clock, flags, &t); } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME + COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, struct compat_timespec __user *, rqtp, struct compat_timespec __user *, rmtp) @@ -1253,6 +1261,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, return kc->nsleep(which_clock, flags, &t); } + #endif static const struct k_clock clock_realtime = { @@ -1317,9 +1326,19 @@ static const struct k_clock clock_tai = { .timer_arm = common_hrtimer_arm, }; -static const struct k_clock clock_monotonic_active = { +static const struct k_clock clock_boottime = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_monotonic_active, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_arm = common_hrtimer_arm, }; static const struct k_clock * const posix_clocks[] = { @@ -1330,11 +1349,10 @@ static const struct k_clock * const posix_clocks[] = { [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw, [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse, [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse, - [CLOCK_BOOTTIME] = &clock_monotonic, + [CLOCK_BOOTTIME] = &clock_boottime, [CLOCK_REALTIME_ALARM] = &alarm_clock, [CLOCK_BOOTTIME_ALARM] = &alarm_clock, [CLOCK_TAI] = &clock_tai, - [CLOCK_MONOTONIC_ACTIVE] = &clock_monotonic_active, }; static const struct k_clock *clockid_to_kclock(const clockid_t id) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index b398c2ea69b2..aa2094d5dd27 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -612,6 +612,14 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) now = ktime_get(); /* Find all expired events */ for_each_cpu(cpu, tick_broadcast_oneshot_mask) { + /* + * Required for !SMP because for_each_cpu() reports + * unconditionally CPU0 as set on UP kernels. + */ + if (!IS_ENABLED(CONFIG_SMP) && + cpumask_empty(tick_broadcast_oneshot_mask)) + break; + td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev->next_event <= now) { cpumask_set_cpu(cpu, tmpmask); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 099572ca4a8f..b7005dd21ec1 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -277,7 +277,8 @@ static bool tick_check_preferred(struct clock_event_device *curdev, */ return !curdev || newdev->rating > curdev->rating || - !cpumask_equal(curdev->cpumask, newdev->cpumask); + (!cpumask_equal(curdev->cpumask, newdev->cpumask) && + !tick_check_percpu(curdev, newdev, smp_processor_id())); } /* @@ -419,19 +420,6 @@ void tick_suspend_local(void) clockevents_shutdown(td->evtdev); } -static void tick_forward_next_period(void) -{ - ktime_t delta, now = ktime_get(); - u64 n; - - delta = ktime_sub(now, tick_next_period); - n = ktime_divns(delta, tick_period); - tick_next_period += n * tick_period; - if (tick_next_period < now) - tick_next_period += tick_period; - tick_sched_forward_next_period(); -} - /** * tick_resume_local - Resume the local tick device * @@ -444,8 +432,6 @@ void tick_resume_local(void) struct tick_device *td = this_cpu_ptr(&tick_cpu_device); bool broadcast = tick_resume_check_broadcast(); - tick_forward_next_period(); - clockevents_tick_resume(td->evtdev); if (!broadcast) { if (td->mode == TICKDEV_MODE_PERIODIC) @@ -505,6 +491,7 @@ void tick_freeze(void) if (tick_freeze_depth == num_online_cpus()) { trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), true); + system_state = SYSTEM_SUSPEND; timekeeping_suspend(); } else { tick_suspend_local(); @@ -528,6 +515,7 @@ void tick_unfreeze(void) if (tick_freeze_depth == num_online_cpus()) { timekeeping_resume(); + system_state = SYSTEM_RUNNING; trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), false); } else { diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 21efab7485ca..e277284c2831 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -141,12 +141,6 @@ static inline void tick_check_oneshot_broadcast_this_cpu(void) { } static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } #endif /* !(BROADCAST && ONESHOT) */ -#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) -extern void tick_sched_forward_next_period(void); -#else -static inline void tick_sched_forward_next_period(void) { } -#endif - /* NO_HZ_FULL internal */ #ifdef CONFIG_NO_HZ_FULL extern void tick_nohz_init(void); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index c1f518e7aa80..6fe615d57ebb 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -82,16 +82,15 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || !tick_device_is_functional(dev)) { - printk(KERN_INFO "Clockevents: " - "could not switch to one-shot mode:"); + pr_info("Clockevents: could not switch to one-shot mode:"); if (!dev) { - printk(" no tick device\n"); + pr_cont(" no tick device\n"); } else { if (!tick_device_is_functional(dev)) - printk(" %s is not functional.\n", dev->name); + pr_cont(" %s is not functional.\n", dev->name); else - printk(" %s does not support one-shot mode.\n", - dev->name); + pr_cont(" %s does not support one-shot mode.\n", + dev->name); } return -EINVAL; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 646645e981f9..da9455a6b42b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -52,15 +52,6 @@ struct tick_sched *tick_get_tick_sched(int cpu) static ktime_t last_jiffies_update; /* - * Called after resume. Make sure that jiffies are not fast forwarded due to - * clock monotonic being forwarded by the suspended time. - */ -void tick_sched_forward_next_period(void) -{ - last_jiffies_update = tick_next_period; -} - -/* * Must be called with interrupts disabled ! */ static void tick_do_update_jiffies64(ktime_t now) @@ -804,12 +795,12 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) return; } - hrtimer_set_expires(&ts->sched_timer, tick); - - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); - else + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); + } else { + hrtimer_set_expires(&ts->sched_timer, tick); tick_program_event(tick, 1); + } } static void tick_nohz_retain_tick(struct tick_sched *ts) diff --git a/kernel/time/time.c b/kernel/time/time.c index 3044d48ebe56..6fa99213fc72 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -407,7 +407,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); -#if __BITS_PER_LONG == 32 /** * set_normalized_timespec - set timespec sec and nsec parts and normalize * @@ -468,7 +467,6 @@ struct timespec ns_to_timespec(const s64 nsec) return ts; } EXPORT_SYMBOL(ns_to_timespec); -#endif /** * ns_to_timeval - Convert nanoseconds to timeval @@ -853,9 +851,9 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, } int get_timespec64(struct timespec64 *ts, - const struct timespec __user *uts) + const struct __kernel_timespec __user *uts) { - struct timespec kts; + struct __kernel_timespec kts; int ret; ret = copy_from_user(&kts, uts, sizeof(kts)); @@ -863,6 +861,11 @@ int get_timespec64(struct timespec64 *ts, return -EFAULT; ts->tv_sec = kts.tv_sec; + + /* Zero out the padding for 32 bit systems or in compat mode */ + if (IS_ENABLED(CONFIG_64BIT_TIME) && (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())) + kts.tv_nsec &= 0xFFFFFFFFUL; + ts->tv_nsec = kts.tv_nsec; return 0; @@ -870,16 +873,61 @@ int get_timespec64(struct timespec64 *ts, EXPORT_SYMBOL_GPL(get_timespec64); int put_timespec64(const struct timespec64 *ts, - struct timespec __user *uts) + struct __kernel_timespec __user *uts) { - struct timespec kts = { + struct __kernel_timespec kts = { .tv_sec = ts->tv_sec, .tv_nsec = ts->tv_nsec }; + return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0; } EXPORT_SYMBOL_GPL(put_timespec64); +int __compat_get_timespec64(struct timespec64 *ts64, + const struct compat_timespec __user *cts) +{ + struct compat_timespec ts; + int ret; + + ret = copy_from_user(&ts, cts, sizeof(ts)); + if (ret) + return -EFAULT; + + ts64->tv_sec = ts.tv_sec; + ts64->tv_nsec = ts.tv_nsec; + + return 0; +} + +int __compat_put_timespec64(const struct timespec64 *ts64, + struct compat_timespec __user *cts) +{ + struct compat_timespec ts = { + .tv_sec = ts64->tv_sec, + .tv_nsec = ts64->tv_nsec + }; + return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; +} + +int compat_get_timespec64(struct timespec64 *ts, const void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; + else + return __compat_get_timespec64(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_get_timespec64); + +int compat_put_timespec64(const struct timespec64 *ts, void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; + else + return __compat_put_timespec64(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_put_timespec64); + int get_itimerspec64(struct itimerspec64 *it, const struct itimerspec __user *uit) { diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ca90219a1e73..4786df904c22 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -138,12 +138,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { - /* Update both bases so mono and raw stay coupled. */ - tk->tkr_mono.base += delta; - tk->tkr_raw.base += delta; - - /* Accumulate time spent in suspend */ - tk->time_suspended += delta; + tk->offs_boot = ktime_add(tk->offs_boot, delta); } /* @@ -473,6 +468,36 @@ u64 ktime_get_raw_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); +/** + * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. + * + * To keep it NMI safe since we're accessing from tracing, we're not using a + * separate timekeeper with updates to monotonic clock and boot offset + * protected with seqlocks. This has the following minor side effects: + * + * (1) Its possible that a timestamp be taken after the boot offset is updated + * but before the timekeeper is updated. If this happens, the new boot offset + * is added to the old timekeeping making the clock appear to update slightly + * earlier: + * CPU 0 CPU 1 + * timekeeping_inject_sleeptime64() + * __timekeeping_inject_sleeptime(tk, delta); + * timestamp(); + * timekeeping_update(tk, TK_CLEAR_NTP...); + * + * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be + * partially updated. Since the tk->offs_boot update is a rare event, this + * should be a rare occurrence which postprocessing should be able to handle. + */ +u64 notrace ktime_get_boot_fast_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot)); +} +EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); + + /* * See comment for __ktime_get_fast_ns() vs. timestamp ordering */ @@ -680,18 +705,19 @@ static void timekeeping_forward_now(struct timekeeper *tk) } /** - * __getnstimeofday64 - Returns the time of day in a timespec64. + * ktime_get_real_ts64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * - * Updates the time of day in the timespec. - * Returns 0 on success, or -ve when suspended (timespec will be undefined). + * Returns the time of day in a timespec64 (WARN if suspended). */ -int __getnstimeofday64(struct timespec64 *ts) +void ktime_get_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; u64 nsecs; + WARN_ON(timekeeping_suspended); + do { seq = read_seqcount_begin(&tk_core.seq); @@ -702,28 +728,8 @@ int __getnstimeofday64(struct timespec64 *ts) ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); - - /* - * Do not bail out early, in case there were callers still using - * the value, even in the face of the WARN_ON. - */ - if (unlikely(timekeeping_suspended)) - return -EAGAIN; - return 0; } -EXPORT_SYMBOL(__getnstimeofday64); - -/** - * getnstimeofday64 - Returns the time of day in a timespec64. - * @ts: pointer to the timespec64 to be set - * - * Returns the time of day in a timespec64 (WARN if suspended). - */ -void getnstimeofday64(struct timespec64 *ts) -{ - WARN_ON(__getnstimeofday64(ts)); -} -EXPORT_SYMBOL(getnstimeofday64); +EXPORT_SYMBOL(ktime_get_real_ts64); ktime_t ktime_get(void) { @@ -764,6 +770,7 @@ EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); static ktime_t *offsets[TK_OFFS_MAX] = { [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, + [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, }; @@ -788,6 +795,25 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs) } EXPORT_SYMBOL_GPL(ktime_get_with_offset); +ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base, *offset = offsets[offs]; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + base = ktime_add(tk->tkr_mono.base, *offset); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return base; + +} +EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); + /** * ktime_mono_to_any() - convert mononotic time to any other time * @tmono: time to convert. @@ -861,39 +887,6 @@ void ktime_get_ts64(struct timespec64 *ts) EXPORT_SYMBOL_GPL(ktime_get_ts64); /** - * ktime_get_active_ts64 - Get the active non-suspended monotonic clock - * @ts: pointer to timespec variable - * - * The function calculates the monotonic clock from the realtime clock and - * the wall_to_monotonic offset, subtracts the accumulated suspend time and - * stores the result in normalized timespec64 format in the variable - * pointed to by @ts. - */ -void ktime_get_active_ts64(struct timespec64 *ts) -{ - struct timekeeper *tk = &tk_core.timekeeper; - struct timespec64 tomono, tsusp; - u64 nsec, nssusp; - unsigned int seq; - - WARN_ON(timekeeping_suspended); - - do { - seq = read_seqcount_begin(&tk_core.seq); - ts->tv_sec = tk->xtime_sec; - nsec = timekeeping_get_ns(&tk->tkr_mono); - tomono = tk->wall_to_monotonic; - nssusp = tk->time_suspended; - } while (read_seqcount_retry(&tk_core.seq, seq)); - - ts->tv_sec += tomono.tv_sec; - ts->tv_nsec = 0; - timespec64_add_ns(ts, nsec + tomono.tv_nsec); - tsusp = ns_to_timespec64(nssusp); - *ts = timespec64_sub(*ts, tsusp); -} - -/** * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC * * Returns the seconds portion of CLOCK_MONOTONIC with a single non @@ -1417,12 +1410,12 @@ int timekeeping_notify(struct clocksource *clock) } /** - * getrawmonotonic64 - Returns the raw monotonic time in a timespec + * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec * @ts: pointer to the timespec64 to be set * * Returns the raw monotonic time (completely un-modified by ntp) */ -void getrawmonotonic64(struct timespec64 *ts) +void ktime_get_raw_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; @@ -1438,7 +1431,7 @@ void getrawmonotonic64(struct timespec64 *ts) ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); } -EXPORT_SYMBOL(getrawmonotonic64); +EXPORT_SYMBOL(ktime_get_raw_ts64); /** @@ -1593,6 +1586,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, return; } tk_xtime_add(tk, delta); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); tk_debug_account_sleep_time(delta); } @@ -2125,7 +2119,7 @@ out: void getboottime64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - ktime_t t = ktime_sub(tk->offs_real, tk->time_suspended); + ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); *ts = ktime_to_timespec64(t); } @@ -2139,30 +2133,20 @@ unsigned long get_seconds(void) } EXPORT_SYMBOL(get_seconds); -struct timespec __current_kernel_time(void) +void ktime_get_coarse_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - - return timespec64_to_timespec(tk_xtime(tk)); -} - -struct timespec64 current_kernel_time64(void) -{ - struct timekeeper *tk = &tk_core.timekeeper; - struct timespec64 now; unsigned long seq; do { seq = read_seqcount_begin(&tk_core.seq); - now = tk_xtime(tk); + *ts = tk_xtime(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); - - return now; } -EXPORT_SYMBOL(current_kernel_time64); +EXPORT_SYMBOL(ktime_get_coarse_real_ts64); -struct timespec64 get_monotonic_coarse64(void) +void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now, mono; @@ -2175,12 +2159,10 @@ struct timespec64 get_monotonic_coarse64(void) mono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); - set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec, + set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); - - return now; } -EXPORT_SYMBOL(get_monotonic_coarse64); +EXPORT_SYMBOL(ktime_get_coarse_ts64); /* * Must hold jiffies_lock @@ -2195,6 +2177,7 @@ void do_timer(unsigned long ticks) * ktime_get_update_offsets_now - hrtimer helper * @cwsseq: pointer to check and store the clock was set sequence number * @offs_real: pointer to storage for monotonic -> realtime offset + * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset * * Returns current monotonic time and updates the offsets if the @@ -2204,7 +2187,7 @@ void do_timer(unsigned long ticks) * Called from hrtimer_interrupt() or retrigger_next_event() */ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, - ktime_t *offs_tai) + ktime_t *offs_boot, ktime_t *offs_tai) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; @@ -2221,6 +2204,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, if (*cwsseq != tk->clock_was_set_seq) { *cwsseq = tk->clock_was_set_seq; *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; } diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 79b67f5e0343..7a9b4eb7a1d5 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -6,6 +6,7 @@ */ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, + ktime_t *offs_boot, ktime_t *offs_tai); extern int timekeeping_valid_for_hres(void); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 4a4fd567fb26..cc2d23e6ff61 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1251,18 +1251,18 @@ EXPORT_SYMBOL(try_to_del_timer_sync); * * Note: For !irqsafe timers, you must not hold locks that are held in * interrupt context while calling this function. Even if the lock has - * nothing to do with the timer in question. Here's why: + * nothing to do with the timer in question. Here's why:: * * CPU0 CPU1 * ---- ---- - * <SOFTIRQ> - * call_timer_fn(); - * base->running_timer = mytimer; - * spin_lock_irq(somelock); + * <SOFTIRQ> + * call_timer_fn(); + * base->running_timer = mytimer; + * spin_lock_irq(somelock); * <IRQ> * spin_lock(somelock); - * del_timer_sync(mytimer); - * while (base->running_timer == mytimer); + * del_timer_sync(mytimer); + * while (base->running_timer == mytimer); * * Now del_timer_sync() will never return and never release somelock. * The interrupt on the other CPU is waiting to grab somelock but diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 0ed768b56c60..d647dabdac97 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -28,8 +28,6 @@ struct timer_list_iter { u64 now; }; -typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); - /* * This allows printing both to /proc/timer_list and * to the console (on SysRq-Q): @@ -372,24 +370,12 @@ static const struct seq_operations timer_list_sops = { .show = timer_list_show, }; -static int timer_list_open(struct inode *inode, struct file *filp) -{ - return seq_open_private(filp, &timer_list_sops, - sizeof(struct timer_list_iter)); -} - -static const struct file_operations timer_list_fops = { - .open = timer_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - static int __init init_timer_list_procfs(void) { struct proc_dir_entry *pe; - pe = proc_create("timer_list", 0400, NULL, &timer_list_fops); + pe = proc_create_seq_private("timer_list", 0400, NULL, &timer_list_sops, + sizeof(struct timer_list_iter), NULL); if (!pe) return -ENOMEM; return 0; diff --git a/kernel/torture.c b/kernel/torture.c index 37b94012a3f8..3de1efbecd6a 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -574,7 +574,7 @@ void stutter_wait(const char *title) { int spt; - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); spt = READ_ONCE(stutter_pause_test); for (; spt; spt = READ_ONCE(stutter_pause_test)) { if (spt == 1) { diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index c4f0f2e4126e..dd6c0a2ad969 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -110,11 +110,7 @@ config GENERIC_TRACER # config TRACING_SUPPORT bool - # PPC32 has no irqflags tracing support, but it can use most of the - # tracers anyway, they were tested to build and work. Note that new - # exceptions to this list aren't welcomed, better implement the - # irqflags tracing for your architecture. - depends on TRACE_IRQFLAGS_SUPPORT || PPC32 + depends on TRACE_IRQFLAGS_SUPPORT depends on STACKTRACE_SUPPORT default y diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d88e96d4e12c..0ae6829804bc 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -14,12 +14,14 @@ #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/kprobes.h> +#include <linux/syscalls.h> #include <linux/error-injection.h> #include "trace_probe.h" #include "trace.h" u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); /** * trace_call_bpf - invoke BPF program @@ -474,8 +476,6 @@ BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) struct bpf_array *array = container_of(map, struct bpf_array, map); struct cgroup *cgrp; - if (unlikely(in_interrupt())) - return -EINVAL; if (unlikely(idx >= array->map.max_entries)) return -E2BIG; @@ -564,6 +564,10 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_prandom_u32_proto; case BPF_FUNC_probe_read_str: return &bpf_probe_read_str_proto; +#ifdef CONFIG_CGROUPS + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; +#endif default: return NULL; } @@ -577,6 +581,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; #ifdef CONFIG_BPF_KPROBE_OVERRIDE @@ -664,6 +670,25 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size, + u64, flags) +{ + struct pt_regs *regs = *(struct pt_regs **)tp_buff; + + return bpf_get_stack((unsigned long) regs, (unsigned long) buf, + (unsigned long) size, flags, 0); +} + +static const struct bpf_func_proto bpf_get_stack_proto_tp = { + .func = bpf_get_stack_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -672,6 +697,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto_tp; default: return tracing_func_proto(func_id, prog); } @@ -734,6 +761,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto_tp; case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; default: @@ -744,7 +773,7 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) /* * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp * to avoid potential recursive reuse issue when/if tracepoints are added - * inside bpf_*_event_output and/or bpf_get_stack_id + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack */ static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, @@ -787,6 +816,26 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, + void *, buf, u32, size, u64, flags) +{ + struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + + perf_fetch_caller_regs(regs); + return bpf_get_stack((unsigned long) regs, (unsigned long) buf, + (unsigned long) size, flags, 0); +} + +static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { + .func = bpf_get_stack_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -795,6 +844,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto_raw_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto_raw_tp; default: return tracing_func_proto(func_id, prog); } @@ -833,8 +884,14 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type return false; if (type != BPF_READ) return false; - if (off % size != 0) - return false; + if (off % size != 0) { + if (sizeof(unsigned long) != 4) + return false; + if (size != 8) + return false; + if (off % size != 4) + return false; + } switch (off) { case bpf_ctx_range(struct bpf_perf_event_data, sample_period): @@ -959,6 +1016,8 @@ void perf_event_detach_bpf_prog(struct perf_event *event) old_array = event->tp_event->prog_array; ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); + if (ret == -ENOENT) + goto unlock; if (ret < 0) { bpf_prog_array_delete_safe(old_array, event->prog); } else { @@ -977,6 +1036,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) { struct perf_event_query_bpf __user *uquery = info; struct perf_event_query_bpf query = {}; + u32 *ids, prog_cnt, ids_len; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -985,16 +1045,32 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) return -EINVAL; if (copy_from_user(&query, uquery, sizeof(query))) return -EFAULT; - if (query.ids_len > BPF_TRACE_MAX_PROGS) + + ids_len = query.ids_len; + if (ids_len > BPF_TRACE_MAX_PROGS) return -E2BIG; + ids = kcalloc(ids_len, sizeof(u32), GFP_USER | __GFP_NOWARN); + if (!ids) + return -ENOMEM; + /* + * The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0, which + * is required when user only wants to check for uquery->prog_cnt. + * There is no need to check for it since the case is handled + * gracefully in bpf_prog_array_copy_info. + */ mutex_lock(&bpf_event_mutex); ret = bpf_prog_array_copy_info(event->tp_event->prog_array, - uquery->ids, - query.ids_len, - &uquery->prog_cnt); + ids, + ids_len, + &prog_cnt); mutex_unlock(&bpf_event_mutex); + if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || + copy_to_user(uquery->ids, ids, ids_len * sizeof(u32))) + ret = -EFAULT; + + kfree(ids); return ret; } @@ -1100,3 +1176,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) mutex_unlock(&bpf_event_mutex); return err; } + +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, + u32 *fd_type, const char **buf, + u64 *probe_offset, u64 *probe_addr) +{ + bool is_tracepoint, is_syscall_tp; + struct bpf_prog *prog; + int flags, err = 0; + + prog = event->prog; + if (!prog) + return -ENOENT; + + /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */ + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) + return -EOPNOTSUPP; + + *prog_id = prog->aux->id; + flags = event->tp_event->flags; + is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT; + is_syscall_tp = is_syscall_trace_event(event->tp_event); + + if (is_tracepoint || is_syscall_tp) { + *buf = is_tracepoint ? event->tp_event->tp->name + : event->tp_event->name; + *fd_type = BPF_FD_TYPE_TRACEPOINT; + *probe_offset = 0x0; + *probe_addr = 0x0; + } else { + /* kprobe/uprobe */ + err = -EOPNOTSUPP; +#ifdef CONFIG_KPROBE_EVENTS + if (flags & TRACE_EVENT_FL_KPROBE) + err = bpf_get_kprobe_info(event, fd_type, buf, + probe_offset, probe_addr, + event->attr.type == PERF_TYPE_TRACEPOINT); +#endif +#ifdef CONFIG_UPROBE_EVENTS + if (flags & TRACE_EVENT_FL_UPROBE) + err = bpf_get_uprobe_info(event, fd_type, buf, + probe_offset, + event->attr.type == PERF_TYPE_TRACEPOINT); +#endif + } + + return err; +} diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 16bbf062018f..efed9c1cfb7e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -728,7 +728,7 @@ static int ftrace_profile_init_cpu(int cpu) */ size = FTRACE_PROFILE_HASH_SIZE; - stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); + stat->hash = kcalloc(size, sizeof(struct hlist_head), GFP_KERNEL); if (!stat->hash) return -ENOMEM; @@ -5514,10 +5514,10 @@ static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) ftrace_create_filter_files(&global_ops, d_tracer); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - trace_create_file("set_graph_function", 0444, d_tracer, + trace_create_file("set_graph_function", 0644, d_tracer, NULL, &ftrace_graph_fops); - trace_create_file("set_graph_notrace", 0444, d_tracer, + trace_create_file("set_graph_notrace", 0644, d_tracer, NULL, &ftrace_graph_notrace_fops); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -6830,9 +6830,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) struct task_struct *g, *t; for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) { - ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH - * sizeof(struct ftrace_ret_stack), - GFP_KERNEL); + ret_stack_list[i] = + kmalloc_array(FTRACE_RETFUNC_DEPTH, + sizeof(struct ftrace_ret_stack), + GFP_KERNEL); if (!ret_stack_list[i]) { start = 0; end = i; @@ -6904,9 +6905,9 @@ static int start_graph_tracing(void) struct ftrace_ret_stack **ret_stack_list; int ret, cpu; - ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE * - sizeof(struct ftrace_ret_stack *), - GFP_KERNEL); + ret_stack_list = kmalloc_array(FTRACE_RETSTACK_ALLOC_SIZE, + sizeof(struct ftrace_ret_stack *), + GFP_KERNEL); if (!ret_stack_list) return -ENOMEM; @@ -7088,9 +7089,10 @@ void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) ret_stack = per_cpu(idle_ret_stack, cpu); if (!ret_stack) { - ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH - * sizeof(struct ftrace_ret_stack), - GFP_KERNEL); + ret_stack = + kmalloc_array(FTRACE_RETFUNC_DEPTH, + sizeof(struct ftrace_ret_stack), + GFP_KERNEL); if (!ret_stack) return; per_cpu(idle_ret_stack, cpu) = ret_stack; @@ -7109,9 +7111,9 @@ void ftrace_graph_init_task(struct task_struct *t) if (ftrace_graph_active) { struct ftrace_ret_stack *ret_stack; - ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH - * sizeof(struct ftrace_ret_stack), - GFP_KERNEL); + ret_stack = kmalloc_array(FTRACE_RETFUNC_DEPTH, + sizeof(struct ftrace_ret_stack), + GFP_KERNEL); if (!ret_stack) return; graph_init_task(t, ret_stack); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c9cb9767d49b..6a46af21765c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -809,7 +809,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); * * You can see, it is legitimate for the previous pointer of * the head (or any page) not to point back to itself. But only - * temporarially. + * temporarily. */ #define RB_PAGE_NORMAL 0UL @@ -906,7 +906,7 @@ static void rb_list_head_clear(struct list_head *list) } /* - * rb_head_page_dactivate - clears head page ptr (for free list) + * rb_head_page_deactivate - clears head page ptr (for free list) */ static void rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer) @@ -1780,7 +1780,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, put_online_cpus(); } else { - /* Make sure this CPU has been intitialized */ + /* Make sure this CPU has been initialized */ if (!cpumask_test_cpu(cpu_id, buffer->cpumask)) goto out; @@ -2325,7 +2325,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, /* * If we need to add a timestamp, then we - * add it to the start of the resevered space. + * add it to the start of the reserved space. */ if (unlikely(info->add_timestamp)) { bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer); @@ -2681,7 +2681,7 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) * ring_buffer_nest_start - Allow to trace while nested * @buffer: The ring buffer to modify * - * The ring buffer has a safty mechanism to prevent recursion. + * The ring buffer has a safety mechanism to prevent recursion. * But there may be a case where a trace needs to be done while * tracing something else. In this case, calling this function * will allow this function to nest within a currently active @@ -2699,7 +2699,7 @@ void ring_buffer_nest_start(struct ring_buffer *buffer) preempt_disable_notrace(); cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; - /* This is the shift value for the above recusive locking */ + /* This is the shift value for the above recursive locking */ cpu_buffer->nest += NESTED_BITS; } @@ -2718,7 +2718,7 @@ void ring_buffer_nest_end(struct ring_buffer *buffer) /* disabled by ring_buffer_nest_start() */ cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; - /* This is the shift value for the above recusive locking */ + /* This is the shift value for the above recursive locking */ cpu_buffer->nest -= NESTED_BITS; preempt_enable_notrace(); } @@ -2907,7 +2907,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, * @buffer: the ring buffer to reserve from * @length: the length of the data to reserve (excluding event header) * - * Returns a reseverd event on the ring buffer to copy directly to. + * Returns a reserved event on the ring buffer to copy directly to. * The user of this interface will need to get the body to write into * and can use the ring_buffer_event_data() interface. * @@ -3009,7 +3009,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, * This function lets the user discard an event in the ring buffer * and then that event will not be read later. * - * This function only works if it is called before the the item has been + * This function only works if it is called before the item has been * committed. It will try to free the event from the ring buffer * if another event has not been added behind it. * @@ -4127,7 +4127,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * through the buffer. Memory is allocated, buffer recording * is disabled, and the iterator pointer is returned to the caller. * - * Disabling buffer recordng prevents the reading from being + * Disabling buffer recording prevents the reading from being * corrupted. This is not a consuming read, so a producer is not * expected. * diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dfbcf9ee1447..c9336e98ac59 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -893,7 +893,7 @@ int __trace_bputs(unsigned long ip, const char *str) EXPORT_SYMBOL_GPL(__trace_bputs); #ifdef CONFIG_TRACER_SNAPSHOT -static void tracing_snapshot_instance(struct trace_array *tr) +void tracing_snapshot_instance(struct trace_array *tr) { struct tracer *tracer = tr->current_trace; unsigned long flags; @@ -949,7 +949,7 @@ static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, struct trace_buffer *size_buf, int cpu_id); static void set_buffer_entries(struct trace_buffer *buf, unsigned long val); -static int alloc_snapshot(struct trace_array *tr) +int tracing_alloc_snapshot_instance(struct trace_array *tr) { int ret; @@ -995,7 +995,7 @@ int tracing_alloc_snapshot(void) struct trace_array *tr = &global_trace; int ret; - ret = alloc_snapshot(tr); + ret = tracing_alloc_snapshot_instance(tr); WARN_ON(ret < 0); return ret; @@ -1165,7 +1165,7 @@ static struct { { trace_clock, "perf", 1 }, { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, - { ktime_get_mono_fast_ns, "boot", 1 }, + { ktime_get_boot_fast_ns, "boot", 1 }, ARCH_TRACE_CLOCKS }; @@ -1751,12 +1751,13 @@ static inline void set_cmdline(int idx, const char *cmdline) static int allocate_cmdlines_buffer(unsigned int val, struct saved_cmdlines_buffer *s) { - s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid), - GFP_KERNEL); + s->map_cmdline_to_pid = kmalloc_array(val, + sizeof(*s->map_cmdline_to_pid), + GFP_KERNEL); if (!s->map_cmdline_to_pid) return -ENOMEM; - s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL); + s->saved_cmdlines = kmalloc_array(TASK_COMM_LEN, val, GFP_KERNEL); if (!s->saved_cmdlines) { kfree(s->map_cmdline_to_pid); return -ENOMEM; @@ -4360,7 +4361,8 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_TGID) { if (!tgid_map) - tgid_map = kzalloc((PID_MAX_DEFAULT + 1) * sizeof(*tgid_map), + tgid_map = kcalloc(PID_MAX_DEFAULT + 1, + sizeof(*tgid_map), GFP_KERNEL); if (!tgid_map) { tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; @@ -4395,8 +4397,7 @@ static int trace_set_options(struct trace_array *tr, char *option) { char *cmp; int neg = 0; - int ret = -ENODEV; - int i; + int ret; size_t orig_len = strlen(option); cmp = strstrip(option); @@ -4408,16 +4409,12 @@ static int trace_set_options(struct trace_array *tr, char *option) mutex_lock(&trace_types_lock); - for (i = 0; trace_options[i]; i++) { - if (strcmp(cmp, trace_options[i]) == 0) { - ret = set_tracer_flag(tr, 1 << i, !neg); - break; - } - } - + ret = match_string(trace_options, -1, cmp); /* If no option could be set, test the specific tracer options */ - if (!trace_options[i]) + if (ret < 0) ret = set_tracer_option(tr, cmp, neg); + else + ret = set_tracer_flag(tr, 1 << ret, !neg); mutex_unlock(&trace_types_lock); @@ -5068,7 +5065,7 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, * where the head holds the module and length of array, and the * tail holds a pointer to the next list. */ - map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); + map_array = kmalloc_array(len + 2, sizeof(*map_array), GFP_KERNEL); if (!map_array) { pr_warn("Unable to allocate trace eval mapping\n"); return; @@ -5408,7 +5405,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) #ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { - ret = alloc_snapshot(tr); + ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) goto out; } @@ -6074,6 +6071,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, { struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; + enum event_trigger_type tt = ETT_NONE; struct ring_buffer *buffer; struct print_entry *entry; unsigned long irq_flags; @@ -6122,6 +6120,12 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, written = cnt; len = cnt; + if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { + /* do not add \n before testing triggers, but add \0 */ + entry->buf[cnt] = '\0'; + tt = event_triggers_call(tr->trace_marker_file, entry, event); + } + if (entry->buf[cnt - 1] != '\n') { entry->buf[cnt] = '\n'; entry->buf[cnt + 1] = '\0'; @@ -6130,6 +6134,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, __buffer_unlock_commit(buffer, event); + if (tt) + event_triggers_post_call(tr->trace_marker_file, tt); + if (written > 0) *fpos += written; @@ -6451,7 +6458,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, } #endif if (!tr->allocated_snapshot) { - ret = alloc_snapshot(tr); + ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) break; } @@ -7179,7 +7186,7 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, return ret; out_reg: - ret = alloc_snapshot(tr); + ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) goto out; @@ -7896,6 +7903,7 @@ static __init void create_trace_instances(struct dentry *d_tracer) static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) { + struct trace_event_file *file; int cpu; trace_create_file("available_tracers", 0444, d_tracer, @@ -7928,6 +7936,12 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("trace_marker", 0220, d_tracer, tr, &tracing_mark_fops); + file = __find_event_file(tr, "ftrace", "print"); + if (file && file->dir) + trace_create_file("trigger", 0644, file->dir, file, + &event_trigger_fops); + tr->trace_marker_file = file; + trace_create_file("trace_marker_raw", 0220, d_tracer, tr, &tracing_mark_raw_fops); @@ -8111,6 +8125,8 @@ static __init int tracer_init_tracefs(void) if (IS_ERR(d_tracer)) return 0; + event_trace_init(); + init_tracer_tracefs(&global_trace, d_tracer); ftrace_init_tracefs_toplevel(&global_trace, d_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6fb46a06c9dc..630c5a24b2b2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -259,6 +259,7 @@ struct trace_array { struct trace_options *topts; struct list_head systems; struct list_head events; + struct trace_event_file *trace_marker_file; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ int ref; #ifdef CONFIG_FUNCTION_TRACER @@ -1334,7 +1335,7 @@ event_trigger_unlock_commit(struct trace_event_file *file, trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc); if (tt) - event_triggers_post_call(file, tt, entry, event); + event_triggers_post_call(file, tt); } /** @@ -1367,7 +1368,7 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file, irq_flags, pc, regs); if (tt) - event_triggers_post_call(file, tt, entry, event); + event_triggers_post_call(file, tt); } #define FILTER_PRED_INVALID ((unsigned short)-1) @@ -1451,9 +1452,13 @@ trace_find_event_field(struct trace_event_call *call, char *name); extern void trace_event_enable_cmd_record(bool enable); extern void trace_event_enable_tgid_record(bool enable); +extern int event_trace_init(void); extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); +extern struct trace_event_file *__find_event_file(struct trace_array *tr, + const char *system, + const char *event); extern struct trace_event_file *find_event_file(struct trace_array *tr, const char *system, const char *event); @@ -1817,6 +1822,17 @@ static inline void __init trace_event_init(void) { } static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { } #endif +#ifdef CONFIG_TRACER_SNAPSHOT +void tracing_snapshot_instance(struct trace_array *tr); +int tracing_alloc_snapshot_instance(struct trace_array *tr); +#else +static inline void tracing_snapshot_instance(struct trace_array *tr) { } +static inline int tracing_alloc_snapshot_instance(struct trace_array *tr) +{ + return 0; +} +#endif + extern struct trace_iterator *tracepoint_print_iter; #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 22fee766081b..80e0b2aca703 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -159,13 +159,13 @@ static int benchmark_event_kthread(void *arg) * wants to run, schedule in, but if the CPU is idle, * we'll keep burning cycles. * - * Note the _rcu_qs() version of cond_resched() will + * Note the tasks_rcu_qs() version of cond_resched() will * notify synchronize_rcu_tasks() that this thread has * passed a quiescent state for rcu_tasks. Otherwise * this thread will never voluntarily schedule which would * block synchronize_rcu_tasks() indefinitely. */ - cond_resched(); + cond_resched_tasks_rcu_qs(); } return 0; diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e954ae3d82c0..1d67464ed95e 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -230,7 +230,7 @@ FTRACE_ENTRY(bprint, bprint_entry, FILTER_OTHER ); -FTRACE_ENTRY(print, print_entry, +FTRACE_ENTRY_REG(print, print_entry, TRACE_PRINT, @@ -242,7 +242,9 @@ FTRACE_ENTRY(print, print_entry, F_printk("%ps: %s", (void *)__entry->ip, __entry->buf), - FILTER_OTHER + FILTER_OTHER, + + ftrace_event_register ); FTRACE_ENTRY(raw_data, raw_data_entry, @@ -356,7 +358,7 @@ FTRACE_ENTRY(hwlat, hwlat_entry, __field( unsigned int, seqnum ) ), - F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", + F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llu\tnmi-ts:%llu\tnmi-count:%u\n", __entry->seqnum, __entry->tv_sec, __entry->tv_nsec, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 05c7172c6667..14ff4ff3caab 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2007,16 +2007,18 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) return -1; } } - trace_create_file("filter", 0644, file->dir, file, - &ftrace_event_filter_fops); /* * Only event directories that can be enabled should have - * triggers. + * triggers or filters. */ - if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) + if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { + trace_create_file("filter", 0644, file->dir, file, + &ftrace_event_filter_fops); + trace_create_file("trigger", 0644, file->dir, file, &event_trigger_fops); + } #ifdef CONFIG_HIST_TRIGGERS trace_create_file("hist", 0444, file->dir, file, @@ -2473,8 +2475,9 @@ __trace_add_event_dirs(struct trace_array *tr) } } +/* Returns any file that matches the system and event */ struct trace_event_file * -find_event_file(struct trace_array *tr, const char *system, const char *event) +__find_event_file(struct trace_array *tr, const char *system, const char *event) { struct trace_event_file *file; struct trace_event_call *call; @@ -2485,10 +2488,7 @@ find_event_file(struct trace_array *tr, const char *system, const char *event) call = file->event_call; name = trace_event_name(call); - if (!name || !call->class || !call->class->reg) - continue; - - if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) + if (!name || !call->class) continue; if (strcmp(event, name) == 0 && @@ -2498,6 +2498,20 @@ find_event_file(struct trace_array *tr, const char *system, const char *event) return NULL; } +/* Returns valid trace event files that match system and event */ +struct trace_event_file * +find_event_file(struct trace_array *tr, const char *system, const char *event) +{ + struct trace_event_file *file; + + file = __find_event_file(tr, system, event); + if (!file || !file->event_call->class->reg || + file->event_call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) + return NULL; + + return file; +} + #ifdef CONFIG_DYNAMIC_FTRACE /* Avoid typos */ @@ -3132,7 +3146,7 @@ static __init int event_trace_enable_again(void) early_initcall(event_trace_enable_again); -static __init int event_trace_init(void) +__init int event_trace_init(void) { struct trace_array *tr; struct dentry *d_tracer; @@ -3177,8 +3191,6 @@ void __init trace_event_init(void) event_trace_enable(); } -fs_initcall(event_trace_init); - #ifdef CONFIG_FTRACE_STARTUP_TEST static DEFINE_SPINLOCK(test_spinlock); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9b4716bb8bb0..e1c818dbc0d7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -436,15 +436,15 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, nr_preds += 2; /* For TRUE and FALSE */ - op_stack = kmalloc(sizeof(*op_stack) * nr_parens, GFP_KERNEL); + op_stack = kmalloc_array(nr_parens, sizeof(*op_stack), GFP_KERNEL); if (!op_stack) return ERR_PTR(-ENOMEM); - prog_stack = kmalloc(sizeof(*prog_stack) * nr_preds, GFP_KERNEL); + prog_stack = kmalloc_array(nr_preds, sizeof(*prog_stack), GFP_KERNEL); if (!prog_stack) { parse_error(pe, -ENOMEM, 0); goto out_free; } - inverts = kmalloc(sizeof(*inverts) * nr_preds, GFP_KERNEL); + inverts = kmalloc_array(nr_preds, sizeof(*inverts), GFP_KERNEL); if (!inverts) { parse_error(pe, -ENOMEM, 0); goto out_free; @@ -750,28 +750,32 @@ static int filter_pred_none(struct filter_pred *pred, void *event) * * Note: * - @str might not be NULL-terminated if it's of type DYN_STRING - * or STATIC_STRING + * or STATIC_STRING, unless @len is zero. */ static int regex_match_full(char *str, struct regex *r, int len) { - if (strncmp(str, r->pattern, len) == 0) - return 1; - return 0; + /* len of zero means str is dynamic and ends with '\0' */ + if (!len) + return strcmp(str, r->pattern) == 0; + + return strncmp(str, r->pattern, len) == 0; } static int regex_match_front(char *str, struct regex *r, int len) { - if (strncmp(str, r->pattern, r->len) == 0) - return 1; - return 0; + if (len && len < r->len) + return 0; + + return strncmp(str, r->pattern, r->len) == 0; } static int regex_match_middle(char *str, struct regex *r, int len) { - if (strnstr(str, r->pattern, len)) - return 1; - return 0; + if (!len) + return strstr(str, r->pattern) != NULL; + + return strnstr(str, r->pattern, len) != NULL; } static int regex_match_end(char *str, struct regex *r, int len) @@ -1499,14 +1503,14 @@ static int process_preds(struct trace_event_call *call, return ret; } - if (!nr_preds) { - prog = NULL; - } else { - prog = predicate_parse(filter_string, nr_parens, nr_preds, + if (!nr_preds) + return -EINVAL; + + prog = predicate_parse(filter_string, nr_parens, nr_preds, parse_pred, call, pe); - if (IS_ERR(prog)) - return PTR_ERR(prog); - } + if (IS_ERR(prog)) + return PTR_ERR(prog); + rcu_assign_pointer(filter->prog, prog); return 0; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0d7b3ffbecc2..046c716a6536 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2466,6 +2466,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else if (strcmp(modifier, "usecs") == 0) *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; else { + hist_err("Invalid field modifier: ", modifier); field = ERR_PTR(-EINVAL); goto out; } @@ -2481,6 +2482,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else { field = trace_find_event_field(file->event_call, field_name); if (!field || !field->size) { + hist_err("Couldn't find field: ", field_name); field = ERR_PTR(-EINVAL); goto out; } @@ -2863,7 +2865,7 @@ static struct trace_event_file *event_file(struct trace_array *tr, { struct trace_event_file *file; - file = find_event_file(tr, system, event_name); + file = __find_event_file(tr, system, event_name); if (!file) return ERR_PTR(-EINVAL); @@ -4913,6 +4915,16 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) seq_printf(m, "%s", field_name); } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP) seq_puts(m, "common_timestamp"); + + if (hist_field->flags) { + if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) && + !(hist_field->flags & HIST_FIELD_FL_EXPR)) { + const char *flags = get_hist_field_flags(hist_field); + + if (flags) + seq_printf(m, ".%s", flags); + } + } } static int event_hist_trigger_print(struct seq_file *m, diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d251cabcf69a..d18249683682 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -97,7 +97,6 @@ EXPORT_SYMBOL_GPL(event_triggers_call); * event_triggers_post_call - Call 'post_triggers' for a trace event * @file: The trace_event_file associated with the event * @tt: enum event_trigger_type containing a set bit for each trigger to invoke - * @rec: The trace entry for the event * * For each trigger associated with an event, invoke the trigger * function registered with the associated trigger command, if the @@ -108,8 +107,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call); */ void event_triggers_post_call(struct trace_event_file *file, - enum event_trigger_type tt, - void *rec, struct ring_buffer_event *event) + enum event_trigger_type tt) { struct event_trigger_data *data; @@ -117,7 +115,7 @@ event_triggers_post_call(struct trace_event_file *file, if (data->paused) continue; if (data->cmd_ops->trigger_type & tt) - data->ops->func(data, rec, event); + data->ops->func(data, NULL, NULL); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -483,9 +481,10 @@ clear_event_triggers(struct trace_array *tr) struct trace_event_file *file; list_for_each_entry(file, &tr->events, list) { - struct event_trigger_data *data; - list_for_each_entry_rcu(data, &file->triggers, list) { + struct event_trigger_data *data, *n; + list_for_each_entry_safe(data, n, &file->triggers, list) { trace_event_trigger_enable_disable(file, 0); + list_del_rcu(&data->list); if (data->ops->free) data->ops->free(data->ops, data); } @@ -642,6 +641,7 @@ event_trigger_callback(struct event_command *cmd_ops, trigger_data->count = -1; trigger_data->ops = trigger_ops; trigger_data->cmd_ops = cmd_ops; + trigger_data->private_data = file; INIT_LIST_HEAD(&trigger_data->list); INIT_LIST_HEAD(&trigger_data->named_list); @@ -1053,7 +1053,12 @@ static void snapshot_trigger(struct event_trigger_data *data, void *rec, struct ring_buffer_event *event) { - tracing_snapshot(); + struct trace_event_file *file = data->private_data; + + if (file) + tracing_snapshot_instance(file->tr); + else + tracing_snapshot(); } static void @@ -1076,7 +1081,7 @@ register_snapshot_trigger(char *glob, struct event_trigger_ops *ops, { int ret = register_trigger(glob, ops, data, file); - if (ret > 0 && tracing_alloc_snapshot() != 0) { + if (ret > 0 && tracing_alloc_snapshot_instance(file->tr) != 0) { unregister_trigger(glob, ops, data, file); ret = 0; } diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 548e62eb5c46..45630a76ed3a 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -14,6 +14,13 @@ #include "trace_output.h" +/* Stub function for events with triggers */ +static int ftrace_event_register(struct trace_event_call *call, + enum trace_reg type, void *data) +{ + return 0; +} + #undef TRACE_SYSTEM #define TRACE_SYSTEM ftrace @@ -117,7 +124,7 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef __dynamic_array #define __dynamic_array(type, item) \ - ret = trace_define_field(event_call, #type, #item, \ + ret = trace_define_field(event_call, #type "[]", #item, \ offsetof(typeof(field), item), \ 0, is_signed_type(type), filter_type);\ if (ret) \ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1cd3fb4d70f8..daa81571b22a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -512,8 +512,6 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) if (ret == 0) tk->tp.flags |= TP_FLAG_REGISTERED; else { - pr_warn("Could not insert probe at %s+%lu: %d\n", - trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); ret = 0; @@ -1289,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, head, NULL); } NOKPROBE_SYMBOL(kretprobe_perf_func); + +int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, + const char **symbol, u64 *probe_offset, + u64 *probe_addr, bool perf_type_tracepoint) +{ + const char *pevent = trace_event_name(event->tp_event); + const char *group = event->tp_event->class->system; + struct trace_kprobe *tk; + + if (perf_type_tracepoint) + tk = find_trace_kprobe(pevent, group); + else + tk = event->tp_event->data; + if (!tk) + return -EINVAL; + + *fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE + : BPF_FD_TYPE_KPROBE; + if (tk->symbol) { + *symbol = tk->symbol; + *probe_offset = tk->rp.kp.offset; + *probe_addr = 0; + } else { + *symbol = NULL; + *probe_offset = 0; + *probe_addr = (unsigned long)tk->rp.kp.addr; + } + return 0; +} #endif /* CONFIG_PERF_EVENTS */ /* diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 3c7bfc4bf5e9..4237eba4ef20 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -472,7 +472,7 @@ static __init int stack_trace_init(void) NULL, &stack_trace_fops); #ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("stack_trace_filter", 0444, d_tracer, + trace_create_file("stack_trace_filter", 0644, d_tracer, &trace_ops, &stack_trace_filter_fops); #endif diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 34fd0e0ec51d..bf89a51e740d 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -55,6 +55,7 @@ struct trace_uprobe { struct list_head list; struct trace_uprobe_filter filter; struct uprobe_consumer consumer; + struct path path; struct inode *inode; char *filename; unsigned long offset; @@ -289,7 +290,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu) for (i = 0; i < tu->tp.nr_args; i++) traceprobe_free_probe_arg(&tu->tp.args[i]); - iput(tu->inode); + path_put(&tu->path); kfree(tu->tp.call.class->system); kfree(tu->tp.call.name); kfree(tu->filename); @@ -363,7 +364,6 @@ end: static int create_trace_uprobe(int argc, char **argv) { struct trace_uprobe *tu; - struct inode *inode; char *arg, *event, *group, *filename; char buf[MAX_EVENT_NAME_LEN]; struct path path; @@ -371,7 +371,6 @@ static int create_trace_uprobe(int argc, char **argv) bool is_delete, is_return; int i, ret; - inode = NULL; ret = 0; is_delete = false; is_return = false; @@ -437,21 +436,16 @@ static int create_trace_uprobe(int argc, char **argv) } /* Find the last occurrence, in case the path contains ':' too. */ arg = strrchr(argv[1], ':'); - if (!arg) { - ret = -EINVAL; - goto fail_address_parse; - } + if (!arg) + return -EINVAL; *arg++ = '\0'; filename = argv[1]; ret = kern_path(filename, LOOKUP_FOLLOW, &path); if (ret) - goto fail_address_parse; - - inode = igrab(d_real_inode(path.dentry)); - path_put(&path); + return ret; - if (!inode || !S_ISREG(inode->i_mode)) { + if (!d_is_reg(path.dentry)) { ret = -EINVAL; goto fail_address_parse; } @@ -490,7 +484,7 @@ static int create_trace_uprobe(int argc, char **argv) goto fail_address_parse; } tu->offset = offset; - tu->inode = inode; + tu->path = path; tu->filename = kstrdup(filename, GFP_KERNEL); if (!tu->filename) { @@ -558,7 +552,7 @@ error: return ret; fail_address_parse: - iput(inode); + path_put(&path); pr_info("Failed to parse address or file.\n"); @@ -922,6 +916,7 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file, goto err_flags; tu->consumer.filter = filter; + tu->inode = d_real_inode(tu->path.dentry); ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); if (ret) goto err_buffer; @@ -967,6 +962,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) WARN_ON(!uprobe_filter_is_empty(&tu->filter)); uprobe_unregister(tu->inode, tu->offset, &tu->consumer); + tu->inode = NULL; tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE; uprobe_buffer_disable(); @@ -1165,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, { __uprobe_perf_func(tu, func, regs, ucb, dsize); } + +int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, + const char **filename, u64 *probe_offset, + bool perf_type_tracepoint) +{ + const char *pevent = trace_event_name(event->tp_event); + const char *group = event->tp_event->class->system; + struct trace_uprobe *tu; + + if (perf_type_tracepoint) + tu = find_probe_event(pevent, group); + else + tu = event->tp_event->data; + if (!tu) + return -EINVAL; + + *fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE + : BPF_FD_TYPE_UPROBE; + *filename = tu->filename; + *probe_offset = tu->offset; + return 0; +} #endif /* CONFIG_PERF_EVENTS */ static int @@ -1337,7 +1355,6 @@ struct trace_event_call * create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) { struct trace_uprobe *tu; - struct inode *inode; struct path path; int ret; @@ -1345,11 +1362,8 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) if (ret) return ERR_PTR(ret); - inode = igrab(d_inode(path.dentry)); - path_put(&path); - - if (!inode || !S_ISREG(inode->i_mode)) { - iput(inode); + if (!d_is_reg(path.dentry)) { + path_put(&path); return ERR_PTR(-EINVAL); } @@ -1364,11 +1378,12 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return) if (IS_ERR(tu)) { pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); + path_put(&path); return ERR_CAST(tu); } tu->offset = offs; - tu->inode = inode; + tu->path = path; tu->filename = kstrdup(name, GFP_KERNEL); init_trace_event_call(tu, &tu->tp.call); diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 5cadb1b8b5fe..752d8042bad4 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -1075,7 +1075,7 @@ int tracing_map_sort_entries(struct tracing_map *map, struct tracing_map_sort_entry *sort_entry, **entries; int i, n_entries, ret; - entries = vmalloc(map->max_elts * sizeof(sort_entry)); + entries = vmalloc(array_size(sizeof(sort_entry), map->max_elts)); if (!entries) return -ENOMEM; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 671b13457387..6dc6356c3327 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -207,7 +207,7 @@ static int tracepoint_add_func(struct tracepoint *tp, lockdep_is_held(&tracepoints_mutex)); old = func_add(&tp_funcs, func, prio); if (IS_ERR(old)) { - WARN_ON_ONCE(1); + WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM); return PTR_ERR(old); } @@ -239,7 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, lockdep_is_held(&tracepoints_mutex)); old = func_remove(&tp_funcs, func); if (IS_ERR(old)) { - WARN_ON_ONCE(1); + WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM); return PTR_ERR(old); } @@ -257,7 +257,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, } /** - * tracepoint_probe_register - Connect a probe to a tracepoint + * tracepoint_probe_register_prio - Connect a probe to a tracepoint with priority * @tp: tracepoint * @probe: probe handler * @data: tracepoint data @@ -290,7 +290,6 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio); * @tp: tracepoint * @probe: probe handler * @data: tracepoint data - * @prio: priority of this function over other registered functions * * Returns 0 if ok, error value on error. * Note: if @tp is within a module, the caller is responsible for diff --git a/kernel/umh.c b/kernel/umh.c index f76b3ff876cf..c449858946af 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -25,6 +25,8 @@ #include <linux/ptrace.h> #include <linux/async.h> #include <linux/uaccess.h> +#include <linux/shmem_fs.h> +#include <linux/pipe_fs_i.h> #include <trace/events/module.h> @@ -97,9 +99,14 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); - retval = do_execve(getname_kernel(sub_info->path), - (const char __user *const __user *)sub_info->argv, - (const char __user *const __user *)sub_info->envp); + sub_info->pid = task_pid_nr(current); + if (sub_info->file) + retval = do_execve_file(sub_info->file, + sub_info->argv, sub_info->envp); + else + retval = do_execve(getname_kernel(sub_info->path), + (const char __user *const __user *)sub_info->argv, + (const char __user *const __user *)sub_info->envp); out: sub_info->retval = retval; /* @@ -393,6 +400,117 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, } EXPORT_SYMBOL(call_usermodehelper_setup); +struct subprocess_info *call_usermodehelper_setup_file(struct file *file, + int (*init)(struct subprocess_info *info, struct cred *new), + void (*cleanup)(struct subprocess_info *info), void *data) +{ + struct subprocess_info *sub_info; + + sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL); + if (!sub_info) + return NULL; + + INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); + sub_info->path = "none"; + sub_info->file = file; + sub_info->init = init; + sub_info->cleanup = cleanup; + sub_info->data = data; + return sub_info; +} + +static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) +{ + struct umh_info *umh_info = info->data; + struct file *from_umh[2]; + struct file *to_umh[2]; + int err; + + /* create pipe to send data to umh */ + err = create_pipe_files(to_umh, 0); + if (err) + return err; + err = replace_fd(0, to_umh[0], 0); + fput(to_umh[0]); + if (err < 0) { + fput(to_umh[1]); + return err; + } + + /* create pipe to receive data from umh */ + err = create_pipe_files(from_umh, 0); + if (err) { + fput(to_umh[1]); + replace_fd(0, NULL, 0); + return err; + } + err = replace_fd(1, from_umh[1], 0); + fput(from_umh[1]); + if (err < 0) { + fput(to_umh[1]); + replace_fd(0, NULL, 0); + fput(from_umh[0]); + return err; + } + + umh_info->pipe_to_umh = to_umh[1]; + umh_info->pipe_from_umh = from_umh[0]; + return 0; +} + +static void umh_save_pid(struct subprocess_info *info) +{ + struct umh_info *umh_info = info->data; + + umh_info->pid = info->pid; +} + +/** + * fork_usermode_blob - fork a blob of bytes as a usermode process + * @data: a blob of bytes that can be do_execv-ed as a file + * @len: length of the blob + * @info: information about usermode process (shouldn't be NULL) + * + * Returns either negative error or zero which indicates success + * in executing a blob of bytes as a usermode process. In such + * case 'struct umh_info *info' is populated with two pipes + * and a pid of the process. The caller is responsible for health + * check of the user process, killing it via pid, and closing the + * pipes when user process is no longer needed. + */ +int fork_usermode_blob(void *data, size_t len, struct umh_info *info) +{ + struct subprocess_info *sub_info; + struct file *file; + ssize_t written; + loff_t pos = 0; + int err; + + file = shmem_kernel_file_setup("", len, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + + written = kernel_write(file, data, len, &pos); + if (written != len) { + err = written; + if (err >= 0) + err = -ENOMEM; + goto out; + } + + err = -ENOMEM; + sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup, + umh_save_pid, info); + if (!sub_info) + goto out; + + err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); +out: + fput(file); + return err; +} +EXPORT_SYMBOL_GPL(fork_usermode_blob); + /** * call_usermodehelper_exec - start a usermode application * @sub_info: information about the subprocessa diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 246d4d4ce5c7..c3d7583fcd21 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -764,8 +764,9 @@ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) struct uid_gid_extent *forward; /* Allocate memory for 340 mappings. */ - forward = kmalloc(sizeof(struct uid_gid_extent) * - UID_GID_MAP_MAX_EXTENTS, GFP_KERNEL); + forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS, + sizeof(struct uid_gid_extent), + GFP_KERNEL); if (!forward) return -ENOMEM; @@ -1235,6 +1236,7 @@ bool current_in_userns(const struct user_namespace *target_ns) { return in_userns(target_ns, current_user_ns()); } +EXPORT_SYMBOL(current_in_userns); static inline struct user_namespace *to_user_ns(struct ns_common *ns) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ca7959be8aaa..78b192071ef7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -66,7 +66,7 @@ enum { * be executing on any CPU. The pool behaves as an unbound one. * * Note that DISASSOCIATED should be flipped only while holding - * attach_mutex to avoid changing binding state while + * wq_pool_attach_mutex to avoid changing binding state while * worker_attach_to_pool() is in progress. */ POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */ @@ -123,7 +123,7 @@ enum { * cpu or grabbing pool->lock is enough for read access. If * POOL_DISASSOCIATED is set, it's identical to L. * - * A: pool->attach_mutex protected. + * A: wq_pool_attach_mutex protected. * * PL: wq_pool_mutex protected. * @@ -166,7 +166,6 @@ struct worker_pool { /* L: hash of busy workers */ struct worker *manager; /* L: purely informational */ - struct mutex attach_mutex; /* attach/detach exclusion */ struct list_head workers; /* A: attached workers */ struct completion *detach_completion; /* all workers detached */ @@ -297,6 +296,7 @@ static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ +static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */ @@ -399,14 +399,14 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); * @worker: iteration cursor * @pool: worker_pool to iterate workers of * - * This must be called with @pool->attach_mutex. + * This must be called with wq_pool_attach_mutex. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pool_worker(worker, pool) \ list_for_each_entry((worker), &(pool)->workers, node) \ - if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \ + if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \ else /** @@ -1724,7 +1724,7 @@ static struct worker *alloc_worker(int node) static void worker_attach_to_pool(struct worker *worker, struct worker_pool *pool) { - mutex_lock(&pool->attach_mutex); + mutex_lock(&wq_pool_attach_mutex); /* * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any @@ -1733,37 +1733,40 @@ static void worker_attach_to_pool(struct worker *worker, set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); /* - * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains - * stable across this function. See the comments above the - * flag definition for details. + * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains + * stable across this function. See the comments above the flag + * definition for details. */ if (pool->flags & POOL_DISASSOCIATED) worker->flags |= WORKER_UNBOUND; list_add_tail(&worker->node, &pool->workers); + worker->pool = pool; - mutex_unlock(&pool->attach_mutex); + mutex_unlock(&wq_pool_attach_mutex); } /** * worker_detach_from_pool() - detach a worker from its pool * @worker: worker which is attached to its pool - * @pool: the pool @worker is attached to * * Undo the attaching which had been done in worker_attach_to_pool(). The * caller worker shouldn't access to the pool after detached except it has * other reference to the pool. */ -static void worker_detach_from_pool(struct worker *worker, - struct worker_pool *pool) +static void worker_detach_from_pool(struct worker *worker) { + struct worker_pool *pool = worker->pool; struct completion *detach_completion = NULL; - mutex_lock(&pool->attach_mutex); + mutex_lock(&wq_pool_attach_mutex); + list_del(&worker->node); + worker->pool = NULL; + if (list_empty(&pool->workers)) detach_completion = pool->detach_completion; - mutex_unlock(&pool->attach_mutex); + mutex_unlock(&wq_pool_attach_mutex); /* clear leftover flags without pool->lock after it is detached */ worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND); @@ -1799,7 +1802,6 @@ static struct worker *create_worker(struct worker_pool *pool) if (!worker) goto fail; - worker->pool = pool; worker->id = id; if (pool->cpu >= 0) @@ -2086,6 +2088,12 @@ __acquires(&pool->lock) worker->current_pwq = pwq; work_color = get_work_color(work); + /* + * Record wq name for cmdline and debug reporting, may get + * overridden through set_worker_desc(). + */ + strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN); + list_del_init(&work->entry); /* @@ -2181,7 +2189,6 @@ __acquires(&pool->lock) worker->current_work = NULL; worker->current_func = NULL; worker->current_pwq = NULL; - worker->desc_valid = false; pwq_dec_nr_in_flight(pwq, work_color); } @@ -2206,6 +2213,16 @@ static void process_scheduled_works(struct worker *worker) } } +static void set_pf_worker(bool val) +{ + mutex_lock(&wq_pool_attach_mutex); + if (val) + current->flags |= PF_WQ_WORKER; + else + current->flags &= ~PF_WQ_WORKER; + mutex_unlock(&wq_pool_attach_mutex); +} + /** * worker_thread - the worker thread function * @__worker: self @@ -2224,7 +2241,7 @@ static int worker_thread(void *__worker) struct worker_pool *pool = worker->pool; /* tell the scheduler that this is a workqueue worker */ - worker->task->flags |= PF_WQ_WORKER; + set_pf_worker(true); woke_up: spin_lock_irq(&pool->lock); @@ -2232,11 +2249,11 @@ woke_up: if (unlikely(worker->flags & WORKER_DIE)) { spin_unlock_irq(&pool->lock); WARN_ON_ONCE(!list_empty(&worker->entry)); - worker->task->flags &= ~PF_WQ_WORKER; + set_pf_worker(false); set_task_comm(worker->task, "kworker/dying"); ida_simple_remove(&pool->worker_ida, worker->id); - worker_detach_from_pool(worker, pool); + worker_detach_from_pool(worker); kfree(worker); return 0; } @@ -2335,7 +2352,7 @@ static int rescuer_thread(void *__rescuer) * Mark rescuer as worker too. As WORKER_PREP is never cleared, it * doesn't participate in concurrency management. */ - rescuer->task->flags |= PF_WQ_WORKER; + set_pf_worker(true); repeat: set_current_state(TASK_IDLE); @@ -2367,7 +2384,6 @@ repeat: worker_attach_to_pool(rescuer, pool); spin_lock_irq(&pool->lock); - rescuer->pool = pool; /* * Slurp in all works issued via this workqueue and @@ -2417,10 +2433,9 @@ repeat: if (need_more_worker(pool)) wake_up_worker(pool); - rescuer->pool = NULL; spin_unlock_irq(&pool->lock); - worker_detach_from_pool(rescuer, pool); + worker_detach_from_pool(rescuer); spin_lock_irq(&wq_mayday_lock); } @@ -2429,7 +2444,7 @@ repeat: if (should_stop) { __set_current_state(TASK_RUNNING); - rescuer->task->flags &= ~PF_WQ_WORKER; + set_pf_worker(false); return 0; } @@ -3271,7 +3286,6 @@ static int init_worker_pool(struct worker_pool *pool) timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0); - mutex_init(&pool->attach_mutex); INIT_LIST_HEAD(&pool->workers); ida_init(&pool->worker_ida); @@ -3354,10 +3368,10 @@ static void put_unbound_pool(struct worker_pool *pool) WARN_ON(pool->nr_workers || pool->nr_idle); spin_unlock_irq(&pool->lock); - mutex_lock(&pool->attach_mutex); + mutex_lock(&wq_pool_attach_mutex); if (!list_empty(&pool->workers)) pool->detach_completion = &detach_completion; - mutex_unlock(&pool->attach_mutex); + mutex_unlock(&wq_pool_attach_mutex); if (pool->detach_completion) wait_for_completion(pool->detach_completion); @@ -3700,8 +3714,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, lockdep_assert_held(&wq_pool_mutex); - ctx = kzalloc(sizeof(*ctx) + nr_node_ids * sizeof(ctx->pwq_tbl[0]), - GFP_KERNEL); + ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(GFP_KERNEL); tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); @@ -4347,9 +4360,9 @@ void set_worker_desc(const char *fmt, ...) va_start(args, fmt); vsnprintf(worker->desc, sizeof(worker->desc), fmt, args); va_end(args); - worker->desc_valid = true; } } +EXPORT_SYMBOL_GPL(set_worker_desc); /** * print_worker_info - print out worker information and description @@ -4371,7 +4384,6 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) char desc[WORKER_DESC_LEN] = { }; struct pool_workqueue *pwq = NULL; struct workqueue_struct *wq = NULL; - bool desc_valid = false; struct worker *worker; if (!(task->flags & PF_WQ_WORKER)) @@ -4384,22 +4396,18 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) worker = kthread_probe_data(task); /* - * Carefully copy the associated workqueue's workfn and name. Keep - * the original last '\0' in case the original contains garbage. + * Carefully copy the associated workqueue's workfn, name and desc. + * Keep the original last '\0' in case the original is garbage. */ probe_kernel_read(&fn, &worker->current_func, sizeof(fn)); probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq)); probe_kernel_read(&wq, &pwq->wq, sizeof(wq)); probe_kernel_read(name, wq->name, sizeof(name) - 1); - - /* copy worker description */ - probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid)); - if (desc_valid) - probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); + probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); if (fn || name[0] || desc[0]) { printk("%sWorkqueue: %s %pf", log_lvl, name, fn); - if (desc[0]) + if (strcmp(name, desc)) pr_cont(" (%s)", desc); pr_cont("\n"); } @@ -4579,6 +4587,47 @@ void show_workqueue_state(void) rcu_read_unlock_sched(); } +/* used to show worker information through /proc/PID/{comm,stat,status} */ +void wq_worker_comm(char *buf, size_t size, struct task_struct *task) +{ + int off; + + /* always show the actual comm */ + off = strscpy(buf, task->comm, size); + if (off < 0) + return; + + /* stabilize PF_WQ_WORKER and worker pool association */ + mutex_lock(&wq_pool_attach_mutex); + + if (task->flags & PF_WQ_WORKER) { + struct worker *worker = kthread_data(task); + struct worker_pool *pool = worker->pool; + + if (pool) { + spin_lock_irq(&pool->lock); + /* + * ->desc tracks information (wq name or + * set_worker_desc()) for the latest execution. If + * current, prepend '+', otherwise '-'. + */ + if (worker->desc[0] != '\0') { + if (worker->current_work) + scnprintf(buf + off, size - off, "+%s", + worker->desc); + else + scnprintf(buf + off, size - off, "-%s", + worker->desc); + } + spin_unlock_irq(&pool->lock); + } + } + + mutex_unlock(&wq_pool_attach_mutex); +} + +#ifdef CONFIG_SMP + /* * CPU hotplug. * @@ -4600,7 +4649,7 @@ static void unbind_workers(int cpu) struct worker *worker; for_each_cpu_worker_pool(pool, cpu) { - mutex_lock(&pool->attach_mutex); + mutex_lock(&wq_pool_attach_mutex); spin_lock_irq(&pool->lock); /* @@ -4616,7 +4665,7 @@ static void unbind_workers(int cpu) pool->flags |= POOL_DISASSOCIATED; spin_unlock_irq(&pool->lock); - mutex_unlock(&pool->attach_mutex); + mutex_unlock(&wq_pool_attach_mutex); /* * Call schedule() so that we cross rq->lock and thus can @@ -4657,7 +4706,7 @@ static void rebind_workers(struct worker_pool *pool) { struct worker *worker; - lockdep_assert_held(&pool->attach_mutex); + lockdep_assert_held(&wq_pool_attach_mutex); /* * Restore CPU affinity of all workers. As all idle workers should @@ -4727,7 +4776,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) static cpumask_t cpumask; struct worker *worker; - lockdep_assert_held(&pool->attach_mutex); + lockdep_assert_held(&wq_pool_attach_mutex); /* is @cpu allowed for @pool? */ if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) @@ -4762,14 +4811,14 @@ int workqueue_online_cpu(unsigned int cpu) mutex_lock(&wq_pool_mutex); for_each_pool(pool, pi) { - mutex_lock(&pool->attach_mutex); + mutex_lock(&wq_pool_attach_mutex); if (pool->cpu == cpu) rebind_workers(pool); else if (pool->cpu < 0) restore_unbound_workers_cpumask(pool, cpu); - mutex_unlock(&pool->attach_mutex); + mutex_unlock(&wq_pool_attach_mutex); } /* update NUMA affinity of unbound workqueues */ @@ -4799,8 +4848,6 @@ int workqueue_offline_cpu(unsigned int cpu) return 0; } -#ifdef CONFIG_SMP - struct work_for_cpu { struct work_struct work; long (*fn)(void *); @@ -5591,7 +5638,7 @@ static void __init wq_numa_init(void) * available. Build one from cpu_to_node() which should have been * fully initialized by now. */ - tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL); + tbl = kcalloc(nr_node_ids, sizeof(tbl[0]), GFP_KERNEL); BUG_ON(!tbl); for_each_node(node) diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index d390d1be3748..66fbb5a9e633 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -31,13 +31,12 @@ struct worker { struct work_struct *current_work; /* L: work being processed */ work_func_t current_func; /* L: current_work's fn */ struct pool_workqueue *current_pwq; /* L: current_work's pwq */ - bool desc_valid; /* ->desc is valid */ struct list_head scheduled; /* L: scheduled works */ /* 64 bytes boundary on 64bit, 32 on 32bit */ struct task_struct *task; /* I: worker task */ - struct worker_pool *pool; /* I: the associated pool */ + struct worker_pool *pool; /* A: the associated pool */ /* L: for rescuers */ struct list_head node; /* A: anchored at pool->workers */ /* A: runs through worker->node */ |