diff options
Diffstat (limited to 'kernel')
45 files changed, 654 insertions, 425 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 5350fd292910..9a20016d4900 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,7 +5,7 @@ obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o softirq.o resource.o \ - sysctl.o sysctl_binary.o capability.o ptrace.o user.o \ + sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ @@ -36,7 +36,7 @@ KCOV_INSTRUMENT_stacktrace.o := n KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n KCSAN_SANITIZE_kcov.o := n -CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) +CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector # cond_syscall is currently not LTO compatible CFLAGS_sys_ni.o = $(DISABLE_LTO) diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 363b9cafc2d8..b6715964b685 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -338,8 +338,8 @@ static void bpf_iter_link_release(struct bpf_link *link) struct bpf_iter_link *iter_link = container_of(link, struct bpf_iter_link, link); - if (iter_link->aux.map) - bpf_map_put_with_uref(iter_link->aux.map); + if (iter_link->tinfo->reg_info->detach_target) + iter_link->tinfo->reg_info->detach_target(&iter_link->aux); } static void bpf_iter_link_dealloc(struct bpf_link *link) @@ -390,15 +390,35 @@ bool bpf_link_is_iter(struct bpf_link *link) int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + union bpf_iter_link_info __user *ulinfo; struct bpf_link_primer link_primer; struct bpf_iter_target_info *tinfo; - struct bpf_iter_aux_info aux = {}; + union bpf_iter_link_info linfo; struct bpf_iter_link *link; - u32 prog_btf_id, target_fd; + u32 prog_btf_id, linfo_len; bool existed = false; - struct bpf_map *map; int err; + if (attr->link_create.target_fd || attr->link_create.flags) + return -EINVAL; + + memset(&linfo, 0, sizeof(union bpf_iter_link_info)); + + ulinfo = u64_to_user_ptr(attr->link_create.iter_info); + linfo_len = attr->link_create.iter_info_len; + if (!ulinfo ^ !linfo_len) + return -EINVAL; + + if (ulinfo) { + err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo), + linfo_len); + if (err) + return err; + linfo_len = min_t(u32, linfo_len, sizeof(linfo)); + if (copy_from_user(&linfo, ulinfo, linfo_len)) + return -EFAULT; + } + prog_btf_id = prog->aux->attach_btf_id; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { @@ -411,13 +431,6 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) if (!existed) return -ENOENT; - /* Make sure user supplied flags are target expected. */ - target_fd = attr->link_create.target_fd; - if (attr->link_create.flags != tinfo->reg_info->req_linfo) - return -EINVAL; - if (!attr->link_create.flags && target_fd) - return -EINVAL; - link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); if (!link) return -ENOMEM; @@ -431,28 +444,15 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return err; } - if (tinfo->reg_info->req_linfo == BPF_ITER_LINK_MAP_FD) { - map = bpf_map_get_with_uref(target_fd); - if (IS_ERR(map)) { - err = PTR_ERR(map); - goto cleanup_link; - } - - aux.map = map; - err = tinfo->reg_info->check_target(prog, &aux); + if (tinfo->reg_info->attach_target) { + err = tinfo->reg_info->attach_target(prog, &linfo, &link->aux); if (err) { - bpf_map_put_with_uref(map); - goto cleanup_link; + bpf_link_cleanup(&link_primer); + return err; } - - link->aux.map = map; } return bpf_link_settle(&link_primer); - -cleanup_link: - bpf_link_cleanup(&link_primer); - return err; } static void init_seq_meta(struct bpf_iter_priv_data *priv_data, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index bde93344164d..ed0b3578867c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1966,7 +1966,7 @@ void bpf_prog_array_delete_safe(struct bpf_prog_array *array, * @index: the index of the program to replace * * Skips over dummy programs, by not counting them, when calculating - * the the position of the program to replace. + * the position of the program to replace. * * Return: * * 0 - Success diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index fbe1f557cb88..af86048e5afd 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -98,12 +98,21 @@ static struct bpf_iter_reg bpf_map_reg_info = { .seq_info = &bpf_map_seq_info, }; -static int bpf_iter_check_map(struct bpf_prog *prog, - struct bpf_iter_aux_info *aux) +static int bpf_iter_attach_map(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) { u32 key_acc_size, value_acc_size, key_size, value_size; - struct bpf_map *map = aux->map; + struct bpf_map *map; bool is_percpu = false; + int err = -EINVAL; + + if (!linfo->map.map_fd) + return -EBADF; + + map = bpf_map_get_with_uref(linfo->map.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || @@ -112,7 +121,7 @@ static int bpf_iter_check_map(struct bpf_prog *prog, else if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) - return -EINVAL; + goto put_map; key_acc_size = prog->aux->max_rdonly_access; value_acc_size = prog->aux->max_rdwr_access; @@ -122,10 +131,22 @@ static int bpf_iter_check_map(struct bpf_prog *prog, else value_size = round_up(map->value_size, 8) * num_possible_cpus(); - if (key_acc_size > key_size || value_acc_size > value_size) - return -EACCES; + if (key_acc_size > key_size || value_acc_size > value_size) { + err = -EACCES; + goto put_map; + } + aux->map = map; return 0; + +put_map: + bpf_map_put_with_uref(map); + return err; +} + +static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux) +{ + bpf_map_put_with_uref(aux->map); } DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, @@ -133,8 +154,8 @@ DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, static const struct bpf_iter_reg bpf_map_elem_reg_info = { .target = "bpf_map_elem", - .check_target = bpf_iter_check_map, - .req_linfo = BPF_ITER_LINK_MAP_FD, + .attach_target = bpf_iter_attach_map, + .detach_target = bpf_iter_detach_map, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map_elem, key), diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2f343ce15747..86299a292214 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3883,7 +3883,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog * return -EINVAL; } -#define BPF_LINK_CREATE_LAST_FIELD link_create.flags +#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len static int link_create(union bpf_attr *attr) { enum bpf_prog_type ptype; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b6ccfce3bf4c..ef938f17b944 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8294,7 +8294,7 @@ static bool stacksafe(struct bpf_func_state *old, if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != cur->stack[spi].slot_type[i % BPF_REG_SIZE]) /* Ex: old explored (safe) state has STACK_SPILL in - * this stack slot, but current has has STACK_MISC -> + * this stack slot, but current has STACK_MISC -> * this verifier states are not equivalent, * return false to continue verification of this path */ diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 18175687133a..106e4500fd53 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -11,6 +11,8 @@ #include <asm/page.h> #include <asm/sections.h> +#include <crypto/sha.h> + /* vmcoreinfo stuff */ unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; @@ -376,6 +378,53 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void) } EXPORT_SYMBOL(paddr_vmcoreinfo_note); +#define NOTES_SIZE (&__stop_notes - &__start_notes) +#define BUILD_ID_MAX SHA1_DIGEST_SIZE +#define NT_GNU_BUILD_ID 3 + +struct elf_note_section { + struct elf_note n_hdr; + u8 n_data[]; +}; + +/* + * Add build ID from .notes section as generated by the GNU ld(1) + * or LLVM lld(1) --build-id option. + */ +static void add_build_id_vmcoreinfo(void) +{ + char build_id[BUILD_ID_MAX * 2 + 1]; + int n_remain = NOTES_SIZE; + + while (n_remain >= sizeof(struct elf_note)) { + const struct elf_note_section *note_sec = + &__start_notes + NOTES_SIZE - n_remain; + const u32 n_namesz = note_sec->n_hdr.n_namesz; + + if (note_sec->n_hdr.n_type == NT_GNU_BUILD_ID && + n_namesz != 0 && + !strcmp((char *)¬e_sec->n_data[0], "GNU")) { + if (note_sec->n_hdr.n_descsz <= BUILD_ID_MAX) { + const u32 n_descsz = note_sec->n_hdr.n_descsz; + const u8 *s = ¬e_sec->n_data[n_namesz]; + + s = PTR_ALIGN(s, 4); + bin2hex(build_id, s, n_descsz); + build_id[2 * n_descsz] = '\0'; + VMCOREINFO_BUILD_ID(build_id); + return; + } + pr_warn("Build ID is too large to include in vmcoreinfo: %u > %u\n", + note_sec->n_hdr.n_descsz, + BUILD_ID_MAX); + return; + } + n_remain -= sizeof(struct elf_note) + + ALIGN(note_sec->n_hdr.n_namesz, 4) + + ALIGN(note_sec->n_hdr.n_descsz, 4); + } +} + static int __init crash_save_vmcoreinfo_init(void) { vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); @@ -394,6 +443,7 @@ static int __init crash_save_vmcoreinfo_init(void) } VMCOREINFO_OSRELEASE(init_uts_ns.name.release); + add_build_id_vmcoreinfo(); VMCOREINFO_PAGESIZE(PAGE_SIZE); VMCOREINFO_SYMBOL(init_uts_ns); diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index f4770fcfa62b..847a9d1fa634 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -1,5 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only +config NO_DMA + bool + config HAS_DMA bool depends on !NO_DMA @@ -186,11 +189,6 @@ config DMA_API_DEBUG drivers like double-freeing of DMA mappings or freeing mappings that were never allocated. - This also attempts to catch cases where a page owned by DMA is - accessed by the cpu in a way that could cause data corruption. For - example, this enables cow_user_page() to check that the source page is - not undergoing DMA. - This option causes a performance degradation. Use only if you want to debug device drivers and dma interactions. diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index f7f807fb6ebb..8e9f7b301c6d 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -448,9 +448,6 @@ void debug_dma_dump_mappings(struct device *dev) * dma_active_cacheline entry to track per event. dma_map_sg(), on the * other hand, consumes a single dma_debug_entry, but inserts 'nents' * entries into the tree. - * - * At any time debug_dma_assert_idle() can be called to trigger a - * warning if any cachelines in the given page are in the active set. */ static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT); static DEFINE_SPINLOCK(radix_lock); @@ -497,10 +494,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln) overlap = active_cacheline_set_overlap(cln, ++overlap); /* If we overflowed the overlap counter then we're potentially - * leaking dma-mappings. Otherwise, if maps and unmaps are - * balanced then this overflow may cause false negatives in - * debug_dma_assert_idle() as the cacheline may be marked idle - * prematurely. + * leaking dma-mappings. */ WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP, pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"), @@ -555,53 +549,6 @@ static void active_cacheline_remove(struct dma_debug_entry *entry) spin_unlock_irqrestore(&radix_lock, flags); } -/** - * debug_dma_assert_idle() - assert that a page is not undergoing dma - * @page: page to lookup in the dma_active_cacheline tree - * - * Place a call to this routine in cases where the cpu touching the page - * before the dma completes (page is dma_unmapped) will lead to data - * corruption. - */ -void debug_dma_assert_idle(struct page *page) -{ - static struct dma_debug_entry *ents[CACHELINES_PER_PAGE]; - struct dma_debug_entry *entry = NULL; - void **results = (void **) &ents; - unsigned int nents, i; - unsigned long flags; - phys_addr_t cln; - - if (dma_debug_disabled()) - return; - - if (!page) - return; - - cln = (phys_addr_t) page_to_pfn(page) << CACHELINE_PER_PAGE_SHIFT; - spin_lock_irqsave(&radix_lock, flags); - nents = radix_tree_gang_lookup(&dma_active_cacheline, results, cln, - CACHELINES_PER_PAGE); - for (i = 0; i < nents; i++) { - phys_addr_t ent_cln = to_cacheline_number(ents[i]); - - if (ent_cln == cln) { - entry = ents[i]; - break; - } else if (ent_cln >= cln + CACHELINES_PER_PAGE) - break; - } - spin_unlock_irqrestore(&radix_lock, flags); - - if (!entry) - return; - - cln = to_cacheline_number(entry); - err_printk(entry->dev, entry, - "cpu touching an active dma mapped cacheline [cln=%pa]\n", - &cln); -} - /* * Wrapper function for adding an entry to the hash. * This function takes care of locking itself. diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c6ce894e4ce9..58cbe357fb2b 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -217,10 +217,9 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); - fs = get_fs(); - set_fs(USER_DS); + fs = force_uaccess_begin(); perf_callchain_user(&ctx, regs); - set_fs(fs); + force_uaccess_end(fs); } } diff --git a/kernel/events/core.c b/kernel/events/core.c index d1f0a7e5b182..5bfe8e3c6e44 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6453,10 +6453,9 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, /* Data. */ sp = perf_user_stack_pointer(regs); - fs = get_fs(); - set_fs(USER_DS); + fs = force_uaccess_begin(); rem = __output_copy_user(handle, (void *) sp, dump_size); - set_fs(fs); + force_uaccess_end(fs); dyn_size = dump_size - rem; perf_output_skip(handle, rem); @@ -11707,7 +11706,7 @@ SYSCALL_DEFINE5(perf_event_open, goto err_task; /* - * Reuse ptrace permission checks for now. + * Preserve ptrace permission check for backwards compatibility. * * We must hold exec_update_mutex across this and any potential * perf_install_in_context() call for this new event to @@ -11715,7 +11714,7 @@ SYSCALL_DEFINE5(perf_event_open, * perf_event_exit_task() that could imply). */ err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) + if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) goto err_cred; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 25de10c904e6..649fd53dc9ad 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -184,7 +184,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { get_page(new_page); page_add_new_anon_rmap(new_page, vma, addr, false); - lru_cache_add_active_or_unevictable(new_page, vma); + lru_cache_add_inactive_or_unevictable(new_page, vma); } else /* no new page, just dec_mm_counter for old_page */ dec_mm_counter(mm, MM_ANONPAGES); @@ -376,7 +376,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) if (!vaddr || !d) return -EINVAL; - ret = get_user_pages_remote(NULL, mm, vaddr, 1, + ret = get_user_pages_remote(mm, vaddr, 1, FOLL_WRITE, &page, &vma, NULL); if (unlikely(ret <= 0)) { /* @@ -477,7 +477,7 @@ retry: if (is_register) gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, gup_flags, + ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &old_page, &vma, NULL); if (ret <= 0) return ret; @@ -2029,7 +2029,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * but we treat this as a 'remote' access since it is * essentially a kernel access to the memory. */ - result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, + result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL, NULL); if (result < 0) return result; diff --git a/kernel/exit.c b/kernel/exit.c index e731c414e024..733e80f334e7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -732,7 +732,7 @@ void __noreturn do_exit(long code) * mm_release()->clear_child_tid() from writing to a user-controlled * kernel address. */ - set_fs(USER_DS); + force_uaccess_begin(); if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", @@ -1626,6 +1626,22 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options, return ret; } +int kernel_wait(pid_t pid, int *stat) +{ + struct wait_opts wo = { + .wo_type = PIDTYPE_PID, + .wo_pid = find_get_pid(pid), + .wo_flags = WEXITED, + }; + int ret; + + ret = do_wait(&wo); + if (ret > 0 && wo.wo_stat) + *stat = wo.wo_stat; + put_pid(wo.wo_pid); + return ret; +} + SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, int, options, struct rusage __user *, ru) { diff --git a/kernel/fork.c b/kernel/fork.c index 35e9894d394c..4d32190861bd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2011,7 +2011,7 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; p->cpuset_slab_spread_rotor = NUMA_NO_NODE; - seqcount_init(&p->mems_allowed_seq); + seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock); #endif #ifdef CONFIG_TRACE_IRQFLAGS memset(&p->irqtrace, 0, sizeof(p->irqtrace)); diff --git a/kernel/futex.c b/kernel/futex.c index 83404124b77b..a5876694a60e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -678,7 +678,7 @@ static int fault_in_user_writeable(u32 __user *uaddr) int ret; mmap_read_lock(mm); - ret = fixup_user_fault(current, mm, (unsigned long)uaddr, + ret = fixup_user_fault(mm, (unsigned long)uaddr, FAULT_FLAG_WRITE, NULL); mmap_read_unlock(mm); @@ -3744,12 +3744,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; - /* fall through */ + fallthrough; case FUTEX_WAIT_BITSET: return futex_wait(uaddr, flags, val, timeout, val3); case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; - /* fall through */ + fallthrough; case FUTEX_WAKE_BITSET: return futex_wake(uaddr, flags, val, val3); case FUTEX_REQUEUE: diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d55ba625d426..52ac5391dcc6 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2731,8 +2731,10 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, do { chip = irq_data_get_irq_chip(data); - if (WARN_ON_ONCE(!chip)) - return -ENODEV; + if (WARN_ON_ONCE(!chip)) { + err = -ENODEV; + goto out_unlock; + } if (chip->irq_set_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY @@ -2745,6 +2747,7 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, if (data) err = chip->irq_set_irqchip_state(data, which, val); +out_unlock: irq_put_desc_busunlock(desc, flags); return err; } diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 8f557fa1f4fe..c6c7e187ae74 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -185,14 +185,18 @@ void rearm_wake_irq(unsigned int irq) unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - if (!desc || !(desc->istate & IRQS_SUSPENDED) || - !irqd_is_wakeup_set(&desc->irq_data)) + if (!desc) return; + if (!(desc->istate & IRQS_SUSPENDED) || + !irqd_is_wakeup_set(&desc->irq_data)) + goto unlock; + desc->istate &= ~IRQS_SUSPENDED; irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); __enable_irq(desc); +unlock: irq_put_desc_busunlock(desc, flags); } diff --git a/kernel/kcov.c b/kernel/kcov.c index 6afae0bcbac4..6b8368be89c8 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -96,7 +96,7 @@ struct kcov_percpu_data { int saved_sequence; }; -DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); +static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); /* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_find(u64 handle) @@ -775,7 +775,7 @@ static inline bool kcov_mode_enabled(unsigned int mode) return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED; } -void kcov_remote_softirq_start(struct task_struct *t) +static void kcov_remote_softirq_start(struct task_struct *t) { struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); unsigned int mode; @@ -792,7 +792,7 @@ void kcov_remote_softirq_start(struct task_struct *t) } } -void kcov_remote_softirq_stop(struct task_struct *t) +static void kcov_remote_softirq_stop(struct task_struct *t) { struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 78c0837bfd7b..ca40bef75a61 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -1169,24 +1169,26 @@ int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mstart, unsigned long long mend) { int i, j; - unsigned long long start, end; + unsigned long long start, end, p_start, p_end; struct crash_mem_range temp_range = {0, 0}; for (i = 0; i < mem->nr_ranges; i++) { start = mem->ranges[i].start; end = mem->ranges[i].end; + p_start = mstart; + p_end = mend; if (mstart > end || mend < start) continue; /* Truncate any area outside of range */ if (mstart < start) - mstart = start; + p_start = start; if (mend > end) - mend = end; + p_end = end; /* Found completely overlapping range */ - if (mstart == start && mend == end) { + if (p_start == start && p_end == end) { mem->ranges[i].start = 0; mem->ranges[i].end = 0; if (i < mem->nr_ranges - 1) { @@ -1197,20 +1199,29 @@ int crash_exclude_mem_range(struct crash_mem *mem, mem->ranges[j].end = mem->ranges[j+1].end; } + + /* + * Continue to check if there are another overlapping ranges + * from the current position because of shifting the above + * mem ranges. + */ + i--; + mem->nr_ranges--; + continue; } mem->nr_ranges--; return 0; } - if (mstart > start && mend < end) { + if (p_start > start && p_end < end) { /* Split original range */ - mem->ranges[i].end = mstart - 1; - temp_range.start = mend + 1; + mem->ranges[i].end = p_start - 1; + temp_range.start = p_end + 1; temp_range.end = end; - } else if (mstart != start) - mem->ranges[i].end = mstart - 1; + } else if (p_start != start) + mem->ranges[i].end = p_start - 1; else - mem->ranges[i].start = mend + 1; + mem->ranges[i].start = p_end + 1; break; } @@ -1247,7 +1258,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, unsigned long long notes_addr; unsigned long mstart, mend; - /* extra phdr for vmcoreinfo elf note */ + /* extra phdr for vmcoreinfo ELF note */ nr_phdr = nr_cpus + 1; nr_phdr += mem->nr_ranges; @@ -1255,7 +1266,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64). * I think this is required by tools like gdb. So same physical - * memory will be mapped in two elf headers. One will contain kernel + * memory will be mapped in two ELF headers. One will contain kernel * text virtual addresses and other will have __va(physical) addresses. */ @@ -1282,7 +1293,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, ehdr->e_ehsize = sizeof(Elf64_Ehdr); ehdr->e_phentsize = sizeof(Elf64_Phdr); - /* Prepare one phdr of type PT_NOTE for each present cpu */ + /* Prepare one phdr of type PT_NOTE for each present CPU */ for_each_present_cpu(cpu) { phdr->p_type = PT_NOTE; notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); @@ -1324,10 +1335,10 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; phdr->p_align = 0; ehdr->e_phnum++; - phdr++; - pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", + pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, ehdr->e_phnum, phdr->p_offset); + phdr++; } *addr = buf; diff --git a/kernel/kmod.c b/kernel/kmod.c index 37c3c4b97b8e..3cd075ce2a1e 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -36,9 +36,8 @@ * * If you need less than 50 threads would mean we're dealing with systems * smaller than 3200 pages. This assumes you are capable of having ~13M memory, - * and this would only be an be an upper limit, after which the OOM killer - * would take effect. Systems like these are very unlikely if modules are - * enabled. + * and this would only be an upper limit, after which the OOM killer would take + * effect. Systems like these are very unlikely if modules are enabled. */ #define MAX_KMOD_CONCURRENT 50 static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); diff --git a/kernel/kthread.c b/kernel/kthread.c index b2807e7be772..3edaa380dc7b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1258,8 +1258,7 @@ void kthread_use_mm(struct mm_struct *mm) if (active_mm != mm) mmdrop(active_mm); - to_kthread(tsk)->oldfs = get_fs(); - set_fs(USER_DS); + to_kthread(tsk)->oldfs = force_uaccess_begin(); } EXPORT_SYMBOL_GPL(kthread_use_mm); @@ -1274,7 +1273,7 @@ void kthread_unuse_mm(struct mm_struct *mm) WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); - set_fs(to_kthread(tsk)->oldfs); + force_uaccess_end(to_kthread(tsk)->oldfs); task_lock(tsk); sync_mm_rss(mm); diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 5525cd3ba0c8..02ef87f50df2 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -423,7 +423,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt) seq_time(m, lt->min); seq_time(m, lt->max); seq_time(m, lt->total); - seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0); + seq_time(m, lt->nr ? div64_u64(lt->total, lt->nr) : 0); } static void seq_stats(struct seq_file *m, struct lock_stat_data *data) diff --git a/kernel/module.c b/kernel/module.c index 8fa2600bde6a..1c5cff34d9f2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -422,7 +422,7 @@ static bool each_symbol_in_section(const struct symsearch *arr, } /* Returns true as soon as fn returns true, otherwise false. */ -bool each_symbol_section(bool (*fn)(const struct symsearch *arr, +static bool each_symbol_section(bool (*fn)(const struct symsearch *arr, struct module *owner, void *data), void *data) @@ -484,7 +484,6 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr, } return false; } -EXPORT_SYMBOL_GPL(each_symbol_section); struct find_symbol_arg { /* Input */ @@ -496,6 +495,7 @@ struct find_symbol_arg { struct module *owner; const s32 *crc; const struct kernel_symbol *sym; + enum mod_license license; }; static bool check_exported_symbol(const struct symsearch *syms, @@ -505,9 +505,9 @@ static bool check_exported_symbol(const struct symsearch *syms, struct find_symbol_arg *fsa = data; if (!fsa->gplok) { - if (syms->licence == GPL_ONLY) + if (syms->license == GPL_ONLY) return false; - if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) { + if (syms->license == WILL_BE_GPL_ONLY && fsa->warn) { pr_warn("Symbol %s is being used by a non-GPL module, " "which will not be allowed in the future\n", fsa->name); @@ -529,6 +529,7 @@ static bool check_exported_symbol(const struct symsearch *syms, fsa->owner = owner; fsa->crc = symversion(syms->crcs, symnum); fsa->sym = &syms->start[symnum]; + fsa->license = syms->license; return true; } @@ -585,9 +586,10 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms, /* Find an exported symbol and return it, along with, (optional) crc and * (optional) module which owns it. Needs preempt disabled or module_mutex. */ -const struct kernel_symbol *find_symbol(const char *name, +static const struct kernel_symbol *find_symbol(const char *name, struct module **owner, const s32 **crc, + enum mod_license *license, bool gplok, bool warn) { @@ -602,13 +604,14 @@ const struct kernel_symbol *find_symbol(const char *name, *owner = fsa.owner; if (crc) *crc = fsa.crc; + if (license) + *license = fsa.license; return fsa.sym; } pr_debug("Failed to find symbol %s\n", name); return NULL; } -EXPORT_SYMBOL_GPL(find_symbol); /* * Search for module by name: must hold module_mutex (or preempt disabled @@ -869,7 +872,7 @@ static int add_module_usage(struct module *a, struct module *b) } /* Module a uses b: caller needs module_mutex() */ -int ref_module(struct module *a, struct module *b) +static int ref_module(struct module *a, struct module *b) { int err; @@ -888,7 +891,6 @@ int ref_module(struct module *a, struct module *b) } return 0; } -EXPORT_SYMBOL_GPL(ref_module); /* Clear the unload stuff of the module. */ static void module_unload_free(struct module *mod) @@ -1077,7 +1079,7 @@ void __symbol_put(const char *symbol) struct module *owner; preempt_disable(); - if (!find_symbol(symbol, &owner, NULL, true, false)) + if (!find_symbol(symbol, &owner, NULL, NULL, true, false)) BUG(); module_put(owner); preempt_enable(); @@ -1169,11 +1171,10 @@ static inline void module_unload_free(struct module *mod) { } -int ref_module(struct module *a, struct module *b) +static int ref_module(struct module *a, struct module *b) { return strong_try_module_get(b); } -EXPORT_SYMBOL_GPL(ref_module); static inline int module_unload_init(struct module *mod) { @@ -1356,7 +1357,7 @@ static inline int check_modstruct_version(const struct load_info *info, * locking is necessary -- use preempt_disable() to placate lockdep. */ preempt_disable(); - if (!find_symbol("module_layout", NULL, &crc, true, false)) { + if (!find_symbol("module_layout", NULL, &crc, NULL, true, false)) { preempt_enable(); BUG(); } @@ -1430,6 +1431,24 @@ static int verify_namespace_is_imported(const struct load_info *info, return 0; } +static bool inherit_taint(struct module *mod, struct module *owner) +{ + if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints)) + return true; + + if (mod->using_gplonly_symbols) { + pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n", + mod->name, owner->name); + return false; + } + + if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) { + pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n", + mod->name, owner->name); + set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints); + } + return true; +} /* Resolve a symbol for this module. I.e. if we find one, record usage. */ static const struct kernel_symbol *resolve_symbol(struct module *mod, @@ -1440,6 +1459,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, struct module *owner; const struct kernel_symbol *sym; const s32 *crc; + enum mod_license license; int err; /* @@ -1449,11 +1469,19 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, */ sched_annotate_sleep(); mutex_lock(&module_mutex); - sym = find_symbol(name, &owner, &crc, + sym = find_symbol(name, &owner, &crc, &license, !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); if (!sym) goto unlock; + if (license == GPL_ONLY) + mod->using_gplonly_symbols = true; + + if (!inherit_taint(mod, owner)) { + sym = NULL; + goto getname; + } + if (!check_version(info, name, mod, crc)) { sym = ERR_PTR(-EINVAL); goto getname; @@ -2236,7 +2264,7 @@ void *__symbol_get(const char *symbol) const struct kernel_symbol *sym; preempt_disable(); - sym = find_symbol(symbol, &owner, NULL, true, true); + sym = find_symbol(symbol, &owner, NULL, NULL, true, true); if (sym && strong_try_module_get(owner)) sym = NULL; preempt_enable(); @@ -2272,7 +2300,7 @@ static int verify_exported_symbols(struct module *mod) for (i = 0; i < ARRAY_SIZE(arr); i++) { for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { if (find_symbol(kernel_symbol_name(s), &owner, NULL, - true, false)) { + NULL, true, false)) { pr_err("%s: exports duplicate symbol %s" " (owned by %s)\n", mod->name, kernel_symbol_name(s), @@ -4489,7 +4517,6 @@ struct module *__module_address(unsigned long addr) } return mod; } -EXPORT_SYMBOL_GPL(__module_address); /* * is_module_text_address - is this address inside module code? @@ -4528,7 +4555,6 @@ struct module *__module_text_address(unsigned long addr) } return mod; } -EXPORT_SYMBOL_GPL(__module_text_address); /* Don't grab lock, we're oopsing. */ void print_modules(void) diff --git a/kernel/panic.c b/kernel/panic.c index e2157ca387c8..aef8872ba843 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -505,7 +505,7 @@ static void do_oops_enter_exit(void) * Return true if the calling CPU is allowed to print oops-related info. * This is a bit racy.. */ -int oops_may_print(void) +bool oops_may_print(void) { return pause_on_oops_flag == 0; } @@ -551,7 +551,7 @@ static int init_oops_id(void) } late_initcall(init_oops_id); -void print_oops_end_marker(void) +static void print_oops_end_marker(void) { init_oops_id(); pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 5714f51ba9f8..f33769f97aca 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -795,6 +795,103 @@ int hibernate(void) return error; } +/** + * hibernate_quiet_exec - Execute a function with all devices frozen. + * @func: Function to execute. + * @data: Data pointer to pass to @func. + * + * Return the @func return value or an error code if it cannot be executed. + */ +int hibernate_quiet_exec(int (*func)(void *data), void *data) +{ + int error, nr_calls = 0; + + lock_system_sleep(); + + if (!hibernate_acquire()) { + error = -EBUSY; + goto unlock; + } + + pm_prepare_console(); + + error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); + if (error) { + nr_calls--; + goto exit; + } + + error = freeze_processes(); + if (error) + goto exit; + + lock_device_hotplug(); + + pm_suspend_clear_flags(); + + error = platform_begin(true); + if (error) + goto thaw; + + error = freeze_kernel_threads(); + if (error) + goto thaw; + + error = dpm_prepare(PMSG_FREEZE); + if (error) + goto dpm_complete; + + suspend_console(); + + error = dpm_suspend(PMSG_FREEZE); + if (error) + goto dpm_resume; + + error = dpm_suspend_end(PMSG_FREEZE); + if (error) + goto dpm_resume; + + error = platform_pre_snapshot(true); + if (error) + goto skip; + + error = func(data); + +skip: + platform_finish(true); + + dpm_resume_start(PMSG_THAW); + +dpm_resume: + dpm_resume(PMSG_THAW); + + resume_console(); + +dpm_complete: + dpm_complete(PMSG_THAW); + + thaw_kernel_threads(); + +thaw: + platform_end(true); + + unlock_device_hotplug(); + + thaw_processes(); + +exit: + __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); + + pm_restore_console(); + + hibernate_release(); + +unlock: + unlock_system_sleep(); + + return error; +} +EXPORT_SYMBOL_GPL(hibernate_quiet_exec); /** * software_resume - Resume from a saved hibernation image. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 84758f34cdb0..8471a0f7eb32 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6431,10 +6431,10 @@ void sched_show_task(struct task_struct *p) if (!try_get_task_stack(p)) return; - printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); + pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p)); if (p->state == TASK_RUNNING) - printk(KERN_CONT " running task "); + pr_cont(" running task "); #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif @@ -6443,8 +6443,8 @@ void sched_show_task(struct task_struct *p) if (pid_alive(p)) ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); - printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, - task_pid_nr(p), ppid, + pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", + free, task_pid_nr(p), ppid, (unsigned long)task_thread_info(p)->flags); print_worker_info(KERN_INFO, p); @@ -6479,13 +6479,6 @@ void show_state_filter(unsigned long state_filter) { struct task_struct *g, *p; -#if BITS_PER_LONG == 32 - printk(KERN_INFO - " task PC stack pid father\n"); -#else - printk(KERN_INFO - " task PC stack pid father\n"); -#endif rcu_read_lock(); for_each_process_thread(g, p) { /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3fd283892761..28709f6b0975 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1999,7 +1999,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; if (trace_sched_update_nr_running_tp_enabled()) { - call_trace_sched_update_nr_running(rq, count); + call_trace_sched_update_nr_running(rq, -count); } /* Check if we still need preemption */ diff --git a/kernel/signal.c b/kernel/signal.c index 6f16f7c5d375..42b67d2cea37 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2541,7 +2541,21 @@ bool get_signal(struct ksignal *ksig) relock: spin_lock_irq(&sighand->siglock); - current->jobctl &= ~JOBCTL_TASK_WORK; + /* + * Make sure we can safely read ->jobctl() in task_work add. As Oleg + * states: + * + * It pairs with mb (implied by cmpxchg) before READ_ONCE. So we + * roughly have + * + * task_work_add: get_signal: + * STORE(task->task_works, new_work); STORE(task->jobctl); + * mb(); mb(); + * LOAD(task->jobctl); LOAD(task->task_works); + * + * and we can rely on STORE-MB-LOAD [ in task_work_add]. + */ + smp_store_mb(current->jobctl, current->jobctl & ~JOBCTL_TASK_WORK); if (unlikely(current->task_works)) { spin_unlock_irq(&sighand->siglock); task_work_run(); diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 2af66e449aa6..946f44a9e86a 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -233,10 +233,9 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) if (current->flags & PF_KTHREAD) return 0; - fs = get_fs(); - set_fs(USER_DS); + fs = force_uaccess_begin(); arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); - set_fs(fs); + force_uaccess_end(fs); return c.len; } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 3b69a560a7ac..4d59775ea79c 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -364,7 +364,6 @@ COND_SYSCALL(socketcall); COND_SYSCALL_COMPAT(socketcall); /* compat syscalls for arm64, x86, ... */ -COND_SYSCALL_COMPAT(sysctl); COND_SYSCALL_COMPAT(fanotify_mark); /* x86 */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f785de3caac0..287862f91717 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2852,6 +2852,15 @@ static struct ctl_table vm_table[] = { .proc_handler = sysctl_compaction_handler, }, { + .procname = "compaction_proactiveness", + .data = &sysctl_compaction_proactiveness, + .maxlen = sizeof(sysctl_compaction_proactiveness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { .procname = "extfrag_threshold", .data = &sysctl_extfrag_threshold, .maxlen = sizeof(int), diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c deleted file mode 100644 index 7d550cc76a3b..000000000000 --- a/kernel/sysctl_binary.c +++ /dev/null @@ -1,171 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/stat.h> -#include <linux/sysctl.h> -#include "../fs/xfs/xfs_sysctl.h" -#include <linux/sunrpc/debug.h> -#include <linux/string.h> -#include <linux/syscalls.h> -#include <linux/namei.h> -#include <linux/mount.h> -#include <linux/fs.h> -#include <linux/nsproxy.h> -#include <linux/pid_namespace.h> -#include <linux/file.h> -#include <linux/ctype.h> -#include <linux/netdevice.h> -#include <linux/kernel.h> -#include <linux/uuid.h> -#include <linux/slab.h> -#include <linux/compat.h> - -static ssize_t binary_sysctl(const int *name, int nlen, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -static void deprecated_sysctl_warning(const int *name, int nlen) -{ - int i; - - /* - * CTL_KERN/KERN_VERSION is used by older glibc and cannot - * ever go away. - */ - if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION) - return; - - if (printk_ratelimit()) { - printk(KERN_INFO - "warning: process `%s' used the deprecated sysctl " - "system call with ", current->comm); - for (i = 0; i < nlen; i++) - printk(KERN_CONT "%d.", name[i]); - printk(KERN_CONT "\n"); - } - return; -} - -#define WARN_ONCE_HASH_BITS 8 -#define WARN_ONCE_HASH_SIZE (1<<WARN_ONCE_HASH_BITS) - -static DECLARE_BITMAP(warn_once_bitmap, WARN_ONCE_HASH_SIZE); - -#define FNV32_OFFSET 2166136261U -#define FNV32_PRIME 0x01000193 - -/* - * Print each legacy sysctl (approximately) only once. - * To avoid making the tables non-const use a external - * hash-table instead. - * Worst case hash collision: 6, but very rarely. - * NOTE! We don't use the SMP-safe bit tests. We simply - * don't care enough. - */ -static void warn_on_bintable(const int *name, int nlen) -{ - int i; - u32 hash = FNV32_OFFSET; - - for (i = 0; i < nlen; i++) - hash = (hash ^ name[i]) * FNV32_PRIME; - hash %= WARN_ONCE_HASH_SIZE; - if (__test_and_set_bit(hash, warn_once_bitmap)) - return; - deprecated_sysctl_warning(name, nlen); -} - -static ssize_t do_sysctl(int __user *args_name, int nlen, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - int name[CTL_MAXNAME]; - int i; - - /* Check args->nlen. */ - if (nlen < 0 || nlen > CTL_MAXNAME) - return -ENOTDIR; - /* Read in the sysctl name for simplicity */ - for (i = 0; i < nlen; i++) - if (get_user(name[i], args_name + i)) - return -EFAULT; - - warn_on_bintable(name, nlen); - - return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen); -} - -SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) -{ - struct __sysctl_args tmp; - size_t oldlen = 0; - ssize_t result; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - if (tmp.oldval && !tmp.oldlenp) - return -EFAULT; - - if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp)) - return -EFAULT; - - result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen, - tmp.newval, tmp.newlen); - - if (result >= 0) { - oldlen = result; - result = 0; - } - - if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp)) - return -EFAULT; - - return result; -} - - -#ifdef CONFIG_COMPAT - -struct compat_sysctl_args { - compat_uptr_t name; - int nlen; - compat_uptr_t oldval; - compat_uptr_t oldlenp; - compat_uptr_t newval; - compat_size_t newlen; - compat_ulong_t __unused[4]; -}; - -COMPAT_SYSCALL_DEFINE1(sysctl, struct compat_sysctl_args __user *, args) -{ - struct compat_sysctl_args tmp; - compat_size_t __user *compat_oldlenp; - size_t oldlen = 0; - ssize_t result; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - if (tmp.oldval && !tmp.oldlenp) - return -EFAULT; - - compat_oldlenp = compat_ptr(tmp.oldlenp); - if (compat_oldlenp && get_user(oldlen, compat_oldlenp)) - return -EFAULT; - - result = do_sysctl(compat_ptr(tmp.name), tmp.nlen, - compat_ptr(tmp.oldval), oldlen, - compat_ptr(tmp.newval), tmp.newlen); - - if (result >= 0) { - oldlen = result; - result = 0; - } - - if (compat_oldlenp && put_user(oldlen, compat_oldlenp)) - return -EFAULT; - - return result; -} - -#endif /* CONFIG_COMPAT */ diff --git a/kernel/task_work.c b/kernel/task_work.c index 5c0848ca1287..613b2d634af8 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -42,7 +42,13 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify) set_notify_resume(task); break; case TWA_SIGNAL: - if (lock_task_sighand(task, &flags)) { + /* + * Only grab the sighand lock if we don't already have some + * task_work pending. This pairs with the smp_store_mb() + * in get_signal(), see comment there. + */ + if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) && + lock_task_sighand(task, &flags)) { task->jobctl |= JOBCTL_TASK_WORK; signal_wake_up(task, 0); unlock_task_sighand(task, &flags); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index fcc42353f125..a09b1d61df6a 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -52,6 +52,15 @@ config GENERIC_CLOCKEVENTS_MIN_ADJUST config GENERIC_CMOS_UPDATE bool +# Select to handle posix CPU timers from task_work +# and not from the timer interrupt context +config HAVE_POSIX_CPU_TIMERS_TASK_WORK + bool + +config POSIX_CPU_TIMERS_TASK_WORK + bool + default y if POSIX_TIMERS && HAVE_POSIX_CPU_TIMERS_TASK_WORK + if GENERIC_CLOCKEVENTS menu "Timers subsystem" diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 2ffb466af77e..ca223a89530a 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -192,7 +192,7 @@ static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm) * When a alarm timer fires, this runs through the timerqueue to * see which alarms expired, and runs those. If there are more alarm * timers queued for the future, we set the hrtimer to fire when - * when the next future alarm timer expires. + * the next future alarm timer expires. */ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) { diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d89da1c7e005..c4038511d5c9 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -135,7 +135,11 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { - .clock_base = { { .cpu_base = &migration_cpu_base, }, }, + .clock_base = { { + .cpu_base = &migration_cpu_base, + .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, + &migration_cpu_base.lock), + }, }, }; #define migration_base migration_cpu_base.clock_base[0] @@ -1998,8 +2002,11 @@ int hrtimers_prepare_cpu(unsigned int cpu) int i; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - cpu_base->clock_base[i].cpu_base = cpu_base; - timerqueue_init_head(&cpu_base->clock_base[i].active); + struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; + + clock_b->cpu_base = cpu_base; + seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); + timerqueue_init_head(&clock_b->active); } cpu_base->cpu = cpu; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 165117996ea0..a71758e34e45 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -377,6 +377,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) */ static int posix_cpu_timer_create(struct k_itimer *new_timer) { + static struct lock_class_key posix_cpu_timers_key; struct pid *pid; rcu_read_lock(); @@ -386,6 +387,17 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) return -EINVAL; } + /* + * If posix timer expiry is handled in task work context then + * timer::it_lock can be taken without disabling interrupts as all + * other locking happens in task context. This requires a seperate + * lock class key otherwise regular posix timer expiry would record + * the lock class being taken in interrupt context and generate a + * false positive warning. + */ + if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK)) + lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key); + new_timer->kclock = &clock_posix_cpu; timerqueue_init(&new_timer->it.cpu.node); new_timer->it.cpu.pid = get_pid(pid); @@ -1080,43 +1092,163 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) return false; } +static void handle_posix_cpu_timers(struct task_struct *tsk); + +#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK +static void posix_cpu_timers_work(struct callback_head *work) +{ + handle_posix_cpu_timers(current); +} + /* - * This is called from the timer interrupt handler. The irq handler has - * already updated our counts. We need to check if any timers fire now. - * Interrupts are disabled. + * Initialize posix CPU timers task work in init task. Out of line to + * keep the callback static and to avoid header recursion hell. */ -void run_posix_cpu_timers(void) +void __init posix_cputimers_init_work(void) { - struct task_struct *tsk = current; - struct k_itimer *timer, *next; - unsigned long flags; - LIST_HEAD(firing); + init_task_work(¤t->posix_cputimers_work.work, + posix_cpu_timers_work); +} - lockdep_assert_irqs_disabled(); +/* + * Note: All operations on tsk->posix_cputimer_work.scheduled happen either + * in hard interrupt context or in task context with interrupts + * disabled. Aside of that the writer/reader interaction is always in the + * context of the current task, which means they are strict per CPU. + */ +static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) +{ + return tsk->posix_cputimers_work.scheduled; +} - /* - * The fast path checks that there are no expired thread or thread - * group timers. If that's so, just return. - */ - if (!fastpath_timer_check(tsk)) +static inline void __run_posix_cpu_timers(struct task_struct *tsk) +{ + if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled)) return; - lockdep_posixtimer_enter(); - if (!lock_task_sighand(tsk, &flags)) { - lockdep_posixtimer_exit(); - return; + /* Schedule task work to actually expire the timers */ + tsk->posix_cputimers_work.scheduled = true; + task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME); +} + +static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, + unsigned long start) +{ + bool ret = true; + + /* + * On !RT kernels interrupts are disabled while collecting expired + * timers, so no tick can happen and the fast path check can be + * reenabled without further checks. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + tsk->posix_cputimers_work.scheduled = false; + return true; } + /* - * Here we take off tsk->signal->cpu_timers[N] and - * tsk->cpu_timers[N] all the timers that are firing, and - * put them on the firing list. + * On RT enabled kernels ticks can happen while the expired timers + * are collected under sighand lock. But any tick which observes + * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath + * checks. So reenabling the tick work has do be done carefully: + * + * Disable interrupts and run the fast path check if jiffies have + * advanced since the collecting of expired timers started. If + * jiffies have not advanced or the fast path check did not find + * newly expired timers, reenable the fast path check in the timer + * interrupt. If there are newly expired timers, return false and + * let the collection loop repeat. */ - check_thread_timers(tsk, &firing); + local_irq_disable(); + if (start != jiffies && fastpath_timer_check(tsk)) + ret = false; + else + tsk->posix_cputimers_work.scheduled = false; + local_irq_enable(); + + return ret; +} +#else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ +static inline void __run_posix_cpu_timers(struct task_struct *tsk) +{ + lockdep_posixtimer_enter(); + handle_posix_cpu_timers(tsk); + lockdep_posixtimer_exit(); +} + +static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) +{ + return false; +} + +static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, + unsigned long start) +{ + return true; +} +#endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ + +static void handle_posix_cpu_timers(struct task_struct *tsk) +{ + struct k_itimer *timer, *next; + unsigned long flags, start; + LIST_HEAD(firing); + + if (!lock_task_sighand(tsk, &flags)) + return; - check_process_timers(tsk, &firing); + do { + /* + * On RT locking sighand lock does not disable interrupts, + * so this needs to be careful vs. ticks. Store the current + * jiffies value. + */ + start = READ_ONCE(jiffies); + barrier(); + + /* + * Here we take off tsk->signal->cpu_timers[N] and + * tsk->cpu_timers[N] all the timers that are firing, and + * put them on the firing list. + */ + check_thread_timers(tsk, &firing); + + check_process_timers(tsk, &firing); + + /* + * The above timer checks have updated the exipry cache and + * because nothing can have queued or modified timers after + * sighand lock was taken above it is guaranteed to be + * consistent. So the next timer interrupt fastpath check + * will find valid data. + * + * If timer expiry runs in the timer interrupt context then + * the loop is not relevant as timers will be directly + * expired in interrupt context. The stub function below + * returns always true which allows the compiler to + * optimize the loop out. + * + * If timer expiry is deferred to task work context then + * the following rules apply: + * + * - On !RT kernels no tick can have happened on this CPU + * after sighand lock was acquired because interrupts are + * disabled. So reenabling task work before dropping + * sighand lock and reenabling interrupts is race free. + * + * - On RT kernels ticks might have happened but the tick + * work ignored posix CPU timer handling because the + * CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work + * must be done very carefully including a check whether + * ticks have happened since the start of the timer + * expiry checks. posix_cpu_timers_enable_work() takes + * care of that and eventually lets the expiry checks + * run again. + */ + } while (!posix_cpu_timers_enable_work(tsk, start)); /* - * We must release these locks before taking any timer's lock. + * We must release sighand lock before taking any timer's lock. * There is a potential race with timer deletion here, as the * siglock now protects our private firing list. We have set * the firing flag in each timer, so that a deletion attempt @@ -1134,6 +1266,13 @@ void run_posix_cpu_timers(void) list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) { int cpu_firing; + /* + * spin_lock() is sufficient here even independent of the + * expiry context. If expiry happens in hard interrupt + * context it's obvious. For task work context it's safe + * because all other operations on timer::it_lock happen in + * task context (syscall or exit). + */ spin_lock(&timer->it_lock); list_del_init(&timer->it.cpu.elist); cpu_firing = timer->it.cpu.firing; @@ -1147,7 +1286,34 @@ void run_posix_cpu_timers(void) cpu_timer_fire(timer); spin_unlock(&timer->it_lock); } - lockdep_posixtimer_exit(); +} + +/* + * This is called from the timer interrupt handler. The irq handler has + * already updated our counts. We need to check if any timers fire now. + * Interrupts are disabled. + */ +void run_posix_cpu_timers(void) +{ + struct task_struct *tsk = current; + + lockdep_assert_irqs_disabled(); + + /* + * If the actual expiry is deferred to task work context and the + * work is already scheduled there is no point to do anything here. + */ + if (posix_cpu_timers_work_scheduled(tsk)) + return; + + /* + * The fast path checks that there are no expired thread or thread + * group timers. If that's so, just return. + */ + if (!fastpath_timer_check(tsk)) + return; + + __run_posix_cpu_timers(tsk); } /* diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0deaf4b79fb4..1c03eec6ca9b 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -229,7 +229,7 @@ void __init generic_sched_clock_init(void) { /* * If no sched_clock() function has been provided at that point, - * make it the final one one. + * make it the final one. */ if (cd.actual_read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 63a632f9896c..4c47f388a83f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -39,18 +39,19 @@ enum timekeeping_adv_mode { TK_ADV_FREQ }; +DEFINE_RAW_SPINLOCK(timekeeper_lock); + /* * The most important data for readout fits into a single 64 byte * cache line. */ static struct { - seqcount_t seq; + seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; } tk_core ____cacheline_aligned = { - .seq = SEQCNT_ZERO(tk_core.seq), + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock), }; -static DEFINE_RAW_SPINLOCK(timekeeper_lock); static struct timekeeper shadow_timekeeper; /** @@ -63,7 +64,7 @@ static struct timekeeper shadow_timekeeper; * See @update_fast_timekeeper() below. */ struct tk_fast { - seqcount_t seq; + seqcount_raw_spinlock_t seq; struct tk_read_base base[2]; }; @@ -80,11 +81,13 @@ static struct clocksource dummy_clock = { }; static struct tk_fast tk_fast_mono ____cacheline_aligned = { + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_mono.seq, &timekeeper_lock), .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; static struct tk_fast tk_fast_raw ____cacheline_aligned = { + .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_raw.seq, &timekeeper_lock), .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; @@ -157,7 +160,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) * tk_clock_read - atomic clocksource read() helper * * This helper is necessary to use in the read paths because, while the - * seqlock ensures we don't return a bad value while structures are updated, + * seqcount ensures we don't return a bad value while structures are updated, * it doesn't protect from potential crashes. There is the possibility that * the tkr's clocksource may change between the read reference, and the * clock reference passed to the read function. This can cause crashes if @@ -222,10 +225,10 @@ static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) unsigned int seq; /* - * Since we're called holding a seqlock, the data may shift + * Since we're called holding a seqcount, the data may shift * under us while we're doing the calculation. This can cause * false positives, since we'd note a problem but throw the - * results away. So nest another seqlock here to atomically + * results away. So nest another seqcount here to atomically * grab the points we are checking with. */ do { @@ -486,7 +489,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); * * To keep it NMI safe since we're accessing from tracing, we're not using a * separate timekeeper with updates to monotonic clock and boot offset - * protected with seqlocks. This has the following minor side effects: + * protected with seqcounts. This has the following minor side effects: * * (1) Its possible that a timestamp be taken after the boot offset is updated * but before the timekeeper is updated. If this happens, the new boot offset @@ -2001,7 +2004,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) * logarithmic_accumulation - shifted accumulation of cycles * * This functions accumulates a shifted interval of cycles into - * into a shifted interval nanoseconds. Allows for O(log) accumulation + * a shifted interval nanoseconds. Allows for O(log) accumulation * loop. * * Returns the unconsumed cycles. diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index bcbb52db2256..4ca2787d1642 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -1,12 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _TIMEKEEPING_INTERNAL_H #define _TIMEKEEPING_INTERNAL_H -/* - * timekeeping debug functions - */ + #include <linux/clocksource.h> +#include <linux/spinlock.h> #include <linux/time.h> +/* + * timekeeping debug functions + */ #ifdef CONFIG_DEBUG_FS extern void tk_debug_account_sleep_time(const struct timespec64 *t); #else @@ -31,4 +33,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) } #endif +/* Semi public for serialization of non timekeeper VDSO updates. */ +extern raw_spinlock_t timekeeper_lock; + #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ae5029f984a8..a16764b0116e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2017,6 +2017,7 @@ static void __init init_timer_cpus(void) void __init init_timers(void) { init_timer_cpus(); + posix_cputimers_init_work(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 54ce6eb2ca36..88e6b8ed6ca5 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -13,6 +13,8 @@ #include <vdso/helpers.h> #include <vdso/vsyscall.h> +#include "timekeeping_internal.h" + static inline void update_vdso_data(struct vdso_data *vdata, struct timekeeper *tk) { @@ -127,3 +129,42 @@ void update_vsyscall_tz(void) __arch_sync_vdso_data(vdata); } + +/** + * vdso_update_begin - Start of a VDSO update section + * + * Allows architecture code to safely update the architecture specific VDSO + * data. Disables interrupts, acquires timekeeper lock to serialize against + * concurrent updates from timekeeping and invalidates the VDSO data + * sequence counter to prevent concurrent readers from accessing + * inconsistent data. + * + * Returns: Saved interrupt flags which need to be handed in to + * vdso_update_end(). + */ +unsigned long vdso_update_begin(void) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + vdso_write_begin(vdata); + return flags; +} + +/** + * vdso_update_end - End of a VDSO update section + * @flags: Interrupt flags as returned from vdso_update_begin() + * + * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data + * synchronization if the architecture requires it, drops timekeeper lock + * and restores interrupt flags. + */ +void vdso_update_end(unsigned long flags) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + + vdso_write_end(vdata); + __arch_sync_vdso_data(vdata); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index aeba5ee7325a..e153be351548 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -2,9 +2,9 @@ # Do not instrument the tracer itself: +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + ifdef CONFIG_FUNCTION_TRACER -ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) # Avoid recursion due to instrumentation. KCSAN_SANITIZE := n diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cb91ef902cc4..a8d4f253ed77 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -383,7 +383,7 @@ static DEFINE_RAW_SPINLOCK(trace_printk_lock); #define BPF_TRACE_PRINTK_SIZE 1024 -static inline __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...) +static __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...) { static char buf[BPF_TRACE_PRINTK_SIZE]; unsigned long flags; diff --git a/kernel/umh.c b/kernel/umh.c index a25433f9cd9a..fcf3ee803630 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -119,37 +119,16 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) { pid_t pid; - /* If SIGCLD is ignored kernel_wait4 won't populate the status. */ + /* If SIGCLD is ignored do_wait won't populate the status. */ kernel_sigaction(SIGCHLD, SIG_DFL); pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); - if (pid < 0) { + if (pid < 0) sub_info->retval = pid; - } else { - int ret = -ECHILD; - /* - * Normally it is bogus to call wait4() from in-kernel because - * wait4() wants to write the exit code to a userspace address. - * But call_usermodehelper_exec_sync() always runs as kernel - * thread (workqueue) and put_user() to a kernel address works - * OK for kernel threads, due to their having an mm_segment_t - * which spans the entire address space. - * - * Thus the __user pointer cast is valid here. - */ - kernel_wait4(pid, (int __user *)&ret, 0, NULL); - - /* - * If ret is 0, either call_usermodehelper_exec_async failed and - * the real error code is already in sub_info->retval or - * sub_info->retval is 0 anyway, so don't mess with it then. - */ - if (ret) - sub_info->retval = ret; - } + else + kernel_wait(pid, &sub_info->retval); /* Restore default kernel sig handler */ kernel_sigaction(SIGCHLD, SIG_IGN); - umh_complete(sub_info); } |