diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 2 | ||||
-rw-r--r-- | kernel/audit.c | 93 | ||||
-rw-r--r-- | kernel/auditsc.c | 47 | ||||
-rw-r--r-- | kernel/cpuset.c | 24 | ||||
-rw-r--r-- | kernel/exit.c | 2 | ||||
-rw-r--r-- | kernel/irq/handle.c | 3 | ||||
-rw-r--r-- | kernel/itimer.c | 6 | ||||
-rw-r--r-- | kernel/kallsyms.c | 13 | ||||
-rw-r--r-- | kernel/kprobes.c | 142 | ||||
-rw-r--r-- | kernel/module.c | 6 | ||||
-rw-r--r-- | kernel/power/main.c | 6 | ||||
-rw-r--r-- | kernel/printk.c | 72 | ||||
-rw-r--r-- | kernel/profile.c | 16 | ||||
-rw-r--r-- | kernel/sched.c | 9 | ||||
-rw-r--r-- | kernel/signal.c | 11 | ||||
-rw-r--r-- | kernel/spinlock.c | 8 | ||||
-rw-r--r-- | kernel/sys.c | 2 |
17 files changed, 338 insertions, 124 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index eb88b446c2cc..b01d26fe8db7 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -29,7 +29,7 @@ obj-$(CONFIG_SYSFS) += ksysfs.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o -ifneq ($(CONFIG_IA64),y) +ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond # me. I suspect most platforms don't need this, but until we know that for sure diff --git a/kernel/audit.c b/kernel/audit.c index ac26d4d960d3..9c4f1af0c794 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1,4 +1,4 @@ -/* audit.c -- Auditing support -*- linux-c -*- +/* audit.c -- Auditing support * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. * System-call specific features have moved to auditsc.c * @@ -38,7 +38,7 @@ * 6) Support low-overhead kernel-based filtering to minimize the * information that must be passed to user-space. * - * Example user-space utilities: http://people.redhat.com/faith/audit/ + * Example user-space utilities: http://people.redhat.com/sgrubb/audit/ */ #include <linux/init.h> @@ -142,7 +142,6 @@ struct audit_buffer { int total; int type; int pid; - int count; /* Times requeued */ }; void audit_set_type(struct audit_buffer *ab, int type) @@ -239,36 +238,36 @@ void audit_log_lost(const char *message) } -static int audit_set_rate_limit(int limit) +static int audit_set_rate_limit(int limit, uid_t loginuid) { int old = audit_rate_limit; audit_rate_limit = limit; - audit_log(current->audit_context, "audit_rate_limit=%d old=%d", - audit_rate_limit, old); + audit_log(NULL, "audit_rate_limit=%d old=%d by auid %u", + audit_rate_limit, old, loginuid); return old; } -static int audit_set_backlog_limit(int limit) +static int audit_set_backlog_limit(int limit, uid_t loginuid) { int old = audit_backlog_limit; audit_backlog_limit = limit; - audit_log(current->audit_context, "audit_backlog_limit=%d old=%d", - audit_backlog_limit, old); + audit_log(NULL, "audit_backlog_limit=%d old=%d by auid %u", + audit_backlog_limit, old, loginuid); return old; } -static int audit_set_enabled(int state) +static int audit_set_enabled(int state, uid_t loginuid) { int old = audit_enabled; if (state != 0 && state != 1) return -EINVAL; audit_enabled = state; - audit_log(current->audit_context, "audit_enabled=%d old=%d", - audit_enabled, old); + audit_log(NULL, "audit_enabled=%d old=%d by auid %u", + audit_enabled, old, loginuid); return old; } -static int audit_set_failure(int state) +static int audit_set_failure(int state, uid_t loginuid) { int old = audit_failure; if (state != AUDIT_FAIL_SILENT @@ -276,8 +275,8 @@ static int audit_set_failure(int state) && state != AUDIT_FAIL_PANIC) return -EINVAL; audit_failure = state; - audit_log(current->audit_context, "audit_failure=%d old=%d", - audit_failure, old); + audit_log(NULL, "audit_failure=%d old=%d by auid %u", + audit_failure, old, loginuid); return old; } @@ -344,6 +343,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; + uid_t loginuid; /* loginuid of sender */ err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); if (err) @@ -351,6 +351,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) pid = NETLINK_CREDS(skb)->pid; uid = NETLINK_CREDS(skb)->uid; + loginuid = NETLINK_CB(skb).loginuid; seq = nlh->nlmsg_seq; data = NLMSG_DATA(nlh); @@ -371,34 +372,36 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; status_get = (struct audit_status *)data; if (status_get->mask & AUDIT_STATUS_ENABLED) { - err = audit_set_enabled(status_get->enabled); + err = audit_set_enabled(status_get->enabled, loginuid); if (err < 0) return err; } if (status_get->mask & AUDIT_STATUS_FAILURE) { - err = audit_set_failure(status_get->failure); + err = audit_set_failure(status_get->failure, loginuid); if (err < 0) return err; } if (status_get->mask & AUDIT_STATUS_PID) { int old = audit_pid; audit_pid = status_get->pid; - audit_log(current->audit_context, - "audit_pid=%d old=%d", audit_pid, old); + audit_log(NULL, "audit_pid=%d old=%d by auid %u", + audit_pid, old, loginuid); } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) - audit_set_rate_limit(status_get->rate_limit); + audit_set_rate_limit(status_get->rate_limit, loginuid); if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) - audit_set_backlog_limit(status_get->backlog_limit); + audit_set_backlog_limit(status_get->backlog_limit, + loginuid); break; case AUDIT_USER: ab = audit_log_start(NULL); if (!ab) break; /* audit_panic has been called */ audit_log_format(ab, - "user pid=%d uid=%d length=%d msg='%.1024s'", + "user pid=%d uid=%d length=%d loginuid=%u" + " msg='%.1024s'", pid, uid, (int)(nlh->nlmsg_len - ((char *)data - (char *)nlh)), - (char *)data); + loginuid, (char *)data); ab->type = AUDIT_USER; ab->pid = pid; audit_log_end(ab); @@ -411,7 +414,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_LIST: #ifdef CONFIG_AUDITSYSCALL err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, - uid, seq, data); + uid, seq, data, loginuid); #else err = -EOPNOTSUPP; #endif @@ -480,7 +483,7 @@ static void audit_log_move(struct audit_buffer *ab) if (ab->len == 0) return; - skb = skb_peek(&ab->sklist); + skb = skb_peek_tail(&ab->sklist); if (!skb || skb_tailroom(skb) <= ab->len + extra) { skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC); if (!skb) { @@ -519,9 +522,9 @@ static inline int audit_log_drain(struct audit_buffer *ab) retval = netlink_unicast(audit_sock, skb, audit_pid, MSG_DONTWAIT); } - if (retval == -EAGAIN && ab->count < 5) { - ++ab->count; - skb_queue_tail(&ab->sklist, skb); + if (retval == -EAGAIN && + (atomic_read(&audit_backlog)) < audit_backlog_limit) { + skb_queue_head(&ab->sklist, skb); audit_log_end_irq(ab); return 1; } @@ -537,8 +540,8 @@ static inline int audit_log_drain(struct audit_buffer *ab) if (!audit_pid) { /* No daemon */ int offset = ab->nlh ? NLMSG_SPACE(0) : 0; int len = skb->len - offset; - printk(KERN_ERR "%*.*s\n", - len, len, skb->data + offset); + skb->data[offset + len] = '\0'; + printk(KERN_ERR "%s\n", skb->data + offset); } kfree_skb(skb); ab->nlh = NULL; @@ -617,7 +620,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx) struct audit_buffer *ab = NULL; unsigned long flags; struct timespec t; - int serial = 0; + unsigned int serial; if (!audit_initialized) return NULL; @@ -659,15 +662,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx) ab->total = 0; ab->type = AUDIT_KERNEL; ab->pid = 0; - ab->count = 0; #ifdef CONFIG_AUDITSYSCALL if (ab->ctx) audit_get_stamp(ab->ctx, &t, &serial); else #endif + { t = CURRENT_TIME; - + serial = 0; + } audit_log_format(ab, "audit(%lu.%03lu:%u): ", t.tv_sec, t.tv_nsec/1000000, serial); return ab; @@ -717,6 +721,29 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) va_end(args); } +void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) +{ + int i; + + for (i=0; i<len; i++) + audit_log_format(ab, "%02x", buf[i]); +} + +void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) +{ + const unsigned char *p = string; + + while (*p) { + if (*p == '"' || *p == ' ' || *p < 0x20 || *p > 0x7f) { + audit_log_hex(ab, string, strlen(string)); + return; + } + p++; + } + audit_log_format(ab, "\"%s\"", string); +} + + /* This is a helper-function to print the d_path without using a static * buffer or allocating another buffer in addition to the one in * audit_buffer. */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 6f1931381bc9..37b3ac94bc47 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1,4 +1,4 @@ -/* auditsc.c -- System-call auditing support -*- linux-c -*- +/* auditsc.c -- System-call auditing support * Handles all system-call specific auditing features. * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. @@ -123,7 +123,7 @@ struct audit_context { int major; /* syscall number */ unsigned long argv[4]; /* syscall arguments */ int return_valid; /* return code is valid */ - int return_code;/* syscall return code */ + long return_code;/* syscall return code */ int auditable; /* 1 if record should be written */ int name_count; struct audit_names names[AUDIT_NAMES]; @@ -135,6 +135,7 @@ struct audit_context { uid_t uid, euid, suid, fsuid; gid_t gid, egid, sgid, fsgid; unsigned long personality; + int arch; #if AUDIT_DEBUG int put_count; @@ -250,7 +251,8 @@ static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) return 0; } -int audit_receive_filter(int type, int pid, int uid, int seq, void *data) +int audit_receive_filter(int type, int pid, int uid, int seq, void *data, + uid_t loginuid) { u32 flags; struct audit_entry *entry; @@ -285,6 +287,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data) err = audit_add_rule(entry, &audit_entlist); if (!err && (flags & AUDIT_AT_EXIT)) err = audit_add_rule(entry, &audit_extlist); + audit_log(NULL, "auid %u added an audit rule\n", loginuid); break; case AUDIT_DEL: flags =((struct audit_rule *)data)->flags; @@ -294,6 +297,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data) err = audit_del_rule(data, &audit_entlist); if (!err && (flags & AUDIT_AT_EXIT)) err = audit_del_rule(data, &audit_extlist); + audit_log(NULL, "auid %u removed an audit rule\n", loginuid); break; default: return -EINVAL; @@ -348,6 +352,10 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_PERS: result = (tsk->personality == value); break; + case AUDIT_ARCH: + if (ctx) + result = (ctx->arch == value); + break; case AUDIT_EXIT: if (ctx && ctx->return_valid) @@ -355,7 +363,7 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_SUCCESS: if (ctx && ctx->return_valid) - result = (ctx->return_code >= 0); + result = (ctx->return_valid == AUDITSC_SUCCESS); break; case AUDIT_DEVMAJOR: if (ctx) { @@ -648,8 +656,11 @@ static void audit_log_exit(struct audit_context *context) audit_log_format(ab, "syscall=%d", context->major); if (context->personality != PER_LINUX) audit_log_format(ab, " per=%lx", context->personality); + audit_log_format(ab, " arch=%x", context->arch); if (context->return_valid) - audit_log_format(ab, " exit=%d", context->return_code); + audit_log_format(ab, " success=%s exit=%ld", + (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", + context->return_code); audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" " pid=%d loginuid=%d uid=%d gid=%d" @@ -696,9 +707,10 @@ static void audit_log_exit(struct audit_context *context) if (!ab) continue; /* audit_panic has been called */ audit_log_format(ab, "item=%d", i); - if (context->names[i].name) - audit_log_format(ab, " name=%s", - context->names[i].name); + if (context->names[i].name) { + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, context->names[i].name); + } if (context->names[i].ino != (unsigned long)-1) audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o" " uid=%d gid=%d rdev=%02x:%02x", @@ -772,7 +784,7 @@ static inline unsigned int audit_serial(void) * then the record will be written at syscall exit time (otherwise, it * will only be written if another part of the kernel requests that it * be written). */ -void audit_syscall_entry(struct task_struct *tsk, int major, +void audit_syscall_entry(struct task_struct *tsk, int arch, int major, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4) { @@ -826,6 +838,7 @@ void audit_syscall_entry(struct task_struct *tsk, int major, if (!audit_enabled) return; + context->arch = arch; context->major = major; context->argv[0] = a1; context->argv[1] = a2; @@ -849,13 +862,13 @@ void audit_syscall_entry(struct task_struct *tsk, int major, * filtering, or because some other part of the kernel write an audit * message), then write out the syscall information. In call cases, * free the names stored from getname(). */ -void audit_syscall_exit(struct task_struct *tsk, int return_code) +void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) { struct audit_context *context; get_task_struct(tsk); task_lock(tsk); - context = audit_get_context(tsk, 1, return_code); + context = audit_get_context(tsk, valid, return_code); task_unlock(tsk); /* Not having a context here is ok, since the parent may have @@ -868,6 +881,7 @@ void audit_syscall_exit(struct task_struct *tsk, int return_code) context->in_syscall = 0; context->auditable = 0; + if (context->previous) { struct audit_context *new_context = context->previous; context->previous = NULL; @@ -981,7 +995,7 @@ void audit_inode(const char *name, const struct inode *inode) } void audit_get_stamp(struct audit_context *ctx, - struct timespec *t, int *serial) + struct timespec *t, unsigned int *serial) { if (ctx) { t->tv_sec = ctx->ctime.tv_sec; @@ -996,20 +1010,21 @@ void audit_get_stamp(struct audit_context *ctx, extern int audit_set_type(struct audit_buffer *ab, int type); -int audit_set_loginuid(struct audit_context *ctx, uid_t loginuid) +int audit_set_loginuid(struct task_struct *task, uid_t loginuid) { - if (ctx) { + if (task->audit_context) { struct audit_buffer *ab; ab = audit_log_start(NULL); if (ab) { audit_log_format(ab, "login pid=%d uid=%u " "old loginuid=%u new loginuid=%u", - ctx->pid, ctx->uid, ctx->loginuid, loginuid); + task->pid, task->uid, + task->audit_context->loginuid, loginuid); audit_set_type(ab, AUDIT_LOGIN); audit_log_end(ab); } - ctx->loginuid = loginuid; + task->audit_context->loginuid = loginuid; } return 0; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 961d74044deb..00e8f2575512 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -166,9 +166,8 @@ static struct super_block *cpuset_sb = NULL; * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't * (usually) grab cpuset_sem. These are the two most performance * critical pieces of code here. The exception occurs on exit(), - * if the last task using a cpuset exits, and the cpuset was marked - * notify_on_release. In that case, the cpuset_sem is taken, the - * path to the released cpuset calculated, and a usermode call made + * when a task in a notify_on_release cpuset exits. Then cpuset_sem + * is taken, and if the cpuset count is zero, a usermode call made * to /sbin/cpuset_release_agent with the name of the cpuset (path * relative to the root of cpuset file system) as the argument. * @@ -1404,6 +1403,18 @@ void cpuset_fork(struct task_struct *tsk) * * Description: Detach cpuset from @tsk and release it. * + * Note that cpusets marked notify_on_release force every task + * in them to take the global cpuset_sem semaphore when exiting. + * This could impact scaling on very large systems. Be reluctant + * to use notify_on_release cpusets where very high task exit + * scaling is required on large systems. + * + * Don't even think about derefencing 'cs' after the cpuset use + * count goes to zero, except inside a critical section guarded + * by the cpuset_sem semaphore. If you don't hold cpuset_sem, + * then a zero cpuset use count is a license to any other task to + * nuke the cpuset immediately. + * **/ void cpuset_exit(struct task_struct *tsk) @@ -1415,10 +1426,13 @@ void cpuset_exit(struct task_struct *tsk) tsk->cpuset = NULL; task_unlock(tsk); - if (atomic_dec_and_test(&cs->count)) { + if (notify_on_release(cs)) { down(&cpuset_sem); - check_for_release(cs); + if (atomic_dec_and_test(&cs->count)) + check_for_release(cs); up(&cpuset_sem); + } else { + atomic_dec(&cs->count); } } diff --git a/kernel/exit.c b/kernel/exit.c index 7be283d98983..edaa50b5bbfa 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -846,6 +846,8 @@ fastcall NORET_TYPE void do_exit(long code) for (;;) ; } +EXPORT_SYMBOL_GPL(do_exit); + NORET_TYPE void complete_and_exit(struct completion *comp, long code) { if (comp) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 2fb0e46e11f3..436c7d93c00a 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -30,6 +30,7 @@ */ irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { [0 ... NR_IRQS-1] = { + .status = IRQ_DISABLED, .handler = &no_irq_type, .lock = SPIN_LOCK_UNLOCKED } @@ -118,8 +119,6 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) */ desc->handler->ack(irq); action_ret = handle_IRQ_event(irq, regs, desc->action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); desc->handler->end(irq); return 1; } diff --git a/kernel/itimer.c b/kernel/itimer.c index e9a40e947e07..1dc988e0d2c7 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -123,7 +123,11 @@ static inline void it_real_arm(struct task_struct *p, unsigned long interval) return; if (interval > (unsigned long) LONG_MAX) interval = LONG_MAX; - p->signal->real_timer.expires = jiffies + interval; + /* the "+ 1" below makes sure that the timer doesn't go off before + * the interval requested. This could happen if + * time requested % (usecs per jiffy) is more than the usecs left + * in the current jiffy */ + p->signal->real_timer.expires = jiffies + interval + 1; add_timer(&p->signal->real_timer); } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 1627f8d6e0cd..13bcec151b57 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -46,6 +46,14 @@ static inline int is_kernel_inittext(unsigned long addr) return 0; } +static inline int is_kernel_extratext(unsigned long addr) +{ + if (addr >= (unsigned long)_sextratext + && addr <= (unsigned long)_eextratext) + return 1; + return 0; +} + static inline int is_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) @@ -169,8 +177,9 @@ const char *kallsyms_lookup(unsigned long addr, namebuf[0] = 0; if ((all_var && is_kernel(addr)) || - (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr)))) { - unsigned long symbol_end=0; + (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr) || + is_kernel_extratext(addr)))) { + unsigned long symbol_end = 0; /* do a binary search on the sorted kallsyms_addresses array */ low = 0; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1d5dd1337bd1..037142b72a49 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -44,6 +44,7 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; unsigned int kprobe_cpu = NR_CPUS; static DEFINE_SPINLOCK(kprobe_lock); +static struct kprobe *curr_kprobe; /* Locks kprobe: irqs must be disabled */ void lock_kprobes(void) @@ -73,22 +74,139 @@ struct kprobe *get_kprobe(void *addr) return NULL; } +/* + * Aggregate handlers for multiple kprobes support - these handlers + * take care of invoking the individual kprobe handlers on p->list + */ +int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe *kp; + + list_for_each_entry(kp, &p->list, list) { + if (kp->pre_handler) { + curr_kprobe = kp; + kp->pre_handler(kp, regs); + curr_kprobe = NULL; + } + } + return 0; +} + +void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) +{ + struct kprobe *kp; + + list_for_each_entry(kp, &p->list, list) { + if (kp->post_handler) { + curr_kprobe = kp; + kp->post_handler(kp, regs, flags); + curr_kprobe = NULL; + } + } + return; +} + +int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr) +{ + /* + * if we faulted "during" the execution of a user specified + * probe handler, invoke just that probe's fault handler + */ + if (curr_kprobe && curr_kprobe->fault_handler) { + if (curr_kprobe->fault_handler(curr_kprobe, regs, trapnr)) + return 1; + } + return 0; +} + +/* + * Fill in the required fields of the "manager kprobe". Replace the + * earlier kprobe in the hlist with the manager kprobe + */ +static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) +{ + ap->addr = p->addr; + ap->opcode = p->opcode; + memcpy(&ap->ainsn, &p->ainsn, sizeof(struct arch_specific_insn)); + + ap->pre_handler = aggr_pre_handler; + ap->post_handler = aggr_post_handler; + ap->fault_handler = aggr_fault_handler; + + INIT_LIST_HEAD(&ap->list); + list_add(&p->list, &ap->list); + + INIT_HLIST_NODE(&ap->hlist); + hlist_del(&p->hlist); + hlist_add_head(&ap->hlist, + &kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]); +} + +/* + * This is the second or subsequent kprobe at the address - handle + * the intricacies + * TODO: Move kcalloc outside the spinlock + */ +static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) +{ + int ret = 0; + struct kprobe *ap; + + if (old_p->break_handler || p->break_handler) { + ret = -EEXIST; /* kprobe and jprobe can't (yet) coexist */ + } else if (old_p->pre_handler == aggr_pre_handler) { + list_add(&p->list, &old_p->list); + } else { + ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC); + if (!ap) + return -ENOMEM; + add_aggr_kprobe(ap, old_p); + list_add(&p->list, &ap->list); + } + return ret; +} + +/* kprobe removal house-keeping routines */ +static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) +{ + *p->addr = p->opcode; + hlist_del(&p->hlist); + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + spin_unlock_irqrestore(&kprobe_lock, flags); + arch_remove_kprobe(p); +} + +static inline void cleanup_aggr_kprobe(struct kprobe *old_p, + struct kprobe *p, unsigned long flags) +{ + list_del(&p->list); + if (list_empty(&old_p->list)) { + cleanup_kprobe(old_p, flags); + kfree(old_p); + } else + spin_unlock_irqrestore(&kprobe_lock, flags); +} + int register_kprobe(struct kprobe *p) { int ret = 0; unsigned long flags = 0; + struct kprobe *old_p; if ((ret = arch_prepare_kprobe(p)) != 0) { goto rm_kprobe; } spin_lock_irqsave(&kprobe_lock, flags); - INIT_HLIST_NODE(&p->hlist); - if (get_kprobe(p->addr)) { - ret = -EEXIST; + old_p = get_kprobe(p->addr); + if (old_p) { + ret = register_aggr_kprobe(old_p, p); goto out; } - arch_copy_kprobe(p); + arch_copy_kprobe(p); + INIT_HLIST_NODE(&p->hlist); hlist_add_head(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); @@ -107,13 +225,17 @@ rm_kprobe: void unregister_kprobe(struct kprobe *p) { unsigned long flags; - arch_remove_kprobe(p); + struct kprobe *old_p; + spin_lock_irqsave(&kprobe_lock, flags); - *p->addr = p->opcode; - hlist_del(&p->hlist); - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); - spin_unlock_irqrestore(&kprobe_lock, flags); + old_p = get_kprobe(p->addr); + if (old_p) { + if (old_p->pre_handler == aggr_pre_handler) + cleanup_aggr_kprobe(old_p, p, flags); + else + cleanup_kprobe(p, flags); + } else + spin_unlock_irqrestore(&kprobe_lock, flags); } static struct notifier_block kprobe_exceptions_nb = { diff --git a/kernel/module.c b/kernel/module.c index 5734ab09d3f9..83b3d376708c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1758,6 +1758,7 @@ sys_init_module(void __user *umod, const char __user *uargs) { struct module *mod; + mm_segment_t old_fs = get_fs(); int ret = 0; /* Must have permission */ @@ -1775,6 +1776,9 @@ sys_init_module(void __user *umod, return PTR_ERR(mod); } + /* flush the icache in correct context */ + set_fs(KERNEL_DS); + /* Flush the instruction cache, since we've played with text */ if (mod->module_init) flush_icache_range((unsigned long)mod->module_init, @@ -1783,6 +1787,8 @@ sys_init_module(void __user *umod, flush_icache_range((unsigned long)mod->module_core, (unsigned long)mod->module_core + mod->core_size); + set_fs(old_fs); + /* Now sew it into the lists. They won't access us, since strong_try_module_get() will fail. */ stop_machine_run(__link_module, mod, NR_CPUS); diff --git a/kernel/power/main.c b/kernel/power/main.c index 7960ddf04a57..4cdebc972ff2 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -156,14 +156,14 @@ static int enter_state(suspend_state_t state) goto Unlock; } - pr_debug("PM: Preparing system for suspend\n"); + pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); if ((error = suspend_prepare(state))) goto Unlock; - pr_debug("PM: Entering state.\n"); + pr_debug("PM: Entering %s sleep\n", pm_states[state]); error = suspend_enter(state); - pr_debug("PM: Finishing up.\n"); + pr_debug("PM: Finishing wakeup.\n"); suspend_finish(state); Unlock: up(&pm_sem); diff --git a/kernel/printk.c b/kernel/printk.c index 290a07ce2c8a..01b58d7d17ff 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -160,42 +160,6 @@ static int __init console_setup(char *str) __setup("console=", console_setup); -/** - * add_preferred_console - add a device to the list of preferred consoles. - * - * The last preferred console added will be used for kernel messages - * and stdin/out/err for init. Normally this is used by console_setup - * above to handle user-supplied console arguments; however it can also - * be used by arch-specific code either to override the user or more - * commonly to provide a default console (ie from PROM variables) when - * the user has not supplied one. - */ -int __init add_preferred_console(char *name, int idx, char *options) -{ - struct console_cmdline *c; - int i; - - /* - * See if this tty is not yet registered, and - * if we have a slot free. - */ - for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) - if (strcmp(console_cmdline[i].name, name) == 0 && - console_cmdline[i].index == idx) { - selected_console = i; - return 0; - } - if (i == MAX_CMDLINECONSOLES) - return -E2BIG; - selected_console = i; - c = &console_cmdline[i]; - memcpy(c->name, name, sizeof(c->name)); - c->name[sizeof(c->name) - 1] = 0; - c->options = options; - c->index = idx; - return 0; -} - static int __init log_buf_len_setup(char *str) { unsigned long size = memparse(str, &str); @@ -671,6 +635,42 @@ static void call_console_drivers(unsigned long start, unsigned long end) {} #endif /** + * add_preferred_console - add a device to the list of preferred consoles. + * + * The last preferred console added will be used for kernel messages + * and stdin/out/err for init. Normally this is used by console_setup + * above to handle user-supplied console arguments; however it can also + * be used by arch-specific code either to override the user or more + * commonly to provide a default console (ie from PROM variables) when + * the user has not supplied one. + */ +int __init add_preferred_console(char *name, int idx, char *options) +{ + struct console_cmdline *c; + int i; + + /* + * See if this tty is not yet registered, and + * if we have a slot free. + */ + for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) + if (strcmp(console_cmdline[i].name, name) == 0 && + console_cmdline[i].index == idx) { + selected_console = i; + return 0; + } + if (i == MAX_CMDLINECONSOLES) + return -E2BIG; + selected_console = i; + c = &console_cmdline[i]; + memcpy(c->name, name, sizeof(c->name)); + c->name[sizeof(c->name) - 1] = 0; + c->options = options; + c->index = idx; + return 0; +} + +/** * acquire_console_sem - lock the console system for exclusive use. * * Acquires a semaphore which guarantees that the caller has diff --git a/kernel/profile.c b/kernel/profile.c index 0221a50ca867..ad8cbb75ffa2 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -49,15 +49,19 @@ static DECLARE_MUTEX(profile_flip_mutex); static int __init profile_setup(char * str) { + static char __initdata schedstr[] = "schedule"; int par; - if (!strncmp(str, "schedule", 8)) { + if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; - printk(KERN_INFO "kernel schedule profiling enabled\n"); - if (str[7] == ',') - str += 8; - } - if (get_option(&str,&par)) { + if (str[strlen(schedstr)] == ',') + str += strlen(schedstr) + 1; + if (get_option(&str, &par)) + prof_shift = par; + printk(KERN_INFO + "kernel schedule profiling enabled (shift: %ld)\n", + prof_shift); + } else if (get_option(&str, &par)) { prof_shift = par; prof_on = CPU_PROFILING; printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", diff --git a/kernel/sched.c b/kernel/sched.c index 0dc3158667a2..f12a0c8a7d98 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3755,19 +3755,22 @@ EXPORT_SYMBOL(cond_resched); */ int cond_resched_lock(spinlock_t * lock) { + int ret = 0; + if (need_lockbreak(lock)) { spin_unlock(lock); cpu_relax(); + ret = 1; spin_lock(lock); } if (need_resched()) { _raw_spin_unlock(lock); preempt_enable_no_resched(); __cond_resched(); + ret = 1; spin_lock(lock); - return 1; } - return 0; + return ret; } EXPORT_SYMBOL(cond_resched_lock); @@ -4243,7 +4246,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) /* No more Mr. Nice Guy. */ if (dest_cpu == NR_CPUS) { - tsk->cpus_allowed = cpuset_cpus_allowed(tsk); + cpus_setall(tsk->cpus_allowed); dest_cpu = any_online_cpu(tsk->cpus_allowed); /* diff --git a/kernel/signal.c b/kernel/signal.c index 8f3debc77c5b..b3c24c732c5a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -522,7 +522,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, { int sig = 0; - sig = next_signal(pending, mask); + /* SIGKILL must have priority, otherwise it is quite easy + * to create an unkillable process, sending sig < SIGKILL + * to self */ + if (unlikely(sigismember(&pending->signal, SIGKILL))) { + if (!sigismember(mask, SIGKILL)) + sig = SIGKILL; + } + + if (likely(!sig)) + sig = next_signal(pending, mask); if (sig) { if (current->notifier) { if (sigismember(current->notifier_mask, sig)) { diff --git a/kernel/spinlock.c b/kernel/spinlock.c index e15ed17863f1..0c3f9d8bbe17 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -294,7 +294,7 @@ EXPORT_SYMBOL(_spin_unlock_irq); void __lockfunc _spin_unlock_bh(spinlock_t *lock) { _raw_spin_unlock(lock); - preempt_enable(); + preempt_enable_no_resched(); local_bh_enable(); } EXPORT_SYMBOL(_spin_unlock_bh); @@ -318,7 +318,7 @@ EXPORT_SYMBOL(_read_unlock_irq); void __lockfunc _read_unlock_bh(rwlock_t *lock) { _raw_read_unlock(lock); - preempt_enable(); + preempt_enable_no_resched(); local_bh_enable(); } EXPORT_SYMBOL(_read_unlock_bh); @@ -342,7 +342,7 @@ EXPORT_SYMBOL(_write_unlock_irq); void __lockfunc _write_unlock_bh(rwlock_t *lock) { _raw_write_unlock(lock); - preempt_enable(); + preempt_enable_no_resched(); local_bh_enable(); } EXPORT_SYMBOL(_write_unlock_bh); @@ -354,7 +354,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock) if (_raw_spin_trylock(lock)) return 1; - preempt_enable(); + preempt_enable_no_resched(); local_bh_enable(); return 0; } diff --git a/kernel/sys.c b/kernel/sys.c index f64e97cabe25..f006632c2ba7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1195,7 +1195,7 @@ static int groups_from_user(struct group_info *group_info, return 0; } -/* a simple shell-metzner sort */ +/* a simple Shell sort */ static void groups_sort(struct group_info *group_info) { int base, max, stride; |