diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-09-01 16:34:25 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-09-01 16:34:25 -0700 |
commit | 34232fcfe9a383bea802af682baae5c99f22376c (patch) | |
tree | 7935364548ebb29533b499728fac1e48c373e269 /kernel | |
parent | bd30fe6a7d9b72e73c5ac9109cbc3066dde08034 (diff) | |
parent | 8c96b70171584f38940eb2ba65b84eee38b549ba (diff) |
Merge tag 'trace-v6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace
Pull tracing updates from Steven Rostedt:
"User visible changes:
- Added a way to easier filter with cpumasks:
# echo 'cpumask & CPUS{17-42}' > /sys/kernel/tracing/events/ipi_send_cpumask/filter
- Show actual size of ring buffer after modifying the ring buffer
size via buffer_size_kb.
Currently it just returns what was written, but the actual size
rounds up to the sub buffer size. Show that real size instead.
Major changes:
- Added "eventfs". This is the code that handles the inodes and
dentries of tracefs/events directory. As there are thousands of
events, and each event has several inodes and dentries that
currently exist even when tracing is never used, they take up
precious memory. Instead, eventfs will allocate the inodes and
dentries in a JIT way (similar to what procfs does). There is now
metadata that handles the events and subdirectories, and will
create the inodes and dentries when they are used.
Note, I also have patches that remove the subdirectory meta data,
but will wait till the next merge window before applying them. It's
a little more complex, and I want to make sure the dynamic code
works properly before adding more complexity, making it easier to
revert if need be.
Minor changes:
- Optimization to user event list traversal
- Remove intermediate permission of tracefs files (note the
intermediate permission removes all access to the files so it is
not a security concern, but just a clean up)
- Add the complex fix to FORTIFY_SOURCE to the kernel stack event
logic
- Other minor cleanups"
* tag 'trace-v6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace: (29 commits)
tracefs: Remove kerneldoc from struct eventfs_file
tracefs: Avoid changing i_mode to a temp value
tracing/user_events: Optimize safe list traversals
ftrace: Remove empty declaration ftrace_enable_daemon() and ftrace_disable_daemon()
tracing: Remove unused function declarations
tracing/filters: Document cpumask filtering
tracing/filters: Further optimise scalar vs cpumask comparison
tracing/filters: Optimise CPU vs cpumask filtering when the user mask is a single CPU
tracing/filters: Optimise scalar vs cpumask filtering when the user mask is a single CPU
tracing/filters: Optimise cpumask vs cpumask filtering when user mask is a single CPU
tracing/filters: Enable filtering the CPU common field by a cpumask
tracing/filters: Enable filtering a scalar field by a cpumask
tracing/filters: Enable filtering a cpumask field by another cpumask
tracing/filters: Dynamically allocate filter_pred.regex
test: ftrace: Fix kprobe test for eventfs
eventfs: Move tracing/events to eventfs
eventfs: Implement removal of meta data from eventfs
eventfs: Implement functions to create files and dirs when accessed
eventfs: Implement eventfs lookup, read, open functions
eventfs: Implement eventfs file add functions
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/trace/ring_buffer.c | 20 | ||||
-rw-r--r-- | kernel/trace/trace.c | 99 | ||||
-rw-r--r-- | kernel/trace/trace.h | 14 | ||||
-rw-r--r-- | kernel/trace/trace_entries.h | 2 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 76 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter.c | 302 | ||||
-rw-r--r-- | kernel/trace/trace_events_user.c | 15 | ||||
-rw-r--r-- | kernel/trace/trace_export.c | 9 |
8 files changed, 389 insertions, 148 deletions
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 52dea5dd5362..78502d4c7214 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -692,10 +692,7 @@ static void rb_time_set(rb_time_t *t, u64 val) static inline bool rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) { - unsigned long ret; - - ret = local_cmpxchg(l, expect, set); - return ret == expect; + return local_try_cmpxchg(l, &expect, set); } static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) @@ -752,9 +749,7 @@ static void rb_time_set(rb_time_t *t, u64 val) static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) { - u64 val; - val = local64_cmpxchg(&t->time, expect, set); - return val == expect; + return local64_try_cmpxchg(&t->time, &expect, set); } #endif @@ -1494,14 +1489,11 @@ static bool rb_head_page_replace(struct buffer_page *old, { unsigned long *ptr = (unsigned long *)&old->list.prev->next; unsigned long val; - unsigned long ret; val = *ptr & ~RB_FLAG_MASK; val |= RB_PAGE_HEAD; - ret = cmpxchg(ptr, val, (unsigned long)&new->list); - - return ret == val; + return try_cmpxchg(ptr, &val, (unsigned long)&new->list); } /* @@ -3003,7 +2995,6 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, { unsigned long new_index, old_index; struct buffer_page *bpage; - unsigned long index; unsigned long addr; u64 write_stamp; u64 delta; @@ -3060,8 +3051,9 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, */ old_index += write_mask; new_index += write_mask; - index = local_cmpxchg(&bpage->write, old_index, new_index); - if (index == old_index) { + + /* caution: old_index gets updated on cmpxchg failure */ + if (local_try_cmpxchg(&bpage->write, &old_index, new_index)) { /* update counters */ local_sub(event_length, &cpu_buffer->entries_bytes); return true; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8e64aaad5361..3e55375f47e0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3119,7 +3119,6 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, struct ftrace_stack *fstack; struct stack_entry *entry; int stackidx; - void *ptr; /* * Add one, for this function and the call to save_stack_trace() @@ -3157,32 +3156,16 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, nr_entries = stack_trace_save(fstack->calls, size, skip); } - size = nr_entries * sizeof(unsigned long); event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, - (sizeof(*entry) - sizeof(entry->caller)) + size, + struct_size(entry, caller, nr_entries), trace_ctx); if (!event) goto out; - ptr = ring_buffer_event_data(event); - entry = ptr; - - /* - * For backward compatibility reasons, the entry->caller is an - * array of 8 slots to store the stack. This is also exported - * to user space. The amount allocated on the ring buffer actually - * holds enough for the stack specified by nr_entries. This will - * go into the location of entry->caller. Due to string fortifiers - * checking the size of the destination of memcpy() it triggers - * when it detects that size is greater than 8. To hide this from - * the fortifiers, we use "ptr" and pointer arithmetic to assign caller. - * - * The below is really just: - * memcpy(&entry->caller, fstack->calls, size); - */ - ptr += offsetof(typeof(*entry), caller); - memcpy(ptr, fstack->calls, size); + entry = ring_buffer_event_data(event); entry->size = nr_entries; + memcpy(&entry->caller, fstack->calls, + flex_array_size(entry, caller, nr_entries)); if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); @@ -4206,18 +4189,12 @@ static void *s_start(struct seq_file *m, loff_t *pos) loff_t l = 0; int cpu; - /* - * copy the tracer to avoid using a global lock all around. - * iter->trace is a copy of current_trace, the pointer to the - * name may be used instead of a strcmp(), as iter->trace->name - * will point to the same string as current_trace->name. - */ mutex_lock(&trace_types_lock); - if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) { + if (unlikely(tr->current_trace != iter->trace)) { /* Close iter->trace before switching to the new current tracer */ if (iter->trace->close) iter->trace->close(iter); - *iter->trace = *tr->current_trace; + iter->trace = tr->current_trace; /* Reopen the new current tracer */ if (iter->trace->open) iter->trace->open(iter); @@ -4829,6 +4806,25 @@ static const struct seq_operations tracer_seq_ops = { .show = s_show, }; +/* + * Note, as iter itself can be allocated and freed in different + * ways, this function is only used to free its content, and not + * the iterator itself. The only requirement to all the allocations + * is that it must zero all fields (kzalloc), as freeing works with + * ethier allocated content or NULL. + */ +static void free_trace_iter_content(struct trace_iterator *iter) +{ + /* The fmt is either NULL, allocated or points to static_fmt_buf */ + if (iter->fmt != static_fmt_buf) + kfree(iter->fmt); + + kfree(iter->temp); + kfree(iter->buffer_iter); + mutex_destroy(&iter->mutex); + free_cpumask_var(iter->started); +} + static struct trace_iterator * __tracing_open(struct inode *inode, struct file *file, bool snapshot) { @@ -4870,16 +4866,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) iter->fmt = NULL; iter->fmt_size = 0; - /* - * We make a copy of the current tracer to avoid concurrent - * changes on it while we are reading. - */ mutex_lock(&trace_types_lock); - iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL); - if (!iter->trace) - goto fail; - - *iter->trace = *tr->current_trace; + iter->trace = tr->current_trace; if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; @@ -4944,9 +4932,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) fail: mutex_unlock(&trace_types_lock); - kfree(iter->trace); - kfree(iter->temp); - kfree(iter->buffer_iter); + free_trace_iter_content(iter); release: seq_release_private(inode, file); return ERR_PTR(-ENOMEM); @@ -5025,12 +5011,7 @@ static int tracing_release(struct inode *inode, struct file *file) mutex_unlock(&trace_types_lock); - mutex_destroy(&iter->mutex); - free_cpumask_var(iter->started); - kfree(iter->fmt); - kfree(iter->temp); - kfree(iter->trace); - kfree(iter->buffer_iter); + free_trace_iter_content(iter); seq_release_private(inode, file); return 0; @@ -6318,6 +6299,15 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val) per_cpu_ptr(buf->data, cpu)->entries = val; } +static void update_buffer_entries(struct array_buffer *buf, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0)); + } else { + per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu); + } +} + #ifdef CONFIG_TRACER_MAX_TRACE /* resize @tr's buffer to the size of @size_tr's entries */ static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, @@ -6396,18 +6386,12 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, return ret; } - if (cpu == RING_BUFFER_ALL_CPUS) - set_buffer_entries(&tr->max_buffer, size); - else - per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size; + update_buffer_entries(&tr->max_buffer, cpu); out: #endif /* CONFIG_TRACER_MAX_TRACE */ - if (cpu == RING_BUFFER_ALL_CPUS) - set_buffer_entries(&tr->array_buffer, size); - else - per_cpu_ptr(tr->array_buffer.data, cpu)->entries = size; + update_buffer_entries(&tr->array_buffer, cpu); return ret; } @@ -6825,10 +6809,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) close_pipe_on_cpu(tr, iter->cpu_file); mutex_unlock(&trace_types_lock); - free_cpumask_var(iter->started); - kfree(iter->fmt); - kfree(iter->temp); - mutex_destroy(&iter->mutex); + free_trace_iter_content(iter); kfree(iter); trace_array_put(tr); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 73eaec158473..5669dd1f90d9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -77,6 +77,16 @@ enum trace_type { #undef __array #define __array(type, item, size) type item[size]; +/* + * For backward compatibility, older user space expects to see the + * kernel_stack event with a fixed size caller field. But today the fix + * size is ignored by the kernel, and the real structure is dynamic. + * Expose to user space: "unsigned long caller[8];" but the real structure + * will be "unsigned long caller[] __counted_by(size)" + */ +#undef __stack_array +#define __stack_array(type, item, size, field) type item[] __counted_by(field); + #undef __array_desc #define __array_desc(type, container, item, size) @@ -596,7 +606,6 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu) int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); void tracing_reset_online_cpus(struct array_buffer *buf); -void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); void tracing_reset_all_online_cpus_unlocked(void); int tracing_open_generic(struct inode *inode, struct file *filp); @@ -697,7 +706,6 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list, void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos); void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos); int trace_pid_show(struct seq_file *m, void *v); -void trace_free_pid_list(struct trace_pid_list *pid_list); int trace_pid_write(struct trace_pid_list *filtered_pids, struct trace_pid_list **new_pid_list, const char __user *ubuf, size_t cnt); @@ -1334,7 +1342,7 @@ struct trace_subsystem_dir { struct list_head list; struct event_subsystem *subsystem; struct trace_array *tr; - struct dentry *entry; + struct eventfs_file *ef; int ref_count; int nr_events; }; diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 340b2fa98218..c47422b20908 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -190,7 +190,7 @@ FTRACE_ENTRY(kernel_stack, stack_entry, F_STRUCT( __field( int, size ) - __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + __stack_array( unsigned long, caller, FTRACE_STACK_ENTRIES, size) ), F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n" diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 578f1f7d49a6..ed367d713be0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -984,7 +984,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) return; if (!--dir->nr_events) { - tracefs_remove(dir->entry); + eventfs_remove(dir->ef); list_del(&dir->list); __put_system_dir(dir); } @@ -1005,7 +1005,7 @@ static void remove_event_file_dir(struct trace_event_file *file) tracefs_remove(dir); } - + eventfs_remove(file->ef); list_del(&file->list); remove_subsystem(file->system); free_event_filter(file->filter); @@ -2291,13 +2291,13 @@ create_new_subsystem(const char *name) return NULL; } -static struct dentry * +static struct eventfs_file * event_subsystem_dir(struct trace_array *tr, const char *name, struct trace_event_file *file, struct dentry *parent) { struct event_subsystem *system, *iter; struct trace_subsystem_dir *dir; - struct dentry *entry; + int res; /* First see if we did not already create this dir */ list_for_each_entry(dir, &tr->systems, list) { @@ -2305,7 +2305,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, if (strcmp(system->name, name) == 0) { dir->nr_events++; file->system = dir; - return dir->entry; + return dir->ef; } } @@ -2329,8 +2329,8 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } else __get_system(system); - dir->entry = tracefs_create_dir(name, parent); - if (!dir->entry) { + dir->ef = eventfs_add_subsystem_dir(name, parent); + if (IS_ERR(dir->ef)) { pr_warn("Failed to create system directory %s\n", name); __put_system(system); goto out_free; @@ -2345,22 +2345,22 @@ event_subsystem_dir(struct trace_array *tr, const char *name, /* the ftrace system is special, do not create enable or filter files */ if (strcmp(name, "ftrace") != 0) { - entry = tracefs_create_file("filter", TRACE_MODE_WRITE, - dir->entry, dir, + res = eventfs_add_file("filter", TRACE_MODE_WRITE, + dir->ef, dir, &ftrace_subsystem_filter_fops); - if (!entry) { + if (res) { kfree(system->filter); system->filter = NULL; pr_warn("Could not create tracefs '%s/filter' entry\n", name); } - trace_create_file("enable", TRACE_MODE_WRITE, dir->entry, dir, + eventfs_add_file("enable", TRACE_MODE_WRITE, dir->ef, dir, &ftrace_system_enable_fops); } list_add(&dir->list, &tr->systems); - return dir->entry; + return dir->ef; out_free: kfree(dir); @@ -2413,36 +2413,37 @@ static int event_create_dir(struct dentry *parent, struct trace_event_file *file) { struct trace_event_call *call = file->event_call; + struct eventfs_file *ef_subsystem = NULL; struct trace_array *tr = file->tr; - struct dentry *d_events; const char *name; int ret; /* * If the trace point header did not define TRACE_SYSTEM - * then the system would be called "TRACE_SYSTEM". + * then the system would be called "TRACE_SYSTEM". This should + * never happen. */ - if (strcmp(call->class->system, TRACE_SYSTEM) != 0) { - d_events = event_subsystem_dir(tr, call->class->system, file, parent); - if (!d_events) - return -ENOMEM; - } else - d_events = parent; + if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0)) + return -ENODEV; + + ef_subsystem = event_subsystem_dir(tr, call->class->system, file, parent); + if (!ef_subsystem) + return -ENOMEM; name = trace_event_name(call); - file->dir = tracefs_create_dir(name, d_events); - if (!file->dir) { + file->ef = eventfs_add_dir(name, ef_subsystem); + if (IS_ERR(file->ef)) { pr_warn("Could not create tracefs '%s' directory\n", name); return -1; } if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) - trace_create_file("enable", TRACE_MODE_WRITE, file->dir, file, + eventfs_add_file("enable", TRACE_MODE_WRITE, file->ef, file, &ftrace_enable_fops); #ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg) - trace_create_file("id", TRACE_MODE_READ, file->dir, + eventfs_add_file("id", TRACE_MODE_READ, file->ef, (void *)(long)call->event.type, &ftrace_event_id_fops); #endif @@ -2458,27 +2459,27 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) * triggers or filters. */ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { - trace_create_file("filter", TRACE_MODE_WRITE, file->dir, + eventfs_add_file("filter", TRACE_MODE_WRITE, file->ef, file, &ftrace_event_filter_fops); - trace_create_file("trigger", TRACE_MODE_WRITE, file->dir, + eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef, file, &event_trigger_fops); } #ifdef CONFIG_HIST_TRIGGERS - trace_create_file("hist", TRACE_MODE_READ, file->dir, file, + eventfs_add_file("hist", TRACE_MODE_READ, file->ef, file, &event_hist_fops); #endif #ifdef CONFIG_HIST_TRIGGERS_DEBUG - trace_create_file("hist_debug", TRACE_MODE_READ, file->dir, file, + eventfs_add_file("hist_debug", TRACE_MODE_READ, file->ef, file, &event_hist_debug_fops); #endif - trace_create_file("format", TRACE_MODE_READ, file->dir, call, + eventfs_add_file("format", TRACE_MODE_READ, file->ef, call, &ftrace_event_format_fops); #ifdef CONFIG_TRACE_EVENT_INJECT if (call->event.type && call->class->reg) - trace_create_file("inject", 0200, file->dir, file, + eventfs_add_file("inject", 0200, file->ef, file, &event_inject_fops); #endif @@ -3631,21 +3632,22 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) { struct dentry *d_events; struct dentry *entry; + int error = 0; entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_fops); if (!entry) return -ENOMEM; - d_events = tracefs_create_dir("events", parent); - if (!d_events) { + d_events = eventfs_create_events_dir("events", parent); + if (IS_ERR(d_events)) { pr_warn("Could not create tracefs 'events' directory\n"); return -ENOMEM; } - entry = trace_create_file("enable", TRACE_MODE_WRITE, d_events, + error = eventfs_add_events_file("enable", TRACE_MODE_WRITE, d_events, tr, &ftrace_tr_enable_fops); - if (!entry) + if (error) return -ENOMEM; /* There are not as crucial, just warn if they are not created */ @@ -3658,11 +3660,11 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) &ftrace_set_event_notrace_pid_fops); /* ring buffer internal formats */ - trace_create_file("header_page", TRACE_MODE_READ, d_events, + eventfs_add_events_file("header_page", TRACE_MODE_READ, d_events, ring_buffer_print_page_header, &ftrace_show_header_fops); - trace_create_file("header_event", TRACE_MODE_READ, d_events, + eventfs_add_events_file("header_event", TRACE_MODE_READ, d_events, ring_buffer_print_entry_header, &ftrace_show_header_fops); @@ -3750,7 +3752,7 @@ int event_trace_del_tracer(struct trace_array *tr) down_write(&trace_event_sem); __trace_remove_event_dirs(tr); - tracefs_remove(tr->event_dir); + eventfs_remove_events_dir(tr->event_dir); up_write(&trace_event_sem); tr->event_dir = NULL; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1dad64267878..3a529214a21b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -46,15 +46,19 @@ static const char * ops[] = { OPS }; enum filter_pred_fn { FILTER_PRED_FN_NOP, FILTER_PRED_FN_64, + FILTER_PRED_FN_64_CPUMASK, FILTER_PRED_FN_S64, FILTER_PRED_FN_U64, FILTER_PRED_FN_32, + FILTER_PRED_FN_32_CPUMASK, FILTER_PRED_FN_S32, FILTER_PRED_FN_U32, FILTER_PRED_FN_16, + FILTER_PRED_FN_16_CPUMASK, FILTER_PRED_FN_S16, FILTER_PRED_FN_U16, FILTER_PRED_FN_8, + FILTER_PRED_FN_8_CPUMASK, FILTER_PRED_FN_S8, FILTER_PRED_FN_U8, FILTER_PRED_FN_COMM, @@ -64,21 +68,25 @@ enum filter_pred_fn { FILTER_PRED_FN_PCHAR_USER, FILTER_PRED_FN_PCHAR, FILTER_PRED_FN_CPU, + FILTER_PRED_FN_CPU_CPUMASK, + FILTER_PRED_FN_CPUMASK, + FILTER_PRED_FN_CPUMASK_CPU, FILTER_PRED_FN_FUNCTION, FILTER_PRED_FN_, FILTER_PRED_TEST_VISITED, }; struct filter_pred { - enum filter_pred_fn fn_num; - u64 val; - u64 val2; - struct regex regex; + struct regex *regex; + struct cpumask *mask; unsigned short *ops; struct ftrace_event_field *field; - int offset; + u64 val; + u64 val2; + enum filter_pred_fn fn_num; + int offset; int not; - int op; + int op; }; /* @@ -94,6 +102,8 @@ struct filter_pred { C(TOO_MANY_OPEN, "Too many '('"), \ C(TOO_MANY_CLOSE, "Too few '('"), \ C(MISSING_QUOTE, "Missing matching quote"), \ + C(MISSING_BRACE_OPEN, "Missing '{'"), \ + C(MISSING_BRACE_CLOSE, "Missing '}'"), \ C(OPERAND_TOO_LONG, "Operand too long"), \ C(EXPECT_STRING, "Expecting string field"), \ C(EXPECT_DIGIT, "Expecting numeric field"), \ @@ -103,6 +113,7 @@ struct filter_pred { C(BAD_SUBSYS_FILTER, "Couldn't find or set field in one of a subsystem's events"), \ C(TOO_MANY_PREDS, "Too many terms in predicate expression"), \ C(INVALID_FILTER, "Meaningless filter expression"), \ + C(INVALID_CPULIST, "Invalid cpulist"), \ C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \ C(NO_FUNCTION, "Function not found"), \ @@ -186,6 +197,15 @@ enum { PROCESS_OR = 4, }; +static void free_predicate(struct filter_pred *pred) +{ + if (pred) { + kfree(pred->regex); + kfree(pred->mask); + kfree(pred); + } +} + /* * Without going into a formal proof, this explains the method that is used in * parsing the logical expressions. @@ -623,12 +643,64 @@ out_free: kfree(inverts); if (prog_stack) { for (i = 0; prog_stack[i].pred; i++) - kfree(prog_stack[i].pred); + free_predicate(prog_stack[i].pred); kfree(prog_stack); } return ERR_PTR(ret); } +static inline int +do_filter_cpumask(int op, const struct cpumask *mask, const struct cpumask *cmp) +{ + switch (op) { + case OP_EQ: + return cpumask_equal(mask, cmp); + case OP_NE: + return !cpumask_equal(mask, cmp); + case OP_BAND: + return cpumask_intersects(mask, cmp); + default: + return 0; + } +} + +/* Optimisation of do_filter_cpumask() for scalar fields */ +static inline int +do_filter_scalar_cpumask(int op, unsigned int cpu, const struct cpumask *mask) +{ + /* + * Per the weight-of-one cpumask optimisations, the mask passed in this + * function has a weight >= 2, so it is never equal to a single scalar. + */ + switch (op) { + case OP_EQ: + return false; + case OP_NE: + return true; + case OP_BAND: + return cpumask_test_cpu(cpu, mask); + default: + return 0; + } +} + +static inline int +do_filter_cpumask_scalar(int op, const struct cpumask *mask, unsigned int cpu) +{ + switch (op) { + case OP_EQ: + return cpumask_test_cpu(cpu, mask) && + cpumask_nth(1, mask) >= nr_cpu_ids; + case OP_NE: + return !cpumask_test_cpu(cpu, mask) || + cpumask_nth(1, mask) < nr_cpu_ids; + case OP_BAND: + return cpumask_test_cpu(cpu, mask); + default: + return 0; + } +} + enum pred_cmp_types { PRED_CMP_TYPE_NOP, PRED_CMP_TYPE_LT, @@ -672,6 +744,18 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \ } \ } +#define DEFINE_CPUMASK_COMPARISON_PRED(size) \ +static int filter_pred_##size##_cpumask(struct filter_pred *pred, void *event) \ +{ \ + u##size *addr = (u##size *)(event + pred->offset); \ + unsigned int cpu = *addr; \ + \ + if (cpu >= nr_cpu_ids) \ + return 0; \ + \ + return do_filter_scalar_cpumask(pred->op, cpu, pred->mask); \ +} + #define DEFINE_EQUALITY_PRED(size) \ static int filter_pred_##size(struct filter_pred *pred, void *event) \ { \ @@ -693,6 +777,11 @@ DEFINE_COMPARISON_PRED(u16); DEFINE_COMPARISON_PRED(s8); DEFINE_COMPARISON_PRED(u8); +DEFINE_CPUMASK_COMPARISON_PRED(64); +DEFINE_CPUMASK_COMPARISON_PRED(32); +DEFINE_CPUMASK_COMPARISON_PRED(16); +DEFINE_CPUMASK_COMPARISON_PRED(8); + DEFINE_EQUALITY_PRED(64); DEFINE_EQUALITY_PRED(32); DEFINE_EQUALITY_PRED(16); @@ -750,7 +839,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event) char *addr = (char *)(event + pred->offset); int cmp, match; - cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len); + cmp = pred->regex->match(addr, pred->regex, pred->regex->field_len); match = cmp ^ pred->not; @@ -763,7 +852,7 @@ static __always_inline int filter_pchar(struct filter_pred *pred, char *str) int len; len = strlen(str) + 1; /* including tailing '\0' */ - cmp = pred->regex.match(str, &pred->regex, len); + cmp = pred->regex->match(str, pred->regex, len); match = cmp ^ pred->not; @@ -813,7 +902,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event) char *addr = (char *)(event + str_loc); int cmp, match; - cmp = pred->regex.match(addr, &pred->regex, str_len); + cmp = pred->regex->match(addr, pred->regex, str_len); match = cmp ^ pred->not; @@ -836,7 +925,7 @@ static int filter_pred_strrelloc(struct filter_pred *pred, void *event) char *addr = (char *)(&item[1]) + str_loc; int cmp, match; - cmp = pred->regex.match(addr, &pred->regex, str_len); + cmp = pred->regex->match(addr, pred->regex, str_len); match = cmp ^ pred->not; @@ -869,12 +958,42 @@ static int filter_pred_cpu(struct filter_pred *pred, void *event) } } +/* Filter predicate for current CPU vs user-provided cpumask */ +static int filter_pred_cpu_cpumask(struct filter_pred *pred, void *event) +{ + int cpu = raw_smp_processor_id(); + + return do_filter_scalar_cpumask(pred->op, cpu, pred->mask); +} + +/* Filter predicate for cpumask field vs user-provided cpumask */ +static int filter_pred_cpumask(struct filter_pred *pred, void *event) +{ + u32 item = *(u32 *)(event + pred->offset); + int loc = item & 0xffff; + const struct cpumask *mask = (event + loc); + const struct cpumask *cmp = pred->mask; + + return do_filter_cpumask(pred->op, mask, cmp); +} + +/* Filter predicate for cpumask field vs user-provided scalar */ +static int filter_pred_cpumask_cpu(struct filter_pred *pred, void *event) +{ + u32 item = *(u32 *)(event + pred->offset); + int loc = item & 0xffff; + const struct cpumask *mask = (event + loc); + unsigned int cpu = pred->val; + + return do_filter_cpumask_scalar(pred->op, mask, cpu); +} + /* Filter predicate for COMM. */ static int filter_pred_comm(struct filter_pred *pred, void *event) { int cmp; - cmp = pred->regex.match(current->comm, &pred->regex, + cmp = pred->regex->match(current->comm, pred->regex, TASK_COMM_LEN); return cmp ^ pred->not; } @@ -1004,7 +1123,7 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) static void filter_build_regex(struct filter_pred *pred) { - struct regex *r = &pred->regex; + struct regex *r = pred->regex; char *search; enum regex_type type = MATCH_FULL; @@ -1169,7 +1288,7 @@ static void free_prog(struct event_filter *filter) return; for (i = 0; prog[i].pred; i++) - kfree(prog[i].pred); + free_predicate(prog[i].pred); kfree(prog); } @@ -1236,8 +1355,12 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, int filter_assign_type(const char *type) { - if (strstr(type, "__data_loc") && strstr(type, "char")) - return FILTER_DYN_STRING; + if (strstr(type, "__data_loc")) { + if (strstr(type, "char")) + return FILTER_DYN_STRING; + if (strstr(type, "cpumask_t")) + return FILTER_CPUMASK; + } if (strstr(type, "__rel_loc") && strstr(type, "char")) return FILTER_RDYN_STRING; @@ -1313,24 +1436,32 @@ static int filter_pred_fn_call(struct filter_pred *pred, void *event) switch (pred->fn_num) { case FILTER_PRED_FN_64: return filter_pred_64(pred, event); + case FILTER_PRED_FN_64_CPUMASK: + return filter_pred_64_cpumask(pred, event); case FILTER_PRED_FN_S64: return filter_pred_s64(pred, event); case FILTER_PRED_FN_U64: return filter_pred_u64(pred, event); case FILTER_PRED_FN_32: return filter_pred_32(pred, event); + case FILTER_PRED_FN_32_CPUMASK: + return filter_pred_32_cpumask(pred, event); case FILTER_PRED_FN_S32: return filter_pred_s32(pred, event); case FILTER_PRED_FN_U32: return filter_pred_u32(pred, event); case FILTER_PRED_FN_16: return filter_pred_16(pred, event); + case FILTER_PRED_FN_16_CPUMASK: + return filter_pred_16_cpumask(pred, event); case FILTER_PRED_FN_S16: return filter_pred_s16(pred, event); case FILTER_PRED_FN_U16: return filter_pred_u16(pred, event); case FILTER_PRED_FN_8: return filter_pred_8(pred, event); + case FILTER_PRED_FN_8_CPUMASK: + return filter_pred_8_cpumask(pred, event); case FILTER_PRED_FN_S8: return filter_pred_s8(pred, event); case FILTER_PRED_FN_U8: @@ -1349,6 +1480,12 @@ static int filter_pred_fn_call(struct filter_pred *pred, void *event) return filter_pred_pchar(pred, event); case FILTER_PRED_FN_CPU: return filter_pred_cpu(pred, event); + case FILTER_PRED_FN_CPU_CPUMASK: + return filter_pred_cpu_cpumask(pred, event); + case FILTER_PRED_FN_CPUMASK: + return filter_pred_cpumask(pred, event); + case FILTER_PRED_FN_CPUMASK_CPU: + return filter_pred_cpumask_cpu(pred, event); case FILTER_PRED_FN_FUNCTION: return filter_pred_function(pred, event); case FILTER_PRED_TEST_VISITED: @@ -1553,9 +1690,117 @@ static int parse_pred(const char *str, void *data, goto err_free; } - pred->regex.len = len; - strncpy(pred->regex.pattern, str + s, len); - pred->regex.pattern[len] = 0; + pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL); + if (!pred->regex) + goto err_mem; + pred->regex->len = len; + strncpy(pred->regex->pattern, str + s, len); + pred->regex->pattern[len] = 0; + + } else if (!strncmp(str + i, "CPUS", 4)) { + unsigned int maskstart; + bool single; + char *tmp; + + switch (field->filter_type) { + case FILTER_CPUMASK: + case FILTER_CPU: + case FILTER_OTHER: + break; + default: + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } + + switch (op) { + case OP_EQ: + case OP_NE: + case OP_BAND: + break; + default: + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } + + /* Skip CPUS */ + i += 4; + if (str[i++] != '{') { + parse_error(pe, FILT_ERR_MISSING_BRACE_OPEN, pos + i); + goto err_free; + } + maskstart = i; + + /* Walk the cpulist until closing } */ + for (; str[i] && str[i] != '}'; i++); + if (str[i] != '}') { + parse_error(pe, FILT_ERR_MISSING_BRACE_CLOSE, pos + i); + goto err_free; + } + + if (maskstart == i) { + parse_error(pe, FILT_ERR_INVALID_CPULIST, pos + i); + goto err_free; + } + + /* Copy the cpulist between { and } */ + tmp = kmalloc((i - maskstart) + 1, GFP_KERNEL); + strscpy(tmp, str + maskstart, (i - maskstart) + 1); + + pred->mask = kzalloc(cpumask_size(), GFP_KERNEL); + if (!pred->mask) + goto err_mem; + + /* Now parse it */ + if (cpulist_parse(tmp, pred->mask)) { + parse_error(pe, FILT_ERR_INVALID_CPULIST, pos + i); + goto err_free; + } + + /* Move along */ + i++; + + /* + * Optimisation: if the user-provided mask has a weight of one + * then we can treat it as a scalar input. + */ + single = cpumask_weight(pred->mask) == 1; + if (single) { + pred->val = cpumask_first(pred->mask); + kfree(pred->mask); + } + + if (field->filter_type == FILTER_CPUMASK) { + pred->fn_num = single ? + FILTER_PRED_FN_CPUMASK_CPU : + FILTER_PRED_FN_CPUMASK; + } else if (field->filter_type == FILTER_CPU) { + if (single) { + pred->op = pred->op == OP_BAND ? OP_EQ : pred->op; + pred->fn_num = FILTER_PRED_FN_CPU; + } else { + pred->fn_num = FILTER_PRED_FN_CPU_CPUMASK; + } + } else if (single) { + pred->op = pred->op == OP_BAND ? OP_EQ : pred->op; + pred->fn_num = select_comparison_fn(pred->op, field->size, false); + if (pred->op == OP_NE) + pred->not = 1; + } else { + switch (field->size) { + case 8: + pred->fn_num = FILTER_PRED_FN_64_CPUMASK; + break; + case 4: + pred->fn_num = FILTER_PRED_FN_32_CPUMASK; + break; + case 2: + pred->fn_num = FILTER_PRED_FN_16_CPUMASK; + break; + case 1: + pred->fn_num = FILTER_PRED_FN_8_CPUMASK; + break; + } + } /* This is either a string, or an integer */ } else if (str[i] == '\'' || str[i] == '"') { @@ -1597,9 +1842,12 @@ static int parse_pred(const char *str, void *data, goto err_free; } - pred->regex.len = len; - strncpy(pred->regex.pattern, str + s, len); - pred->regex.pattern[len] = 0; + pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL); + if (!pred->regex) + goto err_mem; + pred->regex->len = len; + strncpy(pred->regex->pattern, str + s, len); + pred->regex->pattern[len] = 0; filter_build_regex(pred); @@ -1608,7 +1856,7 @@ static int parse_pred(const char *str, void *data, } else if (field->filter_type == FILTER_STATIC_STRING) { pred->fn_num = FILTER_PRED_FN_STRING; - pred->regex.field_len = field->size; + pred->regex->field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) { pred->fn_num = FILTER_PRED_FN_STRLOC; @@ -1691,10 +1939,10 @@ static int parse_pred(const char *str, void *data, return i; err_free: - kfree(pred); + free_predicate(pred); return -EINVAL; err_mem: - kfree(pred); + free_predicate(pred); return -ENOMEM; } @@ -2287,8 +2535,8 @@ static int ftrace_function_set_filter_pred(struct filter_pred *pred, return ret; return __ftrace_function_set_filter(pred->op == OP_EQ, - pred->regex.pattern, - pred->regex.len, + pred->regex->pattern, + pred->regex->len, data); } diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 33cb6af31f39..6f046650e527 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1328,14 +1328,14 @@ static int user_field_set_string(struct ftrace_event_field *field, static int user_event_set_print_fmt(struct user_event *user, char *buf, int len) { - struct ftrace_event_field *field, *next; + struct ftrace_event_field *field; struct list_head *head = &user->fields; int pos = 0, depth = 0; const char *str_func; pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); - list_for_each_entry_safe_reverse(field, next, head, link) { + list_for_each_entry_reverse(field, head, link) { if (depth != 0) pos += snprintf(buf + pos, LEN_OR_ZERO, " "); @@ -1347,7 +1347,7 @@ static int user_event_set_print_fmt(struct user_event *user, char *buf, int len) pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); - list_for_each_entry_safe_reverse(field, next, head, link) { + list_for_each_entry_reverse(field, head, link) { if (user_field_is_dyn_string(field->type, &str_func)) pos += snprintf(buf + pos, LEN_OR_ZERO, ", %s(%s)", str_func, field->name); @@ -1732,7 +1732,7 @@ static int user_event_create(const char *raw_command) static int user_event_show(struct seq_file *m, struct dyn_event *ev) { struct user_event *user = container_of(ev, struct user_event, devent); - struct ftrace_event_field *field, *next; + struct ftrace_event_field *field; struct list_head *head; int depth = 0; @@ -1740,7 +1740,7 @@ static int user_event_show(struct seq_file *m, struct dyn_event *ev) head = trace_get_fields(&user->call); - list_for_each_entry_safe_reverse(field, next, head, link) { + list_for_each_entry_reverse(field, head, link) { if (depth == 0) seq_puts(m, " "); else @@ -1816,13 +1816,14 @@ out: static bool user_fields_match(struct user_event *user, int argc, const char **argv) { - struct ftrace_event_field *field, *next; + struct ftrace_event_field *field; struct list_head *head = &user->fields; int i = 0; - list_for_each_entry_safe_reverse(field, next, head, link) + list_for_each_entry_reverse(field, head, link) { if (!user_field_match(field, argc, argv, &i)) return false; + } if (i != argc) return false; diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 58f3946081e2..1698fc22afa0 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -51,6 +51,9 @@ static int ftrace_event_register(struct trace_event_call *call, #undef __array #define __array(type, item, size) type item[size]; +#undef __stack_array +#define __stack_array(type, item, size, field) __array(type, item, size) + #undef __array_desc #define __array_desc(type, container, item, size) type item[size]; @@ -114,6 +117,9 @@ static void __always_unused ____ftrace_check_##name(void) \ is_signed_type(_type), .filter_type = FILTER_OTHER, \ .len = _len }, +#undef __stack_array +#define __stack_array(_type, _item, _len, _field) __array(_type, _item, _len) + #undef __array_desc #define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len) @@ -149,6 +155,9 @@ static struct trace_event_fields ftrace_event_fields_##name[] = { \ #undef __array #define __array(type, item, len) +#undef __stack_array +#define __stack_array(type, item, len, field) + #undef __array_desc #define __array_desc(type, container, item, len) |