diff options
42 files changed, 1113 insertions, 221 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 21b7f87b00fe..276a793168a6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1509,6 +1509,15 @@ boot up that is likely to be overridden by user space start up functionality. + Optionally, the snapshot can also be defined for a tracing + instance that was created by the trace_instance= command + line parameter. + + trace_instance=foo,sched_switch ftrace_boot_snapshot=foo + + The above will cause the "foo" tracing instance to trigger + a snapshot at the end of boot up. + ftrace_dump_on_oops[=orig_cpu] [FTRACE] will dump the trace buffers on oops. If no parameter is passed, ftrace will dump @@ -6283,6 +6292,26 @@ comma-separated list of trace events to enable. See also Documentation/trace/events.rst + trace_instance=[instance-info] + [FTRACE] Create a ring buffer instance early in boot up. + This will be listed in: + + /sys/kernel/tracing/instances + + Events can be enabled at the time the instance is created + via: + + trace_instance=<name>,<system1>:<event1>,<system2>:<event2> + + Note, the "<system*>:" portion is optional if the event is + unique. + + trace_instance=foo,sched:sched_switch,irq_handler_entry,initcall + + will enable the "sched_switch" event (note, the "sched:" is optional, and + the same thing would happen if it was left off). The irq_handler_entry + event, and all events under the "initcall" system. + trace_options=[option-list] [FTRACE] Enable or disable tracer options at boot. The option-list is a comma delimited list of options diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst index 8b33850b8bf9..f5fcb8e1218f 100644 --- a/Documentation/trace/events.rst +++ b/Documentation/trace/events.rst @@ -207,6 +207,18 @@ field name:: As the kernel will have to know how to retrieve the memory that the pointer is at from user space. +You can convert any long type to a function address and search by function name:: + + call_site.function == security_prepare_creds + +The above will filter when the field "call_site" falls on the address within +"security_prepare_creds". That is, it will compare the value of "call_site" and +the filter will return true if it is greater than or equal to the start of +the function "security_prepare_creds" and less than the end of that function. + +The ".function" postfix can only be attached to values of size long, and can only +be compared with "==" or "!=". + 5.2 Setting filters ------------------- diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index 7b7e4893b8f6..479c9eac6335 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -81,6 +81,7 @@ Documentation written by Tom Zanussi .usecs display a common_timestamp in microseconds .percent display a number of percentage value .graph display a bar-graph of a value + .stacktrace display as a stacktrace (must by a long[] type) ============= ================================================= Note that in general the semantics of a given field aren't @@ -1786,6 +1787,8 @@ or assigned to a variable and referenced in a subsequent expression:: # echo 'hist:keys=next_pid:us_per_sec=1000000 ...' >> event/trigger # echo 'hist:keys=next_pid:timestamp_secs=common_timestamp/$us_per_sec ...' >> event/trigger +Variables can even hold stacktraces, which are useful with synthetic events. + 2.2.2 Synthetic Events ---------------------- @@ -1861,7 +1864,7 @@ A histogram can now be defined for the new synthetic event:: The above shows the latency "lat" in a power of 2 grouping. Like any other event, once a histogram is enabled for the event, the -output can be displayed by reading the event's 'hist' file. +output can be displayed by reading the event's 'hist' file:: # cat /sys/kernel/tracing/events/synthetic/wakeup_latency/hist @@ -1908,7 +1911,7 @@ output can be displayed by reading the event's 'hist' file. The latency values can also be grouped linearly by a given size with -the ".buckets" modifier and specify a size (in this case groups of 10). +the ".buckets" modifier and specify a size (in this case groups of 10):: # echo 'hist:keys=pid,prio,lat.buckets=10:sort=lat' >> \ /sys/kernel/tracing/events/synthetic/wakeup_latency/trigger @@ -1940,6 +1943,157 @@ the ".buckets" modifier and specify a size (in this case groups of 10). Entries: 16 Dropped: 0 +To save stacktraces, create a synthetic event with a field of type "unsigned long[]" +or even just "long[]". For example, to see how long a task is blocked in an +uninterruptible state:: + + # cd /sys/kernel/tracing + # echo 's:block_lat pid_t pid; u64 delta; unsigned long[] stack;' > dynamic_events + # echo 'hist:keys=next_pid:ts=common_timestamp.usecs,st=stacktrace if prev_state == 2' >> events/sched/sched_switch/trigger + # echo 'hist:keys=prev_pid:delta=common_timestamp.usecs-$ts,s=$st:onmax($delta).trace(block_lat,prev_pid,$delta,$s)' >> events/sched/sched_switch/trigger + # echo 1 > events/synthetic/block_lat/enable + # cat trace + + # tracer: nop + # + # entries-in-buffer/entries-written: 2/2 #P:8 + # + # _-----=> irqs-off/BH-disabled + # / _----=> need-resched + # | / _---=> hardirq/softirq + # || / _--=> preempt-depth + # ||| / _-=> migrate-disable + # |||| / delay + # TASK-PID CPU# ||||| TIMESTAMP FUNCTION + # | | | ||||| | | + <idle>-0 [005] d..4. 521.164922: block_lat: pid=0 delta=8322 stack=STACK: + => __schedule+0x448/0x7b0 + => schedule+0x5a/0xb0 + => io_schedule+0x42/0x70 + => bit_wait_io+0xd/0x60 + => __wait_on_bit+0x4b/0x140 + => out_of_line_wait_on_bit+0x91/0xb0 + => jbd2_journal_commit_transaction+0x1679/0x1a70 + => kjournald2+0xa9/0x280 + => kthread+0xe9/0x110 + => ret_from_fork+0x2c/0x50 + + <...>-2 [004] d..4. 525.184257: block_lat: pid=2 delta=76 stack=STACK: + => __schedule+0x448/0x7b0 + => schedule+0x5a/0xb0 + => schedule_timeout+0x11a/0x150 + => wait_for_completion_killable+0x144/0x1f0 + => __kthread_create_on_node+0xe7/0x1e0 + => kthread_create_on_node+0x51/0x70 + => create_worker+0xcc/0x1a0 + => worker_thread+0x2ad/0x380 + => kthread+0xe9/0x110 + => ret_from_fork+0x2c/0x50 + +A synthetic event that has a stacktrace field may use it as a key in +histogram:: + + # echo 'hist:keys=delta.buckets=100,stack.stacktrace:sort=delta' > events/synthetic/block_lat/trigger + # cat events/synthetic/block_lat/hist + + # event histogram + # + # trigger info: hist:keys=delta.buckets=100,stack.stacktrace:vals=hitcount:sort=delta.buckets=100:size=2048 [active] + # + { delta: ~ 0-99, stack.stacktrace __schedule+0xa19/0x1520 + schedule+0x6b/0x110 + io_schedule+0x46/0x80 + bit_wait_io+0x11/0x80 + __wait_on_bit+0x4e/0x120 + out_of_line_wait_on_bit+0x8d/0xb0 + __wait_on_buffer+0x33/0x40 + jbd2_journal_commit_transaction+0x155a/0x19b0 + kjournald2+0xab/0x270 + kthread+0xfa/0x130 + ret_from_fork+0x29/0x50 + } hitcount: 1 + { delta: ~ 0-99, stack.stacktrace __schedule+0xa19/0x1520 + schedule+0x6b/0x110 + io_schedule+0x46/0x80 + rq_qos_wait+0xd0/0x170 + wbt_wait+0x9e/0xf0 + __rq_qos_throttle+0x25/0x40 + blk_mq_submit_bio+0x2c3/0x5b0 + __submit_bio+0xff/0x190 + submit_bio_noacct_nocheck+0x25b/0x2b0 + submit_bio_noacct+0x20b/0x600 + submit_bio+0x28/0x90 + ext4_bio_write_page+0x1e0/0x8c0 + mpage_submit_page+0x60/0x80 + mpage_process_page_bufs+0x16c/0x180 + mpage_prepare_extent_to_map+0x23f/0x530 + } hitcount: 1 + { delta: ~ 0-99, stack.stacktrace __schedule+0xa19/0x1520 + schedule+0x6b/0x110 + schedule_hrtimeout_range_clock+0x97/0x110 + schedule_hrtimeout_range+0x13/0x20 + usleep_range_state+0x65/0x90 + __intel_wait_for_register+0x1c1/0x230 [i915] + intel_psr_wait_for_idle_locked+0x171/0x2a0 [i915] + intel_pipe_update_start+0x169/0x360 [i915] + intel_update_crtc+0x112/0x490 [i915] + skl_commit_modeset_enables+0x199/0x600 [i915] + intel_atomic_commit_tail+0x7c4/0x1080 [i915] + intel_atomic_commit_work+0x12/0x20 [i915] + process_one_work+0x21c/0x3f0 + worker_thread+0x50/0x3e0 + kthread+0xfa/0x130 + } hitcount: 3 + { delta: ~ 0-99, stack.stacktrace __schedule+0xa19/0x1520 + schedule+0x6b/0x110 + schedule_timeout+0x11e/0x160 + __wait_for_common+0x8f/0x190 + wait_for_completion+0x24/0x30 + __flush_work.isra.0+0x1cc/0x360 + flush_work+0xe/0x20 + drm_mode_rmfb+0x18b/0x1d0 [drm] + drm_mode_rmfb_ioctl+0x10/0x20 [drm] + drm_ioctl_kernel+0xb8/0x150 [drm] + drm_ioctl+0x243/0x560 [drm] + __x64_sys_ioctl+0x92/0xd0 + do_syscall_64+0x59/0x90 + entry_SYSCALL_64_after_hwframe+0x72/0xdc + } hitcount: 1 + { delta: ~ 0-99, stack.stacktrace __schedule+0xa19/0x1520 + schedule+0x6b/0x110 + schedule_timeout+0x87/0x160 + __wait_for_common+0x8f/0x190 + wait_for_completion_timeout+0x1d/0x30 + drm_atomic_helper_wait_for_flip_done+0x57/0x90 [drm_kms_helper] + intel_atomic_commit_tail+0x8ce/0x1080 [i915] + intel_atomic_commit_work+0x12/0x20 [i915] + process_one_work+0x21c/0x3f0 + worker_thread+0x50/0x3e0 + kthread+0xfa/0x130 + ret_from_fork+0x29/0x50 + } hitcount: 1 + { delta: ~ 100-199, stack.stacktrace __schedule+0xa19/0x1520 + schedule+0x6b/0x110 + schedule_hrtimeout_range_clock+0x97/0x110 + schedule_hrtimeout_range+0x13/0x20 + usleep_range_state+0x65/0x90 + pci_set_low_power_state+0x17f/0x1f0 + pci_set_power_state+0x49/0x250 + pci_finish_runtime_suspend+0x4a/0x90 + pci_pm_runtime_suspend+0xcb/0x1b0 + __rpm_callback+0x48/0x120 + rpm_callback+0x67/0x70 + rpm_suspend+0x167/0x780 + rpm_idle+0x25a/0x380 + pm_runtime_work+0x93/0xc0 + process_one_work+0x21c/0x3f0 + } hitcount: 1 + + Totals: + Hits: 10 + Entries: 7 + Dropped: 0 + 2.2.3 Hist trigger 'handlers' and 'actions' ------------------------------------------- @@ -2054,11 +2208,11 @@ The following commonly-used handler.action pairs are available: wakeup_new_test($testpid) if comm=="cyclictest"' >> \ /sys/kernel/tracing/events/sched/sched_wakeup_new/trigger - Or, equivalently, using the 'trace' keyword syntax: + Or, equivalently, using the 'trace' keyword syntax:: - # echo 'hist:keys=$testpid:testpid=pid:onmatch(sched.sched_wakeup_new).\ - trace(wakeup_new_test,$testpid) if comm=="cyclictest"' >> \ - /sys/kernel/tracing/events/sched/sched_wakeup_new/trigger + # echo 'hist:keys=$testpid:testpid=pid:onmatch(sched.sched_wakeup_new).\ + trace(wakeup_new_test,$testpid) if comm=="cyclictest"' >> \ + /sys/kernel/tracing/events/sched/sched_wakeup_new/trigger Creating and displaying a histogram based on those events is now just a matter of using the fields and new synthetic event in the @@ -2191,48 +2345,48 @@ The following commonly-used handler.action pairs are available: resulting latency, stored in wakeup_lat, exceeds the current maximum latency, a snapshot is taken. As part of the setup, all the scheduler events are also enabled, which are the events that - will show up in the snapshot when it is taken at some point: + will show up in the snapshot when it is taken at some point:: - # echo 1 > /sys/kernel/tracing/events/sched/enable + # echo 1 > /sys/kernel/tracing/events/sched/enable - # echo 'hist:keys=pid:ts0=common_timestamp.usecs \ - if comm=="cyclictest"' >> \ - /sys/kernel/tracing/events/sched/sched_waking/trigger + # echo 'hist:keys=pid:ts0=common_timestamp.usecs \ + if comm=="cyclictest"' >> \ + /sys/kernel/tracing/events/sched/sched_waking/trigger - # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0: \ - onmax($wakeup_lat).save(next_prio,next_comm,prev_pid,prev_prio, \ - prev_comm):onmax($wakeup_lat).snapshot() \ - if next_comm=="cyclictest"' >> \ - /sys/kernel/tracing/events/sched/sched_switch/trigger + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0: \ + onmax($wakeup_lat).save(next_prio,next_comm,prev_pid,prev_prio, \ + prev_comm):onmax($wakeup_lat).snapshot() \ + if next_comm=="cyclictest"' >> \ + /sys/kernel/tracing/events/sched/sched_switch/trigger When the histogram is displayed, for each bucket the max value and the saved values corresponding to the max are displayed following the rest of the fields. If a snapshot was taken, there is also a message indicating that, - along with the value and event that triggered the global maximum: + along with the value and event that triggered the global maximum:: - # cat /sys/kernel/tracing/events/sched/sched_switch/hist - { next_pid: 2101 } hitcount: 200 - max: 52 next_prio: 120 next_comm: cyclictest \ - prev_pid: 0 prev_prio: 120 prev_comm: swapper/6 + # cat /sys/kernel/tracing/events/sched/sched_switch/hist + { next_pid: 2101 } hitcount: 200 + max: 52 next_prio: 120 next_comm: cyclictest \ + prev_pid: 0 prev_prio: 120 prev_comm: swapper/6 - { next_pid: 2103 } hitcount: 1326 - max: 572 next_prio: 19 next_comm: cyclictest \ - prev_pid: 0 prev_prio: 120 prev_comm: swapper/1 + { next_pid: 2103 } hitcount: 1326 + max: 572 next_prio: 19 next_comm: cyclictest \ + prev_pid: 0 prev_prio: 120 prev_comm: swapper/1 - { next_pid: 2102 } hitcount: 1982 \ - max: 74 next_prio: 19 next_comm: cyclictest \ - prev_pid: 0 prev_prio: 120 prev_comm: swapper/5 + { next_pid: 2102 } hitcount: 1982 \ + max: 74 next_prio: 19 next_comm: cyclictest \ + prev_pid: 0 prev_prio: 120 prev_comm: swapper/5 - Snapshot taken (see tracing/snapshot). Details: - triggering value { onmax($wakeup_lat) }: 572 \ - triggered by event with key: { next_pid: 2103 } + Snapshot taken (see tracing/snapshot). Details: + triggering value { onmax($wakeup_lat) }: 572 \ + triggered by event with key: { next_pid: 2103 } - Totals: - Hits: 3508 - Entries: 3 - Dropped: 0 + Totals: + Hits: 3508 + Entries: 3 + Dropped: 0 In the above case, the event that triggered the global maximum has the key with next_pid == 2103. If you look at the bucket that has @@ -2310,15 +2464,15 @@ The following commonly-used handler.action pairs are available: $cwnd variable. If the value has changed, a snapshot is taken. As part of the setup, all the scheduler and tcp events are also enabled, which are the events that will show up in the snapshot - when it is taken at some point: + when it is taken at some point:: - # echo 1 > /sys/kernel/tracing/events/sched/enable - # echo 1 > /sys/kernel/tracing/events/tcp/enable + # echo 1 > /sys/kernel/tracing/events/sched/enable + # echo 1 > /sys/kernel/tracing/events/tcp/enable - # echo 'hist:keys=dport:cwnd=snd_cwnd: \ - onchange($cwnd).save(snd_wnd,srtt,rcv_wnd): \ - onchange($cwnd).snapshot()' >> \ - /sys/kernel/tracing/events/tcp/tcp_probe/trigger + # echo 'hist:keys=dport:cwnd=snd_cwnd: \ + onchange($cwnd).save(snd_wnd,srtt,rcv_wnd): \ + onchange($cwnd).snapshot()' >> \ + /sys/kernel/tracing/events/tcp/tcp_probe/trigger When the histogram is displayed, for each bucket the tracked value and the saved values corresponding to that value are displayed @@ -2341,10 +2495,10 @@ The following commonly-used handler.action pairs are available: { dport: 443 } hitcount: 211 changed: 10 snd_wnd: 26960 srtt: 17379 rcv_wnd: 28800 - Snapshot taken (see tracing/snapshot). Details:: + Snapshot taken (see tracing/snapshot). Details: - triggering value { onchange($cwnd) }: 10 - triggered by event with key: { dport: 80 } + triggering value { onchange($cwnd) }: 10 + triggered by event with key: { dport: 80 } Totals: Hits: 414 diff --git a/include/linux/kernel.h b/include/linux/kernel.h index fe6efb24d151..40bce7495af8 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -297,7 +297,7 @@ bool mac_pton(const char *s, u8 *mac); * * Use tracing_on/tracing_off when you want to quickly turn on or off * tracing. It simply enables or disables the recording of the trace events. - * This also corresponds to the user space /sys/kernel/debug/tracing/tracing_on + * This also corresponds to the user space /sys/kernel/tracing/tracing_on * file, which gives a means for the kernel and userspace to interact. * Place a tracing_off() in the kernel where you want tracing to end. * From user space, examine the trace, and then echo 1 > tracing_on diff --git a/include/linux/trace.h b/include/linux/trace.h index 80ffda871749..2a70a447184c 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -33,6 +33,18 @@ struct trace_array; int register_ftrace_export(struct trace_export *export); int unregister_ftrace_export(struct trace_export *export); +/** + * trace_array_puts - write a constant string into the trace buffer. + * @tr: The trace array to write to + * @str: The constant string to write + */ +#define trace_array_puts(tr, str) \ + ({ \ + str ? __trace_array_puts(tr, _THIS_IP_, str, strlen(str)) : -1; \ + }) +int __trace_array_puts(struct trace_array *tr, unsigned long ip, + const char *str, int size); + void trace_printk_init_buffers(void); __printf(3, 4) int trace_array_printk(struct trace_array *tr, unsigned long ip, diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 0c4c7587d6c3..6be92bf559fe 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -95,6 +95,7 @@ extern void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, extern int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii); +char *trace_seq_acquire(struct trace_seq *s, unsigned int len); #else /* CONFIG_TRACING */ static inline __printf(2, 3) @@ -139,6 +140,10 @@ static inline int trace_seq_path(struct trace_seq *s, const struct path *path) { return 0; } +static inline char *trace_seq_acquire(struct trace_seq *s, unsigned int len) +{ + return NULL; +} #endif /* CONFIG_TRACING */ #endif /* _LINUX_TRACE_SEQ_H */ diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 552f80b8362f..e299f29375bb 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -482,7 +482,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * * This is how the trace record is structured and will * * be saved into the ring buffer. These are the fields * * that will be exposed to user-space in - * * /sys/kernel/debug/tracing/events/<*>/format. + * * /sys/kernel/tracing/events/<*>/format. * * * * The declared 'local variable' is called '__entry' * * @@ -542,7 +542,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * tracepoint callback (this is used by programmatic plugins and * can also by used by generic instrumentation like SystemTap), and * it is also used to expose a structured trace record in - * /sys/kernel/debug/tracing/events/. + * /sys/kernel/tracing/events/. * * A set of (un)registration functions can be passed to the variant * TRACE_EVENT_FN to perform any (un)registration work. diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index 155c495b89ea..1f7fc1fc590c 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -4,50 +4,7 @@ #ifdef CONFIG_BPF_EVENTS -#undef __entry -#define __entry entry - -#undef __get_dynamic_array -#define __get_dynamic_array(field) \ - ((void *)__entry + (__entry->__data_loc_##field & 0xffff)) - -#undef __get_dynamic_array_len -#define __get_dynamic_array_len(field) \ - ((__entry->__data_loc_##field >> 16) & 0xffff) - -#undef __get_str -#define __get_str(field) ((char *)__get_dynamic_array(field)) - -#undef __get_bitmask -#define __get_bitmask(field) (char *)__get_dynamic_array(field) - -#undef __get_cpumask -#define __get_cpumask(field) (char *)__get_dynamic_array(field) - -#undef __get_sockaddr -#define __get_sockaddr(field) ((struct sockaddr *)__get_dynamic_array(field)) - -#undef __get_rel_dynamic_array -#define __get_rel_dynamic_array(field) \ - ((void *)(&__entry->__rel_loc_##field) + \ - sizeof(__entry->__rel_loc_##field) + \ - (__entry->__rel_loc_##field & 0xffff)) - -#undef __get_rel_dynamic_array_len -#define __get_rel_dynamic_array_len(field) \ - ((__entry->__rel_loc_##field >> 16) & 0xffff) - -#undef __get_rel_str -#define __get_rel_str(field) ((char *)__get_rel_dynamic_array(field)) - -#undef __get_rel_bitmask -#define __get_rel_bitmask(field) (char *)__get_rel_dynamic_array(field) - -#undef __get_rel_cpumask -#define __get_rel_cpumask(field) (char *)__get_rel_dynamic_array(field) - -#undef __get_rel_sockaddr -#define __get_rel_sockaddr(field) ((struct sockaddr *)__get_rel_dynamic_array(field)) +#include "stages/stage6_event_callback.h" #undef __perf_count #define __perf_count(c) (c) diff --git a/include/trace/perf.h b/include/trace/perf.h index 8f3bf1e17707..2c11181c82e0 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -4,51 +4,7 @@ #ifdef CONFIG_PERF_EVENTS -#undef __entry -#define __entry entry - -#undef __get_dynamic_array -#define __get_dynamic_array(field) \ - ((void *)__entry + (__entry->__data_loc_##field & 0xffff)) - -#undef __get_dynamic_array_len -#define __get_dynamic_array_len(field) \ - ((__entry->__data_loc_##field >> 16) & 0xffff) - -#undef __get_str -#define __get_str(field) ((char *)__get_dynamic_array(field)) - -#undef __get_bitmask -#define __get_bitmask(field) (char *)__get_dynamic_array(field) - -#undef __get_cpumask -#define __get_cpumask(field) (char *)__get_dynamic_array(field) - -#undef __get_sockaddr -#define __get_sockaddr(field) ((struct sockaddr *)__get_dynamic_array(field)) - -#undef __get_rel_dynamic_array -#define __get_rel_dynamic_array(field) \ - ((void *)__entry + \ - offsetof(typeof(*__entry), __rel_loc_##field) + \ - sizeof(__entry->__rel_loc_##field) + \ - (__entry->__rel_loc_##field & 0xffff)) - -#undef __get_rel_dynamic_array_len -#define __get_rel_dynamic_array_len(field) \ - ((__entry->__rel_loc_##field >> 16) & 0xffff) - -#undef __get_rel_str -#define __get_rel_str(field) ((char *)__get_rel_dynamic_array(field)) - -#undef __get_rel_bitmask -#define __get_rel_bitmask(field) (char *)__get_rel_dynamic_array(field) - -#undef __get_rel_cpumask -#define __get_rel_cpumask(field) (char *)__get_rel_dynamic_array(field) - -#undef __get_rel_sockaddr -#define __get_rel_sockaddr(field) ((struct sockaddr *)__get_rel_dynamic_array(field)) +#include "stages/stage6_event_callback.h" #undef __perf_count #define __perf_count(c) (__count = (c)) diff --git a/include/trace/stages/stage3_trace_output.h b/include/trace/stages/stage3_trace_output.h index 66374df61ed3..c1fb1355d309 100644 --- a/include/trace/stages/stage3_trace_output.h +++ b/include/trace/stages/stage3_trace_output.h @@ -139,3 +139,6 @@ u64 ____val = (u64)(value); \ (u32) do_div(____val, NSEC_PER_SEC); \ }) + +#undef __get_buf +#define __get_buf(len) trace_seq_acquire(p, (len)) diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h index 49c32394b53f..919b1a4da980 100644 --- a/include/trace/stages/stage6_event_callback.h +++ b/include/trace/stages/stage6_event_callback.h @@ -2,6 +2,9 @@ /* Stage 6 definitions for creating trace events */ +/* Reuse some of the stage 3 macros */ +#include "stage3_trace_output.h" + #undef __entry #define __entry entry diff --git a/include/trace/stages/stage7_class_define.h b/include/trace/stages/stage7_class_define.h index 8795429f388b..bcb960d16fc0 100644 --- a/include/trace/stages/stage7_class_define.h +++ b/include/trace/stages/stage7_class_define.h @@ -23,6 +23,7 @@ #undef __get_rel_sockaddr #undef __print_array #undef __print_hex_dump +#undef __get_buf /* * The below is not executed in the kernel. It is only what is diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index caf32389faf3..a856d4a34c67 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -242,7 +242,7 @@ config DYNAMIC_FTRACE enabled, and the functions not enabled will not affect performance of the system. - See the files in /sys/kernel/debug/tracing: + See the files in /sys/kernel/tracing: available_filter_functions set_ftrace_filter set_ftrace_notrace @@ -306,7 +306,7 @@ config STACK_TRACER select KALLSYMS help This special tracer records the maximum stack footprint of the - kernel and displays it in /sys/kernel/debug/tracing/stack_trace. + kernel and displays it in /sys/kernel/tracing/stack_trace. This tracer works by hooking into every function call that the kernel executes, and keeping a maximum stack depth value and @@ -346,7 +346,7 @@ config IRQSOFF_TRACER disabled by default and can be runtime (re-)started via: - echo 0 > /sys/kernel/debug/tracing/tracing_max_latency + echo 0 > /sys/kernel/tracing/tracing_max_latency (Note that kernel size and overhead increase with this option enabled. This option and the preempt-off timing option can be @@ -370,7 +370,7 @@ config PREEMPT_TRACER disabled by default and can be runtime (re-)started via: - echo 0 > /sys/kernel/debug/tracing/tracing_max_latency + echo 0 > /sys/kernel/tracing/tracing_max_latency (Note that kernel size and overhead increase with this option enabled. This option and the irqs-off timing option can be @@ -522,7 +522,7 @@ config TRACER_SNAPSHOT Allow tracing users to take snapshot of the current buffer using the ftrace interface, e.g.: - echo 1 > /sys/kernel/debug/tracing/snapshot + echo 1 > /sys/kernel/tracing/snapshot cat snapshot config TRACER_SNAPSHOT_PER_CPU_SWAP @@ -534,7 +534,7 @@ config TRACER_SNAPSHOT_PER_CPU_SWAP full swap (all buffers). If this is set, then the following is allowed: - echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot + echo 1 > /sys/kernel/tracing/per_cpu/cpu2/snapshot After which, only the tracing buffer for CPU 2 was swapped with the main tracing buffer, and the other CPU buffers remain the same. @@ -581,7 +581,7 @@ config PROFILE_ANNOTATED_BRANCHES This tracer profiles all likely and unlikely macros in the kernel. It will display the results in: - /sys/kernel/debug/tracing/trace_stat/branch_annotated + /sys/kernel/tracing/trace_stat/branch_annotated Note: this will add a significant overhead; only turn this on if you need to profile the system's use of these macros. @@ -594,7 +594,7 @@ config PROFILE_ALL_BRANCHES taken in the kernel is recorded whether it hit or miss. The results will be displayed in: - /sys/kernel/debug/tracing/trace_stat/branch_all + /sys/kernel/tracing/trace_stat/branch_all This option also enables the likely/unlikely profiler. @@ -645,8 +645,8 @@ config BLK_DEV_IO_TRACE Tracing also is possible using the ftrace interface, e.g.: echo 1 > /sys/block/sda/sda1/trace/enable - echo blk > /sys/kernel/debug/tracing/current_tracer - cat /sys/kernel/debug/tracing/trace_pipe + echo blk > /sys/kernel/tracing/current_tracer + cat /sys/kernel/tracing/trace_pipe If unsure, say N. diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c index c736487fc0e4..4850fdfe27f1 100644 --- a/kernel/trace/kprobe_event_gen_test.c +++ b/kernel/trace/kprobe_event_gen_test.c @@ -21,7 +21,7 @@ * Then: * * # insmod kernel/trace/kprobe_event_gen_test.ko - * # cat /sys/kernel/debug/tracing/trace + * # cat /sys/kernel/tracing/trace * * You should see many instances of the "gen_kprobe_test" and * "gen_kretprobe_test" events in the trace buffer. diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b641cab2745e..af50d931b020 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2864,7 +2864,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n" + " echo global > /sys/kernel/tracing/trace_clock\n" "or add trace_clock=global to the kernel command line\n"); } @@ -5604,11 +5604,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); */ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data) { - struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_page *bpage = data; struct page *page = virt_to_page(bpage); unsigned long flags; + if (!buffer || !buffer->buffers || !buffer->buffers[cpu]) + return; + + cpu_buffer = buffer->buffers[cpu]; + /* If the page is still in use someplace else, we can't reuse it */ if (page_ref_count(page) > 1) goto out; diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 8d77526892f4..8dfe85499d4a 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -22,7 +22,7 @@ * Then: * * # insmod kernel/trace/synth_event_gen_test.ko - * # cat /sys/kernel/debug/tracing/trace + * # cat /sys/kernel/tracing/trace * * You should see several events in the trace buffer - * "create_synth_test", "empty_synth_test", and several instances of diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 54a163ae4815..0fa59acfea17 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -49,6 +49,8 @@ #include <linux/irq_work.h> #include <linux/workqueue.h> +#include <asm/setup.h> /* COMMAND_LINE_SIZE */ + #include "trace.h" #include "trace_output.h" @@ -186,6 +188,12 @@ static char *default_bootup_tracer; static bool allocate_snapshot; static bool snapshot_at_boot; +static char boot_instance_info[COMMAND_LINE_SIZE] __initdata; +static int boot_instance_index; + +static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata; +static int boot_snapshot_index; + static int __init set_cmdline_ftrace(char *str) { strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); @@ -222,9 +230,22 @@ __setup("traceoff_on_warning", stop_trace_on_warning); static int __init boot_alloc_snapshot(char *str) { - allocate_snapshot = true; - /* We also need the main ring buffer expanded */ - ring_buffer_expanded = true; + char *slot = boot_snapshot_info + boot_snapshot_index; + int left = sizeof(boot_snapshot_info) - boot_snapshot_index; + int ret; + + if (str[0] == '=') { + str++; + if (strlen(str) >= left) + return -1; + + ret = snprintf(slot, left, "%s\t", str); + boot_snapshot_index += ret; + } else { + allocate_snapshot = true; + /* We also need the main ring buffer expanded */ + ring_buffer_expanded = true; + } return 1; } __setup("alloc_snapshot", boot_alloc_snapshot); @@ -239,6 +260,23 @@ static int __init boot_snapshot(char *str) __setup("ftrace_boot_snapshot", boot_snapshot); +static int __init boot_instance(char *str) +{ + char *slot = boot_instance_info + boot_instance_index; + int left = sizeof(boot_instance_info) - boot_instance_index; + int ret; + + if (strlen(str) >= left) + return -1; + + ret = snprintf(slot, left, "%s\t", str); + boot_instance_index += ret; + + return 1; +} +__setup("trace_instance=", boot_instance); + + static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) @@ -1001,13 +1039,8 @@ __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *ev ring_buffer_unlock_commit(buffer); } -/** - * __trace_puts - write a constant string into the trace buffer. - * @ip: The address of the caller - * @str: The constant string to write - * @size: The size of the string. - */ -int __trace_puts(unsigned long ip, const char *str, int size) +int __trace_array_puts(struct trace_array *tr, unsigned long ip, + const char *str, int size) { struct ring_buffer_event *event; struct trace_buffer *buffer; @@ -1015,7 +1048,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) unsigned int trace_ctx; int alloc; - if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) + if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; if (unlikely(tracing_selftest_running || tracing_disabled)) @@ -1024,7 +1057,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) alloc = sizeof(*entry) + size + 2; /* possible \n added */ trace_ctx = tracing_gen_ctx(); - buffer = global_trace.array_buffer.buffer; + buffer = tr->array_buffer.buffer; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, trace_ctx); @@ -1046,11 +1079,23 @@ int __trace_puts(unsigned long ip, const char *str, int size) entry->buf[size] = '\0'; __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL); + ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); out: ring_buffer_nest_end(buffer); return size; } +EXPORT_SYMBOL_GPL(__trace_array_puts); + +/** + * __trace_puts - write a constant string into the trace buffer. + * @ip: The address of the caller + * @str: The constant string to write + * @size: The size of the string. + */ +int __trace_puts(unsigned long ip, const char *str, int size) +{ + return __trace_array_puts(&global_trace, ip, str, size); +} EXPORT_SYMBOL_GPL(__trace_puts); /** @@ -1142,7 +1187,7 @@ void tracing_snapshot_instance(struct trace_array *tr) * * Note, make sure to allocate the snapshot with either * a tracing_snapshot_alloc(), or by doing it manually - * with: echo 1 > /sys/kernel/debug/tracing/snapshot + * with: echo 1 > /sys/kernel/tracing/snapshot * * If the snapshot buffer is not allocated, it will stop tracing. * Basically making a permanent snapshot. @@ -5760,7 +5805,7 @@ static const char readme_msg[] = #ifdef CONFIG_SYNTH_EVENTS " events/synthetic_events\t- Create/append/remove/show synthetic events\n" "\t Write into this file to define/undefine new synthetic events.\n" - "\t example: echo 'myevent u64 lat; char name[]' >> synthetic_events\n" + "\t example: echo 'myevent u64 lat; char name[]; long[] stack' >> synthetic_events\n" #endif #endif ; @@ -9225,10 +9270,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) } tr->allocated_snapshot = allocate_snapshot; - /* - * Only the top level trace array gets its snapshot allocated - * from the kernel command line. - */ allocate_snapshot = false; #endif @@ -10144,6 +10185,79 @@ out: return ret; } +#ifdef CONFIG_TRACER_MAX_TRACE +__init static bool tr_needs_alloc_snapshot(const char *name) +{ + char *test; + int len = strlen(name); + bool ret; + + if (!boot_snapshot_index) + return false; + + if (strncmp(name, boot_snapshot_info, len) == 0 && + boot_snapshot_info[len] == '\t') + return true; + + test = kmalloc(strlen(name) + 3, GFP_KERNEL); + if (!test) + return false; + + sprintf(test, "\t%s\t", name); + ret = strstr(boot_snapshot_info, test) == NULL; + kfree(test); + return ret; +} + +__init static void do_allocate_snapshot(const char *name) +{ + if (!tr_needs_alloc_snapshot(name)) + return; + + /* + * When allocate_snapshot is set, the next call to + * allocate_trace_buffers() (called by trace_array_get_by_name()) + * will allocate the snapshot buffer. That will alse clear + * this flag. + */ + allocate_snapshot = true; +} +#else +static inline void do_allocate_snapshot(const char *name) { } +#endif + +__init static void enable_instances(void) +{ + struct trace_array *tr; + char *curr_str; + char *str; + char *tok; + + /* A tab is always appended */ + boot_instance_info[boot_instance_index - 1] = '\0'; + str = boot_instance_info; + + while ((curr_str = strsep(&str, "\t"))) { + + tok = strsep(&curr_str, ","); + + if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE)) + do_allocate_snapshot(tok); + + tr = trace_array_get_by_name(tok); + if (!tr) { + pr_warn("Failed to create instance buffer %s\n", curr_str); + continue; + } + /* Allow user space to delete it */ + trace_array_put(tr); + + while ((tok = strsep(&curr_str, ","))) { + early_enable_events(tr, tok, true); + } + } +} + __init static int tracer_alloc_buffers(void) { int ring_buf_size; @@ -10277,10 +10391,19 @@ out: void __init ftrace_boot_snapshot(void) { + struct trace_array *tr; + if (snapshot_at_boot) { tracing_snapshot(); internal_trace_puts("** Boot snapshot taken **\n"); } + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr == &global_trace) + continue; + trace_array_puts(tr, "** Boot snapshot taken **\n"); + tracing_snapshot_instance(tr); + } } void __init early_trace_init(void) @@ -10302,6 +10425,9 @@ void __init early_trace_init(void) void __init trace_init(void) { trace_event_init(); + + if (boot_instance_index) + enable_instances(); } __init static void clear_boot_tracer(void) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 085a31b978a5..f3aae2be1d53 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -113,6 +113,10 @@ enum trace_type { #define MEM_FAIL(condition, fmt, ...) \ DO_ONCE_LITE_IF(condition, pr_err, "ERROR: " fmt, ##__VA_ARGS__) +#define HIST_STACKTRACE_DEPTH 16 +#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) +#define HIST_STACKTRACE_SKIP 5 + /* * syscalls are special, and need special handling, this is why * they are not included in trace_entries.h @@ -1331,6 +1335,8 @@ DECLARE_PER_CPU(int, trace_buffered_event_cnt); void trace_buffered_event_disable(void); void trace_buffered_event_enable(void); +void early_enable_events(struct trace_array *tr, char *buf, bool disable_first); + static inline void __trace_event_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6a942fa275c7..654ffa40457a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2281,8 +2281,6 @@ create_new_subsystem(const char *name) if (!system->name) goto out_free; - system->filter = NULL; - system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); if (!system->filter) goto out_free; @@ -2843,7 +2841,7 @@ static __init int setup_trace_triggers(char *str) if (!trigger) break; bootup_triggers[i].event = strsep(&trigger, "."); - bootup_triggers[i].trigger = strsep(&trigger, "."); + bootup_triggers[i].trigger = trigger; if (!bootup_triggers[i].trigger) break; } @@ -3771,10 +3769,9 @@ static __init int event_trace_memsetup(void) return 0; } -static __init void -early_enable_events(struct trace_array *tr, bool disable_first) +__init void +early_enable_events(struct trace_array *tr, char *buf, bool disable_first) { - char *buf = bootup_event_buf; char *token; int ret; @@ -3827,7 +3824,7 @@ static __init int event_trace_enable(void) */ __trace_early_add_events(tr); - early_enable_events(tr, false); + early_enable_events(tr, bootup_event_buf, false); trace_printk_start_comm(); @@ -3855,7 +3852,7 @@ static __init int event_trace_enable_again(void) if (!tr) return -ENODEV; - early_enable_events(tr, true); + early_enable_events(tr, bootup_event_buf, true); return 0; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e095c3b3a50d..1dad64267878 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -64,6 +64,7 @@ enum filter_pred_fn { FILTER_PRED_FN_PCHAR_USER, FILTER_PRED_FN_PCHAR, FILTER_PRED_FN_CPU, + FILTER_PRED_FN_FUNCTION, FILTER_PRED_FN_, FILTER_PRED_TEST_VISITED, }; @@ -71,6 +72,7 @@ enum filter_pred_fn { struct filter_pred { enum filter_pred_fn fn_num; u64 val; + u64 val2; struct regex regex; unsigned short *ops; struct ftrace_event_field *field; @@ -103,6 +105,7 @@ struct filter_pred { C(INVALID_FILTER, "Meaningless filter expression"), \ C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \ + C(NO_FUNCTION, "Function not found"), \ C(ERRNO, "Error"), \ C(NO_FILTER, "No filter found") @@ -876,6 +879,17 @@ static int filter_pred_comm(struct filter_pred *pred, void *event) return cmp ^ pred->not; } +/* Filter predicate for functions. */ +static int filter_pred_function(struct filter_pred *pred, void *event) +{ + unsigned long *addr = (unsigned long *)(event + pred->offset); + unsigned long start = (unsigned long)pred->val; + unsigned long end = (unsigned long)pred->val2; + int ret = *addr >= start && *addr < end; + + return pred->op == OP_EQ ? ret : !ret; +} + /* * regex_match_foo - Basic regex callbacks * @@ -1335,6 +1349,8 @@ static int filter_pred_fn_call(struct filter_pred *pred, void *event) return filter_pred_pchar(pred, event); case FILTER_PRED_FN_CPU: return filter_pred_cpu(pred, event); + case FILTER_PRED_FN_FUNCTION: + return filter_pred_function(pred, event); case FILTER_PRED_TEST_VISITED: return test_pred_visited_fn(pred, event); default: @@ -1350,8 +1366,13 @@ static int parse_pred(const char *str, void *data, struct trace_event_call *call = data; struct ftrace_event_field *field; struct filter_pred *pred = NULL; + unsigned long offset; + unsigned long size; + unsigned long ip; char num_buf[24]; /* Big enough to hold an address */ char *field_name; + char *name; + bool function = false; bool ustring = false; char q; u64 val; @@ -1393,6 +1414,12 @@ static int parse_pred(const char *str, void *data, i += len; } + /* See if the field is a kernel function name */ + if ((len = str_has_prefix(str + i, ".function"))) { + function = true; + i += len; + } + while (isspace(str[i])) i++; @@ -1423,7 +1450,71 @@ static int parse_pred(const char *str, void *data, pred->offset = field->offset; pred->op = op; - if (ftrace_event_is_function(call)) { + if (function) { + /* The field must be the same size as long */ + if (field->size != sizeof(long)) { + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } + + /* Function only works with '==' or '!=' and an unquoted string */ + switch (op) { + case OP_NE: + case OP_EQ: + break; + default: + parse_error(pe, FILT_ERR_INVALID_OP, pos + i); + goto err_free; + } + + if (isdigit(str[i])) { + /* We allow 0xDEADBEEF */ + while (isalnum(str[i])) + i++; + + len = i - s; + /* 0xfeedfacedeadbeef is 18 chars max */ + if (len >= sizeof(num_buf)) { + parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); + goto err_free; + } + + strncpy(num_buf, str + s, len); + num_buf[len] = 0; + + ret = kstrtoul(num_buf, 0, &ip); + if (ret) { + parse_error(pe, FILT_ERR_INVALID_VALUE, pos + i); + goto err_free; + } + } else { + s = i; + for (; str[i] && !isspace(str[i]); i++) + ; + + len = i - s; + name = kmemdup_nul(str + s, len, GFP_KERNEL); + if (!name) + goto err_mem; + ip = kallsyms_lookup_name(name); + kfree(name); + if (!ip) { + parse_error(pe, FILT_ERR_NO_FUNCTION, pos + i); + goto err_free; + } + } + + /* Now find the function start and end address */ + if (!kallsyms_lookup_size_offset(ip, &size, &offset)) { + parse_error(pe, FILT_ERR_NO_FUNCTION, pos + i); + goto err_free; + } + + pred->fn_num = FILTER_PRED_FN_FUNCTION; + pred->val = ip - offset; + pred->val2 = pred->val + size; + + } else if (ftrace_event_is_function(call)) { /* * Perf does things different with function events. * It only allows an "ip" field, and expects a string. diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5edbf6b1da3f..89877a18f933 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -135,6 +135,7 @@ enum hist_field_fn { HIST_FIELD_FN_DIV_NOT_POWER2, HIST_FIELD_FN_DIV_MULT_SHIFT, HIST_FIELD_FN_EXECNAME, + HIST_FIELD_FN_STACK, }; /* @@ -480,10 +481,6 @@ DEFINE_HIST_FIELD_FN(u8); #define for_each_hist_key_field(i, hist_data) \ for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++) -#define HIST_STACKTRACE_DEPTH 16 -#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) -#define HIST_STACKTRACE_SKIP 5 - #define HITCOUNT_IDX 0 #define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE) @@ -1360,7 +1357,12 @@ static const char *hist_field_name(struct hist_field *field, field_name = field->name; } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) field_name = "common_timestamp"; - else if (field->flags & HIST_FIELD_FL_HITCOUNT) + else if (field->flags & HIST_FIELD_FL_STACKTRACE) { + if (field->field) + field_name = field->field->name; + else + field_name = "stacktrace"; + } else if (field->flags & HIST_FIELD_FL_HITCOUNT) field_name = "hitcount"; if (field_name == NULL) @@ -1718,6 +1720,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "percent"; else if (hist_field->flags & HIST_FIELD_FL_GRAPH) flags_str = "graph"; + else if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) + flags_str = "stacktrace"; return flags_str; } @@ -1979,7 +1983,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, } if (flags & HIST_FIELD_FL_STACKTRACE) { - hist_field->fn_num = HIST_FIELD_FN_NOP; + if (field) + hist_field->fn_num = HIST_FIELD_FN_STACK; + else + hist_field->fn_num = HIST_FIELD_FN_NOP; + hist_field->size = HIST_STACKTRACE_SIZE; + hist_field->type = kstrdup_const("unsigned long[]", GFP_KERNEL); + if (!hist_field->type) + goto free; goto out; } @@ -2312,6 +2323,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, *flags |= HIST_FIELD_FL_EXECNAME; else if (strcmp(modifier, "syscall") == 0) *flags |= HIST_FIELD_FL_SYSCALL; + else if (strcmp(modifier, "stacktrace") == 0) + *flags |= HIST_FIELD_FL_STACKTRACE; else if (strcmp(modifier, "log2") == 0) *flags |= HIST_FIELD_FL_LOG2; else if (strcmp(modifier, "usecs") == 0) @@ -2351,6 +2364,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->enable_timestamps = true; if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS) hist_data->attrs->ts_in_usecs = true; + } else if (strcmp(field_name, "stacktrace") == 0) { + *flags |= HIST_FIELD_FL_STACKTRACE; } else if (strcmp(field_name, "common_cpu") == 0) *flags |= HIST_FIELD_FL_CPU; else if (strcmp(field_name, "hitcount") == 0) @@ -3111,6 +3126,9 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, unsigned int i, j, var_idx; u64 var_val; + /* Make sure stacktrace can fit in the string variable length */ + BUILD_BUG_ON((HIST_STACKTRACE_DEPTH + 1) * sizeof(long) >= STR_VAR_LEN_MAX); + for (i = 0, j = field_var_str_start; i < n_field_vars; i++) { struct field_var *field_var = field_vars[i]; struct hist_field *var = field_var->var; @@ -3119,13 +3137,26 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, var_val = hist_fn_call(val, elt, buffer, rbe, rec); var_idx = var->var.idx; - if (val->flags & HIST_FIELD_FL_STRING) { + if (val->flags & (HIST_FIELD_FL_STRING | + HIST_FIELD_FL_STACKTRACE)) { char *str = elt_data->field_var_str[j++]; char *val_str = (char *)(uintptr_t)var_val; unsigned int size; - size = min(val->size, STR_VAR_LEN_MAX); - strscpy(str, val_str, size); + if (val->flags & HIST_FIELD_FL_STRING) { + size = min(val->size, STR_VAR_LEN_MAX); + strscpy(str, val_str, size); + } else { + char *stack_start = str + sizeof(unsigned long); + int e; + + e = stack_trace_save((void *)stack_start, + HIST_STACKTRACE_DEPTH, + HIST_STACKTRACE_SKIP); + if (e < HIST_STACKTRACE_DEPTH - 1) + ((unsigned long *)stack_start)[e] = 0; + *((unsigned long *)str) = e; + } var_val = (u64)(uintptr_t)str; } tracing_map_set_var(elt, var_idx, var_val); @@ -3824,7 +3855,8 @@ static void save_field_var(struct hist_trigger_data *hist_data, { hist_data->field_vars[hist_data->n_field_vars++] = field_var; - if (field_var->val->flags & HIST_FIELD_FL_STRING) + /* Stack traces are saved in the string storage too */ + if (field_var->val->flags & (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) hist_data->n_field_var_str++; } @@ -3849,6 +3881,9 @@ static int check_synth_field(struct synth_event *event, && field->is_dynamic) return 0; + if (strstr(hist_field->type, "long[") && field->is_stack) + return 0; + if (strcmp(field->type, hist_field->type) != 0) { if (field->size != hist_field->size || (!field->is_string && field->is_signed != hist_field->is_signed)) @@ -4103,7 +4138,8 @@ static int action_create(struct hist_trigger_data *hist_data, } hist_data->save_vars[hist_data->n_save_vars++] = field_var; - if (field_var->val->flags & HIST_FIELD_FL_STRING) + if (field_var->val->flags & + (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) hist_data->n_save_var_str++; kfree(param); } @@ -4242,6 +4278,19 @@ static u64 hist_field_execname(struct hist_field *hist_field, return (u64)(unsigned long)(elt_data->comm); } +static u64 hist_field_stack(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + u32 str_item = *(u32 *)(event + hist_field->field->offset); + int str_loc = str_item & 0xffff; + char *addr = (char *)(event + str_loc); + + return (u64)(unsigned long)addr; +} + static u64 hist_fn_call(struct hist_field *hist_field, struct tracing_map_elt *elt, struct trace_buffer *buffer, @@ -4305,6 +4354,8 @@ static u64 hist_fn_call(struct hist_field *hist_field, return div_by_mult_and_shift(hist_field, elt, buffer, rbe, event); case HIST_FIELD_FN_EXECNAME: return hist_field_execname(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_STACK: + return hist_field_stack(hist_field, elt, buffer, rbe, event); default: return 0; } @@ -4351,7 +4402,8 @@ static int create_var_field(struct hist_trigger_data *hist_data, if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_EXECNAME) update_var_execname(hist_data->fields[val_idx]); - if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING) + if (!ret && hist_data->fields[val_idx]->flags & + (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++; return ret; @@ -5092,7 +5144,8 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, if (hist_field->flags & HIST_FIELD_FL_VAR) { var_idx = hist_field->var.idx; - if (hist_field->flags & HIST_FIELD_FL_STRING) { + if (hist_field->flags & + (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) { unsigned int str_start, var_str_idx, idx; char *str, *val_str; unsigned int size; @@ -5105,9 +5158,20 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, str = elt_data->field_var_str[idx]; val_str = (char *)(uintptr_t)hist_val; - size = min(hist_field->size, STR_VAR_LEN_MAX); - strscpy(str, val_str, size); - + if (hist_field->flags & HIST_FIELD_FL_STRING) { + size = min(hist_field->size, STR_VAR_LEN_MAX); + strscpy(str, val_str, size); + } else { + char *stack_start = str + sizeof(unsigned long); + int e; + + e = stack_trace_save((void *)stack_start, + HIST_STACKTRACE_DEPTH, + HIST_STACKTRACE_SKIP); + if (e < HIST_STACKTRACE_DEPTH - 1) + ((unsigned long *)stack_start)[e] = 0; + *((unsigned long *)str) = e; + } hist_val = (u64)(uintptr_t)str; } tracing_map_set_var(elt, var_idx, hist_val); @@ -5193,8 +5257,17 @@ static void event_hist_trigger(struct event_trigger_data *data, if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { memset(entries, 0, HIST_STACKTRACE_SIZE); - stack_trace_save(entries, HIST_STACKTRACE_DEPTH, - HIST_STACKTRACE_SKIP); + if (key_field->field) { + unsigned long *stack, n_entries; + + field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec); + stack = (unsigned long *)(long)field_contents; + n_entries = *stack; + memcpy(entries, ++stack, n_entries * sizeof(unsigned long)); + } else { + stack_trace_save(entries, HIST_STACKTRACE_DEPTH, + HIST_STACKTRACE_SKIP); + } key = entries; } else { field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec); @@ -5297,7 +5370,10 @@ static void hist_trigger_print_key(struct seq_file *m, seq_printf(m, "%s: %-30s[%3llu]", field_name, syscall_name, uval); } else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { - seq_puts(m, "stacktrace:\n"); + if (key_field->field) + seq_printf(m, "%s.stacktrace", key_field->field->name); + else + seq_puts(m, "stacktrace:\n"); hist_trigger_stacktrace_print(m, key + key_field->offset, HIST_STACKTRACE_DEPTH); @@ -5842,7 +5918,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) if (hist_field->flags) { if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) && - !(hist_field->flags & HIST_FIELD_FL_EXPR)) { + !(hist_field->flags & HIST_FIELD_FL_EXPR) && + !(hist_field->flags & HIST_FIELD_FL_STACKTRACE)) { const char *flags = get_hist_field_flags(hist_field); if (flags) @@ -5875,9 +5952,12 @@ static int event_hist_trigger_print(struct seq_file *m, if (i > hist_data->n_vals) seq_puts(m, ","); - if (field->flags & HIST_FIELD_FL_STACKTRACE) - seq_puts(m, "stacktrace"); - else + if (field->flags & HIST_FIELD_FL_STACKTRACE) { + if (field->field) + seq_printf(m, "%s.stacktrace", field->field->name); + else + seq_puts(m, "stacktrace"); + } else hist_field_print(m, field); } diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 67592eed0be8..70bddb25d9c0 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -173,6 +173,14 @@ static int synth_field_is_string(char *type) return false; } +static int synth_field_is_stack(char *type) +{ + if (strstr(type, "long[") != NULL) + return true; + + return false; +} + static int synth_field_string_size(char *type) { char buf[4], *end, *start; @@ -248,6 +256,8 @@ static int synth_field_size(char *type) size = sizeof(gfp_t); else if (synth_field_is_string(type)) size = synth_field_string_size(type); + else if (synth_field_is_stack(type)) + size = 0; return size; } @@ -292,6 +302,8 @@ static const char *synth_field_fmt(char *type) fmt = "%x"; else if (synth_field_is_string(type)) fmt = "%.*s"; + else if (synth_field_is_stack(type)) + fmt = "%s"; return fmt; } @@ -371,6 +383,23 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, i == se->n_fields - 1 ? "" : " "); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } + } else if (se->fields[i]->is_stack) { + u32 offset, data_offset, len; + unsigned long *p, *end; + + offset = (u32)entry->fields[n_u64]; + data_offset = offset & 0xffff; + len = offset >> 16; + + p = (void *)entry + data_offset; + end = (void *)p + len - (sizeof(long) - 1); + + trace_seq_printf(s, "%s=STACK:\n", se->fields[i]->name); + + for (; *p && p < end; p++) + trace_seq_printf(s, "=> %pS\n", (void *)*p); + n_u64++; + } else { struct trace_print_flags __flags[] = { __def_gfpflag_names, {-1, NULL} }; @@ -416,8 +445,7 @@ static unsigned int trace_string(struct synth_trace_event *entry, if (is_dynamic) { u32 data_offset; - data_offset = offsetof(typeof(*entry), fields); - data_offset += event->n_u64 * sizeof(u64); + data_offset = struct_size(entry, fields, event->n_u64); data_offset += data_size; len = kern_fetch_store_strlen((unsigned long)str_val); @@ -447,6 +475,43 @@ static unsigned int trace_string(struct synth_trace_event *entry, return len; } +static unsigned int trace_stack(struct synth_trace_event *entry, + struct synth_event *event, + long *stack, + unsigned int data_size, + unsigned int *n_u64) +{ + unsigned int len; + u32 data_offset; + void *data_loc; + + data_offset = struct_size(entry, fields, event->n_u64); + data_offset += data_size; + + for (len = 0; len < HIST_STACKTRACE_DEPTH; len++) { + if (!stack[len]) + break; + } + + /* Include the zero'd element if it fits */ + if (len < HIST_STACKTRACE_DEPTH) + len++; + + len *= sizeof(long); + + /* Find the dynamic section to copy the stack into. */ + data_loc = (void *)entry + data_offset; + memcpy(data_loc, stack, len); + + /* Fill in the field that holds the offset/len combo */ + data_offset |= len << 16; + *(u32 *)&entry->fields[*n_u64] = data_offset; + + (*n_u64)++; + + return len; +} + static notrace void trace_event_raw_event_synth(void *__data, u64 *var_ref_vals, unsigned int *var_ref_idx) @@ -473,7 +538,12 @@ static notrace void trace_event_raw_event_synth(void *__data, val_idx = var_ref_idx[field_pos]; str_val = (char *)(long)var_ref_vals[val_idx]; - len = kern_fetch_store_strlen((unsigned long)str_val); + if (event->dynamic_fields[i]->is_stack) { + len = *((unsigned long *)str_val); + len *= sizeof(unsigned long); + } else { + len = kern_fetch_store_strlen((unsigned long)str_val); + } fields_size += len; } @@ -499,6 +569,12 @@ static notrace void trace_event_raw_event_synth(void *__data, event->fields[i]->is_dynamic, data_size, &n_u64); data_size += len; /* only dynamic string increments */ + } else if (event->fields[i]->is_stack) { + long *stack = (long *)(long)var_ref_vals[val_idx]; + + len = trace_stack(entry, event, stack, + data_size, &n_u64); + data_size += len; } else { struct synth_field *field = event->fields[i]; u64 val = var_ref_vals[val_idx]; @@ -561,6 +637,9 @@ static int __set_synth_event_print_fmt(struct synth_event *event, event->fields[i]->is_dynamic) pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(%s)", event->fields[i]->name); + else if (event->fields[i]->is_stack) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", __get_stacktrace(%s)", event->fields[i]->name); else pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", event->fields[i]->name); @@ -697,7 +776,8 @@ static struct synth_field *parse_synth_field(int argc, char **argv, ret = -EINVAL; goto free; } else if (size == 0) { - if (synth_field_is_string(field->type)) { + if (synth_field_is_string(field->type) || + synth_field_is_stack(field->type)) { char *type; len = sizeof("__data_loc ") + strlen(field->type) + 1; @@ -728,6 +808,8 @@ static struct synth_field *parse_synth_field(int argc, char **argv, if (synth_field_is_string(field->type)) field->is_string = true; + else if (synth_field_is_stack(field->type)) + field->is_stack = true; field->is_signed = synth_field_signed(field->type); out: diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 210e1f168392..04f0fdae19a1 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1539,7 +1539,7 @@ static void osnoise_sleep(void) wake_time = ktime_add_us(ktime_get(), interval); __set_current_state(TASK_INTERRUPTIBLE); - while (schedule_hrtimeout_range(&wake_time, 0, HRTIMER_MODE_ABS)) { + while (schedule_hrtimeout(&wake_time, HRTIMER_MODE_ABS)) { if (kthread_should_stop()) break; } diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 9c90b3a7dce2..e5e299260d0c 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -403,3 +403,26 @@ int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, return 1; } EXPORT_SYMBOL(trace_seq_hex_dump); + +/* + * trace_seq_acquire - acquire seq buffer with size len + * @s: trace sequence descriptor + * @len: size of buffer to be acquired + * + * acquire buffer with size of @len from trace_seq for output usage, + * user can fill string into that buffer. + * + * Returns start address of acquired buffer. + * + * it allow multiple usage in one trace output function call. + */ +char *trace_seq_acquire(struct trace_seq *s, unsigned int len) +{ + char *ret = trace_seq_buffer_ptr(s); + + if (!WARN_ON_ONCE(seq_buf_buffer_left(&s->seq) < len)) + seq_buf_commit(&s->seq, len); + + return ret; +} +EXPORT_SYMBOL(trace_seq_acquire); diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h index b29595fe3ac5..43f6fb6078db 100644 --- a/kernel/trace/trace_synth.h +++ b/kernel/trace/trace_synth.h @@ -18,6 +18,7 @@ struct synth_field { bool is_signed; bool is_string; bool is_dynamic; + bool is_stack; }; struct synth_event { diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index f23144af5743..8d1507dd0724 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -571,8 +571,8 @@ static void for_each_tracepoint_range( bool trace_module_has_bad_taint(struct module *mod) { return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) | - (1 << TAINT_UNSIGNED_MODULE) | - (1 << TAINT_TEST)); + (1 << TAINT_UNSIGNED_MODULE) | (1 << TAINT_TEST) | + (1 << TAINT_LIVEPATCH)); } static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list); diff --git a/samples/Kconfig b/samples/Kconfig index 0d81c00289ee..daf14c35f071 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -46,6 +46,13 @@ config SAMPLE_FTRACE_DIRECT_MULTI that hooks to wake_up_process and schedule, and prints the function addresses. +config SAMPLE_FTRACE_OPS + tristate "Build custom ftrace ops example" + depends on FUNCTION_TRACER + help + This builds an ftrace ops example that hooks two functions and + measures the time taken to invoke one function a number of times. + config SAMPLE_TRACE_ARRAY tristate "Build sample module for kernel access to Ftrace instancess" depends on EVENT_TRACING && m diff --git a/samples/Makefile b/samples/Makefile index 9832ef3f8fcb..7cb632ef88ee 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_SAMPLE_TRACE_CUSTOM_EVENTS) += trace_events/ obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace_printk/ obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace/ obj-$(CONFIG_SAMPLE_FTRACE_DIRECT_MULTI) += ftrace/ +obj-$(CONFIG_SAMPLE_FTRACE_OPS) += ftrace/ obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += ftrace/ subdir-$(CONFIG_SAMPLE_UHID) += uhid obj-$(CONFIG_VIDEO_PCI_SKELETON) += v4l/ diff --git a/samples/ftrace/Makefile b/samples/ftrace/Makefile index faf8cdb79c5f..589baf2ec4e3 100644 --- a/samples/ftrace/Makefile +++ b/samples/ftrace/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-too.o obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-modify.o obj-$(CONFIG_SAMPLE_FTRACE_DIRECT_MULTI) += ftrace-direct-multi.o obj-$(CONFIG_SAMPLE_FTRACE_DIRECT_MULTI) += ftrace-direct-multi-modify.o +obj-$(CONFIG_SAMPLE_FTRACE_OPS) += ftrace-ops.o CFLAGS_sample-trace-array.o := -I$(src) obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += sample-trace-array.o diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c index de5a0f67f320..d93abbcb1f4c 100644 --- a/samples/ftrace/ftrace-direct-modify.c +++ b/samples/ftrace/ftrace-direct-modify.c @@ -3,7 +3,6 @@ #include <linux/kthread.h> #include <linux/ftrace.h> #include <asm/asm-offsets.h> -#include <asm/nospec-branch.h> extern void my_direct_func1(void); extern void my_direct_func2(void); @@ -26,6 +25,7 @@ static unsigned long my_ip = (unsigned long)schedule; #ifdef CONFIG_X86_64 #include <asm/ibt.h> +#include <asm/nospec-branch.h> asm ( " .pushsection .text, \"ax\", @progbits\n" diff --git a/samples/ftrace/ftrace-direct-multi-modify.c b/samples/ftrace/ftrace-direct-multi-modify.c index a825dbd2c9cf..b58c594efb51 100644 --- a/samples/ftrace/ftrace-direct-multi-modify.c +++ b/samples/ftrace/ftrace-direct-multi-modify.c @@ -3,7 +3,6 @@ #include <linux/kthread.h> #include <linux/ftrace.h> #include <asm/asm-offsets.h> -#include <asm/nospec-branch.h> extern void my_direct_func1(unsigned long ip); extern void my_direct_func2(unsigned long ip); @@ -24,6 +23,7 @@ extern void my_tramp2(void *); #ifdef CONFIG_X86_64 #include <asm/ibt.h> +#include <asm/nospec-branch.h> asm ( " .pushsection .text, \"ax\", @progbits\n" diff --git a/samples/ftrace/ftrace-direct-multi.c b/samples/ftrace/ftrace-direct-multi.c index d955a2650605..c27cf130c319 100644 --- a/samples/ftrace/ftrace-direct-multi.c +++ b/samples/ftrace/ftrace-direct-multi.c @@ -5,7 +5,6 @@ #include <linux/ftrace.h> #include <linux/sched/stat.h> #include <asm/asm-offsets.h> -#include <asm/nospec-branch.h> extern void my_direct_func(unsigned long ip); @@ -19,6 +18,7 @@ extern void my_tramp(void *); #ifdef CONFIG_X86_64 #include <asm/ibt.h> +#include <asm/nospec-branch.h> asm ( " .pushsection .text, \"ax\", @progbits\n" diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c index e13fb59a2b47..8139dce2a31c 100644 --- a/samples/ftrace/ftrace-direct-too.c +++ b/samples/ftrace/ftrace-direct-too.c @@ -4,7 +4,6 @@ #include <linux/mm.h> /* for handle_mm_fault() */ #include <linux/ftrace.h> #include <asm/asm-offsets.h> -#include <asm/nospec-branch.h> extern void my_direct_func(struct vm_area_struct *vma, unsigned long address, unsigned int flags); @@ -21,6 +20,7 @@ extern void my_tramp(void *); #ifdef CONFIG_X86_64 #include <asm/ibt.h> +#include <asm/nospec-branch.h> asm ( " .pushsection .text, \"ax\", @progbits\n" diff --git a/samples/ftrace/ftrace-direct.c b/samples/ftrace/ftrace-direct.c index 1f769d0db20f..1d3d307ca33d 100644 --- a/samples/ftrace/ftrace-direct.c +++ b/samples/ftrace/ftrace-direct.c @@ -4,7 +4,6 @@ #include <linux/sched.h> /* for wake_up_process() */ #include <linux/ftrace.h> #include <asm/asm-offsets.h> -#include <asm/nospec-branch.h> extern void my_direct_func(struct task_struct *p); @@ -18,6 +17,7 @@ extern void my_tramp(void *); #ifdef CONFIG_X86_64 #include <asm/ibt.h> +#include <asm/nospec-branch.h> asm ( " .pushsection .text, \"ax\", @progbits\n" diff --git a/samples/ftrace/ftrace-ops.c b/samples/ftrace/ftrace-ops.c new file mode 100644 index 000000000000..68d6685c80bd --- /dev/null +++ b/samples/ftrace/ftrace-ops.c @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/ftrace.h> +#include <linux/ktime.h> +#include <linux/module.h> + +#include <asm/barrier.h> + +/* + * Arbitrary large value chosen to be sufficiently large to minimize noise but + * sufficiently small to complete quickly. + */ +static unsigned int nr_function_calls = 100000; +module_param(nr_function_calls, uint, 0); +MODULE_PARM_DESC(nr_function_calls, "How many times to call the relevant tracee"); + +/* + * The number of ops associated with a call site affects whether a tracer can + * be called directly or whether it's necessary to go via the list func, which + * can be significantly more expensive. + */ +static unsigned int nr_ops_relevant = 1; +module_param(nr_ops_relevant, uint, 0); +MODULE_PARM_DESC(nr_ops_relevant, "How many ftrace_ops to associate with the relevant tracee"); + +/* + * On architectures where all call sites share the same trampoline, having + * tracers enabled for distinct functions can force the use of the list func + * and incur overhead for all call sites. + */ +static unsigned int nr_ops_irrelevant; +module_param(nr_ops_irrelevant, uint, 0); +MODULE_PARM_DESC(nr_ops_irrelevant, "How many ftrace_ops to associate with the irrelevant tracee"); + +/* + * On architectures with DYNAMIC_FTRACE_WITH_REGS, saving the full pt_regs can + * be more expensive than only saving the minimal necessary regs. + */ +static bool save_regs; +module_param(save_regs, bool, 0); +MODULE_PARM_DESC(save_regs, "Register ops with FTRACE_OPS_FL_SAVE_REGS (save all registers in the trampoline)"); + +static bool assist_recursion; +module_param(assist_recursion, bool, 0); +MODULE_PARM_DESC(assist_reursion, "Register ops with FTRACE_OPS_FL_RECURSION"); + +static bool assist_rcu; +module_param(assist_rcu, bool, 0); +MODULE_PARM_DESC(assist_reursion, "Register ops with FTRACE_OPS_FL_RCU"); + +/* + * By default, a trivial tracer is used which immediately returns to mimimize + * overhead. Sometimes a consistency check using a more expensive tracer is + * desireable. + */ +static bool check_count; +module_param(check_count, bool, 0); +MODULE_PARM_DESC(check_count, "Check that tracers are called the expected number of times\n"); + +/* + * Usually it's not interesting to leave the ops registered after the test + * runs, but sometimes it can be useful to leave them registered so that they + * can be inspected through the tracefs 'enabled_functions' file. + */ +static bool persist; +module_param(persist, bool, 0); +MODULE_PARM_DESC(persist, "Successfully load module and leave ftrace ops registered after test completes\n"); + +/* + * Marked as noinline to ensure that an out-of-line traceable copy is + * generated by the compiler. + * + * The barrier() ensures the compiler won't elide calls by determining there + * are no side-effects. + */ +static noinline void tracee_relevant(void) +{ + barrier(); +} + +/* + * Marked as noinline to ensure that an out-of-line traceable copy is + * generated by the compiler. + * + * The barrier() ensures the compiler won't elide calls by determining there + * are no side-effects. + */ +static noinline void tracee_irrelevant(void) +{ + barrier(); +} + +struct sample_ops { + struct ftrace_ops ops; + unsigned int count; +}; + +static void ops_func_nop(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + /* do nothing */ +} + +static void ops_func_count(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + struct sample_ops *self; + + self = container_of(op, struct sample_ops, ops); + self->count++; +} + +static struct sample_ops *ops_relevant; +static struct sample_ops *ops_irrelevant; + +static struct sample_ops *ops_alloc_init(void *tracee, ftrace_func_t func, + unsigned long flags, int nr) +{ + struct sample_ops *ops; + + ops = kcalloc(nr, sizeof(*ops), GFP_KERNEL); + if (WARN_ON_ONCE(!ops)) + return NULL; + + for (unsigned int i = 0; i < nr; i++) { + ops[i].ops.func = func; + ops[i].ops.flags = flags; + WARN_ON_ONCE(ftrace_set_filter_ip(&ops[i].ops, (unsigned long)tracee, 0, 0)); + WARN_ON_ONCE(register_ftrace_function(&ops[i].ops)); + } + + return ops; +} + +static void ops_destroy(struct sample_ops *ops, int nr) +{ + if (!ops) + return; + + for (unsigned int i = 0; i < nr; i++) { + WARN_ON_ONCE(unregister_ftrace_function(&ops[i].ops)); + ftrace_free_filter(&ops[i].ops); + } + + kfree(ops); +} + +static void ops_check(struct sample_ops *ops, int nr, + unsigned int expected_count) +{ + if (!ops || !check_count) + return; + + for (unsigned int i = 0; i < nr; i++) { + if (ops->count == expected_count) + continue; + pr_warn("Counter called %u times (expected %u)\n", + ops->count, expected_count); + } +} + +static ftrace_func_t tracer_relevant = ops_func_nop; +static ftrace_func_t tracer_irrelevant = ops_func_nop; + +static int __init ftrace_ops_sample_init(void) +{ + unsigned long flags = 0; + ktime_t start, end; + u64 period; + + if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS) && save_regs) { + pr_info("this kernel does not support saving registers\n"); + save_regs = false; + } else if (save_regs) { + flags |= FTRACE_OPS_FL_SAVE_REGS; + } + + if (assist_recursion) + flags |= FTRACE_OPS_FL_RECURSION; + + if (assist_rcu) + flags |= FTRACE_OPS_FL_RCU; + + if (check_count) { + tracer_relevant = ops_func_count; + tracer_irrelevant = ops_func_count; + } + + pr_info("registering:\n" + " relevant ops: %u\n" + " tracee: %ps\n" + " tracer: %ps\n" + " irrelevant ops: %u\n" + " tracee: %ps\n" + " tracer: %ps\n" + " saving registers: %s\n" + " assist recursion: %s\n" + " assist RCU: %s\n", + nr_ops_relevant, tracee_relevant, tracer_relevant, + nr_ops_irrelevant, tracee_irrelevant, tracer_irrelevant, + save_regs ? "YES" : "NO", + assist_recursion ? "YES" : "NO", + assist_rcu ? "YES" : "NO"); + + ops_relevant = ops_alloc_init(tracee_relevant, tracer_relevant, + flags, nr_ops_relevant); + ops_irrelevant = ops_alloc_init(tracee_irrelevant, tracer_irrelevant, + flags, nr_ops_irrelevant); + + start = ktime_get(); + for (unsigned int i = 0; i < nr_function_calls; i++) + tracee_relevant(); + end = ktime_get(); + + ops_check(ops_relevant, nr_ops_relevant, nr_function_calls); + ops_check(ops_irrelevant, nr_ops_irrelevant, 0); + + period = ktime_to_ns(ktime_sub(end, start)); + + pr_info("Attempted %u calls to %ps in %lluns (%lluns / call)\n", + nr_function_calls, tracee_relevant, + period, div_u64(period, nr_function_calls)); + + if (persist) + return 0; + + ops_destroy(ops_relevant, nr_ops_relevant); + ops_destroy(ops_irrelevant, nr_ops_irrelevant); + + /* + * The benchmark completed sucessfully, but there's no reason to keep + * the module around. Return an error do the user doesn't have to + * manually unload the module. + */ + return -EINVAL; +} +module_init(ftrace_ops_sample_init); + +static void __exit ftrace_ops_sample_exit(void) +{ + ops_destroy(ops_relevant, nr_ops_relevant); + ops_destroy(ops_irrelevant, nr_ops_irrelevant); +} +module_exit(ftrace_ops_sample_exit); + +MODULE_AUTHOR("Mark Rutland"); +MODULE_DESCRIPTION("Example of using custom ftrace_ops"); +MODULE_LICENSE("GPL"); diff --git a/samples/user_events/example.c b/samples/user_events/example.c index d06dc24156ec..18e34c9d708e 100644 --- a/samples/user_events/example.c +++ b/samples/user_events/example.c @@ -23,8 +23,8 @@ #endif /* Assumes debugfs is mounted */ -const char *data_file = "/sys/kernel/debug/tracing/user_events_data"; -const char *status_file = "/sys/kernel/debug/tracing/user_events_status"; +const char *data_file = "/sys/kernel/tracing/user_events_data"; +const char *status_file = "/sys/kernel/tracing/user_events_status"; static int event_status(long **status) { diff --git a/scripts/tracing/draw_functrace.py b/scripts/tracing/draw_functrace.py index 438516bdfb3c..42fa87300941 100755 --- a/scripts/tracing/draw_functrace.py +++ b/scripts/tracing/draw_functrace.py @@ -12,9 +12,9 @@ calls. Only the functions's names and the call time are provided. Usage: Be sure that you have CONFIG_FUNCTION_TRACER - # mount -t debugfs nodev /sys/kernel/debug - # echo function > /sys/kernel/debug/tracing/current_tracer - $ cat /sys/kernel/debug/tracing/trace_pipe > ~/raw_trace_func + # mount -t tracefs nodev /sys/kernel/tracing + # echo function > /sys/kernel/tracing/current_tracer + $ cat /sys/kernel/tracing/trace_pipe > ~/raw_trace_func Wait some times but not too much, the script is a bit slow. Break the pipe (Ctrl + Z) $ scripts/tracing/draw_functrace.py < ~/raw_trace_func > draw_functrace diff --git a/tools/lib/api/fs/tracing_path.c b/tools/lib/api/fs/tracing_path.c index b8e457c841ab..7ba3e81274e8 100644 --- a/tools/lib/api/fs/tracing_path.c +++ b/tools/lib/api/fs/tracing_path.c @@ -14,8 +14,8 @@ #include "tracing_path.h" static char tracing_mnt[PATH_MAX] = "/sys/kernel/debug"; -static char tracing_path[PATH_MAX] = "/sys/kernel/debug/tracing"; -static char tracing_events_path[PATH_MAX] = "/sys/kernel/debug/tracing/events"; +static char tracing_path[PATH_MAX] = "/sys/kernel/tracing"; +static char tracing_events_path[PATH_MAX] = "/sys/kernel/tracing/events"; static void __tracing_path_set(const char *tracing, const char *mountpoint) { diff --git a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc new file mode 100644 index 000000000000..e2ff3bf4df80 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc @@ -0,0 +1,58 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: event filter function - test event filtering on functions +# requires: set_event events/kmem/kmem_cache_free/filter +# flags: instance + +fail() { #msg + echo $1 + exit_fail +} + +echo "Test event filter function name" +echo 0 > tracing_on +echo 0 > events/enable +echo > trace +echo 'call_site.function == exit_mmap' > events/kmem/kmem_cache_free/filter +echo 1 > events/kmem/kmem_cache_free/enable +echo 1 > tracing_on +ls > /dev/null +echo 0 > events/kmem/kmem_cache_free/enable + +hitcnt=`grep kmem_cache_free trace| grep exit_mmap | wc -l` +misscnt=`grep kmem_cache_free trace| grep -v exit_mmap | wc -l` + +if [ $hitcnt -eq 0 ]; then + exit_fail +fi + +if [ $misscnt -gt 0 ]; then + exit_fail +fi + +address=`grep ' exit_mmap$' /proc/kallsyms | cut -d' ' -f1` + +echo "Test event filter function address" +echo 0 > tracing_on +echo 0 > events/enable +echo > trace +echo "call_site.function == 0x$address" > events/kmem/kmem_cache_free/filter +echo 1 > events/kmem/kmem_cache_free/enable +echo 1 > tracing_on +sleep 1 +echo 0 > events/kmem/kmem_cache_free/enable + +hitcnt=`grep kmem_cache_free trace| grep exit_mmap | wc -l` +misscnt=`grep kmem_cache_free trace| grep -v exit_mmap | wc -l` + +if [ $hitcnt -eq 0 ]; then + exit_fail +fi + +if [ $misscnt -gt 0 ]; then + exit_fail +fi + +reset_events_filter + +exit 0 diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-stack.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-stack.tc new file mode 100644 index 000000000000..755dbe94ccf4 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-stack.tc @@ -0,0 +1,24 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: event trigger - test inter-event histogram trigger trace action with dynamic string param +# requires: set_event synthetic_events events/sched/sched_process_exec/hist "long[]' >> synthetic_events":README + +fail() { #msg + echo $1 + exit_fail +} + +echo "Test create synthetic event with stack" + + +echo 's:wake_lat pid_t pid; u64 delta; unsigned long[] stack;' > dynamic_events +echo 'hist:keys=next_pid:ts=common_timestamp.usecs,st=stacktrace if prev_state == 1||prev_state == 2' >> events/sched/sched_switch/trigger +echo 'hist:keys=prev_pid:delta=common_timestamp.usecs-$ts,s=$st:onmax($delta).trace(wake_lat,prev_pid,$delta,$s)' >> events/sched/sched_switch/trigger +echo 1 > events/synthetic/wake_lat/enable +sleep 1 + +if ! grep -q "=>.*sched" trace; then + fail "Failed to create synthetic event with stack" +fi + +exit 0 diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc index 2968cdc7df30..366f1f3ad906 100644 --- a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc @@ -70,6 +70,12 @@ grep "myevent[[:space:]]unsigned long var" synthetic_events echo "myevent char var[10]" > synthetic_events grep "myevent[[:space:]]char\[10\] var" synthetic_events +if grep -q 'long\[\]' README; then + # test stacktrace type + echo "myevent unsigned long[] var" > synthetic_events + grep "myevent[[:space:]]unsigned long\[\] var" synthetic_events +fi + do_reset exit 0 diff --git a/tools/tracing/latency/latency-collector.c b/tools/tracing/latency/latency-collector.c index 59a7f2346eab..0fd9c747d396 100644 --- a/tools/tracing/latency/latency-collector.c +++ b/tools/tracing/latency/latency-collector.c @@ -1584,7 +1584,7 @@ static void *do_printloop(void *arg) /* * Toss a coin to decide if we want to sleep before printing * out the backtrace. The reason for this is that opening - * /sys/kernel/debug/tracing/trace will cause a blackout of + * /sys/kernel/tracing/trace will cause a blackout of * hundreds of ms, where no latencies will be noted by the * latency tracer. Thus by randomly sleeping we try to avoid * missing traces systematically due to this. With this option |