/* Sysprof -- Sampling, systemwide CPU profiler * Copyright 2004, Red Hat, Inc. * Copyright 2004, 2005, Soeren Sandmann * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include #include #include #include #include #include #include #include #include "stackstash.h" #include "collector.h" #include "watch.h" #include "elfparser.h" #include "tracker.h" #include "perf_counter.h" #include "util.h" #define d_print(...) #define N_PAGES 32 /* Number of pages in the ringbuffer */ #define N_WAKEUP_EVENTS 149 typedef struct counter_t counter_t; typedef struct sample_event_t sample_event_t; typedef struct mmap_event_t mmap_event_t; typedef struct comm_event_t comm_event_t; typedef struct exit_event_t exit_event_t; typedef struct fork_event_t fork_event_t; typedef union counter_event_t counter_event_t; static void process_event (Collector *collector, counter_t *counter, counter_event_t *event); struct counter_t { Collector * collector; int fd; struct perf_counter_mmap_page * mmap_page; uint8_t * data; uint64_t tail; int cpu; }; struct sample_event_t { struct perf_event_header header; uint64_t ip; uint32_t pid, tid; uint64_t n_ips; uint64_t ips[1]; }; struct comm_event_t { struct perf_event_header header; uint32_t pid, tid; char comm[1]; }; struct mmap_event_t { struct perf_event_header header; uint32_t pid, tid; uint64_t addr; uint64_t len; uint64_t pgoff; char filename[1]; }; struct fork_event_t { struct perf_event_header header; uint32_t pid, ppid; uint32_t tid, ptid; }; struct exit_event_t { struct perf_event_header header; uint32_t pid, ppid; uint32_t tid, ptid; }; union counter_event_t { struct perf_event_header header; mmap_event_t mmap; comm_event_t comm; sample_event_t sample; fork_event_t fork; exit_event_t exit; }; struct Collector { CollectorFunc callback; gpointer data; tracker_t * tracker; GTimeVal latest_reset; int prev_samples; int n_samples; GList * counters; gboolean use_hw_counters; }; static int get_n_cpus (void) { return sysconf (_SC_NPROCESSORS_ONLN); } static int sysprof_perf_counter_open (struct perf_counter_attr *attr, pid_t pid, int cpu, int group_fd, unsigned long flags) { #ifndef __NR_perf_counter_open #if defined(__i386__) #define __NR_perf_counter_open 336 #elif defined(__x86_64__) #define __NR_perf_counter_open 298 #elif defined(__arm__) #define __NR_perf_counter_open 364 #elif defined(__bfin__) #define __NR_perf_counter_open 369 #elif defined(__frv__) #define __NR_perf_counter_open 336 #elif defined(__m68k__) #define __NR_perf_counter_open 332 #elif defined(__MICROBLAZE__) #define __NR_perf_counter_open 366 #elif defined(__mips__) && defined(_ABIO32) #define __NR_perf_counter_open 4333 #elif defined(__mips__) && defined(_ABIN32) #define __NR_perf_counter_open 6296 #elif defined(__mips__) && defined(_ABI64) #define __NR_perf_counter_open 5292 #elif defined(__mn10300__) #define __NR_perf_counter_open 337 #elif defined(__hppa__) #define __NR_perf_counter_open 318 #elif defined(__ppc__) || defined(__ppc64__) #define __NR_perf_counter_open 319 #elif defined(__s390__) #define __NR_perf_counter_open 331 #elif defined(__sh__) && (!defined(__SH5__) || __SH5__ == 32) #define __NR_perf_counter_open 336 #elif defined(__sh__) && defined(__SH5__) && __SH5__ == 64 #define __NR_perf_counter_open 364 #elif defined(__sparc__) || defined(__sparc64__) #define __NR_perf_counter_open 327 #endif #endif attr->size = sizeof(*attr); return syscall (__NR_perf_counter_open, attr, pid, cpu, group_fd, flags); } static double timeval_to_ms (const GTimeVal *timeval) { return (timeval->tv_sec * G_USEC_PER_SEC + timeval->tv_usec) / 1000.0; } static double time_diff (const GTimeVal *first, const GTimeVal *second) { double first_ms = timeval_to_ms (first); double second_ms = timeval_to_ms (second); return first_ms - second_ms; } #define RESET_DEAD_PERIOD 250 static gboolean in_dead_period (Collector *collector) { GTimeVal now; double diff; g_get_current_time (&now); diff = time_diff (&now, &collector->latest_reset); if (diff >= 0.0 && diff < RESET_DEAD_PERIOD) return TRUE; return FALSE; } static int get_page_size (void) { static int page_size; static gboolean has_page_size = FALSE; if (!has_page_size) { page_size = getpagesize(); has_page_size = TRUE; } return page_size; } static void on_read (gpointer data) { counter_t *counter = data; int mask = (N_PAGES * get_page_size() - 1); int n_bytes = mask + 1; gboolean skip_samples; Collector *collector; uint64_t head, tail; collector = counter->collector; tail = counter->tail; head = counter->mmap_page->data_head; rmb(); if (head < tail) { g_warning ("sysprof fails at ring buffers (head "FMT64", tail "FMT64"\n", head, tail); tail = head; } #if 0 /* Verify that the double mapping works */ x = g_random_int() & mask; g_assert (*(counter->data + x) == *(counter->data + x + n_bytes)); #endif skip_samples = in_dead_period (collector); #if 0 g_print ("n bytes %d\n", head - tail); #endif while (head - tail >= sizeof (struct perf_event_header)) { struct perf_event_header *header; guint8 buffer[4096]; guint8 *free_me; free_me = NULL; /* Note that: * * - perf events are a multiple of 64 bits * - the perf event header is 64 bits * - the data area is a multiple of 64 bits * * which means there will always be space for one header, which means we * can safely dereference the size field. */ header = (struct perf_event_header *)(counter->data + (tail & mask)); if (header->size > head - tail) { /* The kernel did not generate a complete event. * I don't think that can happen, but we may as well * be paranoid. */ break; } if (counter->data + (tail & mask) + header->size > counter->data + n_bytes) { int n_before, n_after; guint8 *b; if (header->size > sizeof (buffer)) free_me = b = g_malloc (header->size); else b = buffer; n_after = (tail & mask) + header->size - n_bytes; n_before = header->size - n_after; memcpy (b, counter->data + (tail & mask), n_before); memcpy (b + n_before, counter->data, n_after); header = (struct perf_event_header *)b; } if (!skip_samples || header->type != PERF_EVENT_SAMPLE) { if (header->type == PERF_EVENT_SAMPLE) collector->n_samples++; process_event (collector, counter, (counter_event_t *)header); } if (free_me) g_free (free_me); tail += header->size; } counter->tail = tail; counter->mmap_page->data_tail = tail; if (collector->callback) { if (collector->n_samples - collector->prev_samples >= N_WAKEUP_EVENTS) { gboolean first_sample = collector->prev_samples == 0; collector->callback (first_sample, collector->data); collector->prev_samples = collector->n_samples; } } } static void * fail (GError **err, const char *what) { g_set_error (err, COLLECTOR_ERROR, COLLECTOR_ERROR_FAILED, "%s: %s", what, g_strerror (errno)); return NULL; } static void * map_buffer (counter_t *counter, GError **err) { int n_bytes = N_PAGES * get_page_size(); void *address; address = mmap (NULL, n_bytes + get_page_size(), PROT_READ | PROT_WRITE, MAP_SHARED, counter->fd, 0); if (address == MAP_FAILED) return fail (err, "mmap"); return address; } static gboolean counter_set_output (counter_t *counter, int output) { return ioctl (counter->fd, PERF_COUNTER_IOC_SET_OUTPUT, output) == 0; } static void counter_enable (counter_t *counter) { ioctl (counter->fd, PERF_COUNTER_IOC_ENABLE); } static void counter_disable (counter_t *counter) { d_print ("disable\n"); ioctl (counter->fd, PERF_COUNTER_IOC_DISABLE); } static counter_t * counter_new (Collector *collector, int cpu, counter_t *output, GError **err) { struct perf_counter_attr attr; counter_t *counter; int fd; counter = g_new (counter_t, 1); memset (&attr, 0, sizeof (attr)); attr.type = PERF_TYPE_HARDWARE; attr.config = PERF_COUNT_HW_CPU_CYCLES; attr.sample_period = 1200000 ; /* In number of clock cycles - * FIXME: consider using frequency instead */ attr.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_CALLCHAIN; attr.wakeup_events = N_WAKEUP_EVENTS; attr.disabled = TRUE; attr.mmap = 1; attr.comm = 1; attr.task = 1; attr.exclude_idle = 1; if (!collector->use_hw_counters || (fd = sysprof_perf_counter_open (&attr, -1, cpu, -1, 0)) < 0) { attr.type = PERF_TYPE_SOFTWARE; attr.config = PERF_COUNT_SW_CPU_CLOCK; attr.sample_period = 1000000; fd = sysprof_perf_counter_open (&attr, -1, cpu, -1, 0); } if (fd < 0) return fail (err, "Could not open performance counter"); counter->collector = collector; counter->fd = fd; counter->cpu = cpu; if (output && counter_set_output (counter, output->fd)) { counter->mmap_page = NULL; counter->data = NULL; counter->tail = 0; } else { counter->mmap_page = map_buffer (counter, err); if (!counter->mmap_page || counter->mmap_page == MAP_FAILED) return NULL; counter->data = (uint8_t *)counter->mmap_page + get_page_size (); counter->tail = 0; fd_add_watch (fd, counter); fd_set_read_callback (fd, on_read); } return counter; } static void counter_free (counter_t *counter) { d_print ("munmap\n"); munmap (counter->mmap_page, (N_PAGES + 1) * get_page_size()); fd_remove_watch (counter->fd); close (counter->fd); g_free (counter); } /* * Collector */ static void enable_counters (Collector *collector) { GList *list; d_print ("enable\n"); for (list = collector->counters; list != NULL; list = list->next) { counter_t *counter = list->data; counter_enable (counter); } } static void disable_counters (Collector *collector) { GList *list; d_print ("disable\n"); for (list = collector->counters; list != NULL; list = list->next) { counter_t *counter = list->data; counter_disable (counter); } } void collector_reset (Collector *collector) { /* Disable the counters so that we won't track * the activity of tracker_free()/tracker_new() * * They will still record fork/mmap/etc. so * we can keep an accurate log of process creation */ if (collector->counters) { d_print ("disable counters\n"); disable_counters (collector); } if (collector->tracker) { tracker_free (collector->tracker); collector->tracker = tracker_new (); } collector->n_samples = 0; collector->prev_samples = 0; g_get_current_time (&collector->latest_reset); if (collector->counters) { d_print ("enable counters\n"); enable_counters (collector); } } /* callback is called whenever a new sample arrives */ Collector * collector_new (gboolean use_hw_counters, CollectorFunc callback, gpointer data) { Collector *collector = g_new0 (Collector, 1); collector->callback = callback; collector->data = data; collector->tracker = NULL; collector->use_hw_counters = use_hw_counters; collector_reset (collector); return collector; } static void process_mmap (Collector *collector, mmap_event_t *mmap) { tracker_add_map (collector->tracker, mmap->pid, mmap->addr, mmap->addr + mmap->len, mmap->pgoff, 0, /* inode */ mmap->filename); } static void process_comm (Collector *collector, comm_event_t *comm) { d_print ("pid, tid: %d %d", comm->pid, comm->tid); tracker_add_process (collector->tracker, comm->pid, comm->comm); } static void process_fork (Collector *collector, fork_event_t *fork) { d_print ("ppid: %d pid: %d ptid: %d tid %d\n", fork->ppid, fork->pid, fork->ptid, fork->tid); tracker_add_fork (collector->tracker, fork->ppid, fork->pid); } static void process_exit (Collector *collector, exit_event_t *exit) { d_print ("for %d %d", exit->pid, exit->tid); tracker_add_exit (collector->tracker, exit->pid); } static void process_sample (Collector *collector, sample_event_t *sample) { uint64_t *ips; int n_ips; d_print ("pid, tid: %d %d", sample->pid, sample->tid); if (sample->n_ips == 0) { uint64_t trace[3]; if (sample->header.misc & PERF_EVENT_MISC_KERNEL) { trace[0] = PERF_CONTEXT_KERNEL; trace[1] = sample->ip; trace[2] = PERF_CONTEXT_USER; ips = trace; n_ips = 3; } else { trace[0] = PERF_CONTEXT_USER; trace[1] = sample->ip; ips = trace; n_ips = 2; } } else { ips = sample->ips; n_ips = sample->n_ips; } tracker_add_sample (collector->tracker, sample->pid, ips, n_ips); } static void process_event (Collector *collector, counter_t *counter, counter_event_t *event) { char *name; switch (event->header.type) { case PERF_EVENT_MMAP: name = "mmap"; break; case PERF_EVENT_LOST: name = "lost"; break; case PERF_EVENT_COMM: name = "comm"; break; case PERF_EVENT_EXIT: name = "exit"; break; case PERF_EVENT_THROTTLE: name = "throttle"; break; case PERF_EVENT_UNTHROTTLE: name = "unthrottle"; break; case PERF_EVENT_FORK: name = "fork"; break; case PERF_EVENT_READ: name = "read"; break; case PERF_EVENT_SAMPLE: name = "samp"; break; default: name = "unknown"; break; } d_print ("cpu %d :: %s :: ", counter->cpu, name); switch (event->header.type) { case PERF_EVENT_MMAP: process_mmap (collector, &event->mmap); break; case PERF_EVENT_LOST: g_print ("lost event\n"); break; case PERF_EVENT_COMM: process_comm (collector, &event->comm); break; case PERF_EVENT_EXIT: process_exit (collector, &event->exit); break; case PERF_EVENT_THROTTLE: g_print ("throttle\n"); break; case PERF_EVENT_UNTHROTTLE: g_print ("unthrottle\n"); break; case PERF_EVENT_FORK: process_fork (collector, &event->fork); break; case PERF_EVENT_READ: break; case PERF_EVENT_SAMPLE: process_sample (collector, &event->sample); break; default: g_warning ("unknown event: %d (%d)\n", event->header.type, event->header.size); break; } d_print ("\n"); } gboolean collector_start (Collector *collector, GError **err) { int n_cpus = get_n_cpus (); int i; counter_t *output; if (!collector->tracker) collector->tracker = tracker_new (); output = NULL; for (i = 0; i < n_cpus; ++i) { counter_t *counter = counter_new (collector, i, output, err); if (!counter) { GList *list; for (list = collector->counters; list != NULL; list = list->next) counter_free (list->data); collector->tracker = NULL; return FALSE; } collector->counters = g_list_append (collector->counters, counter); if (!output) output = counter; } enable_counters (collector); return TRUE; } void collector_stop (Collector *collector) { GList *list; if (!collector->counters) return; /* Read any remaining data */ for (list = collector->counters; list != NULL; list = list->next) { counter_t *counter = list->data; if (counter->data) on_read (counter); counter_free (counter); } g_list_free (collector->counters); collector->counters = NULL; } int collector_get_n_samples (Collector *collector) { return collector->n_samples; } Profile * collector_create_profile (Collector *collector) { /* The collector must be stopped when you create a profile */ g_assert (!collector->counters); return tracker_create_profile (collector->tracker); } GQuark collector_error_quark (void) { static GQuark q = 0; if (q == 0) q = g_quark_from_static_string ("collector-error-quark"); return q; }