diff options
author | René Stadler <mail@renestadler.de> | 2011-11-16 20:37:21 +0100 |
---|---|---|
committer | René Stadler <mail@renestadler.de> | 2011-11-16 20:46:04 +0100 |
commit | f617bf6dcd6feb21d1ea058ecf2b3c1af5cc535a (patch) | |
tree | d1c4eee0ae2695dd93e534b0e7d61cd0faef086e /GstDebugViewer | |
parent | 381d04ad354b7240fa376c0149be1e1bc5000fd8 (diff) |
Data: fix up out-of-order log lines
This is important because we rely on monotonically increasing timestamps for
binary searches in various places.
Overhead for an already sorted file with 1 million lines is less than 5%.
Diffstat (limited to 'GstDebugViewer')
-rw-r--r-- | GstDebugViewer/Data.py | 89 |
1 files changed, 87 insertions, 2 deletions
diff --git a/GstDebugViewer/Data.py b/GstDebugViewer/Data.py index 4b3b68d..4c5d70d 100644 --- a/GstDebugViewer/Data.py +++ b/GstDebugViewer/Data.py @@ -183,6 +183,75 @@ class Producer (object): for consumer in self.consumers: consumer.handle_load_finished () +class SortHelper (object): + + def __init__ (self, fileobj, offsets): + + self._gen = self.__gen (fileobj, offsets) + self._gen.next () + + # Override in the instance, for performance (this gets called in an + # inner loop): + self.find_insert_position = self._gen.send + + @staticmethod + def find_insert_position (insert_time_string): + + # Stub for documentary purposes. + + pass + + @staticmethod + def __gen (fileobj, offsets): + + from math import floor + tell = fileobj.tell + seek = fileobj.seek + read = fileobj.read + time_len = len (time_args (0)) + + # We remember the previous insertion point. This gives a nice speed up + # for larger bubbles which are already sorted. TODO: In practice, log + # lines only get out of order across threads. Need to check if it pays + # to parse the thread here and maintain multiple insertion points for + # heavily interleaved parts of the log. + pos = 0 + pos_time_string = "" + + insert_pos = None + while True: + insert_time_string = (yield insert_pos) + + save_offset = tell () + + if pos_time_string <= insert_time_string: + lo = pos + hi = len (offsets) + else: + lo = 0 + hi = pos + + # This is a bisection search, except we don't cut the range in the + # middle each time, but at the 90th percentile. This is because + # logs are "mostly sorted", so the insertion point is much more + # likely to be at the end anyways: + while lo < hi: + mid = int (floor (lo * 0.1 + hi * 0.9)) + seek (offsets[mid]) + mid_time_string = read (time_len) + if insert_time_string < mid_time_string: + hi = mid + else: + lo = mid + 1 + pos = lo + # Caller will replace row at pos with the new one, so this is + # correct: + pos_time_string = insert_time_string + + insert_pos = pos + + seek (save_offset) + class LineCache (Producer): _lines_per_iteration = 50000 @@ -233,17 +302,33 @@ class LineCache (Producer): self.__fileobj.seek (0) limit = self._lines_per_iteration + last_line = "" i = 0 + sort_helper = SortHelper (self.__fileobj, offsets) + find_insert_position = sort_helper.find_insert_position while True: offset = tell () line = readline () if not line: break + # if line[18] == "\x1b": + # line = strip_escape (line) match = rexp_match (line) if match is None: continue - levels_append (dict_levels_get (match.group (1), debug_level_none)) - offsets_append (offset) + + # Timestamp is in the very beginning of the row, and can be sorted + # by lexical comparison. That's why we don't bother parsing the + # time to integer. We also don't have to take a substring here, + # which would be a useless memcpy. + if line >= last_line: + levels_append (dict_levels_get (match.group (1), debug_level_none)) + offsets_append (offset) + last_line = line + else: + pos = find_insert_position (line) + levels.insert (pos, dict_levels_get (match.group (1), debug_level_none)) + offsets.insert (pos, offset) i += 1 if i >= limit: i = 0 |