diff options
author | Jonathan Corbet <corbet@lwn.net> | 2011-07-11 13:51:58 -0600 |
---|---|---|
committer | Jonathan Corbet <corbet@lwn.net> | 2011-07-11 13:51:58 -0600 |
commit | 47ffed3ceed85f4e2c97cfec055f7ca4301616f3 (patch) | |
tree | f82252ba41cca26e6a9e974264240fcef6a16563 | |
parent | 85004f0f9b73d7504606be5ccd0cc08688129eb1 (diff) | |
parent | 69f9ad7e643fa72da257d358821072f789dbc3ce (diff) |
Merge branch 'refactoring' of git://gitorious.org/mining-tools/gitdm into german
-rw-r--r-- | ConfigFile.py | 141 | ||||
-rw-r--r-- | README | 47 | ||||
-rw-r--r-- | csv.py | 40 | ||||
-rw-r--r-- | csvdump.py | 88 | ||||
-rw-r--r-- | database.py | 19 | ||||
-rwxr-xr-x | gitdm | 195 | ||||
-rw-r--r-- | gitdm.config | 5 | ||||
-rw-r--r-- | logparser.py | 90 | ||||
-rw-r--r-- | patterns.py | 54 | ||||
-rw-r--r-- | reports.py | 44 | ||||
-rw-r--r-- | sample-config/filetypes.txt | 362 |
11 files changed, 926 insertions, 159 deletions
diff --git a/ConfigFile.py b/ConfigFile.py index 32a4aec..b6981a4 100644 --- a/ConfigFile.py +++ b/ConfigFile.py @@ -13,18 +13,42 @@ import sys, re, datetime, os.path import database -# -# Read a line and strip out junk. -# -def ReadConfigLine (file): - line = file.readline () - if not line: - return None - line = line.split('#')[0] # Get rid of any comments - line = line.strip () # and extra white space - if len (line) == 0: # we got rid of everything - return ReadConfigLine (file) - return line +class ReadConfigLine: + """ + ReadConfigLine provides a iterator to extract line + from an config file without comments. + + Typical use case: + + fd = open(filename, 'r') + for line in ReadConfigLine(fd): + parse_line(line) + fd.close(fd) + """ + + def __init__(self, fd): + self.fd = fd + self.buffer = None + self.patch = [] + + def __iter__(self): + return self + + def next(self): + line = self.fd.readline() + while line: + line = line.split('#')[0] # Get rid of any comments + line = line.strip() # and extra white space + if len(line) == 0: # we got rid of everything + line = self.fd.readline() + else: + break + + if not line: + raise StopIteration + + return line + # # Give up and die. @@ -38,19 +62,19 @@ def croak (message): # def ReadEmailAliases (name): try: - file = open (name, 'r') + fd = open (name, 'r') except IOError: croak ('Unable to open email alias file %s' % (name)) - line = ReadConfigLine (file) - while line: + + for line in ReadConfigLine (fd): m = re.match ('^("[^"]+"|\S+)\s+(.+)$', line) if not m or len (m.groups ()) != 2: croak ('Funky email alias line "%s"' % (line)) if m and m.group (2).find ('@') <= 0: croak ('Non-addresses in email alias "%s"' % (line)) database.AddEmailAlias (m.group (1).replace ('"', ''), m.group (2)) - line = ReadConfigLine (file) - file.close () + + fd.close () # # The Email/Employer map @@ -59,11 +83,11 @@ EMMpat = re.compile (r'^([^\s]+)\s+([^<]+)\s*(<\s*(\d+-\d+-\d+)\s*)?$') def ReadEmailEmployers (name): try: - file = open (name, 'r') + fd = open (name, 'r') except IOError: croak ('Unable to open email/employer file %s' % (name)) - line = ReadConfigLine (file) - while line: + + for line in ReadConfigLine (fd): m = EMMpat.match (line) if not m: croak ('Funky email/employer line "%s"' % (line)) @@ -71,8 +95,8 @@ def ReadEmailEmployers (name): company = m.group (2).strip () enddate = ParseDate (m.group (4)) database.AddEmailEmployerMapping (email, company, enddate) - line = ReadConfigLine (file) - file.close () + + fd.close () def ParseDate (cdate): if not cdate: @@ -83,22 +107,22 @@ def ParseDate (cdate): def ReadGroupMap (fname, employer): try: - file = open (fname, 'r') + fd = open (fname, 'r') except IOError: croak ('Unable to open group map file %s' % (fname)) - line = ReadConfigLine (file) - while line: + + for line in ReadConfigLine (fd): database.AddEmailEmployerMapping (line, employer) - line = ReadConfigLine (file) - file.close () + + fd.close () # # Read in a virtual employer description. # -def ReadVirtual (file, name): +def ReadVirtual (fd, name): ve = database.VirtualEmployer (name) - line = ReadConfigLine (file) - while line: + + for line in ReadConfigLine (fd): sl = line.split (None, 1) first = sl[0] if first == 'end': @@ -116,23 +140,57 @@ def ReadVirtual (file, name): if not (0 < percent <= 100): croak ('Bad split value "%s" for virtual empl %s' % (first, name)) ve.addsplit (' '.join (sl[1:]), percent/100.0) - line = ReadConfigLine (file) # # We should never get here # croak ('Missing "end" line for virtual employer %s' % (name)) # +# Read file type patterns for more fine graned reports +# +def ReadFileType (filename): + try: + fd = open (filename, 'r') + except IOError: + croak ('Unable to open file type mapping file %s' % (filename)) + patterns = {} + order = [] + regex_order = re.compile ('^order\s+(.*)$') + regex_file_type = re.compile ('^filetype\s+(\S+)\s+(.+)$') + + for line in ReadConfigLine (fd): + o = regex_order.match (line) + if o: + # Consider only the first definition in the config file + elements = o.group(1).replace (' ', '') + order = order or elements.split(',') + continue + + m = regex_file_type.match (line) + if not m or len (m.groups ()) != 2: + ConfigFile.croak ('Funky file type line "%s"' % (line)) + if not patterns.has_key (m.group (1)): + patterns[m.group (1)] = [] + if m.group (1) not in order: + print '%s not found, appended to the last order' % m.group (1) + order.append (m.group (1)) + + patterns[m.group (1)].append (re.compile (m.group (2), re.IGNORECASE)) + + fd.close () + return patterns, order + +# # Read an overall config file. # def ConfigFile (name, confdir): try: - file = open (name, 'r') + fd = open (name, 'r') except IOError: croak ('Unable to open config file %s' % (name)) - line = ReadConfigLine (file) - while line: + + for line in ReadConfigLine (fd): sline = line.split (None, 2) if len (sline) < 2: croak ('Funky config line: "%s"' % (line)) @@ -146,7 +204,20 @@ def ConfigFile (name, confdir): ReadGroupMap (os.path.join (confdir, sline[1]), sline[2]) elif sline[0] == 'VirtualEmployer': ReadVirtual (file, ' '.join (sline[1:])) + elif sline[0] == 'FileTypeMap': + patterns, order = ReadFileType (os.path.join (confdir, sline[1])) + database.FileTypes = database.FileType (patterns, order) else: croak ('Unrecognized config line: "%s"' % (line)) - line = ReadConfigLine (file) + + +if __name__ == '__main__': + '''Test the iterato for reading configuration files''' + try: + fd = open(sys.argv[1]) + except: + croak('Usage: %s <config-file>' % sys.argv[0]) + + for line in ReadConfigLine(fd): + print line @@ -20,6 +20,10 @@ Run it like this: git log -p -M [details] | gitdm [options] +Alternatively, you can run with: + + git log --numstat -M [details] | gitdm -n [options] + The [details] tell git which changesets are of interest; the [options] can be: @@ -32,26 +36,35 @@ be: By default, "./gitdm.config" is used. -d Omit the developer reports, giving employer information - only. + only. - -D Rather than create the usual statistics, create a - file (datelc) providing lines changed per day, where the first column - displays the changes happened only on that day and the second sums - the day it happnened with the previous ones. This option is suitable - for feeding to a tool like gnuplot. + -D Rather than create the usual statistics, create a file (datelc.csv) + providing lines changed per day, where the first column displays + the changes happened only on that day and the second sums the day it + happnened with the previous ones. This option is suitable for + feeding to a tool like gnuplot. -h file Generate HTML output to the given file -l num Only list the top <num> entries in each report. + -n Use --numstat instead of generated patches to get the statistics. + -o file Write text output to the given file (default is stdout). + -p prefix Dump out the database categorized by changeset and by file type. + It requires -n, otherwise it is not possible to get separated results. + -r pat Only generate statistics for changes to files whose name matches the given regular expression. -s Ignore Signed-off-by lines which match the author of each patch. + -t Generate a report by type of contribution (code, documentation, etc.). + It requires -n, otherwise this option is ignored silently. + + -u Group all unknown developers under the "(Unknown)" employer. @@ -68,6 +81,10 @@ looks like: git log -p -M v2.6.19..v2.6.20 | \ gitdm -u -s -a -o results -h results.html +or: + + git log --numstat -M v2.6.19..v2.6.20 | \ + gitdm -u -s -a -n -o results -h results.html CONFIGURATION FILE @@ -134,6 +151,24 @@ end for example, no check to ensure that the percentages add up to something rational. +FileTypeMap file + + Map file names/extensions onto file types. These files contain lines + like: + + order <type1>,<type2>,...,<typeN> + + filetype <type> <regex> + ... + + This construct allows fine graned reports by type of contribution + (build, code, image, multimedia, documentation, etc.) + + Order is important because it is possible to have overlapping between + filenames. For instance, ltmain.sh fits better as 'build' instead of + 'code' (the filename instead of '\.sh$'). The first element in order + has precedence over the next ones. + OTHER TOOLS @@ -1,40 +0,0 @@ -# -# aggregate per-month statistics for people -# -import sys, datetime - -class CSVStat: - def __init__ (self, name, employer, date): - self.name = name - self.employer = employer - self.added = self.removed = 0 - self.date = date - def accumulate (self, p): - self.added = self.added + p.added - self.removed = self.removed + p.removed - -PeriodCommitHash = { } - -def AccumulatePatch (p, Aggregate): - date = "%.2d-%.2d-01"%(p.date.year, p.date.month) - if (Aggregate == 'week'): - date = "%.2d-%.2d"%(p.date.isocalendar()[0], p.date.isocalendar()[1]) - authdatekey = "%s-%s"%(p.author.name, date) - if authdatekey not in PeriodCommitHash: - empl = p.author.emailemployer (p.email, p.date) - stat = CSVStat (p.author.name, empl, date) - PeriodCommitHash[authdatekey] = stat - else: - stat = PeriodCommitHash[authdatekey] - stat.accumulate (p) - -def OutputCSV (file): - if file is None: - return - file.write ("Name\tAffliation\tDate\tAdded\tRemoved\n") - for date, stat in PeriodCommitHash.items(): - # sanitise names " is common and \" sometimes too - empl_name = stat.employer.name.replace ("\"", ".").replace ("\\", ".") - author_name = stat.name.replace ("\"", ".").replace ("\\", ".") - file.write ("\"%s\"\t\"%s\"\t%s\t%d\t%d\n"%(author_name, empl_name, stat.date, \ - stat.added, stat.removed)) diff --git a/csvdump.py b/csvdump.py new file mode 100644 index 0000000..b76a5f6 --- /dev/null +++ b/csvdump.py @@ -0,0 +1,88 @@ +# +# aggregate per-month statistics for people +# +import sys, datetime +import csv + +class CSVStat: + def __init__ (self, name, email, employer, date): + self.name = name + self.email = email + self.employer = employer + self.added = self.removed = 0 + self.date = date + def accumulate (self, p): + self.added = self.added + p.added + self.removed = self.removed + p.removed + +PeriodCommitHash = { } + +def AccumulatePatch (p, Aggregate): + date = "%.2d-%.2d-01"%(p.date.year, p.date.month) + if (Aggregate == 'week'): + date = "%.2d-%.2d"%(p.date.isocalendar()[0], p.date.isocalendar()[1]) + authdatekey = "%s-%s"%(p.author.name, date) + if authdatekey not in PeriodCommitHash: + empl = p.author.emailemployer (p.email, p.date) + stat = CSVStat (p.author.name, p.email, empl, date) + PeriodCommitHash[authdatekey] = stat + else: + stat = PeriodCommitHash[authdatekey] + stat.accumulate (p) + +ChangeSets = [] +FileTypes = [] + +def store_patch(patch): + if not patch.merge: + employer = patch.author.emailemployer(patch.email, patch.date) + employer = employer.name.replace('"', '.').replace ('\\', '.') + author = patch.author.name.replace ('"', '.').replace ('\\', '.') + author = patch.author.name.replace ("'", '.') + try: + domain = patch.email.split('@')[1] + except: + domain = patch.email + ChangeSets.append([patch.commit, str(patch.date), + patch.email, domain, author, employer, + patch.added, patch.removed]) + for (filetype, (added, removed)) in patch.filetypes.iteritems(): + FileTypes.append([patch.commit, filetype, added, removed]) + + +def save_csv (prefix='data'): + # Dump the ChangeSets + if len(ChangeSets) > 0: + fd = open('%s-changesets.csv' % prefix, 'w') + writer = csv.writer (fd, quoting=csv.QUOTE_NONNUMERIC) + writer.writerow (['Commit', 'Date', 'Domain', + 'Email', 'Name', 'Affliation', + 'Added', 'Removed']) + for commit in ChangeSets: + writer.writerow(commit) + + # Dump the file types + if len(FileTypes) > 0: + fd = open('%s-filetypes.csv' % prefix, 'w') + writer = csv.writer (fd, quoting=csv.QUOTE_NONNUMERIC) + + writer.writerow (['Commit', 'Type', 'Added', 'Removed']) + for commit in FileTypes: + writer.writerow(commit) + + + +def OutputCSV (file): + if file is None: + return + writer = csv.writer (file, quoting=csv.QUOTE_NONNUMERIC) + writer.writerow (['Name', 'Email', 'Affliation', 'Date', + 'Added', 'Removed']) + for date, stat in PeriodCommitHash.items(): + # sanitise names " is common and \" sometimes too + empl_name = stat.employer.name.replace ('"', '.').replace ('\\', '.') + author_name = stat.name.replace ('"', '.').replace ('\\', '.') + writer.writerow ([author_name, stat.email, empl_name, stat.date, + stat.added, stat.removed]) + +__all__ = [ 'AccumulatePatch', 'OutputCSV', 'store_patch' ] diff --git a/database.py b/database.py index b5d9382..6a62adc 100644 --- a/database.py +++ b/database.py @@ -188,6 +188,25 @@ class VirtualEmployer (Employer): # Should check that they add up too, but I'm lazy Employers[self.name] = self +class FileType: + def __init__ (self, patterns={}, order=[]): + self.patterns = patterns + self.order = order + + def guess_file_type (self, filename, patterns=None, order=None): + patterns = patterns or self.patterns + order = order or self.order + + for file_type in order: + if patterns.has_key (file_type): + for patt in patterns[file_type]: + if patt.search (filename): + return file_type + + return 'unknown' + +FileTypes = None + # # Mix all the virtual employers into their real destinations. # @@ -1,4 +1,5 @@ #!/usr/bin/pypy +#-*- coding:utf-8 -*- # # @@ -6,15 +7,17 @@ # # Copyright 2007-11 Eklektix, Inc. # Copyright 2007-11 Jonathan Corbet <corbet@lwn.net> +# Copyright 2011 Germán Póo-Caamaño <gpoo@gnome.org> # # This file may be distributed under the terms of the GNU General # Public License, version 2. -import database, csv, ConfigFile, reports +import database, csvdump, ConfigFile, reports import getopt, datetime import os, re, sys, rfc822, string -from patterns import * +import logparser +from patterns import patterns Today = datetime.date.today() @@ -32,11 +35,14 @@ DateStats = 0 AuthorSOBs = 1 FileFilter = None CSVFile = None +CSVPrefix = None AkpmOverLt = 0 DumpDB = 0 CFName = 'gitdm.config' DirName = '' Aggregate = 'month' +Numstat = 0 +ReportByFileType = 0 # # Options: @@ -48,7 +54,9 @@ Aggregate = 'month' # -D Output date statistics # -h hfile HTML output to hfile # -l count Maximum length for output lists +# -n Use numstats instead of generated patch from git log # -o file File for text output +# -p prefix Prefix for CSV output # -r pattern Restrict to files matching pattern # -s Ignore author SOB lines # -u Map unknown employers to '(Unknown)' @@ -59,9 +67,10 @@ Aggregate = 'month' def ParseOpts (): global MapUnknown, DevReports global DateStats, AuthorSOBs, FileFilter, AkpmOverLt, DumpDB - global CFName, CSVFile, DirName, Aggregate + global CFName, CSVFile, CSVPrefix,DirName, Aggregate, Numstat + global ReportByFileType - opts, rest = getopt.getopt (sys.argv[1:], 'ab:dc:Dh:l:o:r:suwx:z') + opts, rest = getopt.getopt (sys.argv[1:], 'ab:dc:Dh:l:no:p:r:stuwx:z') for opt in opts: if opt[0] == '-a': AkpmOverLt = 1 @@ -77,13 +86,19 @@ def ParseOpts (): reports.SetHTMLOutput (open (opt[1], 'w')) elif opt[0] == '-l': reports.SetMaxList (int (opt[1])) + elif opt[0] == '-n': + Numstat = 1 elif opt[0] == '-o': reports.SetOutput (open (opt[1], 'w')) + elif opt[0] == '-p': + CSVPrefix = opt[1] elif opt[0] == '-r': print 'Filter on "%s"' % (opt[1]) FileFilter = re.compile (opt[1]) elif opt[0] == '-s': AuthorSOBs = 0 + elif opt[0] == '-t': + ReportByFileType = 1 elif opt[0] == '-u': MapUnknown = 1 elif opt[0] == '-x': @@ -139,6 +154,8 @@ def PrintDateStats(): # Let's slowly try to move some smarts into this class. # class patch: + (ADDED, REMOVED) = range (2) + def __init__ (self, commit): self.commit = commit self.merge = self.added = self.removed = 0 @@ -148,6 +165,7 @@ class patch: self.reviews = [ ] self.testers = [ ] self.reports = [ ] + self.filetypes = {} def addreviewer (self, reviewer): self.reviews.append (reviewer) @@ -157,36 +175,57 @@ class patch: def addreporter (self, reporter): self.reports.append (reporter) + + def addfiletype (self, filetype, added, removed): + if self.filetypes.has_key (filetype): + self.filetypes[filetype][self.ADDED] += added + self.filetypes[filetype][self.REMOVED] += removed + else: + self.filetypes[filetype] = [added, removed] + +def parse_numstat(line, file_filter): + """ + Receive a line of text, determine if fits a numstat line and + parse the added and removed lines as well as the file type. + """ + m = patterns['numstat'].match (line) + if m: + filename = m.group (3) + # If we have a file filter, check for file lines. + if file_filter and not file_filter.search (filename): + return None, None, None, None + + try: + added = int (m.group (1)) + removed = int (m.group (2)) + except ValueError: + # A binary file (image, etc.) is marked with '-' + added = removed = 0 + + m = patterns['rename'].match (filename) + if m: + filename = '%s%s%s' % (m.group (1), m.group (3), m.group (4)) + + filetype = database.FileTypes.guess_file_type (os.path.basename(filename)) + return filename, filetype, added, removed + else: + return None, None, None, None + # # The core hack for grabbing the information about a changeset. # -def grabpatch(): - global NextLine - - while (1): - m = Pcommit.match (NextLine) - if m: - break; - NextLine = sys.stdin.readline () - if not NextLine: - return +def grabpatch(logpatch): + m = patterns['commit'].match (logpatch[0]) + if not m: + return None p = patch(m.group (1)) - NextLine = sys.stdin.readline () ignore = (FileFilter is not None) - while NextLine: - Line = NextLine - # - # If this line starts a new commit, drop out. - # - m = Pcommit.match (Line) - if m: - break - NextLine = sys.stdin.readline () + for Line in logpatch[1:]: # # Maybe it's an author line? # - m = Pauthor.match (Line) + m = patterns['author'].match (Line) if m: p.email = database.RemapEmail (m.group (2)) p.author = LookupStoreHacker(m.group (1), p.email) @@ -194,7 +233,7 @@ def grabpatch(): # # Could be a signed-off-by: # - m = Psob.match (Line) + m = patterns['signed-off-by'].match (Line) if m: email = database.RemapEmail (m.group (2)) sobber = LookupStoreHacker(m.group (1), email) @@ -204,24 +243,26 @@ def grabpatch(): # # Various other tags of interest. # - m = Preview.match (Line) # Reviewed-by: + m = patterns['reviewed-by'].match (Line) if m: email = database.RemapEmail (m.group (2)) p.addreviewer (LookupStoreHacker(m.group (1), email)) continue - m = Ptest.match (Line) # Tested-by: + m = patterns['tested-by'].match (Line) if m: email = database.RemapEmail (m.group (2)) p.addtester (LookupStoreHacker (m.group (1), email)) p.author.testcredit (patch) continue - m = Prep.match (Line) # Reported-by: + # Reported-by: + m = patterns['reported-by'].match (Line) if m: email = database.RemapEmail (m.group (2)) p.addreporter (LookupStoreHacker (m.group (1), email)) p.author.reportcredit (patch) continue - m = Preptest.match (Line) # Reported-and-tested-by: + # Reported-and-tested-by: + m = patterns['reported-and-tested-by'].match (Line) if m: email = database.RemapEmail (m.group (2)) h = LookupStoreHacker (m.group (1), email) @@ -233,14 +274,14 @@ def grabpatch(): # # If this one is a merge, make note of the fact. # - m = Pmerge.match (Line) + m = patterns['merge'].match (Line) if m: p.merge = 1 continue # # See if it's the date. # - m = Pdate.match (Line) + m = patterns['date'].match (Line) if m: dt = rfc822.parsedate(m.group (2)) p.date = datetime.date (dt[0], dt[1], dt[2]) @@ -248,20 +289,29 @@ def grabpatch(): sys.stderr.write ('Funky date: %s\n' % p.date) p.date = Today continue - # - # If we have a file filter, check for file lines. - # - if FileFilter: - ignore = ApplyFileFilter (Line, ignore) - # - # OK, maybe it's part of the diff itself. - # - if not ignore: - if Padd.match (Line): - p.added += 1 - continue - if Prem.match (Line): - p.removed += 1 + if not Numstat: + # + # If we have a file filter, check for file lines. + # + if FileFilter: + ignore = ApplyFileFilter (Line, ignore) + # + # OK, maybe it's part of the diff itself. + # + if not ignore: + if patterns['add'].match (Line): + p.added += 1 + continue + if patterns['rem'].match (Line): + p.removed += 1 + else: + # Get the statistics (lines added/removes) using numstats + # and without requiring a diff (--numstat instead -p) + (filename, filetype, added, removed) = parse_numstat (Line, FileFilter) + if filename: + p.added += added + p.removed += removed + p.addfiletype (filetype, added, removed) if '@' in p.author.name: GripeAboutAuthorName (p.author.name) @@ -279,7 +329,7 @@ def ApplyFileFilter (line, ignore): # If this is the first file line (--- a/), set ignore one way # or the other. # - m = Pfilea.match (line) + m = patterns['filea'].match (line) if m: file = m.group (1) if FileFilter.search (file): @@ -288,13 +338,29 @@ def ApplyFileFilter (line, ignore): # # For the second line, we can turn ignore off, but not on # - m = Pfileb.match (line) + m = patterns['fileb'].match (line) if m: file = m.group (1) if FileFilter.search (file): return 0 return ignore +def is_svntag(logpatch): + """ + This is a workaround for a bug on the migration to Git + from Subversion found in GNOME. It may happen in other + repositories as well. + """ + + for Line in logpatch: + m = patterns['svn-tag'].match(Line.strip()) + if m: + sys.stderr.write ('(W) detected a commit on a svn tag: %s\n' % + (m.group (0),)) + return True + + return False + # # If this patch is signed off by both Andrew Morton and Linus Torvalds, # remove the (redundant) Linus signoff. @@ -324,7 +390,6 @@ if AkpmOverLt == 1: Akpm = ('akpm@linux-foundation.org', LookupStoreHacker ('Andrew Morton', 'akpm@linux-foundation.org')) -NextLine = sys.stdin.readline () TotalChanged = TotalAdded = TotalRemoved = 0 # @@ -332,12 +397,23 @@ TotalChanged = TotalAdded = TotalRemoved = 0 # print >> sys.stderr, 'Grabbing changesets...\r', +patches = logparser.LogPatchSplitter(sys.stdin) printcount = CSCount = 0 -while (1): + +for logpatch in patches: if (printcount % 50) == 0: print >> sys.stderr, 'Grabbing changesets...%d\r' % printcount, printcount += 1 - p = grabpatch() + + # We want to ignore commits on svn tags since in Subversion + # thats mean a copy of the whole repository, which leads to + # wrong results. Some migrations from Subversion to Git does + # not catch all this tags/copy and import them just as a new + # big changeset. + if is_svntag(logpatch): + continue + + p = grabpatch(logpatch) if not p: break # if p.added > 100000 or p.removed > 100000: @@ -373,8 +449,9 @@ while (1): hacker.addtested (p) for hacker in p.reports: hacker.addreport (p) - CSCount += 1 - csv.AccumulatePatch (p, Aggregate) + CSCount += 1 + csvdump.AccumulatePatch (p, Aggregate) + csvdump.store_patch (p) print >> sys.stderr, 'Grabbing changesets...done ' if DumpDB: @@ -403,10 +480,16 @@ if TotalChanged == 0: if DateStats: PrintDateStats () -csv.OutputCSV (CSVFile) -if CSVFile is not None: - CSVFile.close () +if CSVPrefix: + csvdump.save_csv (CSVPrefix) + +if CSVFile: + csvdump.OutputCSV (CSVFile) + CSVFile.close () if DevReports: reports.DevReports (hlist, TotalChanged, CSCount, TotalRemoved) reports.EmplReports (elist, TotalChanged, CSCount) + +if ReportByFileType and Numstat: + reports.ReportByFileType (hlist) diff --git a/gitdm.config b/gitdm.config index 588d6ef..3ae2f20 100644 --- a/gitdm.config +++ b/gitdm.config @@ -20,3 +20,8 @@ EmailMap sample-config/domain-map # # GroupMap sample-config/illuminati The Illuminati # +# +# Use FileTypeMap to map a file types to file names using regular +# regular expressions. +# +FileTypeMap sample-config/filetypes.txt diff --git a/logparser.py b/logparser.py new file mode 100644 index 0000000..b375034 --- /dev/null +++ b/logparser.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +# +# Copyright © 2009 Germán Póo-Caamaño <gpoo@gnome.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + +import sys +from patterns import patterns + +class LogPatchSplitter: + """ + LogPatchSplitters provides a iterator to extract every + changeset from a git log output. + + Typical use case: + + patches = LogPatchSplitter(sys.stdin) + + for patch in patches: + parse_patch(patch) + """ + + def __init__(self, fd): + self.fd = fd + self.buffer = None + self.patch = [] + + def __iter__(self): + return self + + def next(self): + patch = self.__grab_patch__() + if not patch: + raise StopIteration + return patch + + def __grab_patch__(self): + """ + Extract a patch from the file descriptor and the + patch is returned as a list of lines. + """ + + patch = [] + line = self.buffer or self.fd.readline() + + while line: + m = patterns['commit'].match(line) + if m: + patch = [line] + break + line = self.fd.readline() + + if not line: + return None + + line = self.fd.readline() + while line: + # If this line starts a new commit, drop out. + m = patterns['commit'].match(line) + if m: + self.buffer = line + break + + patch.append(line) + self.buffer = None + line = self.fd.readline() + + return patch + + +if __name__ == '__main__': + patches = LogPatchSplitter(sys.stdin) + + for patch in patches: + print '---------- NEW PATCH ----------' + for line in patch: + print line, diff --git a/patterns.py b/patterns.py index e63efb6..803e532 100644 --- a/patterns.py +++ b/patterns.py @@ -1,10 +1,12 @@ # +# -*- coding:utf-8 -*- # Pull together regular expressions used in multiple places. # # This code is part of the LWN git data miner. # # Copyright 2007-11 Eklektix, Inc. # Copyright 2007-11 Jonathan Corbet <corbet@lwn.net> +# Copyright 2011 Germán Póo-Caamaño <gpoo@gnome.org> # # This file may be distributed under the terms of the GNU General # Public License, version 2. @@ -16,24 +18,34 @@ import re # expressions." Now they have two problems. # -- Jamie Zawinski # -Pemail = r'\s+"?([^<"]+)"?\s<([^>]+)>' # just email addr + name -Pcommit = re.compile (r'^commit ([0-9a-f ]+)$') -Pauthor = re.compile (r'^Author:' + Pemail + '$') -Psob = re.compile (r'^\s+Signed-off-by:' + Pemail + '.*$') -Pmerge = re.compile (r'^Merge:.*$') -Padd = re.compile (r'^\+[^+].*$') -Prem = re.compile (r'^-[^-].*$') -Pdate = re.compile (r'^(Commit)?Date:\s+(.*)$') -Pfilea = re.compile (r'^---\s+(.*)$') -Pfileb = re.compile (r'^\+\+\+\s+(.*)$') -Preview = re.compile (r'^\s+Reviewed-by:' + Pemail + '.*$') -Ptest = re.compile (r'^\s+tested-by:' + Pemail + '.*$', re.I) -Prep = re.compile (r'^\s+Reported-by:' + Pemail + '.*$') -Preptest = re.compile (r'^\s+reported-and-tested-by:' + Pemail + '.*$', re.I) -# -# Merges are described with a variety of lines. -# -PExtMerge = re.compile(r'^ +Merge( branch .* of)? ([^ ]+:[^ ]+)\n$') -PIntMerge = re.compile(r'^ +(Merge|Pull) .* into .*$') -# PIntMerge2 = re.compile(r"^ +Merge branch(es)? '.*$") -PIntMerge2 = re.compile(r"^ +Merge .*$") +_pemail = r'\s+"?([^<"]+)"?\s<([^>]+)>' # just email addr + name + +patterns = { + 'commit': re.compile (r'^commit ([0-9a-f ]+)$'), + 'author': re.compile (r'^Author:' + _pemail + '$'), + 'signed-off-by': re.compile (r'^\s+Signed-off-by:' + _pemail + '.*$'), + 'merge': re.compile (r'^Merge:.*$'), + 'add': re.compile (r'^\+[^+].*$'), + 'rem': re.compile (r'^-[^-].*$'), + 'date': re.compile (r'^(Commit)?Date:\s+(.*)$'), + # filea, fileb are used only in 'parche mode' (-p) + 'filea': re.compile (r'^---\s+(.*)$'), + 'fileb': re.compile (r'^\+\+\+\s+(.*)$'), + 'reviewed-by': re.compile (r'^\s+Reviewed-by:' + _pemail+ '.*$'), + 'tested-by': re.compile (r'^\s+tested-by:' + _pemail + '.*$', re.I), + 'reported-by': re.compile (r'^\s+Reported-by:' + _pemail + '.*$'), + 'reported-and-tested-by': re.compile (r'^\s+reported-and-tested-by:' + _pemail + '.*$', re.I), + # + # Merges are described with a variety of lines. + # + 'ExtMerge': re.compile(r'^ +Merge( branch .* of)? ([^ ]+:[^ ]+)\n$'), + 'IntMerge': re.compile(r'^ +(Merge|Pull) .* into .*$'), + # PIntMerge2 = re.compile(r"^ +Merge branch(es)? '.*$"), + 'IntMerge2': re.compile(r"^ +Merge .*$"), + # Another way to get the statistics (per file). + # It implies --numstat + 'numstat': re.compile('^(\d+|-)\s+(\d+|-)\s+(.*)$'), + 'rename' : re.compile('(.*)\{(.*) => (.*)\}(.*)'), + # Detect errors on svn conversions + 'svn-tag': re.compile("^svn path=/tags/(.*)/?; revision=([0-9]+)$"), +} @@ -340,4 +340,46 @@ def EmplReports (elist, totalchanged, cscount): ReportByELChanged (elist, totalchanged) ReportByESOBs (elist) ReportByEHackers (elist) - + +def ReportByFileType (hacker_list): + total = {} + total_by_hacker = {} + + BeginReport ('Developer contributions by type') + for h in hacker_list: + by_hacker = {} + for patch in h.patches: + # Get a summary by hacker + for (filetype, (added, removed)) in patch.filetypes.iteritems(): + if by_hacker.has_key(filetype): + by_hacker[filetype][patch.ADDED] += added + by_hacker[filetype][patch.REMOVED] += removed + else: + by_hacker[filetype] = [added, removed] + + # Update the totals + if total.has_key(filetype): + total[filetype][patch.ADDED] += added + total[filetype][patch.REMOVED] += removed + else: + total[filetype] = [added, removed, []] + + # Print a summary by hacker + print h.name + for filetype, counters in by_hacker.iteritems(): + print '\t', filetype, counters + h_added = by_hacker[filetype][patch.ADDED] + h_removed = by_hacker[filetype][patch.REMOVED] + total[filetype][2].append ([h.name, h_added, h_removed]) + + # Print the global summary + BeginReport ('Contributions by type and developers') + for filetype, (added, removed, hackers) in total.iteritems(): + print filetype, added, removed + for h, h_added, h_removed in hackers: + print '\t%s: [%d, %d]' % (h, h_added, h_removed) + + # Print the very global summary + BeginReport ('General contributions by type') + for filetype, (added, removed, hackers) in total.iteritems(): + print filetype, added, removed diff --git a/sample-config/filetypes.txt b/sample-config/filetypes.txt new file mode 100644 index 0000000..e24c396 --- /dev/null +++ b/sample-config/filetypes.txt @@ -0,0 +1,362 @@ +# -*- coding:utf-8 -*- +# Copyright (C) 2006 Libresoft +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# Authors : Gregorio Robles <grex@gsyc.escet.urjc.es> +# Authors : Germán Póo-Caamaño <gpoo@gnome.org> +# +# This file contains associations parameters regarding filetypes +# (documentation, develompent, multimedia, images...) +# +# format: +# filetype <type> <regex> [<comment>] +# +# Order: +# The list should keep an order, so filetypes can be counted properly. +# ie. we want ltmain.sh -> 'build' instead of 'code'. +# +# If there is an filetype which is not in order but has values, it will +# be added at the end. +# +order image,translation,ui,multimedia,package,build,code,documentation,devel-doc +# +# +# Code files (headers and the like included +# (most common languages first +# +filetype code \.c$ # C +filetype code \.pc$ # C +filetype code \.ec$ # C +filetype code \.ecp$ # C +filetype code \.C$ # C++ +filetype code \.cpp$ # C++ +filetype code \.c\+\+$ # C++ +filetype code \.cxx$ # C++ +filetype code \.cc$ # C++ +filetype code \.pcc$ # C++ +filetype code \.cpy$ # C++ +filetype code \.h$ # C or C++ header +filetype code \.hh$ # C++ header +filetype code \.hpp$ # C++ header +filetype code \.hxx$ # C++ header +filetype code \.sh$ # Shell +filetype code \.pl$ # Perl +filetype code \.pm$ # Perl +filetype code \.pod$ # Perl +filetype code \.perl$ # Perl +filetype code \.cgi$ # CGI +filetype code \.php$ # PHP +filetype code \.php3$ # PHP +filetype code \.php4$ # PHP +filetype code \.inc$ # PHP +filetype code \.py$ # Python +filetype code \.java$ # Java +filetype code \.class$ # Java Class (or at least a class in some OOPL +filetype code \.ada$ # ADA +filetype code \.ads$ # ADA +filetype code \.adb$ # ADA +filetype code \.pad$ # ADA +filetype code \.s$ # Assembly +filetype code \.S$ # Assembly +filetype code \.asm$ # Assembly +filetype code \.awk$ # awk +filetype code \.cs$ # C# +filetype code \.csh$ # CShell (including tcsh +filetype code \.cob$ # COBOL +filetype code \.cbl$ # COBOL +filetype code \.COB$ # COBOL +filetype code \.CBL$ # COBOL +filetype code \.exp$ # Expect +filetype code \.l$ # (F lex +filetype code \.ll$ # (F lex +filetype code \.lex$ # (F lex +filetype code \.f$ # Fortran +filetype code \.f77$ # Fortran +filetype code \.F$ # Fortran +filetype code \.hs$ # Haskell +filetype code \.lhs$ # Not preprocessed Haskell +filetype code \.el$ # LISP (including Scheme +filetype code \.scm$ # LISP (including Scheme +filetype code \.lsp$ # LISP (including Scheme +filetype code \.jl$ # LISP (including Scheme +filetype code \.ml$ # ML +filetype code \.ml3$ # ML +filetype code \.m3$ # Modula3 +filetype code \.i3$ # Modula3 +filetype code \.m$ # Objective-C +filetype code \.p$ # Pascal +filetype code \.pas$ # Pascal +filetype code \.rb$ # Ruby +filetype code \.sed$ # sed +filetype code \.tcl$ # TCL +filetype code \.tk$ # TCL +filetype code \.itk$ # TCL +filetype code \.y$ # Yacc +filetype code \.yy$ # Yacc +filetype code \.idl$ # CORBA IDL +filetype code \.gnorba$ # GNOME CORBA IDL +filetype code \.oafinfo$ # GNOME OAF +filetype code \.mcopclass$ # MCOP IDL compiler generated class +filetype code \.autoforms$ # Autoform +filetype code \.atf$ # Autoform +filetype code \.gnuplot$ +filetype code \.xs$ # Shared library? Seen a lot of them in gnome-perl +filetype code \.js$ # JavaScript (and who knows, maybe more +filetype code \.patch$ +filetype code \.diff$ # Sometimes patches appear this way +filetype code \.ids$ # Not really sure what this means +filetype code \.upd$ # ¿¿¿??? (from Kcontrol +filetype code $.ad$ # ¿¿¿??? (from Kdisplay and mc +filetype code $.i$ # Appears in the kbindings for Qt +filetype code $.pri$ # from Qt +filetype code \.schema$ # Not really sure what this means +filetype code \.fd$ # Something to do with latex +filetype code \.cls$ # Something to do with latex +filetype code \.pro$ # Postscript generation +filetype code \.ppd$ # PDF generation +filetype code \.dlg$ # Not really sure what this means +filetype code \.plugin$ # Plug-in file +filetype code \.dsp # Microsoft Developer Studio Project File +filetype code \.vim$ # vim syntax file +filetype code \.trm$ # gnuplot term file +filetype code \.font$ # Font mapping +filetype code \.ccg$ # C++ files - Found in gtkmm* +filetype code \.hg$ # C++ headers - Found in gtkmm* +filetype code \.dtd # XML Document Type Definition +filetype code \.bat # DOS batch files +filetype code \.vala # Vala +filetype code \.py\.in$ +filetype code \.rhtml$ # eRuby +filetype code \.sql$ # SQL script +# +# +# Development documentation files (for hacking generally +# +filetype devel-doc ^readme.*$ +filetype devel-doc ^changelog.* +filetype devel-doc ^todo.*$ +filetype devel-doc ^credits.*$ +filetype devel-doc ^authors.*$ +filetype devel-doc ^changes.*$ +filetype devel-doc ^news.*$ +filetype devel-doc ^install.*$ +filetype devel-doc ^hacking.*$ +filetype devel-doc ^copyright.*$ +filetype devel-doc ^licen(s|c)e.*$ +filetype devel-doc ^copying.*$ +filetype devel-doc manifest$ +filetype devel-doc faq$ +filetype devel-doc building$ +filetype devel-doc howto$ +filetype devel-doc design$ +filetype devel-doc \.files$ +filetype devel-doc files$ +filetype devel-doc subdirs$ +filetype devel-doc maintainers$ +filetype devel-doc developers$ +filetype devel-doc contributors$ +filetype devel-doc thanks$ +filetype devel-doc releasing$ +filetype devel-doc test$ +filetype devel-doc testing$ +filetype devel-doc build$ +filetype devel-doc comments?$ +filetype devel-doc bugs$ +filetype devel-doc buglist$ +filetype devel-doc problems$ +filetype devel-doc debug$ +filetype devel-doc hacks$ +filetype devel-doc hacking$ +filetype devel-doc versions?$ +filetype devel-doc mappings$ +filetype devel-doc tips$ +filetype devel-doc ideas?$ +filetype devel-doc spec$ +filetype devel-doc compiling$ +filetype devel-doc notes$ +filetype devel-doc missing$ +filetype devel-doc done$ +filetype devel-doc \.omf$ # XML-based format used in GNOME +filetype devel-doc \.lsm$ +filetype devel-doc ^doxyfile$ +filetype devel-doc \.kdevprj$ +filetype devel-doc \.directory$ +filetype devel-doc \.dox$ +filetype devel-doc \.doap$ +# +# +# Building, compiling, configuration and CVS admin files +# +filetype build \.in.*$ +filetype build configure.*$ +filetype build makefile.*$ +filetype build config\.sub$ +filetype build config\.guess$ +filetype build config\.status$ +filetype build ltmain\.sh$ +filetype build autogen\.sh$ +filetype build config$ +filetype build conf$ +filetype build cvsignore$ +filetype build \.cfg$ +filetype build \.m4$ +filetype build \.mk$ +filetype build \.mak$ +filetype build \.make$ +filetype build \.mbx$ +filetype build \.protocol$ +filetype build \.version$ +filetype build mkinstalldirs$ +filetype build install-sh$ +filetype build rules$ +filetype build \.kdelnk$ +filetype build \.menu$ +filetype build linguas$ # Build translations +filetype build potfiles.*$ # Build translations +filetype build \.shlibs$ # Shared libraries +# filetype build %debian% +# filetype build %specs/% +filetype build \.spec$ # It seems theyre necessary for RPM build +filetype build \.def$ # build bootstrap for DLLs on win32 +# +# +# Documentation files +# +# filetype documentation doc/% +# filetype documentation %HOWTO% +filetype documentation \.html$ +filetype documentation \.txt$ +filetype documentation \.ps(\.gz|\.bz2)?$ +filetype documentation \.dvi(\.gz|\.bz2)?$ +filetype documentation \.lyx$ +filetype documentation \.tex$ +filetype documentation \.texi$ +filetype documentation \.pdf(\.gz|\.bz2)?$ +filetype documentation \.djvu$ +filetype documentation \.epub$ +filetype documentation \.sgml$ +filetype documentation \.docbook$ +filetype documentation \.wml$ +filetype documentation \.xhtml$ +filetype documentation \.phtml$ +filetype documentation \.shtml$ +filetype documentation \.htm$ +filetype documentation \.rdf$ +filetype documentation \.phtm$ +filetype documentation \.tmpl$ +filetype documentation \.ref$ # References +filetype documentation \.css$ +# filetype documentation %tutorial% +filetype documentation \.templates$ +filetype documentation \.dsl$ +filetype documentation \.ent$ +filetype documentation \.xml$ +filetype documentation \.xmi$ +filetype documentation \.xsl$ +filetype documentation \.entities$ +filetype documentation \.[1-7]$ # Man pages +filetype documentation \.man$ +filetype documentation \.manpages$ +filetype documentation \.doc$ +filetype documentation \.rtf$ +filetype documentation \.wpd$ +filetype documentation \.qt3$ +filetype documentation man\d?/.*\.\d$ +filetype documentation \.docs$ +filetype documentation \.sdw$ # OpenOffice.org Writer document +filetype documentation \.odt$ # OpenOffice.org document +filetype documentation \.en$ # Files in English language +filetype documentation \.de$ # Files in German +filetype documentation \.es$ # Files in Spanish +filetype documentation \.fr$ # Files in French +filetype documentation \.it$ # Files in Italian +filetype documentation \.cz$ # Files in Czech +filetype documentation \.page$ # Mallard +filetype documentation \.page.stub$ # Mallard stub +# +# +# Images +# +filetype image \.png$ +filetype image \.jpg$ +filetype image \.jpeg$ +filetype image \.bmp$ +filetype image \.gif$ +filetype image \.xbm$ +filetype image \.eps$ +filetype image \.mng$ +filetype image \.pnm$ +filetype image \.pbm$ +filetype image \.ppm$ +filetype image \.pgm$ +filetype image \.gbr$ +filetype image \.svg$ +filetype image \.fig$ +filetype image \.tif$ +filetype image \.swf$ +filetype image \.svgz$ +filetype image \.shape$ # XML files used for shapes for instance in Kivio +filetype image \.sml$ # XML files used for shapes for instance in Kivio +filetype image \.bdf$ # vfontcap - Vector Font Capability Database (VFlib Version 2 +filetype image \.ico$ +filetype image \.dia$ # We consider .dia as images, I dont want them in unknown +# +# +# Translation files +# +filetype translation \.po$ +filetype translation \.pot$ +filetype translation \.charset$ +filetype translation \.mo$ +# +# +# User interface files +# +filetype ui \.desktop$ +filetype ui \.ui$ +filetype ui \.xpm$ +filetype ui \.xcf$ +filetype ui \.3ds$ +filetype ui \.theme$ +filetype ui \.kimap$ +filetype ui \.glade$ +filetype ui \.gtkbuilder$ +filetype ui rc$ +# +# +# Sound files +# +filetype multimedia \.mp3$ +filetype multimedia \.ogg$ +filetype multimedia \.wav$ +filetype multimedia \.au$ +filetype multimedia \.mid$ +filetype multimedia \.vorbis$ +filetype multimedia \.midi$ +filetype multimedia \.arts$ +# +# +# Packages (yes, there are people who upload packages to the repo) +# +filetype package \.tar$ +filetype package \.tar.gz$ +filetype package \.tar.bz2$ +filetype package \.tar.xz$ +filetype package \.tgz$ +filetype package \.deb$ +filetype package \.rpm$ +filetype package \.srpm$ +filetype package \.ebuild$ |