#!/usr/bin/python # # # This code is part of the LWN git data miner. # # Copyright 2007-9 LWN.net # Copyright 2007-9 Jonathan Corbet # # This file may be distributed under the terms of the GNU General # Public License, version 2. import database, csv, ConfigFile, reports import getopt, datetime import os, re, sys, rfc822, string from patterns import * Today = datetime.date.today() # # Remember author names we have griped about. # GripedAuthorNames = [ ] # # Control options. # MapUnknown = 0 DevReports = 1 DateStats = 0 AuthorSOBs = 1 FileFilter = None CSVFile = None AkpmOverLt = 0 DumpDB = 0 CFName = 'gitdm.config' DirName = '' # # Options: # # -a Andrew Morton's signoffs shadow Linus's # -b dir Specify the base directory to fetch the configuration files # -c cfile Specify a configuration file # -d Output individual developer stats # -D Output date statistics # -h hfile HTML output to hfile # -l count Maximum length for output lists # -o file File for text output # -r pattern Restrict to files matching pattern # -s Ignore author SOB lines # -u Map unknown employers to '(Unknown)' # -x file.csv Export raw statistics as CSV # -z Dump out the hacker database at completion def ParseOpts (): global MapUnknown, DevReports global DateStats, AuthorSOBs, FileFilter, AkpmOverLt, DumpDB global CFName, CSVFile, DirName opts, rest = getopt.getopt (sys.argv[1:], 'ab:dc:Dh:l:o:r:sux:z') for opt in opts: if opt[0] == '-a': AkpmOverLt = 1 elif opt[0] == '-b': DirName = opt[1] elif opt[0] == '-c': CFName = opt[1] elif opt[0] == '-d': DevReports = 0 elif opt[0] == '-D': DateStats = 1 elif opt[0] == '-h': reports.SetHTMLOutput (open (opt[1], 'w')) elif opt[0] == '-l': reports.SetMaxList (int (opt[1])) elif opt[0] == '-o': reports.SetOutput (open (opt[1], 'w')) elif opt[0] == '-r': print 'Filter on "%s"' % (opt[1]) FileFilter = re.compile (opt[1]) elif opt[0] == '-s': AuthorSOBs = 0 elif opt[0] == '-u': MapUnknown = 1 elif opt[0] == '-x': CSVFile = open (opt[1], 'w') print "open output file " + opt[1] + "\n" elif opt[0] == '-z': DumpDB = 1 def LookupStoreHacker (name, email): email = database.RemapEmail (email) h = database.LookupEmail (email) if h: # already there return h elist = database.LookupEmployer (email, MapUnknown) h = database.LookupName (name) if h: # new email h.addemail (email, elist) return h return database.StoreHacker(name, elist, email) # # Date tracking. # DateMap = { } def AddDateLines(date, lines): if lines > 1000000: print 'Skip big patch (%d)' % lines return try: DateMap[date] += lines except KeyError: DateMap[date] = lines def PrintDateStats(): dates = DateMap.keys () dates.sort () total = 0 datef = open ('datelc.csv', 'w') datef.write('Date,Changed,Total Changed\n') for date in dates: total += DateMap[date] datef.write ('%d/%02d/%02d,%d,%d\n' % (date.year, date.month, date.day, DateMap[date], total)) # # Let's slowly try to move some smarts into this class. # class patch: def __init__ (self, commit): self.commit = commit self.merge = self.added = self.removed = 0 self.author = LookupStoreHacker('Unknown hacker', 'unknown@hacker.net') self.email = 'unknown@hacker.net' self.sobs = [ ] self.reviews = [ ] self.testers = [ ] self.reports = [ ] def addreviewer (self, reviewer): self.reviews.append (reviewer) def addtester (self, tester): self.testers.append (tester) def addreporter (self, reporter): self.reports.append (reporter) # # The core hack for grabbing the information about a changeset. # def grabpatch(): global NextLine while (1): m = Pcommit.match (NextLine) if m: break; NextLine = sys.stdin.readline () if not NextLine: return p = patch(m.group (1)) NextLine = sys.stdin.readline () ignore = (FileFilter is not None) while NextLine: Line = NextLine # # If this line starts a new commit, drop out. # m = Pcommit.match (Line) if m: break NextLine = sys.stdin.readline () # # Maybe it's an author line? # m = Pauthor.match (Line) if m: p.email = database.RemapEmail (m.group (2)) p.author = LookupStoreHacker(m.group (1), p.email) continue # # Could be a signed-off-by: # m = Psob.match (Line) if m: email = database.RemapEmail (m.group (2)) sobber = LookupStoreHacker(m.group (1), email) if sobber != p.author or AuthorSOBs: p.sobs.append ((email, LookupStoreHacker(m.group (1), m.group (2)))) continue # # Various other tags of interest. # m = Preview.match (Line) # Reviewed-by: if m: email = database.RemapEmail (m.group (2)) p.addreviewer (LookupStoreHacker(m.group (1), email)) continue m = Ptest.match (Line) # Tested-by: if m: email = database.RemapEmail (m.group (2)) p.addtester (LookupStoreHacker (m.group (1), email)) p.author.testcredit (patch) continue m = Prep.match (Line) # Reported-by: if m: email = database.RemapEmail (m.group (2)) p.addreporter (LookupStoreHacker (m.group (1), email)) p.author.reportcredit (patch) continue m = Preptest.match (Line) # Reported-and-tested-by: if m: email = database.RemapEmail (m.group (2)) h = LookupStoreHacker (m.group (1), email) p.addreporter (h) p.addtester (h) p.author.reportcredit (patch) p.author.testcredit (patch) continue # # If this one is a merge, make note of the fact. # m = Pmerge.match (Line) if m: p.merge = 1 continue # # See if it's the date. # m = Pdate.match (Line) if m: dt = rfc822.parsedate(m.group (2)) p.date = datetime.date (dt[0], dt[1], dt[2]) if p.date > Today: sys.stderr.write ('Funky date: %s\n' % p.date) p.date = Today continue # # If we have a file filter, check for file lines. # if FileFilter: ignore = ApplyFileFilter (Line, ignore) # # OK, maybe it's part of the diff itself. # if not ignore: if Padd.match (Line): p.added += 1 continue if Prem.match (Line): p.removed += 1 if '@' in p.author.name: GripeAboutAuthorName (p.author.name) return p def GripeAboutAuthorName (name): if name in GripedAuthorNames: return GripedAuthorNames.append (name) print '%s is an author name, probably not what you want' % (name) def ApplyFileFilter (line, ignore): # # If this is the first file line (--- a/), set ignore one way # or the other. # m = Pfilea.match (line) if m: file = m.group (1) if FileFilter.search (file): return 0 return 1 # # For the second line, we can turn ignore off, but not on # m = Pfileb.match (line) if m: file = m.group (1) if FileFilter.search (file): return 0 return ignore # # If this patch is signed off by both Andrew Morton and Linus Torvalds, # remove the (redundant) Linus signoff. # def TrimLTSOBs (p): if AkpmOverLt == 1 and Linus in p.sobs and Akpm in p.sobs: p.sobs.remove (Linus) # # Here starts the real program. # ParseOpts () # # Read the config files. # ConfigFile.ConfigFile (CFName, DirName) # # Let's pre-seed the database with a couple of hackers # we want to remember. # if AkpmOverLt == 1: Linus = ('torvalds@linux-foundation.org', LookupStoreHacker ('Linus Torvalds', 'torvalds@linux-foundation.org')) Akpm = ('akpm@linux-foundation.org', LookupStoreHacker ('Andrew Morton', 'akpm@linux-foundation.org')) NextLine = sys.stdin.readline () TotalChanged = TotalAdded = TotalRemoved = 0 # # Snarf changesets. # print >> sys.stderr, 'Grabbing changesets...\r', printcount = CSCount = 0 while (1): if (printcount % 50) == 0: print >> sys.stderr, 'Grabbing changesets...%d\r' % printcount, printcount += 1 p = grabpatch() if not p: break # if p.added > 100000 or p.removed > 100000: # print 'Skipping massive add', p.commit # continue if FileFilter and p.added == 0 and p.removed == 0: continue # # Record some global information - but only if this patch had # stuff which wasn't ignored. # if ((p.added + p.removed) > 0 or not FileFilter) and not p.merge: TotalAdded += p.added TotalRemoved += p.removed TotalChanged += max (p.added, p.removed) AddDateLines (p.date, max (p.added, p.removed)) empl = p.author.emailemployer (p.email, p.date) empl.AddCSet (p) if AkpmOverLt: TrimLTSOBs (p) for sobemail, sobber in p.sobs: empl = sobber.emailemployer (sobemail, p.date) empl.AddSOB() if not p.merge: p.author.addpatch (p) for sobemail, sob in p.sobs: sob.addsob (p) for hacker in p.reviews: hacker.addreview (p) for hacker in p.testers: hacker.addtested (p) for hacker in p.reports: hacker.addreport (p) CSCount += 1 csv.AccumulatePatch (p) print >> sys.stderr, 'Grabbing changesets...done ' if DumpDB: database.DumpDB () # # Say something # hlist = database.AllHackers () elist = database.AllEmployers () ndev = nempl = 0 for h in hlist: if len (h.patches) > 0: ndev += 1 for e in elist: if e.count > 0: nempl += 1 reports.Write ('Processed %d csets from %d developers\n' % (CSCount, ndev)) reports.Write ('%d employers found\n' % (nempl)) reports.Write ('A total of %d lines added, %d removed (delta %d)\n' % (TotalAdded, TotalRemoved, TotalAdded - TotalRemoved)) if TotalChanged == 0: TotalChanged = 1 # HACK to avoid div by zero if DateStats: PrintDateStats () csv.OutputCSV (CSVFile) if CSVFile is not None: CSVFile.close () if DevReports: reports.DevReports (hlist, TotalChanged, CSCount, TotalRemoved) reports.EmplReports (elist, TotalChanged, CSCount)