summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan Corbet <corbet@lwn.net>2011-07-11 13:51:58 -0600
committerJonathan Corbet <corbet@lwn.net>2011-07-11 13:51:58 -0600
commit47ffed3ceed85f4e2c97cfec055f7ca4301616f3 (patch)
treef82252ba41cca26e6a9e974264240fcef6a16563
parent85004f0f9b73d7504606be5ccd0cc08688129eb1 (diff)
parent69f9ad7e643fa72da257d358821072f789dbc3ce (diff)
Merge branch 'refactoring' of git://gitorious.org/mining-tools/gitdm into german
-rw-r--r--ConfigFile.py141
-rw-r--r--README47
-rw-r--r--csv.py40
-rw-r--r--csvdump.py88
-rw-r--r--database.py19
-rwxr-xr-xgitdm195
-rw-r--r--gitdm.config5
-rw-r--r--logparser.py90
-rw-r--r--patterns.py54
-rw-r--r--reports.py44
-rw-r--r--sample-config/filetypes.txt362
11 files changed, 926 insertions, 159 deletions
diff --git a/ConfigFile.py b/ConfigFile.py
index 32a4aec..b6981a4 100644
--- a/ConfigFile.py
+++ b/ConfigFile.py
@@ -13,18 +13,42 @@
import sys, re, datetime, os.path
import database
-#
-# Read a line and strip out junk.
-#
-def ReadConfigLine (file):
- line = file.readline ()
- if not line:
- return None
- line = line.split('#')[0] # Get rid of any comments
- line = line.strip () # and extra white space
- if len (line) == 0: # we got rid of everything
- return ReadConfigLine (file)
- return line
+class ReadConfigLine:
+ """
+ ReadConfigLine provides a iterator to extract line
+ from an config file without comments.
+
+ Typical use case:
+
+ fd = open(filename, 'r')
+ for line in ReadConfigLine(fd):
+ parse_line(line)
+ fd.close(fd)
+ """
+
+ def __init__(self, fd):
+ self.fd = fd
+ self.buffer = None
+ self.patch = []
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ line = self.fd.readline()
+ while line:
+ line = line.split('#')[0] # Get rid of any comments
+ line = line.strip() # and extra white space
+ if len(line) == 0: # we got rid of everything
+ line = self.fd.readline()
+ else:
+ break
+
+ if not line:
+ raise StopIteration
+
+ return line
+
#
# Give up and die.
@@ -38,19 +62,19 @@ def croak (message):
#
def ReadEmailAliases (name):
try:
- file = open (name, 'r')
+ fd = open (name, 'r')
except IOError:
croak ('Unable to open email alias file %s' % (name))
- line = ReadConfigLine (file)
- while line:
+
+ for line in ReadConfigLine (fd):
m = re.match ('^("[^"]+"|\S+)\s+(.+)$', line)
if not m or len (m.groups ()) != 2:
croak ('Funky email alias line "%s"' % (line))
if m and m.group (2).find ('@') <= 0:
croak ('Non-addresses in email alias "%s"' % (line))
database.AddEmailAlias (m.group (1).replace ('"', ''), m.group (2))
- line = ReadConfigLine (file)
- file.close ()
+
+ fd.close ()
#
# The Email/Employer map
@@ -59,11 +83,11 @@ EMMpat = re.compile (r'^([^\s]+)\s+([^<]+)\s*(<\s*(\d+-\d+-\d+)\s*)?$')
def ReadEmailEmployers (name):
try:
- file = open (name, 'r')
+ fd = open (name, 'r')
except IOError:
croak ('Unable to open email/employer file %s' % (name))
- line = ReadConfigLine (file)
- while line:
+
+ for line in ReadConfigLine (fd):
m = EMMpat.match (line)
if not m:
croak ('Funky email/employer line "%s"' % (line))
@@ -71,8 +95,8 @@ def ReadEmailEmployers (name):
company = m.group (2).strip ()
enddate = ParseDate (m.group (4))
database.AddEmailEmployerMapping (email, company, enddate)
- line = ReadConfigLine (file)
- file.close ()
+
+ fd.close ()
def ParseDate (cdate):
if not cdate:
@@ -83,22 +107,22 @@ def ParseDate (cdate):
def ReadGroupMap (fname, employer):
try:
- file = open (fname, 'r')
+ fd = open (fname, 'r')
except IOError:
croak ('Unable to open group map file %s' % (fname))
- line = ReadConfigLine (file)
- while line:
+
+ for line in ReadConfigLine (fd):
database.AddEmailEmployerMapping (line, employer)
- line = ReadConfigLine (file)
- file.close ()
+
+ fd.close ()
#
# Read in a virtual employer description.
#
-def ReadVirtual (file, name):
+def ReadVirtual (fd, name):
ve = database.VirtualEmployer (name)
- line = ReadConfigLine (file)
- while line:
+
+ for line in ReadConfigLine (fd):
sl = line.split (None, 1)
first = sl[0]
if first == 'end':
@@ -116,23 +140,57 @@ def ReadVirtual (file, name):
if not (0 < percent <= 100):
croak ('Bad split value "%s" for virtual empl %s' % (first, name))
ve.addsplit (' '.join (sl[1:]), percent/100.0)
- line = ReadConfigLine (file)
#
# We should never get here
#
croak ('Missing "end" line for virtual employer %s' % (name))
#
+# Read file type patterns for more fine graned reports
+#
+def ReadFileType (filename):
+ try:
+ fd = open (filename, 'r')
+ except IOError:
+ croak ('Unable to open file type mapping file %s' % (filename))
+ patterns = {}
+ order = []
+ regex_order = re.compile ('^order\s+(.*)$')
+ regex_file_type = re.compile ('^filetype\s+(\S+)\s+(.+)$')
+
+ for line in ReadConfigLine (fd):
+ o = regex_order.match (line)
+ if o:
+ # Consider only the first definition in the config file
+ elements = o.group(1).replace (' ', '')
+ order = order or elements.split(',')
+ continue
+
+ m = regex_file_type.match (line)
+ if not m or len (m.groups ()) != 2:
+ ConfigFile.croak ('Funky file type line "%s"' % (line))
+ if not patterns.has_key (m.group (1)):
+ patterns[m.group (1)] = []
+ if m.group (1) not in order:
+ print '%s not found, appended to the last order' % m.group (1)
+ order.append (m.group (1))
+
+ patterns[m.group (1)].append (re.compile (m.group (2), re.IGNORECASE))
+
+ fd.close ()
+ return patterns, order
+
+#
# Read an overall config file.
#
def ConfigFile (name, confdir):
try:
- file = open (name, 'r')
+ fd = open (name, 'r')
except IOError:
croak ('Unable to open config file %s' % (name))
- line = ReadConfigLine (file)
- while line:
+
+ for line in ReadConfigLine (fd):
sline = line.split (None, 2)
if len (sline) < 2:
croak ('Funky config line: "%s"' % (line))
@@ -146,7 +204,20 @@ def ConfigFile (name, confdir):
ReadGroupMap (os.path.join (confdir, sline[1]), sline[2])
elif sline[0] == 'VirtualEmployer':
ReadVirtual (file, ' '.join (sline[1:]))
+ elif sline[0] == 'FileTypeMap':
+ patterns, order = ReadFileType (os.path.join (confdir, sline[1]))
+ database.FileTypes = database.FileType (patterns, order)
else:
croak ('Unrecognized config line: "%s"' % (line))
- line = ReadConfigLine (file)
+
+
+if __name__ == '__main__':
+ '''Test the iterato for reading configuration files'''
+ try:
+ fd = open(sys.argv[1])
+ except:
+ croak('Usage: %s <config-file>' % sys.argv[0])
+
+ for line in ReadConfigLine(fd):
+ print line
diff --git a/README b/README
index 7226541..dab372e 100644
--- a/README
+++ b/README
@@ -20,6 +20,10 @@ Run it like this:
git log -p -M [details] | gitdm [options]
+Alternatively, you can run with:
+
+ git log --numstat -M [details] | gitdm -n [options]
+
The [details] tell git which changesets are of interest; the [options] can
be:
@@ -32,26 +36,35 @@ be:
By default, "./gitdm.config" is used.
-d Omit the developer reports, giving employer information
- only.
+ only.
- -D Rather than create the usual statistics, create a
- file (datelc) providing lines changed per day, where the first column
- displays the changes happened only on that day and the second sums
- the day it happnened with the previous ones. This option is suitable
- for feeding to a tool like gnuplot.
+ -D Rather than create the usual statistics, create a file (datelc.csv)
+ providing lines changed per day, where the first column displays
+ the changes happened only on that day and the second sums the day it
+ happnened with the previous ones. This option is suitable for
+ feeding to a tool like gnuplot.
-h file Generate HTML output to the given file
-l num Only list the top <num> entries in each report.
+ -n Use --numstat instead of generated patches to get the statistics.
+
-o file Write text output to the given file (default is stdout).
+ -p prefix Dump out the database categorized by changeset and by file type.
+ It requires -n, otherwise it is not possible to get separated results.
+
-r pat Only generate statistics for changes to files whose
name matches the given regular expression.
-s Ignore Signed-off-by lines which match the author of
each patch.
+ -t Generate a report by type of contribution (code, documentation, etc.).
+ It requires -n, otherwise this option is ignored silently.
+
+
-u Group all unknown developers under the "(Unknown)"
employer.
@@ -68,6 +81,10 @@ looks like:
git log -p -M v2.6.19..v2.6.20 | \
gitdm -u -s -a -o results -h results.html
+or:
+
+ git log --numstat -M v2.6.19..v2.6.20 | \
+ gitdm -u -s -a -n -o results -h results.html
CONFIGURATION FILE
@@ -134,6 +151,24 @@ end
for example, no check to ensure that the percentages add up to
something rational.
+FileTypeMap file
+
+ Map file names/extensions onto file types. These files contain lines
+ like:
+
+ order <type1>,<type2>,...,<typeN>
+
+ filetype <type> <regex>
+ ...
+
+ This construct allows fine graned reports by type of contribution
+ (build, code, image, multimedia, documentation, etc.)
+
+ Order is important because it is possible to have overlapping between
+ filenames. For instance, ltmain.sh fits better as 'build' instead of
+ 'code' (the filename instead of '\.sh$'). The first element in order
+ has precedence over the next ones.
+
OTHER TOOLS
diff --git a/csv.py b/csv.py
deleted file mode 100644
index cec1f06..0000000
--- a/csv.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#
-# aggregate per-month statistics for people
-#
-import sys, datetime
-
-class CSVStat:
- def __init__ (self, name, employer, date):
- self.name = name
- self.employer = employer
- self.added = self.removed = 0
- self.date = date
- def accumulate (self, p):
- self.added = self.added + p.added
- self.removed = self.removed + p.removed
-
-PeriodCommitHash = { }
-
-def AccumulatePatch (p, Aggregate):
- date = "%.2d-%.2d-01"%(p.date.year, p.date.month)
- if (Aggregate == 'week'):
- date = "%.2d-%.2d"%(p.date.isocalendar()[0], p.date.isocalendar()[1])
- authdatekey = "%s-%s"%(p.author.name, date)
- if authdatekey not in PeriodCommitHash:
- empl = p.author.emailemployer (p.email, p.date)
- stat = CSVStat (p.author.name, empl, date)
- PeriodCommitHash[authdatekey] = stat
- else:
- stat = PeriodCommitHash[authdatekey]
- stat.accumulate (p)
-
-def OutputCSV (file):
- if file is None:
- return
- file.write ("Name\tAffliation\tDate\tAdded\tRemoved\n")
- for date, stat in PeriodCommitHash.items():
- # sanitise names " is common and \" sometimes too
- empl_name = stat.employer.name.replace ("\"", ".").replace ("\\", ".")
- author_name = stat.name.replace ("\"", ".").replace ("\\", ".")
- file.write ("\"%s\"\t\"%s\"\t%s\t%d\t%d\n"%(author_name, empl_name, stat.date, \
- stat.added, stat.removed))
diff --git a/csvdump.py b/csvdump.py
new file mode 100644
index 0000000..b76a5f6
--- /dev/null
+++ b/csvdump.py
@@ -0,0 +1,88 @@
+#
+# aggregate per-month statistics for people
+#
+import sys, datetime
+import csv
+
+class CSVStat:
+ def __init__ (self, name, email, employer, date):
+ self.name = name
+ self.email = email
+ self.employer = employer
+ self.added = self.removed = 0
+ self.date = date
+ def accumulate (self, p):
+ self.added = self.added + p.added
+ self.removed = self.removed + p.removed
+
+PeriodCommitHash = { }
+
+def AccumulatePatch (p, Aggregate):
+ date = "%.2d-%.2d-01"%(p.date.year, p.date.month)
+ if (Aggregate == 'week'):
+ date = "%.2d-%.2d"%(p.date.isocalendar()[0], p.date.isocalendar()[1])
+ authdatekey = "%s-%s"%(p.author.name, date)
+ if authdatekey not in PeriodCommitHash:
+ empl = p.author.emailemployer (p.email, p.date)
+ stat = CSVStat (p.author.name, p.email, empl, date)
+ PeriodCommitHash[authdatekey] = stat
+ else:
+ stat = PeriodCommitHash[authdatekey]
+ stat.accumulate (p)
+
+ChangeSets = []
+FileTypes = []
+
+def store_patch(patch):
+ if not patch.merge:
+ employer = patch.author.emailemployer(patch.email, patch.date)
+ employer = employer.name.replace('"', '.').replace ('\\', '.')
+ author = patch.author.name.replace ('"', '.').replace ('\\', '.')
+ author = patch.author.name.replace ("'", '.')
+ try:
+ domain = patch.email.split('@')[1]
+ except:
+ domain = patch.email
+ ChangeSets.append([patch.commit, str(patch.date),
+ patch.email, domain, author, employer,
+ patch.added, patch.removed])
+ for (filetype, (added, removed)) in patch.filetypes.iteritems():
+ FileTypes.append([patch.commit, filetype, added, removed])
+
+
+def save_csv (prefix='data'):
+ # Dump the ChangeSets
+ if len(ChangeSets) > 0:
+ fd = open('%s-changesets.csv' % prefix, 'w')
+ writer = csv.writer (fd, quoting=csv.QUOTE_NONNUMERIC)
+ writer.writerow (['Commit', 'Date', 'Domain',
+ 'Email', 'Name', 'Affliation',
+ 'Added', 'Removed'])
+ for commit in ChangeSets:
+ writer.writerow(commit)
+
+ # Dump the file types
+ if len(FileTypes) > 0:
+ fd = open('%s-filetypes.csv' % prefix, 'w')
+ writer = csv.writer (fd, quoting=csv.QUOTE_NONNUMERIC)
+
+ writer.writerow (['Commit', 'Type', 'Added', 'Removed'])
+ for commit in FileTypes:
+ writer.writerow(commit)
+
+
+
+def OutputCSV (file):
+ if file is None:
+ return
+ writer = csv.writer (file, quoting=csv.QUOTE_NONNUMERIC)
+ writer.writerow (['Name', 'Email', 'Affliation', 'Date',
+ 'Added', 'Removed'])
+ for date, stat in PeriodCommitHash.items():
+ # sanitise names " is common and \" sometimes too
+ empl_name = stat.employer.name.replace ('"', '.').replace ('\\', '.')
+ author_name = stat.name.replace ('"', '.').replace ('\\', '.')
+ writer.writerow ([author_name, stat.email, empl_name, stat.date,
+ stat.added, stat.removed])
+
+__all__ = [ 'AccumulatePatch', 'OutputCSV', 'store_patch' ]
diff --git a/database.py b/database.py
index b5d9382..6a62adc 100644
--- a/database.py
+++ b/database.py
@@ -188,6 +188,25 @@ class VirtualEmployer (Employer):
# Should check that they add up too, but I'm lazy
Employers[self.name] = self
+class FileType:
+ def __init__ (self, patterns={}, order=[]):
+ self.patterns = patterns
+ self.order = order
+
+ def guess_file_type (self, filename, patterns=None, order=None):
+ patterns = patterns or self.patterns
+ order = order or self.order
+
+ for file_type in order:
+ if patterns.has_key (file_type):
+ for patt in patterns[file_type]:
+ if patt.search (filename):
+ return file_type
+
+ return 'unknown'
+
+FileTypes = None
+
#
# Mix all the virtual employers into their real destinations.
#
diff --git a/gitdm b/gitdm
index a5b9c4a..41634e6 100755
--- a/gitdm
+++ b/gitdm
@@ -1,4 +1,5 @@
#!/usr/bin/pypy
+#-*- coding:utf-8 -*-
#
#
@@ -6,15 +7,17 @@
#
# Copyright 2007-11 Eklektix, Inc.
# Copyright 2007-11 Jonathan Corbet <corbet@lwn.net>
+# Copyright 2011 Germán Póo-Caamaño <gpoo@gnome.org>
#
# This file may be distributed under the terms of the GNU General
# Public License, version 2.
-import database, csv, ConfigFile, reports
+import database, csvdump, ConfigFile, reports
import getopt, datetime
import os, re, sys, rfc822, string
-from patterns import *
+import logparser
+from patterns import patterns
Today = datetime.date.today()
@@ -32,11 +35,14 @@ DateStats = 0
AuthorSOBs = 1
FileFilter = None
CSVFile = None
+CSVPrefix = None
AkpmOverLt = 0
DumpDB = 0
CFName = 'gitdm.config'
DirName = ''
Aggregate = 'month'
+Numstat = 0
+ReportByFileType = 0
#
# Options:
@@ -48,7 +54,9 @@ Aggregate = 'month'
# -D Output date statistics
# -h hfile HTML output to hfile
# -l count Maximum length for output lists
+# -n Use numstats instead of generated patch from git log
# -o file File for text output
+# -p prefix Prefix for CSV output
# -r pattern Restrict to files matching pattern
# -s Ignore author SOB lines
# -u Map unknown employers to '(Unknown)'
@@ -59,9 +67,10 @@ Aggregate = 'month'
def ParseOpts ():
global MapUnknown, DevReports
global DateStats, AuthorSOBs, FileFilter, AkpmOverLt, DumpDB
- global CFName, CSVFile, DirName, Aggregate
+ global CFName, CSVFile, CSVPrefix,DirName, Aggregate, Numstat
+ global ReportByFileType
- opts, rest = getopt.getopt (sys.argv[1:], 'ab:dc:Dh:l:o:r:suwx:z')
+ opts, rest = getopt.getopt (sys.argv[1:], 'ab:dc:Dh:l:no:p:r:stuwx:z')
for opt in opts:
if opt[0] == '-a':
AkpmOverLt = 1
@@ -77,13 +86,19 @@ def ParseOpts ():
reports.SetHTMLOutput (open (opt[1], 'w'))
elif opt[0] == '-l':
reports.SetMaxList (int (opt[1]))
+ elif opt[0] == '-n':
+ Numstat = 1
elif opt[0] == '-o':
reports.SetOutput (open (opt[1], 'w'))
+ elif opt[0] == '-p':
+ CSVPrefix = opt[1]
elif opt[0] == '-r':
print 'Filter on "%s"' % (opt[1])
FileFilter = re.compile (opt[1])
elif opt[0] == '-s':
AuthorSOBs = 0
+ elif opt[0] == '-t':
+ ReportByFileType = 1
elif opt[0] == '-u':
MapUnknown = 1
elif opt[0] == '-x':
@@ -139,6 +154,8 @@ def PrintDateStats():
# Let's slowly try to move some smarts into this class.
#
class patch:
+ (ADDED, REMOVED) = range (2)
+
def __init__ (self, commit):
self.commit = commit
self.merge = self.added = self.removed = 0
@@ -148,6 +165,7 @@ class patch:
self.reviews = [ ]
self.testers = [ ]
self.reports = [ ]
+ self.filetypes = {}
def addreviewer (self, reviewer):
self.reviews.append (reviewer)
@@ -157,36 +175,57 @@ class patch:
def addreporter (self, reporter):
self.reports.append (reporter)
+
+ def addfiletype (self, filetype, added, removed):
+ if self.filetypes.has_key (filetype):
+ self.filetypes[filetype][self.ADDED] += added
+ self.filetypes[filetype][self.REMOVED] += removed
+ else:
+ self.filetypes[filetype] = [added, removed]
+
+def parse_numstat(line, file_filter):
+ """
+ Receive a line of text, determine if fits a numstat line and
+ parse the added and removed lines as well as the file type.
+ """
+ m = patterns['numstat'].match (line)
+ if m:
+ filename = m.group (3)
+ # If we have a file filter, check for file lines.
+ if file_filter and not file_filter.search (filename):
+ return None, None, None, None
+
+ try:
+ added = int (m.group (1))
+ removed = int (m.group (2))
+ except ValueError:
+ # A binary file (image, etc.) is marked with '-'
+ added = removed = 0
+
+ m = patterns['rename'].match (filename)
+ if m:
+ filename = '%s%s%s' % (m.group (1), m.group (3), m.group (4))
+
+ filetype = database.FileTypes.guess_file_type (os.path.basename(filename))
+ return filename, filetype, added, removed
+ else:
+ return None, None, None, None
+
#
# The core hack for grabbing the information about a changeset.
#
-def grabpatch():
- global NextLine
-
- while (1):
- m = Pcommit.match (NextLine)
- if m:
- break;
- NextLine = sys.stdin.readline ()
- if not NextLine:
- return
+def grabpatch(logpatch):
+ m = patterns['commit'].match (logpatch[0])
+ if not m:
+ return None
p = patch(m.group (1))
- NextLine = sys.stdin.readline ()
ignore = (FileFilter is not None)
- while NextLine:
- Line = NextLine
- #
- # If this line starts a new commit, drop out.
- #
- m = Pcommit.match (Line)
- if m:
- break
- NextLine = sys.stdin.readline ()
+ for Line in logpatch[1:]:
#
# Maybe it's an author line?
#
- m = Pauthor.match (Line)
+ m = patterns['author'].match (Line)
if m:
p.email = database.RemapEmail (m.group (2))
p.author = LookupStoreHacker(m.group (1), p.email)
@@ -194,7 +233,7 @@ def grabpatch():
#
# Could be a signed-off-by:
#
- m = Psob.match (Line)
+ m = patterns['signed-off-by'].match (Line)
if m:
email = database.RemapEmail (m.group (2))
sobber = LookupStoreHacker(m.group (1), email)
@@ -204,24 +243,26 @@ def grabpatch():
#
# Various other tags of interest.
#
- m = Preview.match (Line) # Reviewed-by:
+ m = patterns['reviewed-by'].match (Line)
if m:
email = database.RemapEmail (m.group (2))
p.addreviewer (LookupStoreHacker(m.group (1), email))
continue
- m = Ptest.match (Line) # Tested-by:
+ m = patterns['tested-by'].match (Line)
if m:
email = database.RemapEmail (m.group (2))
p.addtester (LookupStoreHacker (m.group (1), email))
p.author.testcredit (patch)
continue
- m = Prep.match (Line) # Reported-by:
+ # Reported-by:
+ m = patterns['reported-by'].match (Line)
if m:
email = database.RemapEmail (m.group (2))
p.addreporter (LookupStoreHacker (m.group (1), email))
p.author.reportcredit (patch)
continue
- m = Preptest.match (Line) # Reported-and-tested-by:
+ # Reported-and-tested-by:
+ m = patterns['reported-and-tested-by'].match (Line)
if m:
email = database.RemapEmail (m.group (2))
h = LookupStoreHacker (m.group (1), email)
@@ -233,14 +274,14 @@ def grabpatch():
#
# If this one is a merge, make note of the fact.
#
- m = Pmerge.match (Line)
+ m = patterns['merge'].match (Line)
if m:
p.merge = 1
continue
#
# See if it's the date.
#
- m = Pdate.match (Line)
+ m = patterns['date'].match (Line)
if m:
dt = rfc822.parsedate(m.group (2))
p.date = datetime.date (dt[0], dt[1], dt[2])
@@ -248,20 +289,29 @@ def grabpatch():
sys.stderr.write ('Funky date: %s\n' % p.date)
p.date = Today
continue
- #
- # If we have a file filter, check for file lines.
- #
- if FileFilter:
- ignore = ApplyFileFilter (Line, ignore)
- #
- # OK, maybe it's part of the diff itself.
- #
- if not ignore:
- if Padd.match (Line):
- p.added += 1
- continue
- if Prem.match (Line):
- p.removed += 1
+ if not Numstat:
+ #
+ # If we have a file filter, check for file lines.
+ #
+ if FileFilter:
+ ignore = ApplyFileFilter (Line, ignore)
+ #
+ # OK, maybe it's part of the diff itself.
+ #
+ if not ignore:
+ if patterns['add'].match (Line):
+ p.added += 1
+ continue
+ if patterns['rem'].match (Line):
+ p.removed += 1
+ else:
+ # Get the statistics (lines added/removes) using numstats
+ # and without requiring a diff (--numstat instead -p)
+ (filename, filetype, added, removed) = parse_numstat (Line, FileFilter)
+ if filename:
+ p.added += added
+ p.removed += removed
+ p.addfiletype (filetype, added, removed)
if '@' in p.author.name:
GripeAboutAuthorName (p.author.name)
@@ -279,7 +329,7 @@ def ApplyFileFilter (line, ignore):
# If this is the first file line (--- a/), set ignore one way
# or the other.
#
- m = Pfilea.match (line)
+ m = patterns['filea'].match (line)
if m:
file = m.group (1)
if FileFilter.search (file):
@@ -288,13 +338,29 @@ def ApplyFileFilter (line, ignore):
#
# For the second line, we can turn ignore off, but not on
#
- m = Pfileb.match (line)
+ m = patterns['fileb'].match (line)
if m:
file = m.group (1)
if FileFilter.search (file):
return 0
return ignore
+def is_svntag(logpatch):
+ """
+ This is a workaround for a bug on the migration to Git
+ from Subversion found in GNOME. It may happen in other
+ repositories as well.
+ """
+
+ for Line in logpatch:
+ m = patterns['svn-tag'].match(Line.strip())
+ if m:
+ sys.stderr.write ('(W) detected a commit on a svn tag: %s\n' %
+ (m.group (0),))
+ return True
+
+ return False
+
#
# If this patch is signed off by both Andrew Morton and Linus Torvalds,
# remove the (redundant) Linus signoff.
@@ -324,7 +390,6 @@ if AkpmOverLt == 1:
Akpm = ('akpm@linux-foundation.org',
LookupStoreHacker ('Andrew Morton', 'akpm@linux-foundation.org'))
-NextLine = sys.stdin.readline ()
TotalChanged = TotalAdded = TotalRemoved = 0
#
@@ -332,12 +397,23 @@ TotalChanged = TotalAdded = TotalRemoved = 0
#
print >> sys.stderr, 'Grabbing changesets...\r',
+patches = logparser.LogPatchSplitter(sys.stdin)
printcount = CSCount = 0
-while (1):
+
+for logpatch in patches:
if (printcount % 50) == 0:
print >> sys.stderr, 'Grabbing changesets...%d\r' % printcount,
printcount += 1
- p = grabpatch()
+
+ # We want to ignore commits on svn tags since in Subversion
+ # thats mean a copy of the whole repository, which leads to
+ # wrong results. Some migrations from Subversion to Git does
+ # not catch all this tags/copy and import them just as a new
+ # big changeset.
+ if is_svntag(logpatch):
+ continue
+
+ p = grabpatch(logpatch)
if not p:
break
# if p.added > 100000 or p.removed > 100000:
@@ -373,8 +449,9 @@ while (1):
hacker.addtested (p)
for hacker in p.reports:
hacker.addreport (p)
- CSCount += 1
- csv.AccumulatePatch (p, Aggregate)
+ CSCount += 1
+ csvdump.AccumulatePatch (p, Aggregate)
+ csvdump.store_patch (p)
print >> sys.stderr, 'Grabbing changesets...done '
if DumpDB:
@@ -403,10 +480,16 @@ if TotalChanged == 0:
if DateStats:
PrintDateStats ()
-csv.OutputCSV (CSVFile)
-if CSVFile is not None:
- CSVFile.close ()
+if CSVPrefix:
+ csvdump.save_csv (CSVPrefix)
+
+if CSVFile:
+ csvdump.OutputCSV (CSVFile)
+ CSVFile.close ()
if DevReports:
reports.DevReports (hlist, TotalChanged, CSCount, TotalRemoved)
reports.EmplReports (elist, TotalChanged, CSCount)
+
+if ReportByFileType and Numstat:
+ reports.ReportByFileType (hlist)
diff --git a/gitdm.config b/gitdm.config
index 588d6ef..3ae2f20 100644
--- a/gitdm.config
+++ b/gitdm.config
@@ -20,3 +20,8 @@ EmailMap sample-config/domain-map
#
# GroupMap sample-config/illuminati The Illuminati
#
+#
+# Use FileTypeMap to map a file types to file names using regular
+# regular expressions.
+#
+FileTypeMap sample-config/filetypes.txt
diff --git a/logparser.py b/logparser.py
new file mode 100644
index 0000000..b375034
--- /dev/null
+++ b/logparser.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+#-*- coding:utf-8 -*-
+#
+# Copyright © 2009 Germán Póo-Caamaño <gpoo@gnome.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+
+import sys
+from patterns import patterns
+
+class LogPatchSplitter:
+ """
+ LogPatchSplitters provides a iterator to extract every
+ changeset from a git log output.
+
+ Typical use case:
+
+ patches = LogPatchSplitter(sys.stdin)
+
+ for patch in patches:
+ parse_patch(patch)
+ """
+
+ def __init__(self, fd):
+ self.fd = fd
+ self.buffer = None
+ self.patch = []
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ patch = self.__grab_patch__()
+ if not patch:
+ raise StopIteration
+ return patch
+
+ def __grab_patch__(self):
+ """
+ Extract a patch from the file descriptor and the
+ patch is returned as a list of lines.
+ """
+
+ patch = []
+ line = self.buffer or self.fd.readline()
+
+ while line:
+ m = patterns['commit'].match(line)
+ if m:
+ patch = [line]
+ break
+ line = self.fd.readline()
+
+ if not line:
+ return None
+
+ line = self.fd.readline()
+ while line:
+ # If this line starts a new commit, drop out.
+ m = patterns['commit'].match(line)
+ if m:
+ self.buffer = line
+ break
+
+ patch.append(line)
+ self.buffer = None
+ line = self.fd.readline()
+
+ return patch
+
+
+if __name__ == '__main__':
+ patches = LogPatchSplitter(sys.stdin)
+
+ for patch in patches:
+ print '---------- NEW PATCH ----------'
+ for line in patch:
+ print line,
diff --git a/patterns.py b/patterns.py
index e63efb6..803e532 100644
--- a/patterns.py
+++ b/patterns.py
@@ -1,10 +1,12 @@
#
+# -*- coding:utf-8 -*-
# Pull together regular expressions used in multiple places.
#
# This code is part of the LWN git data miner.
#
# Copyright 2007-11 Eklektix, Inc.
# Copyright 2007-11 Jonathan Corbet <corbet@lwn.net>
+# Copyright 2011 Germán Póo-Caamaño <gpoo@gnome.org>
#
# This file may be distributed under the terms of the GNU General
# Public License, version 2.
@@ -16,24 +18,34 @@ import re
# expressions." Now they have two problems.
# -- Jamie Zawinski
#
-Pemail = r'\s+"?([^<"]+)"?\s<([^>]+)>' # just email addr + name
-Pcommit = re.compile (r'^commit ([0-9a-f ]+)$')
-Pauthor = re.compile (r'^Author:' + Pemail + '$')
-Psob = re.compile (r'^\s+Signed-off-by:' + Pemail + '.*$')
-Pmerge = re.compile (r'^Merge:.*$')
-Padd = re.compile (r'^\+[^+].*$')
-Prem = re.compile (r'^-[^-].*$')
-Pdate = re.compile (r'^(Commit)?Date:\s+(.*)$')
-Pfilea = re.compile (r'^---\s+(.*)$')
-Pfileb = re.compile (r'^\+\+\+\s+(.*)$')
-Preview = re.compile (r'^\s+Reviewed-by:' + Pemail + '.*$')
-Ptest = re.compile (r'^\s+tested-by:' + Pemail + '.*$', re.I)
-Prep = re.compile (r'^\s+Reported-by:' + Pemail + '.*$')
-Preptest = re.compile (r'^\s+reported-and-tested-by:' + Pemail + '.*$', re.I)
-#
-# Merges are described with a variety of lines.
-#
-PExtMerge = re.compile(r'^ +Merge( branch .* of)? ([^ ]+:[^ ]+)\n$')
-PIntMerge = re.compile(r'^ +(Merge|Pull) .* into .*$')
-# PIntMerge2 = re.compile(r"^ +Merge branch(es)? '.*$")
-PIntMerge2 = re.compile(r"^ +Merge .*$")
+_pemail = r'\s+"?([^<"]+)"?\s<([^>]+)>' # just email addr + name
+
+patterns = {
+ 'commit': re.compile (r'^commit ([0-9a-f ]+)$'),
+ 'author': re.compile (r'^Author:' + _pemail + '$'),
+ 'signed-off-by': re.compile (r'^\s+Signed-off-by:' + _pemail + '.*$'),
+ 'merge': re.compile (r'^Merge:.*$'),
+ 'add': re.compile (r'^\+[^+].*$'),
+ 'rem': re.compile (r'^-[^-].*$'),
+ 'date': re.compile (r'^(Commit)?Date:\s+(.*)$'),
+ # filea, fileb are used only in 'parche mode' (-p)
+ 'filea': re.compile (r'^---\s+(.*)$'),
+ 'fileb': re.compile (r'^\+\+\+\s+(.*)$'),
+ 'reviewed-by': re.compile (r'^\s+Reviewed-by:' + _pemail+ '.*$'),
+ 'tested-by': re.compile (r'^\s+tested-by:' + _pemail + '.*$', re.I),
+ 'reported-by': re.compile (r'^\s+Reported-by:' + _pemail + '.*$'),
+ 'reported-and-tested-by': re.compile (r'^\s+reported-and-tested-by:' + _pemail + '.*$', re.I),
+ #
+ # Merges are described with a variety of lines.
+ #
+ 'ExtMerge': re.compile(r'^ +Merge( branch .* of)? ([^ ]+:[^ ]+)\n$'),
+ 'IntMerge': re.compile(r'^ +(Merge|Pull) .* into .*$'),
+ # PIntMerge2 = re.compile(r"^ +Merge branch(es)? '.*$"),
+ 'IntMerge2': re.compile(r"^ +Merge .*$"),
+ # Another way to get the statistics (per file).
+ # It implies --numstat
+ 'numstat': re.compile('^(\d+|-)\s+(\d+|-)\s+(.*)$'),
+ 'rename' : re.compile('(.*)\{(.*) => (.*)\}(.*)'),
+ # Detect errors on svn conversions
+ 'svn-tag': re.compile("^svn path=/tags/(.*)/?; revision=([0-9]+)$"),
+}
diff --git a/reports.py b/reports.py
index 268fe0a..9b8cce9 100644
--- a/reports.py
+++ b/reports.py
@@ -340,4 +340,46 @@ def EmplReports (elist, totalchanged, cscount):
ReportByELChanged (elist, totalchanged)
ReportByESOBs (elist)
ReportByEHackers (elist)
-
+
+def ReportByFileType (hacker_list):
+ total = {}
+ total_by_hacker = {}
+
+ BeginReport ('Developer contributions by type')
+ for h in hacker_list:
+ by_hacker = {}
+ for patch in h.patches:
+ # Get a summary by hacker
+ for (filetype, (added, removed)) in patch.filetypes.iteritems():
+ if by_hacker.has_key(filetype):
+ by_hacker[filetype][patch.ADDED] += added
+ by_hacker[filetype][patch.REMOVED] += removed
+ else:
+ by_hacker[filetype] = [added, removed]
+
+ # Update the totals
+ if total.has_key(filetype):
+ total[filetype][patch.ADDED] += added
+ total[filetype][patch.REMOVED] += removed
+ else:
+ total[filetype] = [added, removed, []]
+
+ # Print a summary by hacker
+ print h.name
+ for filetype, counters in by_hacker.iteritems():
+ print '\t', filetype, counters
+ h_added = by_hacker[filetype][patch.ADDED]
+ h_removed = by_hacker[filetype][patch.REMOVED]
+ total[filetype][2].append ([h.name, h_added, h_removed])
+
+ # Print the global summary
+ BeginReport ('Contributions by type and developers')
+ for filetype, (added, removed, hackers) in total.iteritems():
+ print filetype, added, removed
+ for h, h_added, h_removed in hackers:
+ print '\t%s: [%d, %d]' % (h, h_added, h_removed)
+
+ # Print the very global summary
+ BeginReport ('General contributions by type')
+ for filetype, (added, removed, hackers) in total.iteritems():
+ print filetype, added, removed
diff --git a/sample-config/filetypes.txt b/sample-config/filetypes.txt
new file mode 100644
index 0000000..e24c396
--- /dev/null
+++ b/sample-config/filetypes.txt
@@ -0,0 +1,362 @@
+# -*- coding:utf-8 -*-
+# Copyright (C) 2006 Libresoft
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# Authors : Gregorio Robles <grex@gsyc.escet.urjc.es>
+# Authors : Germán Póo-Caamaño <gpoo@gnome.org>
+#
+# This file contains associations parameters regarding filetypes
+# (documentation, develompent, multimedia, images...)
+#
+# format:
+# filetype <type> <regex> [<comment>]
+#
+# Order:
+# The list should keep an order, so filetypes can be counted properly.
+# ie. we want ltmain.sh -> 'build' instead of 'code'.
+#
+# If there is an filetype which is not in order but has values, it will
+# be added at the end.
+#
+order image,translation,ui,multimedia,package,build,code,documentation,devel-doc
+#
+#
+# Code files (headers and the like included
+# (most common languages first
+#
+filetype code \.c$ # C
+filetype code \.pc$ # C
+filetype code \.ec$ # C
+filetype code \.ecp$ # C
+filetype code \.C$ # C++
+filetype code \.cpp$ # C++
+filetype code \.c\+\+$ # C++
+filetype code \.cxx$ # C++
+filetype code \.cc$ # C++
+filetype code \.pcc$ # C++
+filetype code \.cpy$ # C++
+filetype code \.h$ # C or C++ header
+filetype code \.hh$ # C++ header
+filetype code \.hpp$ # C++ header
+filetype code \.hxx$ # C++ header
+filetype code \.sh$ # Shell
+filetype code \.pl$ # Perl
+filetype code \.pm$ # Perl
+filetype code \.pod$ # Perl
+filetype code \.perl$ # Perl
+filetype code \.cgi$ # CGI
+filetype code \.php$ # PHP
+filetype code \.php3$ # PHP
+filetype code \.php4$ # PHP
+filetype code \.inc$ # PHP
+filetype code \.py$ # Python
+filetype code \.java$ # Java
+filetype code \.class$ # Java Class (or at least a class in some OOPL
+filetype code \.ada$ # ADA
+filetype code \.ads$ # ADA
+filetype code \.adb$ # ADA
+filetype code \.pad$ # ADA
+filetype code \.s$ # Assembly
+filetype code \.S$ # Assembly
+filetype code \.asm$ # Assembly
+filetype code \.awk$ # awk
+filetype code \.cs$ # C#
+filetype code \.csh$ # CShell (including tcsh
+filetype code \.cob$ # COBOL
+filetype code \.cbl$ # COBOL
+filetype code \.COB$ # COBOL
+filetype code \.CBL$ # COBOL
+filetype code \.exp$ # Expect
+filetype code \.l$ # (F lex
+filetype code \.ll$ # (F lex
+filetype code \.lex$ # (F lex
+filetype code \.f$ # Fortran
+filetype code \.f77$ # Fortran
+filetype code \.F$ # Fortran
+filetype code \.hs$ # Haskell
+filetype code \.lhs$ # Not preprocessed Haskell
+filetype code \.el$ # LISP (including Scheme
+filetype code \.scm$ # LISP (including Scheme
+filetype code \.lsp$ # LISP (including Scheme
+filetype code \.jl$ # LISP (including Scheme
+filetype code \.ml$ # ML
+filetype code \.ml3$ # ML
+filetype code \.m3$ # Modula3
+filetype code \.i3$ # Modula3
+filetype code \.m$ # Objective-C
+filetype code \.p$ # Pascal
+filetype code \.pas$ # Pascal
+filetype code \.rb$ # Ruby
+filetype code \.sed$ # sed
+filetype code \.tcl$ # TCL
+filetype code \.tk$ # TCL
+filetype code \.itk$ # TCL
+filetype code \.y$ # Yacc
+filetype code \.yy$ # Yacc
+filetype code \.idl$ # CORBA IDL
+filetype code \.gnorba$ # GNOME CORBA IDL
+filetype code \.oafinfo$ # GNOME OAF
+filetype code \.mcopclass$ # MCOP IDL compiler generated class
+filetype code \.autoforms$ # Autoform
+filetype code \.atf$ # Autoform
+filetype code \.gnuplot$
+filetype code \.xs$ # Shared library? Seen a lot of them in gnome-perl
+filetype code \.js$ # JavaScript (and who knows, maybe more
+filetype code \.patch$
+filetype code \.diff$ # Sometimes patches appear this way
+filetype code \.ids$ # Not really sure what this means
+filetype code \.upd$ # ¿¿¿??? (from Kcontrol
+filetype code $.ad$ # ¿¿¿??? (from Kdisplay and mc
+filetype code $.i$ # Appears in the kbindings for Qt
+filetype code $.pri$ # from Qt
+filetype code \.schema$ # Not really sure what this means
+filetype code \.fd$ # Something to do with latex
+filetype code \.cls$ # Something to do with latex
+filetype code \.pro$ # Postscript generation
+filetype code \.ppd$ # PDF generation
+filetype code \.dlg$ # Not really sure what this means
+filetype code \.plugin$ # Plug-in file
+filetype code \.dsp # Microsoft Developer Studio Project File
+filetype code \.vim$ # vim syntax file
+filetype code \.trm$ # gnuplot term file
+filetype code \.font$ # Font mapping
+filetype code \.ccg$ # C++ files - Found in gtkmm*
+filetype code \.hg$ # C++ headers - Found in gtkmm*
+filetype code \.dtd # XML Document Type Definition
+filetype code \.bat # DOS batch files
+filetype code \.vala # Vala
+filetype code \.py\.in$
+filetype code \.rhtml$ # eRuby
+filetype code \.sql$ # SQL script
+#
+#
+# Development documentation files (for hacking generally
+#
+filetype devel-doc ^readme.*$
+filetype devel-doc ^changelog.*
+filetype devel-doc ^todo.*$
+filetype devel-doc ^credits.*$
+filetype devel-doc ^authors.*$
+filetype devel-doc ^changes.*$
+filetype devel-doc ^news.*$
+filetype devel-doc ^install.*$
+filetype devel-doc ^hacking.*$
+filetype devel-doc ^copyright.*$
+filetype devel-doc ^licen(s|c)e.*$
+filetype devel-doc ^copying.*$
+filetype devel-doc manifest$
+filetype devel-doc faq$
+filetype devel-doc building$
+filetype devel-doc howto$
+filetype devel-doc design$
+filetype devel-doc \.files$
+filetype devel-doc files$
+filetype devel-doc subdirs$
+filetype devel-doc maintainers$
+filetype devel-doc developers$
+filetype devel-doc contributors$
+filetype devel-doc thanks$
+filetype devel-doc releasing$
+filetype devel-doc test$
+filetype devel-doc testing$
+filetype devel-doc build$
+filetype devel-doc comments?$
+filetype devel-doc bugs$
+filetype devel-doc buglist$
+filetype devel-doc problems$
+filetype devel-doc debug$
+filetype devel-doc hacks$
+filetype devel-doc hacking$
+filetype devel-doc versions?$
+filetype devel-doc mappings$
+filetype devel-doc tips$
+filetype devel-doc ideas?$
+filetype devel-doc spec$
+filetype devel-doc compiling$
+filetype devel-doc notes$
+filetype devel-doc missing$
+filetype devel-doc done$
+filetype devel-doc \.omf$ # XML-based format used in GNOME
+filetype devel-doc \.lsm$
+filetype devel-doc ^doxyfile$
+filetype devel-doc \.kdevprj$
+filetype devel-doc \.directory$
+filetype devel-doc \.dox$
+filetype devel-doc \.doap$
+#
+#
+# Building, compiling, configuration and CVS admin files
+#
+filetype build \.in.*$
+filetype build configure.*$
+filetype build makefile.*$
+filetype build config\.sub$
+filetype build config\.guess$
+filetype build config\.status$
+filetype build ltmain\.sh$
+filetype build autogen\.sh$
+filetype build config$
+filetype build conf$
+filetype build cvsignore$
+filetype build \.cfg$
+filetype build \.m4$
+filetype build \.mk$
+filetype build \.mak$
+filetype build \.make$
+filetype build \.mbx$
+filetype build \.protocol$
+filetype build \.version$
+filetype build mkinstalldirs$
+filetype build install-sh$
+filetype build rules$
+filetype build \.kdelnk$
+filetype build \.menu$
+filetype build linguas$ # Build translations
+filetype build potfiles.*$ # Build translations
+filetype build \.shlibs$ # Shared libraries
+# filetype build %debian%
+# filetype build %specs/%
+filetype build \.spec$ # It seems theyre necessary for RPM build
+filetype build \.def$ # build bootstrap for DLLs on win32
+#
+#
+# Documentation files
+#
+# filetype documentation doc/%
+# filetype documentation %HOWTO%
+filetype documentation \.html$
+filetype documentation \.txt$
+filetype documentation \.ps(\.gz|\.bz2)?$
+filetype documentation \.dvi(\.gz|\.bz2)?$
+filetype documentation \.lyx$
+filetype documentation \.tex$
+filetype documentation \.texi$
+filetype documentation \.pdf(\.gz|\.bz2)?$
+filetype documentation \.djvu$
+filetype documentation \.epub$
+filetype documentation \.sgml$
+filetype documentation \.docbook$
+filetype documentation \.wml$
+filetype documentation \.xhtml$
+filetype documentation \.phtml$
+filetype documentation \.shtml$
+filetype documentation \.htm$
+filetype documentation \.rdf$
+filetype documentation \.phtm$
+filetype documentation \.tmpl$
+filetype documentation \.ref$ # References
+filetype documentation \.css$
+# filetype documentation %tutorial%
+filetype documentation \.templates$
+filetype documentation \.dsl$
+filetype documentation \.ent$
+filetype documentation \.xml$
+filetype documentation \.xmi$
+filetype documentation \.xsl$
+filetype documentation \.entities$
+filetype documentation \.[1-7]$ # Man pages
+filetype documentation \.man$
+filetype documentation \.manpages$
+filetype documentation \.doc$
+filetype documentation \.rtf$
+filetype documentation \.wpd$
+filetype documentation \.qt3$
+filetype documentation man\d?/.*\.\d$
+filetype documentation \.docs$
+filetype documentation \.sdw$ # OpenOffice.org Writer document
+filetype documentation \.odt$ # OpenOffice.org document
+filetype documentation \.en$ # Files in English language
+filetype documentation \.de$ # Files in German
+filetype documentation \.es$ # Files in Spanish
+filetype documentation \.fr$ # Files in French
+filetype documentation \.it$ # Files in Italian
+filetype documentation \.cz$ # Files in Czech
+filetype documentation \.page$ # Mallard
+filetype documentation \.page.stub$ # Mallard stub
+#
+#
+# Images
+#
+filetype image \.png$
+filetype image \.jpg$
+filetype image \.jpeg$
+filetype image \.bmp$
+filetype image \.gif$
+filetype image \.xbm$
+filetype image \.eps$
+filetype image \.mng$
+filetype image \.pnm$
+filetype image \.pbm$
+filetype image \.ppm$
+filetype image \.pgm$
+filetype image \.gbr$
+filetype image \.svg$
+filetype image \.fig$
+filetype image \.tif$
+filetype image \.swf$
+filetype image \.svgz$
+filetype image \.shape$ # XML files used for shapes for instance in Kivio
+filetype image \.sml$ # XML files used for shapes for instance in Kivio
+filetype image \.bdf$ # vfontcap - Vector Font Capability Database (VFlib Version 2
+filetype image \.ico$
+filetype image \.dia$ # We consider .dia as images, I dont want them in unknown
+#
+#
+# Translation files
+#
+filetype translation \.po$
+filetype translation \.pot$
+filetype translation \.charset$
+filetype translation \.mo$
+#
+#
+# User interface files
+#
+filetype ui \.desktop$
+filetype ui \.ui$
+filetype ui \.xpm$
+filetype ui \.xcf$
+filetype ui \.3ds$
+filetype ui \.theme$
+filetype ui \.kimap$
+filetype ui \.glade$
+filetype ui \.gtkbuilder$
+filetype ui rc$
+#
+#
+# Sound files
+#
+filetype multimedia \.mp3$
+filetype multimedia \.ogg$
+filetype multimedia \.wav$
+filetype multimedia \.au$
+filetype multimedia \.mid$
+filetype multimedia \.vorbis$
+filetype multimedia \.midi$
+filetype multimedia \.arts$
+#
+#
+# Packages (yes, there are people who upload packages to the repo)
+#
+filetype package \.tar$
+filetype package \.tar.gz$
+filetype package \.tar.bz2$
+filetype package \.tar.xz$
+filetype package \.tgz$
+filetype package \.deb$
+filetype package \.rpm$
+filetype package \.srpm$
+filetype package \.ebuild$