19 files changed, 3132 insertions, 0 deletions
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..6eb19a4
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,29 @@
+Copyright (c) 2003, WiseGuys Internet B.V.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+- Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+- Neither the name of the WiseGuys Internet B.V. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README b/README
new file mode 100644
index 0000000..d8626a5
--- /dev/null
+++ b/README
@@ -0,0 +1,141 @@
+
+  :: libTextCat 2.0 ::
+     
+  What is it?
+  
+     Libtextcat is a library with functions that implement the
+     classification technique described in Cavnar & Trenkle, "N-Gram-Based
+     Text Categorization" [1]. It was primarily developed for language
+     guessing, a task on which it is known to perform with near-perfect
+     accuracy.
+     
+     The central idea of the Cavnar & Trenkle technique is to calculate a
+     "fingerprint" of a document with an unknown category, and compare this
+     with the fingerprints of a number of documents of which the categories
+     are known. The categories of the closest matches are output as the
+     classification. A fingerprint is a list of the most frequent n-grams
+     occurring in a document, ordered by frequency. Fingerprints are
+     compared with a simple out-of-place metric. See the article for more
+     details.
+     
+     Considerable effort went into making this implementation fast and
+     efficient. The language guesser processes over 100 documents/second on
+     a simple PC, which makes it practical for many uses. It was developed
+     for use in our webcrawler and search engine software, in which it it
+     handles millions of documents a day.
+     
+  Download
+  
+     The library is released under the BSD License, which basicly states
+     that you can do anything you like with it as long as you mention us
+     and make it clear that this library is covered by the BSD License. It
+     also exempts us from any liability, should this library eat your hard
+     disc, kill your cat or classify your attorney's e-mails as spam.
+     
+     The current version is 2.0.
+  	 
+     As of yet there is no development version.
+     
+     The distribution contains a configuration file for Gertjan van Noord's
+     language models, which are distributed under the GNU Public License.
+     Please note that this license does not allow you to distribute them as
+     a part of closed source software package. In time, we will provide you
+     with language models under a less restrictive license.
+     
+  Installation
+  
+     As of now, we have no autoconfig script. A "make all" in the src/ dir
+     should do the trick. If that doesn't work, you'll have to dive into
+     the mess we call our Makefile.
+     
+     The library is known to compile flawlessly on GNU/Linux for x86. If
+     you manage to get it working on other systems, please drop us a note,
+     and we'll proudly add you to this page.
+     
+  Quickstart: language guesser
+  
+     Assuming that you have successfully compiled the library, you still
+     need some language models to start guessing languages. If you don't
+     feel like creating them yourself (cf. [2]Creating your own
+     fingerprints below), you can use the excellent collection of over 70
+     language models provided in Gertjan van Noord's "TextCat" package. We
+     have provided a configuration file for these models in the langclass
+     directory. Hack it at will.
+
+       * Download the textcat package at
+  	 http://odur.let.rug.nl/~vannoord/TextCat/
+       * mkdir ~/text_cat
+       * cd ~/text_cat
+       * tar xzf ~/your_download_dir/text_cat.tgz
+       * ~/libtextcat/src/testtextcat ~/libtextcat/langclass/conf.txt
+  	 
+     Paste some text onto the commandline, and watch it get classified.
+     
+  Using the API
+  
+     Classifying the language of a textbuffer can be as easy as:
+
+       #include "textcat.h"
+       ...
+       void *h = textcat_Init( "conf.txt" );
+       ...
+       printf( "Language: %s\n", textcat_Classify(h, buffer, 400);
+       ...
+       textcat_Done(h);
+      
+  Creating your own fingerprints
+  
+     The createfp program allows you to easily create your own document
+     fingerprints. Just feed it an example document on standard input, and
+     store the standard output:
+
+       % createfp < mydocument.txt > myfingerprint.txt
+  
+     Put the names of your fingerprints in a configuration file, add some
+     id's and you're ready to classify.
+     
+  Performance tuning
+  
+     This library was made with efficiency in mind. There are couple of
+     parameters you may wish to tweak if you intend to use it for other
+     tasks than language guessing.
+     
+     The most important thing is buffer size. For reliable language
+     guessing the classifier only needs a couple of hundreds of bytes max.
+     So don't feed it 100KB of text unless you are creating a fingerprint.
+     
+     If you insist on feeding the classifier lots of text, try fiddling
+     with TABLEPOW, which determines the size of the hash table that is
+     used to store the n-grams. Making it too small will result in many
+     hashtable clashes, making it too large will cause wild memory
+     behaviour and both are bad for the performance.
+     
+     Putting the most probable models at the top of the list in your config
+     file improves performance, because this will raise the threshold for
+     likely candidates more quickly.
+     
+     Since the speed of the classifier is roughly linear with respect to
+     the number of models, you should consider how many models you really
+     need. In case of language guessing: do you really want to recognize
+     every language ever invented?
+     
+  References
+  
+     [1] The document that started it all can be downloaded at John M.
+     Trenkle's site: N-Gram-Based Text Categorization
+     
+       http://www.novodynamics.com/trenkle/papers/sdair-94-bc.ps.gz
+  
+     [2] The Perl implementation by Gertjan van Noord (code + language
+     models): downloadable from his [7]website
+  
+       http://odur.let.rug.nl/~vannoord/TextCat/
+     
+  Contact
+  
+     Praise and flames may be directed at us through
+     libtextcat AT wise-guys.nl. If there is enough interest, we'll whip up
+     a mailing list. The current project maintainer is Frank Scheelen.
+       
+  c. 2003 WiseGuys Internet B.V.
+
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..24dffef
--- /dev/null
+++ b/TODO
@@ -0,0 +1,9 @@
+
+TODO
+
+- Better Makefile
+- Improve error handling and reporting.
+- Split off table and heap routines to separate source files.
+- Speed optimizations. Some speed might be gained by comparing
+  the unknown fingerprint to all known fingerprints at once.
+- More docs, more comments.
+\ No newline at end of file
diff --git a/include/textcat.h b/include/textcat.h
new file mode 120000
index 0000000..9db5260
--- /dev/null
+++ b/include/textcat.h
@@ -0,0 +1 @@
+../src/textcat.h
+\ No newline at end of file
diff --git a/langclass/conf.txt b/langclass/conf.txt
new file mode 100644
index 0000000..4eb9f73
--- /dev/null
+++ b/langclass/conf.txt
@@ -0,0 +1,89 @@
+#
+# A sample config file for the language models 
+# provided with Gertjan van Noords language guesser
+# (http://odur.let.rug.nl/~vannoord/TextCat/)
+#
+# Notes: 
+# - You may consider eliminating a couple of small languages from this
+# list because they cause false positives with big languages and are
+# bad for performance. (Do you really want to recognize Drents?)
+# - Putting the most probable languages at the top of the list
+# improves performance, because this will raise the threshold for
+# likely candidates more quickly.
+#
+LM/afrikaans.lm			afrikaans
+LM/albanian.lm			albanian
+LM/amharic-utf.lm		amharic-utf
+LM/arabic-iso8859_6.lm		arabic-iso8859_6
+LM/arabic-windows1256.lm	arabic-windows1256
+LM/armenian.lm			armenian
+LM/basque.lm			basque
+LM/belarus-windows1251.lm	belarus-windows1251
+LM/bosnian.lm			bosnian
+LM/breton.lm			breton
+LM/bulgarian-iso8859_5.lm	bulgarian-iso8859_5
+LM/catalan.lm			catalan
+LM/chinese-big5.lm		chinese-big5
+LM/chinese-gb2312.lm		chinese-gb2312
+LM/croatian-ascii.lm		croatian-ascii
+LM/czech-iso8859_2.lm		czech-iso8859_2
+LM/danish.lm			danish
+LM/drents.lm			drents   # Dutch dialect
+LM/dutch.lm			dutch
+LM/english.lm			english
+LM/esperanto.lm			esperanto
+LM/estonian.lm			estonian
+LM/finnish.lm			finnish
+LM/french.lm			french
+LM/frisian.lm			frisian
+LM/georgian.lm			georgian
+LM/german.lm			german
+LM/greek-iso8859-7.lm		greek-iso8859-7
+LM/hebrew-iso8859_8.lm		hebrew-iso8859_8
+LM/hindi.lm			hindi
+LM/hungarian.lm			hungarian
+LM/icelandic.lm			icelandic
+LM/indonesian.lm		indonesian
+LM/irish.lm			irish
+LM/italian.lm			italian
+LM/japanese-euc_jp.lm		japanese-euc_jp
+LM/japanese-shift_jis.lm	japanese-shift_jis
+LM/korean.lm			korean
+LM/latin.lm			latin
+LM/latvian.lm			latvian
+LM/lithuanian.lm		lithuanian
+LM/malay.lm			malay
+LM/manx.lm			manx
+LM/marathi.lm			marathi
+LM/middle_frisian.lm		middle_frisian
+LM/mingo.lm			mingo
+LM/nepali.lm			nepali
+LM/norwegian.lm			norwegian
+LM/persian.lm			persian
+LM/polish.lm			polish
+LM/portuguese.lm		portuguese
+LM/quechua.lm			quechua
+LM/romanian.lm			romanian
+LM/rumantsch.lm			rumantsch
+LM/russian-iso8859_5.lm		russian-iso8859_5
+LM/russian-koi8_r.lm		russian-koi8_r
+LM/russian-windows1251.lm	russian-windows1251
+LM/sanskrit.lm			sanskrit
+LM/scots.lm			scots
+LM/scots_gaelic.lm		scots_gaelic
+LM/serbian-ascii.lm		serbian-ascii
+LM/slovak-ascii.lm		slovak-ascii
+LM/slovak-windows1250.lm	slovak-windows1250
+LM/slovenian-ascii.lm		slovenian-ascii
+LM/slovenian-iso8859_2.lm	slovenian-iso8859_2
+LM/spanish.lm			spanish
+LM/swahili.lm			swahili
+LM/swedish.lm			swedish
+LM/tagalog.lm			tagalog
+LM/tamil.lm			tamil
+LM/thai.lm			thai
+LM/turkish.lm			turkish
+LM/ukrainian-koi8_r.lm		ukrainian-koi8_r
+LM/vietnamese.lm		vietnamese
+LM/welsh.lm			welsh
+LM/yiddish-utf.lm		yiddish-utf
diff --git a/lib/libtextcat.a b/lib/libtextcat.a
new file mode 120000
index 0000000..53dc620
--- /dev/null
+++ b/lib/libtextcat.a
@@ -0,0 +1 @@
+../src/libtextcat.a
+\ No newline at end of file
diff --git a/src/.#fingerprint.c.1.2 b/src/.#fingerprint.c.1.2
new file mode 100644
index 0000000..19d55d9
--- /dev/null
+++ b/src/.#fingerprint.c.1.2
@@ -0,0 +1,634 @@
+/**
+ * fingerprint.c -- Routines for creating an n-gram fingerprint of a
+ * buffer.
+ *
+ * A fingerprint is a list of most common n-grams, ordered by
+ * frequency. (Note that we can use other strings than n-grams, for
+ * instance entire words.)
+ *
+ * Mar 28, 2003 frank@wise-guys.nl -- created
+ *
+ * TODO:
+ * - put table/heap datastructure in a separate file.
+ * */
+#include <string.h>
+#include <ctype.h>
+
+#include "wglib.h"
+#include "wg_mempool.h"
+
+#define INVALID(c) (isspace(c) || isdigit(c)) 
+
+#define TABLEPOW   13
+#define TABLESIZE  (1<<TABLEPOW)
+#define TABLEMASK  ((TABLESIZE)-1)
+
+#define MAXOUTOFPLACE 400
+#define MINDOCSIZE     25
+
+typedef struct {
+
+	sint2 rank;
+	char str[6];
+
+} ngram_t;
+
+typedef struct fprint_s {
+
+	const char *name;
+	ngram_t *fprint;
+	uint4 size;
+	
+} fprint_t;
+
+typedef struct entry_s {
+	char str[6];
+	unsigned int cnt;
+	struct entry_s *next;
+} entry_t;
+
+typedef struct table_s {	
+	void *pool;
+	entry_t **table;
+	entry_t *heap;
+
+	struct table_s *next;
+	
+	uint4 heapsize;
+	uint4 size;
+} table_t;
+
+
+
+/*
+ * fast and furious little hash function
+ *
+ * (Note that we could use some kind of rolling checksum, and update it
+ * during n-gram construction) 
+ */
+static uint4 simplehash( const char *p, int len )
+{
+	sint4 h = len * 13;
+	while (*p) {
+		h = (h<<5)-h + *p++;
+	}
+	return (uint4)h;
+}
+
+
+/* checks if n-gram lex is a prefix of key and of length len */
+inline int issame( char *lex, char *key, int len )
+{
+	int i;
+	for (i=0; i<len; i++) {
+		if ( key[i] != lex[i] ) {
+			return 0;
+		}
+	}
+	if ( lex[i] != 0 ) {
+		return 0;
+	}
+	return 1;
+}
+
+
+/* increases frequency of ngram(p,len) */
+static inline int increasefreq( table_t *t, char *p, int len ) 
+{	
+	uint4 hash = simplehash( p, len ) & TABLEMASK;				
+	entry_t *entry = t->table[ hash ];
+	
+	wg_debug(NGRAMDEBUG, "%u-GRAM: %s\n", len,p);
+
+	while ( entry ) {				
+		if ( issame( entry->str, p, len ) ) {
+			/*** Found it! ***/
+			entry->cnt++;
+			return 1;
+		}
+		else {
+			entry = entry->next;
+		}
+	}
+
+	/*** Not found, so create ***/
+	entry = wgmempool_alloc( t->pool, sizeof(entry_t) );
+	strcpy( entry->str, p );
+	entry->cnt = 1;
+
+	entry->next = t->table[hash];
+	t->table[hash] = entry;
+
+	return 1;
+}
+
+
+/* looks up ngram(p,len) */
+static entry_t *findfreq( table_t *t, char *p, int len ) 
+{	
+	uint4 hash = simplehash( p, len ) & TABLEMASK;				
+	entry_t *entry = t->table[ hash ];
+	
+	while ( entry ) {				
+		if ( issame( entry->str, p, len ) ) {
+			return entry;
+		}
+		else {
+			entry = entry->next;
+		}
+	}
+
+	return NULL;
+}
+
+static void dumptable(table_t *t)
+{
+	int i;
+	for (i=0; i<TABLESIZE; i++) {
+
+		entry_t *p = t->table[i];
+
+		while (p) {
+
+			printf("%5u %s\n", p->cnt, p->str);
+			p = p->next;
+		}
+
+	}
+}
+
+
+#define GREATER(x,y) ((x).cnt > (y).cnt)
+#define LESS(x,y)    ((x).cnt < (y).cnt)
+
+inline static void siftup( table_t *t, unsigned int child )
+{
+	entry_t *heap = t->heap;
+	unsigned int parent = (child-1) >> 1;
+	entry_t tmp;
+
+	while ( child > 0 ) {
+		if ( GREATER(heap[parent],heap[child]) ) {
+			memcpy( &tmp, &heap[parent], sizeof(entry_t) );
+			memcpy( &heap[parent], &heap[child], sizeof(entry_t) );
+			memcpy( &heap[child], &tmp, sizeof(entry_t) );
+		}
+		else {
+			return;
+		}
+
+		child = parent;
+		parent = (child-1) >> 1;
+	}
+}
+
+
+inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
+{
+	entry_t *heap = t->heap;
+	unsigned int child = parent*2 + 1;
+	entry_t tmp;
+
+	while ( child < heapsize ) {
+		if ( child+1 < heapsize && GREATER(heap[child], heap[child+1]) ) {
+			child++;
+		}
+		if ( GREATER(heap[parent], heap[child] ) ) {
+			memcpy( &tmp, &heap[parent], sizeof(entry_t) );
+			memcpy( &heap[parent], &heap[child], sizeof(entry_t) );
+			memcpy( &heap[child], &tmp, sizeof(entry_t) );
+		}
+		else {
+			return;
+		}
+		parent = child;
+		child = (parent*2)+1;
+	}
+}
+
+
+static int heapinsert( table_t *t, entry_t *item )
+{
+	entry_t *heap = t->heap;
+
+	/*** Still room for an entry? ***/
+	if (t->size < t->heapsize) {
+		memcpy( &(heap[t->size]), item, sizeof(entry_t));
+		siftup( t, t->size );
+		t->size++;		
+		return 0;
+	}
+
+	/*** Worse than the worst performer? ***/
+	if ( LESS(*item, heap[0]) ) {
+		return 0;
+	}
+
+	/*** Insert into heap and reheap ***/
+	memcpy( &(heap[0]), item, sizeof(entry_t));
+	siftdown( t, t->size, 0 );
+	return 0;
+}
+
+
+extern int heapextract( table_t *t, entry_t *item )
+{
+	entry_t *p;
+
+	if (t->size == 0 ) {
+		return 0;
+	}
+
+	p = &(t->heap[0]);
+
+	memcpy( item, p, sizeof( entry_t ));
+	memcpy( &(t->heap[0]), &(t->heap[t->size-1]), sizeof(entry_t) );
+
+	siftdown(t,t->size, 0);
+	t->size--;
+
+	return 1;
+}
+
+
+/*** Makes a heap of all table entries ***/
+static int table2heap(table_t *t)
+{
+	int i;
+	uint4 cnt = 0;
+	uint4 bufsize = 2048;
+
+	/*** Fill result heap ***/
+	for (i=0; i<TABLESIZE; i++) {
+		entry_t *p = t->table[i];	       
+		while (p) {
+			heapinsert(t, p);
+			p = p->next;
+		}
+	}     
+	return 1;
+}
+
+
+static table_t *inittable(uint4 maxngrams)
+{	
+	table_t *result = (table_t *)wg_zalloc( sizeof(table_t) );
+	result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE );
+	result->pool = wgmempool_Init( 10000, 10 );
+
+	result->heap = (entry_t *)wg_malloc( sizeof(entry_t) * maxngrams );
+	result->heapsize = maxngrams;
+	result->size = 0;
+
+	return result;
+}
+
+static void tabledone( table_t *t )
+{
+	if (!t) {
+		return;
+	}
+	wgmempool_Done(t->pool);
+	wg_free(t->table);
+	wg_free(t->heap);
+	wg_free(t);	
+}
+
+
+extern void *fprint_Init(const char *name)
+{
+	fprint_t *h = (fprint_t *)wg_zalloc( sizeof(fprint_t) );
+	
+	if ( name ) {
+		h->name = wg_strdup(name);
+	}
+
+	return (void *)h;
+}
+
+
+extern void fprint_Done( void *handle )
+{
+	fprint_t *h = (fprint_t *)handle;
+
+	if ( h->name ) {
+		wg_free( (void *)h->name );
+	}
+	if ( h->fprint ) {
+		wg_free( h->fprint );
+	}
+
+	wg_free( h );
+}
+
+extern const char *fprint_Name( void *handle )
+{
+	fprint_t *h = (fprint_t *)handle;
+	return  h->name;
+}
+
+/**
+ * Function that prepares buffer for n-grammification:
+ * runs of invalid characters are collapsed to a single
+ * underscore.
+ *
+ * Function is implemented as a finite state machine.
+ */
+static char *prepbuffer( const char *src, size_t bufsize )
+{
+	const char *p = src;
+	char *dest = (char *)wg_malloc( bufsize + 3 );
+	char *w = dest;
+	char *wlimit = dest + bufsize + 1;
+
+ START:
+	if ( INVALID(*p) ) {
+		goto SPACE;
+	}
+	else if ( *p == '\0' ) {
+		goto END;
+	}
+
+	*w++ = '_';
+	if ( w == wlimit ) {
+		goto STOP;
+	}
+
+	goto WORD;
+
+
+ SPACE:
+	/*** Inside string of invalid characters ***/
+	p++;
+	if ( INVALID(*p) ) {
+		goto SPACE;
+	}
+	else if ( *p == '\0' ) {
+		goto END;
+	}
+
+	*w++ = '_';
+	if ( w == wlimit ) {
+		goto STOP;
+	}
+
+	goto WORD;
+
+ WORD:
+	/*** Inside string of valid characters ***/
+	*w++ = *p++;
+	if ( w == wlimit ) {
+		goto END;
+	}
+	else if ( INVALID(*p) ) {
+		goto SPACE;
+	}
+	else if ( *p == '\0' ) {
+		goto STOP;
+	}
+	goto WORD;
+
+ END:
+	*w++ = '_';
+
+ STOP:
+	*w++ = '\0';
+
+	/*** Docs that are too small for a fingerprint, are refused ***/
+	if ( w - dest < MINDOCSIZE ) {
+		wg_free(dest);
+		return NULL;
+	}
+
+	return dest;
+}
+
+
+static void createngramtable( table_t *t, const char *buf )
+{
+	char n[6];
+	const char *p = buf;
+	int i;
+
+	/*** Get all n-grams where 1<=n<=5. Allow underscores only at borders. ***/
+	for (;;p++) {
+
+		const char *q = p;
+		char *m = n;
+
+		/*** First char may be an underscore ***/
+		*m++ = *q++;
+		*m = '\0';
+
+		increasefreq( t, n, 1 );
+
+		if ( *q == '\0' ) {
+			return;
+		}
+
+		/*** Let the compiler unroll this ***/
+		for ( i=2; i<=5; i++) {
+
+			*m++ = *q;
+			*m = '\0';
+
+			increasefreq( t, n, i );
+
+			if ( *q == '_' ) break;
+			q++;
+			if ( *q == '\0' ) {
+				return;
+			}
+		}
+	}
+	return;
+}
+
+
+static int ngramcmp(const void *a, const void *b)
+{
+	ngram_t *x = (ngram_t *)a;
+	ngram_t *y = (ngram_t *)b;
+	
+	return strcmp( x->str, y->str );
+}
+
+/**
+ * Create a fingerprint: 
+ * - record the frequency of each unique n-gram in a hash table
+ * - take the most frequent n-grams
+ * - sort them alphabetically, recording their relative rank
+ */
+extern int fprint_Create( void *handle, const char *buffer, uint4 bufsize, uint4 maxngrams )
+{
+	sint4 i;
+	fprint_t *h = (fprint_t *)handle;
+	table_t *t = inittable(maxngrams);
+	char *tmp;
+
+	if ( bufsize < MINDOCSIZE ) {
+		return 0;
+	}
+
+	/*** Throw out all invalid chars ***/
+	tmp = prepbuffer( buffer, bufsize ); 	
+	if ( tmp == NULL ) {
+		return 0;
+	}
+
+	/*** Create a hash table containing n-gram counts ***/
+	createngramtable(t, tmp);
+
+	/*** Take the top N n-grams and add them to the profile ***/
+	table2heap(t);	
+	maxngrams = WGMIN( maxngrams, t->size );       
+
+	h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams );
+	h->size = maxngrams;
+
+	/*** Pull n-grams out of heap (backwards) ***/
+	for (i=maxngrams-1; i>=0; i--) {
+
+		entry_t tmp;
+
+		heapextract(t, &tmp);
+		
+		/*** the string and its rank is all we need ***/
+		strcpy( h->fprint[i].str, tmp.str );
+		h->fprint[i].rank = i;
+	}
+
+	tabledone(t);
+	wg_free(tmp);
+
+	/*** Sort n-grams alphabetically, for easy comparison ***/
+	qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp ); 
+	return 1;
+}
+
+extern void fprint_Show( void *handle )
+{
+	fprint_t *h = (fprint_t *)handle;
+	int i;
+	printf("------ %s --------\n", h->name );
+	for (i=0; i<h->size; i++) {
+		printf("%3u: '%s' [%u]\n", i, h->fprint[i].str,  h->fprint[i].rank);
+	}
+
+
+}
+
+extern int fprint_Read( void *handle, const char *fname, int maxngrams )
+{
+	fprint_t *h = (fprint_t *)handle;
+	FILE *fp;
+	char line[1024];
+	int cnt = 0;
+
+	fp = fopen( fname, "r" );
+	if (!fp) {
+		return 0;
+	}
+	
+	h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t));
+
+	while (cnt < maxngrams && wg_getline(line,1024,fp)) {
+
+		char *p;
+
+		wg_trim(line, line);
+
+		p = strpbrk( line, " \t" );
+		if ( p ) {
+			*p = '\0';
+		}
+
+		if ( strlen(line) > 5 ) {
+			continue;
+		}
+
+		strcpy( h->fprint[cnt].str, line );
+		h->fprint[cnt].rank = cnt;
+
+		cnt++;
+	}
+
+	h->size = cnt;
+
+	/*** Sort n-grams, for easy comparison later on ***/
+	qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp ); 
+
+	fclose(fp);
+
+	return 1;
+}
+
+
+extern sint4 fprint_Compare( void *cat, void *unknown, int cutoff )
+{
+	fprint_t *c = (fprint_t *)cat;
+	fprint_t *u = (fprint_t *)unknown;
+	int i = 0;
+	int j = 0;
+	sint4 sum = 0;
+	
+	/*** Compare the profiles in mergesort fashion ***/
+	while ( i < c->size && j < u->size ) {
+
+		int cmp = strcmp( c->fprint[i].str, u->fprint[j].str );
+
+		if ( cmp < 0 ) {
+			i++;
+		}
+		else if ( cmp == 0 ) {
+			sum += abs( c->fprint[i].rank - u->fprint[j].rank );
+			if ( sum > cutoff ) {
+				return 1000000000;
+			}
+			i++;
+			j++;
+		}
+		else {
+			sum += MAXOUTOFPLACE;
+			if ( sum > cutoff ) {
+				return 1000000000;
+			}
+			j++;
+		}
+	}
+
+	/*** Process tail of unknown, if any ***/
+	while ( j < u->size ) {
+		sum += MAXOUTOFPLACE;
+		if ( sum > cutoff ) {
+			return 1000000000;
+		}
+		j++;
+	}
+
+	return sum;
+	       	
+}
+
+
+#if 0
+int main(int argc, char **argv)
+{
+	char buf[100 * 1024];
+	char tmp[100 * 1024];
+	int size;
+	fprint_t *h;
+	
+	wg_meminit();
+	{
+		//size = fread( buf, 1, 100*1024, stdin );
+		h = fprint_Init("test");
+		buf[size] = '\0';
+		//fprint_Create(h, buf, strlen(buf), 100);
+		fprint_Read( h, "/home/frank/textcat/LM/dutch.lm", 100 );
+		fprint_Show(h);
+		fprint_Done(h);
+		wg_printmemstats();
+	}
+}
+#endif
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..1253f38
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,62 @@
+#
+# Apr 1, 2003 frank@wise-guys.nl
+# Apr 2, 2003 gnarl@wise-guys.nl
+#
+#--------------------------------------------------------------------------
+# Macro Variables
+#--------------------------------------------------------------------------
+#MDEBUG		=-DMEMDEBUG -DMEMVERBOSE
+#
+SRC		=/home/src
+RANLIB		=ranlib
+RM		=/bin/rm -f
+CC		=gcc
+LIBS		=
+LEX		=flex
+LDFLAGS		=-g
+OPTS		=-static -O3 -DWGTIMER -D_THREAD_SAFE -D_GNU_SOURCE -march=i686 \
+		 -funroll-loops --pipe -D__GNU_C__
+VERBOSE		= -DVERBOSE
+IFLAGS		=
+
+WARNS		=-W -Winline -Wall -Wshadow -Wno-unused -Wpointer-arith 
+
+CFLAGS		=$(VERBOSE) $(IFLAGS) $(LDFLAGS) -D_GNU_SOURCE -D_REENTRANT -DWGTIMER $(WARNS) $(OPTS) $(MDEBUG)  
+
+OBJECTS2	=	fingerprint.o textcat.o common.o wg_mempool.o
+OBJECTS		=	testtextcat.o
+OBJECTS3	=	createfp.o
+
+SOURCES		=	fingerprint.c textcat.c common.c wg_mempool.c testtextcat.c createfp.c
+
+LIBS1 		=	-lm 
+
+LIBS 		= $(LIBS1)
+
+#--------------------------------------------------------------------------
+# Dependencies
+#--------------------------------------------------------------------------
+all: textcat testtextcat createfp
+
+$(OBJECTS) : Makefile
+
+textcat: $(OBJECTS2)
+	$(RM) -f libtextcat.a ; ar cr libtextcat.a $(OBJECTS2) ; $(RANLIB) libtextcat.a
+
+testtextcat: $(OBJECTS) libtextcat.a
+	$(CC) $(CFLAGS) -static -o $@ $(OBJECTS) libtextcat.a $(LDFLAGS) $(LIBS) 
+
+createfp: $(OBJECTS3) libtextcat.a
+	$(CC) $(CFLAGS) -static -o $@ $(OBJECTS3) libtextcat.a $(LDFLAGS) $(LIBS) 
+
+dep:	depend
+#
+depend: 
+	makedepend -- $(CFLAGS) -- $(SOURCES)
+#
+$(SOURCES): Makefile
+#
+clean:
+	$(RM) core *.o *~ testtextcat createfp *.a
+
+# DO NOT DELETE
diff --git a/src/common.c b/src/common.c
new file mode 100644
index 0000000..054dae4
--- /dev/null
+++ b/src/common.c
@@ -0,0 +1,376 @@
+/*
+ * common.c -- miscellanious helper functions.
+ *
+ * Copyright (c) 2003, WiseGuys Internet B.V.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ * 
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <ctype.h>
+#include "common.h"
+
+extern void wgmem_error( const char *fmt, ... )
+{
+        va_list ap;
+
+	fprintf(stderr, "MEMERROR : ");
+        va_start(ap, fmt);
+        vfprintf(stderr, fmt, ap);
+        va_end(ap);
+
+	exit(-1);
+}
+
+
+extern void *wg_malloc( size_t size )
+{
+	void *result;
+	if ( !size ) {
+		wgmem_error( "Error mallocing 0 bytes.\n" );
+	}
+
+	result = malloc( size );
+	if ( !result ) {
+		wgmem_error( "Error while mallocing %u bytes.\n", size );
+	}
+
+	return result;
+}
+
+extern void *wg_calloc( size_t nmemb, size_t size )
+{
+	void *result;
+	if ( !size || !nmemb ) {
+		wgmem_error( "Error callocing 0 bytes.\n" );
+	}
+
+	result = calloc( nmemb, size );
+	if ( !result ) {
+		wgmem_error( "Error while callocing %u elements of %u bytes.\n", nmemb, size);
+	}
+
+	return result;
+}
+
+extern void *wg_zalloc( size_t size )
+{
+	void *result;
+
+	if (!size) {
+		wgmem_error( "Error zallocing 0 bytes.\n" );
+	}
+
+	result = calloc(1, size);
+	if ( !result ) {
+		wgmem_error( "Error while zallocing %u bytes.\n", size );
+	}
+
+	return result;
+}
+
+extern char* wg_strdup( const char *s )
+{
+	char *result = strdup( s );
+
+	if ( !result ) {
+		wgmem_error( "Error while strduping %u bytes.\n", strlen(s) );
+	}
+
+	return( result ); 
+}
+
+extern void* wg_realloc( void *ptr, size_t size ) 
+{ 
+	void *result;
+
+	if (!size) {
+		wgmem_error( "Error reallocing 0 bytes.\n" );
+	}
+
+	result = realloc( ptr, size );
+	if ( !result ) {
+		wgmem_error( "Error while reallocing %u bytes.\n", size );
+	}
+
+	return( result ); 
+}
+
+extern void wg_free( void *mem )
+{
+        if ( mem ) {
+                free(mem);
+        }
+}
+
+extern char *wg_getline( char *line, int size, FILE *fp )
+{
+        char *p;
+
+        if ( fgets(line, size, fp) == NULL ) {
+		return NULL;
+	}
+        
+        /** kill term null **/
+        if ( (p = strpbrk( line, "\n\r" )) ) {
+                *p = '\0';
+        }           
+  
+        return line;
+}
+
+
+
+/*
+ * wg_split: split a line into segments, using whitespace-sequences as separators.
+ *
+ * ARGUMENTS:
+ * - result:
+ * 
+ *  After the split, this array contains pointers to the start of each
+ *  detected segment. Must be preallocated and at least as large as
+ *  maxsegments. The pointers point into the dest buffer.
+ * 
+ * - dest: 
+ * 
+ *  String into which result points as an index. Must be preallocated, and
+ *  at least as big as src. You can use src as dest, but in that case src
+ *  is overwritten!
+ * 
+ * - src:  
+ * 
+ *  The string to split. Sequences of whitespace are treated as separators, unless
+ *  escaped. There are two ways to escape: by using single quotes (anything
+ *  between single quotes is treated as one segment), or by using a backslash
+ *  to escape the next character. The backslash escape works inside quotation
+ *  as well.
+ * 
+ *  Example:
+ *  
+ *  "It\'s   very\ easy   'to  use WiseGuys\' wg_split()' function" is split into:
+ *  
+ *  "It's"
+ *  "very easy"
+ *  "to  use WiseGuys' wg_split()"
+ *  "function"
+ *  
+ * - maxsegments: 
+ * 
+ *  The maximum number of segments. If the splitter runs out of segments,
+ *  the remainder of the string is stored in the last segment.
+ *  
+ * RETURN VALUE:
+ * The number of segments found.
+ */
+unsigned int wg_split( char **result, char *dest, char *src, int maxsegments )
+{
+	char *p = src;
+	char *w = dest;
+	int cnt = 0;
+	int state=0;
+
+	if ( maxsegments == 0 ) {
+		return 0;
+	}
+
+	maxsegments--;
+
+	while (cnt < maxsegments && *p) {
+
+		switch (state) {
+		case 0:
+			/*** Skip spaces ***/
+			while ( isspace((int) *p) ) {
+				p++;
+			}
+			state = 1;
+
+		case 1: 
+			/*** Start segment ***/
+			result[cnt] = w;
+			cnt++;
+			state = 2;
+
+		case 2:
+			/*** Unquoted segment ***/
+			while (*p) {
+				if ( isspace((int) *p) ) {
+					*w++ = '\0';
+					p++;
+					state = 0;
+					break;
+				}				
+				else if ( *p == '\'' ) {
+					/*** Start quotation ***/
+					p++;
+					state = 3;
+					break;
+				}
+				else if ( *p == '\\' && p[1] ) {
+					/*** Escape ***/
+					p++;
+					*w++ = *p++;
+				}
+				else {
+					*w++ = *p++;
+				}
+			}
+			break;
+
+		case 3:
+			/*** Inside quotes ***/
+			while (*p) {
+				if (*p == '\'') {
+					p++;
+					break;
+				}
+				else if ( *p == '\\' && p[1] ) {
+					/*** Escape ***/
+					p++;
+					*w++ = *p++;
+				}
+				else {
+					*w++ = *p++;
+				}
+			}
+			state = 2;
+			break;
+
+		}
+	}
+
+	if ( !*p ) {
+		*w = '\0';
+		return cnt;
+	}
+
+	/*** We ran out of segments; copy the remainder of the string into last segment ***/
+	result[cnt++] = w;
+	while (*p) {
+		*w++ = *p++;
+	}
+	*w = '\0';
+	return cnt;
+}
+
+
+extern void wg_timerstart(wgtimer_t *t)
+{
+        gettimeofday( &(t->start), NULL );
+}
+        
+
+
+extern uint4 wg_timerstop(wgtimer_t *t)
+{
+	uint4 result;
+        gettimeofday( &(t->stop), NULL );
+        result = (t->stop.tv_sec - t->start.tv_sec) * 1000000 +
+                 (t->stop.tv_usec - t->start.tv_usec);
+
+	t->start.tv_sec = t->stop.tv_sec;
+	t->start.tv_usec = t->stop.tv_usec;
+
+	return result;
+}
+
+
+/**
+ * wg_strgmov -- a guarded strcpy() variation
+ * 
+ * copies src to dest (including terminating zero), and returns
+ * pointer to position of terminating zero in dest. The function is
+ * guaranteed not to write past destlimit. If the copy couldn't be
+ * finished, the function returns NULL after restoring the first 
+ * character in dest for your convenience (since this is usually a zero).  
+ */
+char *wg_strgmov( char *dest, const char *src, const char *destlimit )
+{
+	char tmp, *w;
+	
+	if ( !dest || dest >= destlimit ) {
+		return NULL;
+	}
+
+	tmp = *dest;
+	w = dest;
+
+	while ( *src ) {
+
+		*w++ = *src++;
+
+		if ( w == destlimit ) {
+			/*** restore old situation ***/
+			*dest = tmp;
+			return NULL;
+		}
+	}
+
+	*w = '\0';
+	return w;
+
+}
+
+/*
+ * wg_trim() -- remove whitespace surrounding a string. 
+ *
+ * Example: "   bla   bla   bla   " becomes "bla   bla   bla" after trimming.
+ *
+ * ARGUMENTS
+ * - dest : After the trim, this will point to the trimmed string.
+ *          Must be preallocated and at least as big as src. You can
+ *          use src as dest.
+ * - src : Points to the string to be trimmed.
+ *
+ * RETURNS:
+ * dest
+ */
+char *wg_trim( char *dest, const char *src )
+{
+	char *lastnonspace = &dest[-1];
+	const char *p = src;
+	char *w = dest;
+	
+	while ( isspace((int)*p) ) {
+		p++;
+	}
+	while (*p) {
+		if ( !isspace((int)*p) ) {
+			lastnonspace = w;
+		}
+		*w++ = *p++;
+	}
+	lastnonspace[1] = '\0';
+
+	return dest;
+}
+
diff --git a/src/common.h b/src/common.h
new file mode 100644
index 0000000..5dc4656
--- /dev/null
+++ b/src/common.h
@@ -0,0 +1,83 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+/**
+ * common.h -- a mixed bag of helper functions 
+ *
+ * Copyright (C) 2003 WiseGuys Internet B.V.
+ *
+ * THE BSD LICENSE
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ * 
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <inttypes.h>
+#include <sys/time.h>
+#include <time.h>
+
+#define WGMIN(x,y)         ((x)<=(y)?(x):(y))
+#define WGMAX(x,y)         ((x)<=(y)?(y):(x))
+#define __STR__(x)         #x
+#define WGSTR(x)           __STR__(x)
+
+typedef uint32_t	uint4;
+typedef uint16_t	uint2;
+typedef uint8_t		uchar;
+
+typedef int32_t		sint4;
+typedef int16_t		sint2;
+typedef int8_t		schar;
+
+typedef int8_t		boole;
+
+
+typedef struct wgtimer_s {
+        struct timeval start;
+        struct timeval stop;
+} wgtimer_t;
+
+
+extern void *wg_malloc( size_t size );
+extern void *wg_calloc( size_t nmemb, size_t size );
+extern void *wg_zalloc( size_t size );
+extern char* wg_strdup( const char *s );
+extern void* wg_realloc( void *ptr, size_t size ) ;
+extern void wg_free( void *mem );
+
+extern char *wg_getline( char *line, int size, FILE *fp );
+
+extern void wg_timerstart(wgtimer_t *t);
+extern uint4 wg_timerstop(wgtimer_t *t);
+
+extern unsigned int wg_split( char **result, char *dest, char *src, int maxsegments );
+extern char *wg_strgmov( char *dest, const char *src, const char *destlimit );
+extern char *wg_trim( char *dest, const char *src );
+
+      
+#endif
+
diff --git a/src/constants.h b/src/constants.h
new file mode 100644
index 0000000..0bdae6c
--- /dev/null
+++ b/src/constants.h
@@ -0,0 +1,82 @@
+#ifndef _CONSTANTS_H_
+#define _CONSTANTS_H_
+
+/*
+ * constants.h -- some constants used throughout the code. Not pretty,
+ * but certainly convenient.
+ *
+ * Copyright (C) 2003 WiseGuys Internet B.V.
+ *
+ * THE BSD LICENSE
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ * 
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <limits.h>
+
+
+#define VERSION    2
+#define SUBVERSION 0
+#define DESCRIPTION "out of place"
+
+/* Reported matches are those fingerprints with a score less than best
+ * score * THRESHOLDVALUE (i.e. a THRESHOLDVALUE of 1.03 means matches
+ * must score within 3% from the best score.)  
+ */
+#define THRESHOLDVALUE  1.03
+
+/* If more than MAXCANDIDATES matches are found, the classifier reports
+ * unknown, because the input is obviously confusing.
+ */
+#define MAXCANDIDATES   5
+
+/* The size of the buffer used to report the classification.
+ */
+#define MAXOUTPUTSIZE 1024
+
+/* Maximum number of n-grams in a fingerprint */
+#define MAXNGRAMS  400
+
+/* Maximum size of an n-gram? */
+#define MAXNGRAMSIZE 5
+
+/* Which characters are not acceptable in n-grams? */
+#define INVALID(c) (isspace(c) || isdigit(c)) 
+
+/* Minimum size (in characters) for accepting a document */
+#define MINDOCSIZE  25
+
+/* Maximum penalty for missing an n-gram in fingerprint */
+#define MAXOUTOFPLACE 400
+
+/* Size of hash table is 2^TABLEPOW. */
+#define TABLEPOW  13
+
+#define MAXSCORE  INT_MAX
+
+#endif
diff --git a/src/createfp.c b/src/createfp.c
new file mode 100644
index 0000000..1e3e0cd
--- /dev/null
+++ b/src/createfp.c
@@ -0,0 +1,84 @@
+/*
+ * createfprint.c - can be used to create a fingerprint of a document.
+ *
+ * Copyright (c) 2003, WiseGuys Internet B.V.
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ * 
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "fingerprint.h"
+#include "common.h"
+
+#define BLOCKSIZE 4096
+
+char *myread(FILE *fp)
+{
+	char *buf;
+	size_t size = 0;
+	size_t maxsize = BLOCKSIZE*2;
+
+	buf = (char *)wg_malloc( maxsize );
+	do {
+		size_t read = fread( buf+size, 1, BLOCKSIZE, fp );
+		size += read;
+		if ( size + BLOCKSIZE > maxsize ) {
+			maxsize *= 2;
+			buf = (char *)wg_realloc( buf, maxsize );
+		}
+
+	} while (!feof(stdin));
+
+	buf[size] = '\0';
+	buf = (char *)wg_realloc( buf, size+1 );
+
+	return buf;
+}
+
+
+int main( int argc, char **argv )
+{
+	void *h;
+	char *result;
+	wgtimer_t tm;
+	int i;
+	char *buf;
+
+	buf = myread(stdin);
+
+	h = fp_Init(NULL);
+	if ( fp_Create( h, buf, strlen(buf), 400 ) == 0 ) {
+		fprintf(stderr, "There was an error creating the fingerprint\n");
+		exit(-1);
+	}
+	fp_Print(h,stdout);
+	fp_Done(h);
+	wg_free(buf);
+
+	return 0;
+}
diff --git a/src/fingerprint.c b/src/fingerprint.c
new file mode 100644
index 0000000..0d14558
--- /dev/null
+++ b/src/fingerprint.c
@@ -0,0 +1,704 @@
+/**
+ * fingerprint.c -- Routines for creating an n-gram fingerprint of a
+ * buffer.
+ *
+ * Copyright (c) 2003, WiseGuys Internet B.V.
+ * All rights reserved.
+ *
+ * THE BSD LICENSE
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ * 
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * DESCRIPTION
+ *
+ * A fingerprint is a list of most common n-grams, ordered by
+ * frequency. (Note that we can use other strings than n-grams, for
+ * instance entire words.)
+ *
+ * HOW DOES IT WORK?
+ *
+ * - Buffer is sliced up into n-grams
+ * - N-grams are inserted into a hash table that records their frequency
+ * - The table entries are filtered through a N-sized heap to
+ *   get the N most frequent n-grams.
+ *
+ * The reason why we go through the trouble of doing a partial
+ * (heap)sort is that a full quicksort behaves horribly on the data:
+ * most n-grams have a very low count, resulting in a data set in
+ * nearly-sorted order. This causes quicksort to behave very badly. 
+ * Heapsort, on the other hand, behaves handsomely: worst case is
+ * Mlog(N) for M n-grams filtered through a N-sized heap.
+ *
+ * REVISION HISTORY
+ *
+ * Mar 28, 2003 frank@wise-guys.nl -- created
+ *
+ * TODO:
+ * - put table/heap datastructure in a separate file.
+ * */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "common.h"
+#include "wg_mempool.h"
+#include "constants.h"
+
+
+#define TABLESIZE  (1<<TABLEPOW)
+#define TABLEMASK  ((TABLESIZE)-1)
+
+typedef struct {
+
+	sint2 rank;
+	char str[MAXNGRAMSIZE+1];
+
+} ngram_t;
+
+typedef struct fp_s {
+
+	const char *name;
+	ngram_t *fprint;
+	uint4 size;
+	
+} fp_t;
+
+typedef struct entry_s {
+	char str[MAXNGRAMSIZE+1];
+	unsigned int cnt;
+	struct entry_s *next;
+} entry_t;
+
+typedef struct table_s {	
+	void *pool;
+	entry_t **table;
+	entry_t *heap;
+
+	struct table_s *next;
+	
+	uint4 heapsize;
+	uint4 size;
+} table_t;
+
+
+
+/*
+ * fast and furious little hash function
+ *
+ * (Note that we could use some kind of rolling checksum, and update it
+ * during n-gram construction) 
+ */
+static uint4 simplehash( const char *p, int len )
+{
+	sint4 h = len * 13;
+	while (*p) {
+		h = (h<<5)-h + *p++;
+	}
+	return (uint4)h;
+}
+
+
+/* checks if n-gram lex is a prefix of key and of length len */
+inline int issame( char *lex, char *key, int len )
+{
+	int i;
+	for (i=0; i<len; i++) {
+		if ( key[i] != lex[i] ) {
+			return 0;
+		}
+	}
+	if ( lex[i] != 0 ) {
+		return 0;
+	}
+	return 1;
+}
+
+
+/* increases frequency of ngram(p,len) */
+static inline int increasefreq( table_t *t, char *p, int len ) 
+{	
+	uint4 hash = simplehash( p, len ) & TABLEMASK;				
+	entry_t *entry = t->table[ hash ];
+	
+	while ( entry ) {				
+		if ( issame( entry->str, p, len ) ) {
+			/*** Found it! ***/
+			entry->cnt++;
+			return 1;
+		}
+		else {
+			entry = entry->next;
+		}
+	}
+
+	/*** Not found, so create ***/
+	entry = wgmempool_alloc( t->pool, sizeof(entry_t) );
+	strcpy( entry->str, p );
+	entry->cnt = 1;
+
+	entry->next = t->table[hash];
+	t->table[hash] = entry;
+
+	return 1;
+}
+
+
+/* looks up ngram(p,len) */
+static entry_t *findfreq( table_t *t, char *p, int len ) 
+{	
+	uint4 hash = simplehash( p, len ) & TABLEMASK;				
+	entry_t *entry = t->table[ hash ];
+	
+	while ( entry ) {				
+		if ( issame( entry->str, p, len ) ) {
+			return entry;
+		}
+		else {
+			entry = entry->next;
+		}
+	}
+
+	return NULL;
+}
+
+static void dumptable(table_t *t)
+{
+	int i;
+	for (i=0; i<TABLESIZE; i++) {
+
+		entry_t *p = t->table[i];
+
+		while (p) {
+
+			printf("%5u %s\n", p->cnt, p->str);
+			p = p->next;
+		}
+
+	}
+}
+
+
+#define GREATER(x,y) ((x).cnt > (y).cnt)
+#define LESS(x,y)    ((x).cnt < (y).cnt)
+
+inline static void siftup( table_t *t, unsigned int child )
+{
+	entry_t *heap = t->heap;
+	unsigned int parent = (child-1) >> 1;
+	entry_t tmp;
+
+	while ( child > 0 ) {
+		if ( GREATER(heap[parent],heap[child]) ) {
+			memcpy( &tmp, &heap[parent], sizeof(entry_t) );
+			memcpy( &heap[parent], &heap[child], sizeof(entry_t) );
+			memcpy( &heap[child], &tmp, sizeof(entry_t) );
+		}
+		else {
+			return;
+		}
+
+		child = parent;
+		parent = (child-1) >> 1;
+	}
+}
+
+
+inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
+{
+	entry_t *heap = t->heap;
+	unsigned int child = parent*2 + 1;
+	entry_t tmp;
+
+	while ( child < heapsize ) {
+		if ( child+1 < heapsize && GREATER(heap[child], heap[child+1]) ) {
+			child++;
+		}
+		if ( GREATER(heap[parent], heap[child] ) ) {
+			memcpy( &tmp, &heap[parent], sizeof(entry_t) );
+			memcpy( &heap[parent], &heap[child], sizeof(entry_t) );
+			memcpy( &heap[child], &tmp, sizeof(entry_t) );
+		}
+		else {
+			return;
+		}
+		parent = child;
+		child = (parent*2)+1;
+	}
+}
+
+
+static int heapinsert( table_t *t, entry_t *item )
+{
+	entry_t *heap = t->heap;
+
+	/*** Still room for an entry? ***/
+	if (t->size < t->heapsize) {
+		memcpy( &(heap[t->size]), item, sizeof(entry_t));
+		siftup( t, t->size );
+		t->size++;		
+		return 0;
+	}
+
+	/*** Worse than the worst performer? ***/
+	if ( LESS(*item, heap[0]) ) {
+		return 0;
+	}
+
+	/*** Insert into heap and reheap ***/
+	memcpy( &(heap[0]), item, sizeof(entry_t));
+	siftdown( t, t->size, 0 );
+	return 0;
+}
+
+
+extern int heapextract( table_t *t, entry_t *item )
+{
+	entry_t *p;
+
+	if (t->size == 0 ) {
+		return 0;
+	}
+
+	p = &(t->heap[0]);
+
+	memcpy( item, p, sizeof( entry_t ));
+	memcpy( &(t->heap[0]), &(t->heap[t->size-1]), sizeof(entry_t) );
+
+	siftdown(t,t->size, 0);
+	t->size--;
+
+	return 1;
+}
+
+
+/*** Makes a heap of all table entries ***/
+static int table2heap(table_t *t)
+{
+	int i;
+	uint4 cnt = 0;
+	uint4 bufsize = 2048;
+
+	/*** Fill result heap ***/
+	for (i=0; i<TABLESIZE; i++) {
+		entry_t *p = t->table[i];	       
+		while (p) {
+			heapinsert(t, p);
+			p = p->next;
+		}
+	}     
+	return 1;
+}
+
+
+static table_t *inittable(uint4 maxngrams)
+{	
+	table_t *result = (table_t *)wg_zalloc( sizeof(table_t) );
+	result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE );
+	result->pool = wgmempool_Init( 10000, 10 );
+
+	result->heap = (entry_t *)wg_malloc( sizeof(entry_t) * maxngrams );
+	result->heapsize = maxngrams;
+	result->size = 0;
+
+	return result;
+}
+
+static void tabledone( table_t *t )
+{
+	if (!t) {
+		return;
+	}
+	wgmempool_Done(t->pool);
+	wg_free(t->table);
+	wg_free(t->heap);
+	wg_free(t);	
+}
+
+
+extern void *fp_Init(const char *name)
+{
+	fp_t *h = (fp_t *)wg_zalloc( sizeof(fp_t) );
+	
+	if ( name ) {
+		h->name = wg_strdup(name);
+	}
+
+	return (void *)h;
+}
+
+
+extern void fp_Done( void *handle )
+{
+	fp_t *h = (fp_t *)handle;
+
+	if ( h->name ) {
+		wg_free( (void *)h->name );
+	}
+	if ( h->fprint ) {
+		wg_free( h->fprint );
+	}
+
+	wg_free( h );
+}
+
+extern const char *fp_Name( void *handle )
+{
+	fp_t *h = (fp_t *)handle;
+	return  h->name;
+}
+
+/**
+ * Function that prepares buffer for n-grammification:
+ * runs of invalid characters are collapsed to a single
+ * underscore.
+ *
+ * Function is implemented as a finite state machine.
+ */
+static char *prepbuffer( const char *src, size_t bufsize )
+{
+	const char *p = src;
+	char *dest = (char *)wg_malloc( bufsize + 3 );
+	char *w = dest;
+	char *wlimit = dest + bufsize + 1;
+
+ START:
+	if ( INVALID(*p) ) {
+		goto SPACE;
+	}
+	else if ( *p == '\0' ) {
+		goto END;
+	}
+
+	*w++ = '_';
+	if ( w == wlimit ) {
+		goto STOP;
+	}
+
+	goto WORD;
+
+
+ SPACE:
+	/*** Inside string of invalid characters ***/
+	p++;
+	if ( INVALID(*p) ) {
+		goto SPACE;
+	}
+	else if ( *p == '\0' ) {
+		goto END;
+	}
+
+	*w++ = '_';
+	if ( w == wlimit ) {
+		goto STOP;
+	}
+
+	goto WORD;
+
+ WORD:
+	/*** Inside string of valid characters ***/
+	*w++ = *p++;
+	if ( w == wlimit ) {
+		goto END;
+	}
+	else if ( INVALID(*p) ) {
+		goto SPACE;
+	}
+	else if ( *p == '\0' ) {
+		goto STOP;
+	}
+	goto WORD;
+
+ END:
+	*w++ = '_';
+
+ STOP:
+	*w++ = '\0';
+
+	/*** Docs that are too small for a fingerprint, are refused ***/
+	if ( w - dest < MINDOCSIZE ) {
+		wg_free(dest);
+		return NULL;
+	}
+
+	return dest;
+}
+
+
+static void createngramtable( table_t *t, const char *buf )
+{
+	char n[MAXNGRAMSIZE+1];
+	const char *p = buf;
+	int i;
+
+	/*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/
+	for (;;p++) {
+
+		const char *q = p;
+		char *m = n;
+
+		/*** First char may be an underscore ***/
+		*m++ = *q++;
+		*m = '\0';
+
+		increasefreq( t, n, 1 );
+
+		if ( *q == '\0' ) {
+			return;
+		}
+
+		/*** Let the compiler unroll this ***/
+		for ( i=2; i<=MAXNGRAMSIZE; i++) {
+
+			*m++ = *q;
+			*m = '\0';
+
+			increasefreq( t, n, i );
+
+			if ( *q == '_' ) break;
+			q++;
+			if ( *q == '\0' ) {
+				return;
+			}
+		}
+	}
+	return;
+}
+
+
+static int mystrcmp(const char *a, const char *b)
+{
+	int i;
+	while ( *a && *a == *b ) {
+		a++;
+		b++;
+	}
+	return (*a - *b);
+}
+
+
+static int ngramcmp_str(const void *a, const void *b)
+{
+	ngram_t *x = (ngram_t *)a;
+	ngram_t *y = (ngram_t *)b;
+	
+	return mystrcmp( x->str, y->str );
+}
+
+static int ngramcmp_rank(const void *a, const void *b)
+{
+	ngram_t *x = (ngram_t *)a;
+	ngram_t *y = (ngram_t *)b;
+	
+	return x->rank - y->rank;
+}
+
+/**
+ * Create a fingerprint: 
+ * - record the frequency of each unique n-gram in a hash table
+ * - take the most frequent n-grams
+ * - sort them alphabetically, recording their relative rank
+ */
+extern int fp_Create( void *handle, const char *buffer, uint4 bufsize, uint4 maxngrams )
+{
+	sint4 i = 0;
+	fp_t *h = NULL;
+	table_t *t  = NULL;
+	char *tmp   = NULL;
+
+	if ( bufsize < MINDOCSIZE ) {
+		return 0;
+	}
+
+	/*** Throw out all invalid chars ***/
+	tmp = prepbuffer( buffer, bufsize ); 	
+	if ( tmp == NULL ) {
+		return 0;
+	}
+
+	h = (fp_t*)handle;
+	t = inittable(maxngrams);
+
+	/*** Create a hash table containing n-gram counts ***/
+	createngramtable(t, tmp);
+
+	/*** Take the top N n-grams and add them to the profile ***/
+	table2heap(t);	
+	maxngrams = WGMIN( maxngrams, t->size );       
+
+	h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams );
+	h->size = maxngrams;
+
+	/*** Pull n-grams out of heap (backwards) ***/
+	for (i=maxngrams-1; i>=0; i--) {
+
+		entry_t tmp2;
+
+		heapextract(t, &tmp2);
+		
+		/*** the string and its rank is all we need ***/
+		strcpy( h->fprint[i].str, tmp2.str );
+		h->fprint[i].rank = i;
+	}
+
+	tabledone(t);
+	wg_free(tmp);
+
+	/*** Sort n-grams alphabetically, for easy comparison ***/
+	qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); 
+	return 1;
+}
+
+extern void fp_Debug( void *handle )
+{
+	fp_t *h = (fp_t *)handle;
+	uint4 i;
+	printf("------ %s --------\n", h->name );
+	for (i=0; i<h->size; i++) {
+		printf("%3u: '%s' [%u]\n", i, h->fprint[i].str,  h->fprint[i].rank);
+	}
+
+
+}
+
+extern int fp_Read( void *handle, const char *fname, int maxngrams )
+{
+	fp_t *h = (fp_t *)handle;
+	FILE *fp;
+	char line[1024];
+	int cnt = 0;
+
+	fp = fopen( fname, "r" );
+	if (!fp) {
+#ifdef VERBOSE
+		fprintf( stderr, "Failed to open fingerprint file '%s'\n", fname);
+#endif
+		return 0;
+	}
+	
+	h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t));
+
+	while (cnt < maxngrams && wg_getline(line,1024,fp)) {
+
+		char *p;
+
+		wg_trim(line, line);
+
+		p = strpbrk( line, " \t" );
+		if ( p ) {
+			*p = '\0';
+		}
+
+		if ( strlen(line) > MAXNGRAMSIZE ) {
+			continue;
+		}
+
+		strcpy( h->fprint[cnt].str, line );
+		h->fprint[cnt].rank = cnt;
+
+		cnt++;
+	}
+
+	h->size = cnt;
+
+	/*** Sort n-grams, for easy comparison later on ***/
+	qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); 
+
+	fclose(fp);
+
+	return 1;
+}
+
+
+
+extern void fp_Print( void *handle, FILE *fp )
+{
+	uint4 i;
+	fp_t *h = (fp_t *)handle;
+	ngram_t *tmp = wg_malloc( sizeof(ngram_t) * h->size );
+	
+	/*** Make a temporary and sort it on rank ***/
+	memcpy( tmp, h->fprint, h->size * sizeof(ngram_t) );
+	qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank ); 
+
+	for (i=0; i<h->size; i++) {
+		fprintf( fp, "%s\n", tmp[i].str );
+	}
+	wg_free( tmp );
+}
+
+
+
+extern sint4 fp_Compare( void *cat, void *unknown, int cutoff )
+{
+	fp_t *c = (fp_t *)cat;
+	fp_t *u = (fp_t *)unknown;
+	uint4 i = 0;
+	uint4 j = 0;
+	sint4 sum = 0;
+	
+	/*** Compare the profiles in mergesort fashion ***/
+	while ( i < c->size && j < u->size ) {
+
+		int cmp = mystrcmp( c->fprint[i].str, u->fprint[j].str );
+
+		if ( cmp < 0 ) {
+			i++;
+		}
+		else if ( cmp == 0 ) {
+			sum += abs( c->fprint[i].rank - u->fprint[j].rank );
+			if ( sum > cutoff ) {
+				return MAXSCORE;
+			}
+			i++;
+			j++;
+		}
+		else {
+			sum += MAXOUTOFPLACE;
+			if ( sum > cutoff ) {
+				return MAXSCORE;
+			}
+			j++;
+		}
+	}
+
+	/*** Process tail of unknown, if any ***/
+	while ( j < u->size ) {
+		sum += MAXOUTOFPLACE;
+		if ( sum > cutoff ) {
+			return MAXSCORE;
+		}
+		j++;
+	}
+
+	return sum;
+	       	
+}
+
+
diff --git a/src/fingerprint.h b/src/fingerprint.h
new file mode 100644
index 0000000..de55fe9
--- /dev/null
+++ b/src/fingerprint.h
@@ -0,0 +1,47 @@
+#ifndef _FINGERPRINT_H_
+#define _FINGERPRINT_H_
+/*
+ * Copyright (C) 2003 WiseGuys Internet B.V.
+ *
+ * THE BSD LICENSE
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ * 
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "common.h"
+
+extern void *fp_Init(const char *name);
+extern void fp_Done( void *handle );
+extern int fp_Create( void *handle, const char *buffer, uint4 bufsize, uint4 maxngrams );
+extern int fp_Read( void *handle, const char *fname, int maxngrams );
+extern sint4 fp_Compare( void *cat, void *unknown, int cutoff );
+extern void fp_Show( void *handle );
+extern const char *fp_Name( void *handle );
+extern void fp_Print( void *handle, FILE *fp );
+
+#endif
diff --git a/src/testtextcat.c b/src/testtextcat.c
new file mode 100644
index 0000000..3fecc94
--- /dev/null
+++ b/src/testtextcat.c
@@ -0,0 +1,98 @@
+/*
+ * testtextcat.c -- a simple commandline classifier. Feed it input on
+ * standard in and it will feed you a classification on standard out.
+ *
+ * Copyright (C) 2003 WiseGuys Internet B.V.
+ *
+ * THE BSD LICENSE
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ * 
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "textcat.h"
+#include "common.h"
+
+#define BLOCKSIZE 4096
+
+char *myread(FILE *fp)
+{
+	char *buf;
+	size_t size = 0;
+	size_t maxsize = BLOCKSIZE*2;
+
+	buf = (char *)wg_malloc( maxsize );
+	do {
+		size_t read = fread( buf+size, 1, BLOCKSIZE, fp );
+		size += read;
+		if ( size + BLOCKSIZE > maxsize ) {
+			maxsize *= 2;
+			buf = (char *)wg_realloc( buf, maxsize );
+		}
+
+	} while (!feof(stdin));
+
+	buf[size] = '\0';
+	buf = (char *)wg_realloc( buf, size+1 );
+
+	return buf;
+}
+
+
+int main( int argc, char **argv )
+{
+	void *h;
+	char *result;
+	wgtimer_t tm;
+	int i;
+	char *buf;
+
+	printf("%s\n", textcat_Version());
+
+	h = textcat_Init( argc>1?argv[1]:"conf.txt" );
+	if ( !h ) {
+		printf("Unable to init. Aborting.\n");
+		exit(-1);
+	}
+
+	buf = myread(stdin);
+
+	wg_timerstart(&tm);
+
+	/*** We only need a little text to determine the language ***/
+	buf[1024] = '\0';
+	result = textcat_Classify( h, buf, strlen(buf)+1 );
+	printf("Result == %s\n", result);
+	
+	fprintf(stderr, "That took %u ms.\n", wg_timerstop(&tm)/1000);
+
+	textcat_Done(h);
+
+	wg_free(buf);
+
+	return 0;
+}
diff --git a/src/textcat.c b/src/textcat.c
new file mode 100644
index 0000000..b64734c
--- /dev/null
+++ b/src/textcat.c
@@ -0,0 +1,237 @@
+/**
+ * textcat.c -- routines for categorizing text
+ *
+ * Copyright (C) 2003 WiseGuys Internet B.V.
+ *
+ * THE BSD LICENSE
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ * 
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * DESCRIPTION
+ *
+ * These routines use the N-gram fingerprinting technique as described
+ * in Cavnar and Trenkle, (1994.), N-Gram-Based Text Categorization.
+ * (cf. http://www.nonlineardynamics.com/trenkle/)
+ *
+ * REVISION HISTORY
+ *
+ * Mar 27, 2003 frank@wise-guys.nl -- created
+ *
+ * IMPROVEMENTS:
+ * - If two n-grams have the same frequency count, choose the shortest
+ * - Use a better similarity measure (the article suggests Wilcoxon rank test)
+ * - The profiles are matched one by one, which results in redundant lookups.
+ * - Make the thingy reentrant as well as thread-safe. (Reentrancy is abandoned
+ *   by the use of the output buffer in textcat_t.)
+ */
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+#include "fingerprint.h"
+#include "textcat.h"
+#include "constants.h"
+
+
+typedef struct {
+
+	void **fprint;
+	uint4 size;
+	uint4 maxsize;
+
+	char output[MAXOUTPUTSIZE];
+
+} textcat_t;
+
+
+typedef struct {
+	int score;
+	const char *name;
+} candidate_t;
+
+
+static int cmpcandidates(const void *a, const void *b)
+{
+	candidate_t *x = (candidate_t *)a;
+	candidate_t *y = (candidate_t *)b;
+
+	if ( x->score < y->score ) {
+		return -1;
+	}
+	if ( x->score > y->score ) {
+		return 1;
+	}
+	return 0;
+}
+
+
+extern void textcat_Done( void *handle )
+{
+	textcat_t *h = (textcat_t *)handle;
+	uint4 i;
+
+	for (i=0; i<h->size; i++) {
+		fp_Done( h->fprint[i] );
+	}
+	wg_free( h->fprint );
+	wg_free( h );
+
+}
+
+extern void *textcat_Init( const char *conffile )
+{
+	textcat_t *h;
+	char line[1024];
+	FILE *fp;
+
+	fp = fopen( conffile, "r" );
+	if ( !fp ) {
+#ifdef VERBOSE
+		fprintf( stderr, "Failed to open config file '%s'\n", conffile);
+#endif
+		return NULL;
+	}
+
+	h = (textcat_t *)wg_malloc(sizeof(textcat_t));
+	h->size = 0;
+	h->maxsize = 16;
+	h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize );
+
+	while ( wg_getline( line, 1024, fp ) ) {
+		char *p;
+		char *segment[4];
+		double startcnt = 0.0;		
+		int res;
+
+		/*** Skip comments ***/
+		if (( p = strchr(line,'#') )) {
+			*p = '\0';
+		}
+		if ((res = wg_split( segment, line, line, 4)) < 2 ) {
+			continue;
+		}
+
+		/*** Ensure enough space ***/
+		if ( h->size == h->maxsize ) {
+			h->maxsize *= 2;
+			h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize );
+		}
+
+		/*** Load data ***/
+		if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) {
+			goto ERROR;
+		}
+		if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) {
+			textcat_Done(h);
+			goto ERROR;
+		}		
+		h->size++;
+	}
+
+	fclose(fp);
+	return h;
+
+ ERROR:
+	fclose(fp);
+	return NULL;
+
+}
+
+
+extern char *textcat_Classify( void *handle, const char *buffer, size_t size )
+{
+	textcat_t *h = (textcat_t *)handle;
+	uint4 i, cnt = 0;
+	int minscore = MAXSCORE;
+	int min = -1;
+	int threshold = minscore;
+	char *result = h->output;
+
+	candidate_t *candidates = (candidate_t *)alloca( sizeof(candidate_t) * h->size );
+
+	void *unknown;
+
+	unknown = fp_Init(NULL);
+	if ( fp_Create( unknown, buffer, size, MAXNGRAMS ) == 0 ) {
+		/*** Too little information ***/
+		result = _TEXTCAT_RESULT_SHORT;
+		goto READY;
+	}
+	
+	/*** Calculate the score for each category. ***/
+	for (i=0; i<h->size; i++) {
+		int score = fp_Compare( h->fprint[i], unknown, threshold );
+		candidates[i].score = score;
+		candidates[i].name = fp_Name( h->fprint[i] );
+		if ( score < minscore ) {
+			minscore = score;
+			threshold = (int)( (double)score * THRESHOLDVALUE );
+		}
+	}
+
+	/*** Find the best performers ***/
+	for (i=0; i<h->size; i++) {
+		if ( candidates[i].score < threshold ) {
+
+			if ( ++cnt == MAXCANDIDATES+1 ) {
+				break;
+			}
+
+			memcpy( &candidates[cnt-1], &candidates[i], sizeof(candidate_t) );
+
+		}
+	}
+
+	/*** The verdict ***/
+	if ( cnt == MAXCANDIDATES+1 ) {
+		result = _TEXTCAT_RESULT_UNKOWN;
+	}
+	else {
+		char *p = result;
+		char *plimit = result+MAXOUTPUTSIZE;
+		
+		qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates );
+
+		*p = '\0';
+		for (i=0; i<cnt; i++) {
+			p = wg_strgmov( p, "[", plimit );
+			p = wg_strgmov( p, candidates[i].name, plimit );
+			p = wg_strgmov( p, "]", plimit );
+		}
+	}
+ READY:
+	fp_Done(unknown);
+	return result;
+}
+
+
+extern char *textcat_Version()
+{
+	return "TextCat " WGSTR(VERSION) "." WGSTR(SUBVERSION) " (" DESCRIPTION ")";
+}
diff --git a/src/textcat.h b/src/textcat.h
new file mode 100644
index 0000000..746e131
--- /dev/null
+++ b/src/textcat.h
@@ -0,0 +1,80 @@
+#ifndef _TEXTCAT_H_
+#define _TEXTCAT_H_
+/*
+ * textcat.h -- routines for categorizing text
+ *
+ * Copyright (C) 2003 WiseGuys Internet B.V.
+ *
+ * THE BSD LICENSE
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ * 
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+
+#define _TEXTCAT_RESULT_UNKOWN        "UNKNOWN"
+#define _TEXTCAT_RESULT_SHORT         "SHORT"
+
+
+/**
+ * textcat_Init() - Initialize the text classifier. The textfile
+ * conffile should contain a list of fingerprint filenames and
+ * identification strings for the categories.  The filenames should be
+ * reachable from the current working directory. The identification
+ * strings will are used in the classification output.
+ * 
+ * Returns: handle on success, NULL on error. (At the moment, the
+ * only way errors can occur, is when the library cannot read the
+ * conffile, or one of the fingerprint files listed in it.)
+ */
+extern void *textcat_Init( const char *conffile );
+
+/**
+ * textcat_Done() - Free up resources for handle
+ */
+extern void textcat_Done( void *handle );
+
+/**
+ * textcat_Classify() - Give the most likely categories for buffer
+ * with length size.
+ *
+ * Returns: string containing a list of category id's, each one
+ * between square brackets, "UNKNOWN" when not recognized, "SHORT" if the
+ * document was too short to make a reliable assessment.
+ *
+ * Performace note: longer buffers take longer to process. However,
+ * for many uses it is not necessary to categorize the whole buffer.
+ * For language classification, a few hundred bytes will suffice.  
+ */
+extern char *textcat_Classify( void *handle, const char *buffer, size_t size );
+
+/**
+ * textcat_Version() - Returns a string describing the version of this classifier.
+ */
+extern char *textcat_Version();
+#endif
diff --git a/src/wg_mempool.c b/src/wg_mempool.c
new file mode 100644
index 0000000..d489ffd
--- /dev/null
+++ b/src/wg_mempool.c
@@ -0,0 +1,231 @@
+/*
+ * wg_mempool.c -- Functions for managing a memory pool. 
+ *
+ * Copyright (C) 2003 WiseGuys Internet B.V.
+ * 
+ * THE BSD LICENSE
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ * 
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include "common.h"
+
+typedef struct memblock_s {
+	char *pool;             
+	char *p;
+	char *pend;
+	struct memblock_s *next;
+} memblock_t;
+
+
+typedef struct mempool_s {
+	memblock_t *first;	/* linked list of blocks in use */
+	memblock_t *spare;      /* linked list of spare blocks */
+	size_t maxallocsize;
+	size_t blocksize;
+} mempool_t;
+
+
+
+static void addblock( mempool_t *h )
+{
+	memblock_t *block;
+
+	/*** Spare blocks from previous round? ***/
+	if ( h->spare ) {
+		/*** Use spare block ***/
+		block = h->spare;
+		h->spare = block->next;
+	}
+	else {
+		/*** Make a new block ***/
+		block = (memblock_t *)wg_malloc( sizeof(memblock_t) );
+		block->pool = (char *)wg_malloc( h->blocksize );
+	}
+
+	block->p = block->pool;
+	block->pend = block->pool + h->blocksize - h->maxallocsize;
+        block->next = h->first;
+	h->first = block;
+}
+
+
+extern void *wgmempool_Init(size_t blocksize, size_t maxstrsize)
+{
+	mempool_t *result = (mempool_t *)wg_malloc( sizeof(mempool_t) );
+	
+	result->first = NULL;
+	result->spare = NULL;
+	result->blocksize = blocksize;
+	result->maxallocsize = maxstrsize?(maxstrsize + 1):0;
+	addblock( result );
+
+	return (void *)result;
+}
+
+
+extern void wgmempool_Done( void *handle )
+{
+	mempool_t *h = (mempool_t *)handle;
+
+	memblock_t *p;
+
+	/*** Active blocks ***/
+	p = h->first;
+	while (p) {
+		memblock_t *next = p->next;
+		wg_free( p->pool );
+
+		memset( p, 0, sizeof(memblock_t)); /* for safety */
+		wg_free( p );
+
+		p = next;
+	}
+
+	/*** Spare blocks ***/
+	p = h->spare;
+	while (p) {
+		memblock_t *next = p->next;
+		wg_free( p->pool );
+
+		memset( p, 0, sizeof(memblock_t)); /* for safety */
+		wg_free( p );
+
+		p = next;
+	}
+
+	memset( h, 0, sizeof(mempool_t)); /* for safety */
+	wg_free(h);
+}
+
+extern void wgmempool_Reset( void *handle )
+{
+	mempool_t *h = (mempool_t *)handle;
+	memblock_t *p;
+
+	if ( !h->first ) {
+		return;
+	}
+
+	/*** Find last active block ***/
+	p = h->first;
+	while (p->next) {
+		p = p->next;
+	}
+
+	/*** Append spare list to it ***/
+	p->next = h->spare;
+	h->spare = h->first;
+	h->first = NULL;	
+
+	/*** Start with a new block ***/
+	addblock(h);
+}
+
+
+extern void *wgmempool_alloc( void *handle, size_t size )
+{
+	void *result;
+	mempool_t *h = (mempool_t *)handle;
+	memblock_t *block = h->first;
+
+	/*** Too little space left in block? ***/
+	if ( block->p + size > block->pend + h->maxallocsize ) {
+		addblock(h);
+		block = h->first;
+	}
+	result = (void *)block->p;
+	block->p += size;
+	return result;
+}
+
+
+
+extern char *wgmempool_strdup( void *handle, const char *str )
+{
+	char *w, *result;
+	mempool_t *h = (mempool_t *)handle;
+	memblock_t *block = h->first;
+
+	/*** Create extra room? ***/
+	if ( h->maxallocsize ) {
+		if ( block->p >= block->pend ) {
+			addblock(h);
+			block = h->first;
+		}
+	}
+	else if ( block->p + strlen(str) + 1 >= block->pend ) {
+		addblock(h);
+		block = h->first;
+	}
+
+	/*** Enough room, so copy string ***/
+	result = w = block->p;
+        while (*str) {
+		*w++ = *str++;
+	}
+	*w++ = '\0';
+	block->p = w;
+	return result;
+}
+
+
+extern char *wgmempool_getline( void *handle, size_t size, FILE *fp )
+{
+	char *result, *p;
+	mempool_t *h = (mempool_t *)handle;
+	memblock_t *block = h->first;
+
+	/*** Enough space? ***/
+	if ( block->p + size > block->pend + h->maxallocsize ) {
+		addblock(h);
+		block = h->first;
+	}
+
+	result = (char *)block->p;
+        fgets(result, size, fp) ;
+        
+        /** end of stream? **/
+        if ( feof( fp ) ) {
+                return NULL;
+        }
+        
+        /** find end of line **/
+	p = result;
+	while (*p && *p != '\n' && *p != '\r' ) {
+		p++;
+	}
+	*p++ = '\0';
+  
+	block->p = p;
+	return result;
+}
+
+
diff --git a/src/wg_mempool.h b/src/wg_mempool.h
new file mode 100644
index 0000000..73d40af
--- /dev/null
+++ b/src/wg_mempool.h
@@ -0,0 +1,144 @@
+#ifndef _WGMEMPOOL_H_
+#define _WGMEMPOOL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+ * mempool.c -- functions for efficient, pooled memory allocation
+ *
+ * THE BSD LICENSE
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ * 
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * DESCRIPTION
+ *
+ * These functions can be used to maintain a pool from which memory
+ * can be claimed without performing numerous allocs. 
+ *
+ * A memory pool can increase time and space efficiency by factors
+ * in situations where:
+ *
+ * - you need numerous allocations of little chunks of memory, e.g.
+ *   a huge list of keywords.
+ * - you need little or no deallocations: preferable just one big 
+ *   dealloc at the end. 
+ *
+ * A memory pool is unfavorable in situations where you constantly
+ * alloc and free memory. 
+ *
+ * NOTES
+ *
+ * - A memory pool consists of "blocks" of a fixed size, which are
+ *   added as the need for extra memory grows. 
+ * 
+ * - Deallocation of the individual fragments is not possible. The
+ *   pool can only be deallocated in its entirety.
+ *
+ */
+
+
+#include "common.h"
+
+/* 
+ * wgmempool_Init() -- intialize a memory pool
+ *
+ * ARGUMENTS
+ *
+ * - blocksize : size of blocks of which pool consists
+ * - maxstrsize: 
+ *            -  > 0 : the maximum size of a string that is copied 
+ *               with wgmempool_strdup(). Omits the necessity of boundschecking
+ *               in wgmempool_strdup(), making is slighlty faster.
+ *            -  0 : causes wgmempool_strdup() to do its own boundschecking,
+ *               which makes it somewhat slower.
+ *
+ * RETURN VALUE
+ *
+ * mempool handler on success, NULL on error.
+ * 
+ */
+extern void *wgmempool_Init(uint4 blocksize, size_t maxstrsize);
+
+
+/*
+ * wgmempool_Done() -- deallocate memory pool in its entirety
+ */
+extern void wgmempool_Done( void *handle );
+
+
+/*
+ * wgmempool_Reset() -- resets a memory pool for reuse
+ *
+ * wgmempool_Reset() preserves already claimed memory for reuse, making
+ * it more time efficient than doing a wgmempool_Done() and wgmempool_Init().
+ */
+extern void wgmempool_Reset( void *handle );
+
+
+/*
+ * wgmempool_alloc() -- Allocate size bytes of memory in mempool 
+ *
+ * ARGUMENTS
+ *
+ * - size: number of bytes to claim
+ *
+ * RETURN VALUE
+ *
+ * - pointer to claimed memory on success
+ * - NULL on error
+ */
+extern void *wgmempool_alloc( void *handle, size_t size );
+
+
+/*
+ * wgmempool_strdup() -- Allocate and copy a string to mempool
+ *
+ * ARGUMENTS
+ *
+ * - str : null-terminated string. If maxstrsize > 0 during
+ *         init, strlen(str) may not be greater than this value.
+ *
+ * RETURN VALUE
+ *
+ * - pointer to duplicated string on success
+ * - NULL on error
+ */
+extern char *wgmempool_strdup( void *handle, const char* str );
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
+