summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--LICENSE29
-rw-r--r--README141
-rw-r--r--TODO9
l---------include/textcat.h1
-rw-r--r--langclass/conf.txt89
l---------lib/libtextcat.a1
-rw-r--r--src/.#fingerprint.c.1.2634
-rw-r--r--src/Makefile62
-rw-r--r--src/common.c376
-rw-r--r--src/common.h83
-rw-r--r--src/constants.h82
-rw-r--r--src/createfp.c84
-rw-r--r--src/fingerprint.c704
-rw-r--r--src/fingerprint.h47
-rw-r--r--src/testtextcat.c98
-rw-r--r--src/textcat.c237
-rw-r--r--src/textcat.h80
-rw-r--r--src/wg_mempool.c231
-rw-r--r--src/wg_mempool.h144
19 files changed, 3132 insertions, 0 deletions
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..6eb19a4
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,29 @@
+Copyright (c) 2003, WiseGuys Internet B.V.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+- Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+- Neither the name of the WiseGuys Internet B.V. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README b/README
new file mode 100644
index 0000000..d8626a5
--- /dev/null
+++ b/README
@@ -0,0 +1,141 @@
+
+ :: libTextCat 2.0 ::
+
+ What is it?
+
+ Libtextcat is a library with functions that implement the
+ classification technique described in Cavnar & Trenkle, "N-Gram-Based
+ Text Categorization" [1]. It was primarily developed for language
+ guessing, a task on which it is known to perform with near-perfect
+ accuracy.
+
+ The central idea of the Cavnar & Trenkle technique is to calculate a
+ "fingerprint" of a document with an unknown category, and compare this
+ with the fingerprints of a number of documents of which the categories
+ are known. The categories of the closest matches are output as the
+ classification. A fingerprint is a list of the most frequent n-grams
+ occurring in a document, ordered by frequency. Fingerprints are
+ compared with a simple out-of-place metric. See the article for more
+ details.
+
+ Considerable effort went into making this implementation fast and
+ efficient. The language guesser processes over 100 documents/second on
+ a simple PC, which makes it practical for many uses. It was developed
+ for use in our webcrawler and search engine software, in which it it
+ handles millions of documents a day.
+
+ Download
+
+ The library is released under the BSD License, which basicly states
+ that you can do anything you like with it as long as you mention us
+ and make it clear that this library is covered by the BSD License. It
+ also exempts us from any liability, should this library eat your hard
+ disc, kill your cat or classify your attorney's e-mails as spam.
+
+ The current version is 2.0.
+
+ As of yet there is no development version.
+
+ The distribution contains a configuration file for Gertjan van Noord's
+ language models, which are distributed under the GNU Public License.
+ Please note that this license does not allow you to distribute them as
+ a part of closed source software package. In time, we will provide you
+ with language models under a less restrictive license.
+
+ Installation
+
+ As of now, we have no autoconfig script. A "make all" in the src/ dir
+ should do the trick. If that doesn't work, you'll have to dive into
+ the mess we call our Makefile.
+
+ The library is known to compile flawlessly on GNU/Linux for x86. If
+ you manage to get it working on other systems, please drop us a note,
+ and we'll proudly add you to this page.
+
+ Quickstart: language guesser
+
+ Assuming that you have successfully compiled the library, you still
+ need some language models to start guessing languages. If you don't
+ feel like creating them yourself (cf. [2]Creating your own
+ fingerprints below), you can use the excellent collection of over 70
+ language models provided in Gertjan van Noord's "TextCat" package. We
+ have provided a configuration file for these models in the langclass
+ directory. Hack it at will.
+
+ * Download the textcat package at
+ http://odur.let.rug.nl/~vannoord/TextCat/
+ * mkdir ~/text_cat
+ * cd ~/text_cat
+ * tar xzf ~/your_download_dir/text_cat.tgz
+ * ~/libtextcat/src/testtextcat ~/libtextcat/langclass/conf.txt
+
+ Paste some text onto the commandline, and watch it get classified.
+
+ Using the API
+
+ Classifying the language of a textbuffer can be as easy as:
+
+ #include "textcat.h"
+ ...
+ void *h = textcat_Init( "conf.txt" );
+ ...
+ printf( "Language: %s\n", textcat_Classify(h, buffer, 400);
+ ...
+ textcat_Done(h);
+
+ Creating your own fingerprints
+
+ The createfp program allows you to easily create your own document
+ fingerprints. Just feed it an example document on standard input, and
+ store the standard output:
+
+ % createfp < mydocument.txt > myfingerprint.txt
+
+ Put the names of your fingerprints in a configuration file, add some
+ id's and you're ready to classify.
+
+ Performance tuning
+
+ This library was made with efficiency in mind. There are couple of
+ parameters you may wish to tweak if you intend to use it for other
+ tasks than language guessing.
+
+ The most important thing is buffer size. For reliable language
+ guessing the classifier only needs a couple of hundreds of bytes max.
+ So don't feed it 100KB of text unless you are creating a fingerprint.
+
+ If you insist on feeding the classifier lots of text, try fiddling
+ with TABLEPOW, which determines the size of the hash table that is
+ used to store the n-grams. Making it too small will result in many
+ hashtable clashes, making it too large will cause wild memory
+ behaviour and both are bad for the performance.
+
+ Putting the most probable models at the top of the list in your config
+ file improves performance, because this will raise the threshold for
+ likely candidates more quickly.
+
+ Since the speed of the classifier is roughly linear with respect to
+ the number of models, you should consider how many models you really
+ need. In case of language guessing: do you really want to recognize
+ every language ever invented?
+
+ References
+
+ [1] The document that started it all can be downloaded at John M.
+ Trenkle's site: N-Gram-Based Text Categorization
+
+ http://www.novodynamics.com/trenkle/papers/sdair-94-bc.ps.gz
+
+ [2] The Perl implementation by Gertjan van Noord (code + language
+ models): downloadable from his [7]website
+
+ http://odur.let.rug.nl/~vannoord/TextCat/
+
+ Contact
+
+ Praise and flames may be directed at us through
+ libtextcat AT wise-guys.nl. If there is enough interest, we'll whip up
+ a mailing list. The current project maintainer is Frank Scheelen.
+
+ c. 2003 WiseGuys Internet B.V.
+
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..24dffef
--- /dev/null
+++ b/TODO
@@ -0,0 +1,9 @@
+
+TODO
+
+- Better Makefile
+- Improve error handling and reporting.
+- Split off table and heap routines to separate source files.
+- Speed optimizations. Some speed might be gained by comparing
+ the unknown fingerprint to all known fingerprints at once.
+- More docs, more comments. \ No newline at end of file
diff --git a/include/textcat.h b/include/textcat.h
new file mode 120000
index 0000000..9db5260
--- /dev/null
+++ b/include/textcat.h
@@ -0,0 +1 @@
+../src/textcat.h \ No newline at end of file
diff --git a/langclass/conf.txt b/langclass/conf.txt
new file mode 100644
index 0000000..4eb9f73
--- /dev/null
+++ b/langclass/conf.txt
@@ -0,0 +1,89 @@
+#
+# A sample config file for the language models
+# provided with Gertjan van Noords language guesser
+# (http://odur.let.rug.nl/~vannoord/TextCat/)
+#
+# Notes:
+# - You may consider eliminating a couple of small languages from this
+# list because they cause false positives with big languages and are
+# bad for performance. (Do you really want to recognize Drents?)
+# - Putting the most probable languages at the top of the list
+# improves performance, because this will raise the threshold for
+# likely candidates more quickly.
+#
+LM/afrikaans.lm afrikaans
+LM/albanian.lm albanian
+LM/amharic-utf.lm amharic-utf
+LM/arabic-iso8859_6.lm arabic-iso8859_6
+LM/arabic-windows1256.lm arabic-windows1256
+LM/armenian.lm armenian
+LM/basque.lm basque
+LM/belarus-windows1251.lm belarus-windows1251
+LM/bosnian.lm bosnian
+LM/breton.lm breton
+LM/bulgarian-iso8859_5.lm bulgarian-iso8859_5
+LM/catalan.lm catalan
+LM/chinese-big5.lm chinese-big5
+LM/chinese-gb2312.lm chinese-gb2312
+LM/croatian-ascii.lm croatian-ascii
+LM/czech-iso8859_2.lm czech-iso8859_2
+LM/danish.lm danish
+LM/drents.lm drents # Dutch dialect
+LM/dutch.lm dutch
+LM/english.lm english
+LM/esperanto.lm esperanto
+LM/estonian.lm estonian
+LM/finnish.lm finnish
+LM/french.lm french
+LM/frisian.lm frisian
+LM/georgian.lm georgian
+LM/german.lm german
+LM/greek-iso8859-7.lm greek-iso8859-7
+LM/hebrew-iso8859_8.lm hebrew-iso8859_8
+LM/hindi.lm hindi
+LM/hungarian.lm hungarian
+LM/icelandic.lm icelandic
+LM/indonesian.lm indonesian
+LM/irish.lm irish
+LM/italian.lm italian
+LM/japanese-euc_jp.lm japanese-euc_jp
+LM/japanese-shift_jis.lm japanese-shift_jis
+LM/korean.lm korean
+LM/latin.lm latin
+LM/latvian.lm latvian
+LM/lithuanian.lm lithuanian
+LM/malay.lm malay
+LM/manx.lm manx
+LM/marathi.lm marathi
+LM/middle_frisian.lm middle_frisian
+LM/mingo.lm mingo
+LM/nepali.lm nepali
+LM/norwegian.lm norwegian
+LM/persian.lm persian
+LM/polish.lm polish
+LM/portuguese.lm portuguese
+LM/quechua.lm quechua
+LM/romanian.lm romanian
+LM/rumantsch.lm rumantsch
+LM/russian-iso8859_5.lm russian-iso8859_5
+LM/russian-koi8_r.lm russian-koi8_r
+LM/russian-windows1251.lm russian-windows1251
+LM/sanskrit.lm sanskrit
+LM/scots.lm scots
+LM/scots_gaelic.lm scots_gaelic
+LM/serbian-ascii.lm serbian-ascii
+LM/slovak-ascii.lm slovak-ascii
+LM/slovak-windows1250.lm slovak-windows1250
+LM/slovenian-ascii.lm slovenian-ascii
+LM/slovenian-iso8859_2.lm slovenian-iso8859_2
+LM/spanish.lm spanish
+LM/swahili.lm swahili
+LM/swedish.lm swedish
+LM/tagalog.lm tagalog
+LM/tamil.lm tamil
+LM/thai.lm thai
+LM/turkish.lm turkish
+LM/ukrainian-koi8_r.lm ukrainian-koi8_r
+LM/vietnamese.lm vietnamese
+LM/welsh.lm welsh
+LM/yiddish-utf.lm yiddish-utf
diff --git a/lib/libtextcat.a b/lib/libtextcat.a
new file mode 120000
index 0000000..53dc620
--- /dev/null
+++ b/lib/libtextcat.a
@@ -0,0 +1 @@
+../src/libtextcat.a \ No newline at end of file
diff --git a/src/.#fingerprint.c.1.2 b/src/.#fingerprint.c.1.2
new file mode 100644
index 0000000..19d55d9
--- /dev/null
+++ b/src/.#fingerprint.c.1.2
@@ -0,0 +1,634 @@
+/**
+ * fingerprint.c -- Routines for creating an n-gram fingerprint of a
+ * buffer.
+ *
+ * A fingerprint is a list of most common n-grams, ordered by
+ * frequency. (Note that we can use other strings than n-grams, for
+ * instance entire words.)
+ *
+ * Mar 28, 2003 frank@wise-guys.nl -- created
+ *
+ * TODO:
+ * - put table/heap datastructure in a separate file.
+ * */
+#include <string.h>
+#include <ctype.h>
+
+#include "wglib.h"
+#include "wg_mempool.h"
+
+#define INVALID(c) (isspace(c) || isdigit(c))
+
+#define TABLEPOW 13
+#define TABLESIZE (1<<TABLEPOW)
+#define TABLEMASK ((TABLESIZE)-1)
+
+#define MAXOUTOFPLACE 400
+#define MINDOCSIZE 25
+
+typedef struct {
+
+ sint2 rank;
+ char str[6];
+
+} ngram_t;
+
+typedef struct fprint_s {
+
+ const char *name;
+ ngram_t *fprint;
+ uint4 size;
+
+} fprint_t;
+
+typedef struct entry_s {
+ char str[6];
+ unsigned int cnt;
+ struct entry_s *next;
+} entry_t;
+
+typedef struct table_s {
+ void *pool;
+ entry_t **table;
+ entry_t *heap;
+
+ struct table_s *next;
+
+ uint4 heapsize;
+ uint4 size;
+} table_t;
+
+
+
+/*
+ * fast and furious little hash function
+ *
+ * (Note that we could use some kind of rolling checksum, and update it
+ * during n-gram construction)
+ */
+static uint4 simplehash( const char *p, int len )
+{
+ sint4 h = len * 13;
+ while (*p) {
+ h = (h<<5)-h + *p++;
+ }
+ return (uint4)h;
+}
+
+
+/* checks if n-gram lex is a prefix of key and of length len */
+inline int issame( char *lex, char *key, int len )
+{
+ int i;
+ for (i=0; i<len; i++) {
+ if ( key[i] != lex[i] ) {
+ return 0;
+ }
+ }
+ if ( lex[i] != 0 ) {
+ return 0;
+ }
+ return 1;
+}
+
+
+/* increases frequency of ngram(p,len) */
+static inline int increasefreq( table_t *t, char *p, int len )
+{
+ uint4 hash = simplehash( p, len ) & TABLEMASK;
+ entry_t *entry = t->table[ hash ];
+
+ wg_debug(NGRAMDEBUG, "%u-GRAM: %s\n", len,p);
+
+ while ( entry ) {
+ if ( issame( entry->str, p, len ) ) {
+ /*** Found it! ***/
+ entry->cnt++;
+ return 1;
+ }
+ else {
+ entry = entry->next;
+ }
+ }
+
+ /*** Not found, so create ***/
+ entry = wgmempool_alloc( t->pool, sizeof(entry_t) );
+ strcpy( entry->str, p );
+ entry->cnt = 1;
+
+ entry->next = t->table[hash];
+ t->table[hash] = entry;
+
+ return 1;
+}
+
+
+/* looks up ngram(p,len) */
+static entry_t *findfreq( table_t *t, char *p, int len )
+{
+ uint4 hash = simplehash( p, len ) & TABLEMASK;
+ entry_t *entry = t->table[ hash ];
+
+ while ( entry ) {
+ if ( issame( entry->str, p, len ) ) {
+ return entry;
+ }
+ else {
+ entry = entry->next;
+ }
+ }
+
+ return NULL;
+}
+
+static void dumptable(table_t *t)
+{
+ int i;
+ for (i=0; i<TABLESIZE; i++) {
+
+ entry_t *p = t->table[i];
+
+ while (p) {
+
+ printf("%5u %s\n", p->cnt, p->str);
+ p = p->next;
+ }
+
+ }
+}
+
+
+#define GREATER(x,y) ((x).cnt > (y).cnt)
+#define LESS(x,y) ((x).cnt < (y).cnt)
+
+inline static void siftup( table_t *t, unsigned int child )
+{
+ entry_t *heap = t->heap;
+ unsigned int parent = (child-1) >> 1;
+ entry_t tmp;
+
+ while ( child > 0 ) {
+ if ( GREATER(heap[parent],heap[child]) ) {
+ memcpy( &tmp, &heap[parent], sizeof(entry_t) );
+ memcpy( &heap[parent], &heap[child], sizeof(entry_t) );
+ memcpy( &heap[child], &tmp, sizeof(entry_t) );
+ }
+ else {
+ return;
+ }
+
+ child = parent;
+ parent = (child-1) >> 1;
+ }
+}
+
+
+inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
+{
+ entry_t *heap = t->heap;
+ unsigned int child = parent*2 + 1;
+ entry_t tmp;
+
+ while ( child < heapsize ) {
+ if ( child+1 < heapsize && GREATER(heap[child], heap[child+1]) ) {
+ child++;
+ }
+ if ( GREATER(heap[parent], heap[child] ) ) {
+ memcpy( &tmp, &heap[parent], sizeof(entry_t) );
+ memcpy( &heap[parent], &heap[child], sizeof(entry_t) );
+ memcpy( &heap[child], &tmp, sizeof(entry_t) );
+ }
+ else {
+ return;
+ }
+ parent = child;
+ child = (parent*2)+1;
+ }
+}
+
+
+static int heapinsert( table_t *t, entry_t *item )
+{
+ entry_t *heap = t->heap;
+
+ /*** Still room for an entry? ***/
+ if (t->size < t->heapsize) {
+ memcpy( &(heap[t->size]), item, sizeof(entry_t));
+ siftup( t, t->size );
+ t->size++;
+ return 0;
+ }
+
+ /*** Worse than the worst performer? ***/
+ if ( LESS(*item, heap[0]) ) {
+ return 0;
+ }
+
+ /*** Insert into heap and reheap ***/
+ memcpy( &(heap[0]), item, sizeof(entry_t));
+ siftdown( t, t->size, 0 );
+ return 0;
+}
+
+
+extern int heapextract( table_t *t, entry_t *item )
+{
+ entry_t *p;
+
+ if (t->size == 0 ) {
+ return 0;
+ }
+
+ p = &(t->heap[0]);
+
+ memcpy( item, p, sizeof( entry_t ));
+ memcpy( &(t->heap[0]), &(t->heap[t->size-1]), sizeof(entry_t) );
+
+ siftdown(t,t->size, 0);
+ t->size--;
+
+ return 1;
+}
+
+
+/*** Makes a heap of all table entries ***/
+static int table2heap(table_t *t)
+{
+ int i;
+ uint4 cnt = 0;
+ uint4 bufsize = 2048;
+
+ /*** Fill result heap ***/
+ for (i=0; i<TABLESIZE; i++) {
+ entry_t *p = t->table[i];
+ while (p) {
+ heapinsert(t, p);
+ p = p->next;
+ }
+ }
+ return 1;
+}
+
+
+static table_t *inittable(uint4 maxngrams)
+{
+ table_t *result = (table_t *)wg_zalloc( sizeof(table_t) );
+ result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE );
+ result->pool = wgmempool_Init( 10000, 10 );
+
+ result->heap = (entry_t *)wg_malloc( sizeof(entry_t) * maxngrams );
+ result->heapsize = maxngrams;
+ result->size = 0;
+
+ return result;
+}
+
+static void tabledone( table_t *t )
+{
+ if (!t) {
+ return;
+ }
+ wgmempool_Done(t->pool);
+ wg_free(t->table);
+ wg_free(t->heap);
+ wg_free(t);
+}
+
+
+extern void *fprint_Init(const char *name)
+{
+ fprint_t *h = (fprint_t *)wg_zalloc( sizeof(fprint_t) );
+
+ if ( name ) {
+ h->name = wg_strdup(name);
+ }
+
+ return (void *)h;
+}
+
+
+extern void fprint_Done( void *handle )
+{
+ fprint_t *h = (fprint_t *)handle;
+
+ if ( h->name ) {
+ wg_free( (void *)h->name );
+ }
+ if ( h->fprint ) {
+ wg_free( h->fprint );
+ }
+
+ wg_free( h );
+}
+
+extern const char *fprint_Name( void *handle )
+{
+ fprint_t *h = (fprint_t *)handle;
+ return h->name;
+}
+
+/**
+ * Function that prepares buffer for n-grammification:
+ * runs of invalid characters are collapsed to a single
+ * underscore.
+ *
+ * Function is implemented as a finite state machine.
+ */
+static char *prepbuffer( const char *src, size_t bufsize )
+{
+ const char *p = src;
+ char *dest = (char *)wg_malloc( bufsize + 3 );
+ char *w = dest;
+ char *wlimit = dest + bufsize + 1;
+
+ START:
+ if ( INVALID(*p) ) {
+ goto SPACE;
+ }
+ else if ( *p == '\0' ) {
+ goto END;
+ }
+
+ *w++ = '_';
+ if ( w == wlimit ) {
+ goto STOP;
+ }
+
+ goto WORD;
+
+
+ SPACE:
+ /*** Inside string of invalid characters ***/
+ p++;
+ if ( INVALID(*p) ) {
+ goto SPACE;
+ }
+ else if ( *p == '\0' ) {
+ goto END;
+ }
+
+ *w++ = '_';
+ if ( w == wlimit ) {
+ goto STOP;
+ }
+
+ goto WORD;
+
+ WORD:
+ /*** Inside string of valid characters ***/
+ *w++ = *p++;
+ if ( w == wlimit ) {
+ goto END;
+ }
+ else if ( INVALID(*p) ) {
+ goto SPACE;
+ }
+ else if ( *p == '\0' ) {
+ goto STOP;
+ }
+ goto WORD;
+
+ END:
+ *w++ = '_';
+
+ STOP:
+ *w++ = '\0';
+
+ /*** Docs that are too small for a fingerprint, are refused ***/
+ if ( w - dest < MINDOCSIZE ) {
+ wg_free(dest);
+ return NULL;
+ }
+
+ return dest;
+}
+
+
+static void createngramtable( table_t *t, const char *buf )
+{
+ char n[6];
+ const char *p = buf;
+ int i;
+
+ /*** Get all n-grams where 1<=n<=5. Allow underscores only at borders. ***/
+ for (;;p++) {
+
+ const char *q = p;
+ char *m = n;
+
+ /*** First char may be an underscore ***/
+ *m++ = *q++;
+ *m = '\0';
+
+ increasefreq( t, n, 1 );
+
+ if ( *q == '\0' ) {
+ return;
+ }
+
+ /*** Let the compiler unroll this ***/
+ for ( i=2; i<=5; i++) {
+
+ *m++ = *q;
+ *m = '\0';
+
+ increasefreq( t, n, i );
+
+ if ( *q == '_' ) break;
+ q++;
+ if ( *q == '\0' ) {
+ return;
+ }
+ }
+ }
+ return;
+}
+
+
+static int ngramcmp(const void *a, const void *b)
+{
+ ngram_t *x = (ngram_t *)a;
+ ngram_t *y = (ngram_t *)b;
+
+ return strcmp( x->str, y->str );
+}
+
+/**
+ * Create a fingerprint:
+ * - record the frequency of each unique n-gram in a hash table
+ * - take the most frequent n-grams
+ * - sort them alphabetically, recording their relative rank
+ */
+extern int fprint_Create( void *handle, const char *buffer, uint4 bufsize, uint4 maxngrams )
+{
+ sint4 i;
+ fprint_t *h = (fprint_t *)handle;
+ table_t *t = inittable(maxngrams);
+ char *tmp;
+
+ if ( bufsize < MINDOCSIZE ) {
+ return 0;
+ }
+
+ /*** Throw out all invalid chars ***/
+ tmp = prepbuffer( buffer, bufsize );
+ if ( tmp == NULL ) {
+ return 0;
+ }
+
+ /*** Create a hash table containing n-gram counts ***/
+ createngramtable(t, tmp);
+
+ /*** Take the top N n-grams and add them to the profile ***/
+ table2heap(t);
+ maxngrams = WGMIN( maxngrams, t->size );
+
+ h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams );
+ h->size = maxngrams;
+
+ /*** Pull n-grams out of heap (backwards) ***/
+ for (i=maxngrams-1; i>=0; i--) {
+
+ entry_t tmp;
+
+ heapextract(t, &tmp);
+
+ /*** the string and its rank is all we need ***/
+ strcpy( h->fprint[i].str, tmp.str );
+ h->fprint[i].rank = i;
+ }
+
+ tabledone(t);
+ wg_free(tmp);
+
+ /*** Sort n-grams alphabetically, for easy comparison ***/
+ qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp );
+ return 1;
+}
+
+extern void fprint_Show( void *handle )
+{
+ fprint_t *h = (fprint_t *)handle;
+ int i;
+ printf("------ %s --------\n", h->name );
+ for (i=0; i<h->size; i++) {
+ printf("%3u: '%s' [%u]\n", i, h->fprint[i].str, h->fprint[i].rank);
+ }
+
+
+}
+
+extern int fprint_Read( void *handle, const char *fname, int maxngrams )
+{
+ fprint_t *h = (fprint_t *)handle;
+ FILE *fp;
+ char line[1024];
+ int cnt = 0;
+
+ fp = fopen( fname, "r" );
+ if (!fp) {
+ return 0;
+ }
+
+ h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t));
+
+ while (cnt < maxngrams && wg_getline(line,1024,fp)) {
+
+ char *p;
+
+ wg_trim(line, line);
+
+ p = strpbrk( line, " \t" );
+ if ( p ) {
+ *p = '\0';
+ }
+
+ if ( strlen(line) > 5 ) {
+ continue;
+ }
+
+ strcpy( h->fprint[cnt].str, line );
+ h->fprint[cnt].rank = cnt;
+
+ cnt++;
+ }
+
+ h->size = cnt;
+
+ /*** Sort n-grams, for easy comparison later on ***/
+ qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp );
+
+ fclose(fp);
+
+ return 1;
+}
+
+
+extern sint4 fprint_Compare( void *cat, void *unknown, int cutoff )
+{
+ fprint_t *c = (fprint_t *)cat;
+ fprint_t *u = (fprint_t *)unknown;
+ int i = 0;
+ int j = 0;
+ sint4 sum = 0;
+
+ /*** Compare the profiles in mergesort fashion ***/
+ while ( i < c->size && j < u->size ) {
+
+ int cmp = strcmp( c->fprint[i].str, u->fprint[j].str );
+
+ if ( cmp < 0 ) {
+ i++;
+ }
+ else if ( cmp == 0 ) {
+ sum += abs( c->fprint[i].rank - u->fprint[j].rank );
+ if ( sum > cutoff ) {
+ return 1000000000;
+ }
+ i++;
+ j++;
+ }
+ else {
+ sum += MAXOUTOFPLACE;
+ if ( sum > cutoff ) {
+ return 1000000000;
+ }
+ j++;
+ }
+ }
+
+ /*** Process tail of unknown, if any ***/
+ while ( j < u->size ) {
+ sum += MAXOUTOFPLACE;
+ if ( sum > cutoff ) {
+ return 1000000000;
+ }
+ j++;
+ }
+
+ return sum;
+
+}
+
+
+#if 0
+int main(int argc, char **argv)
+{
+ char buf[100 * 1024];
+ char tmp[100 * 1024];
+ int size;
+ fprint_t *h;
+
+ wg_meminit();
+ {
+ //size = fread( buf, 1, 100*1024, stdin );
+ h = fprint_Init("test");
+ buf[size] = '\0';
+ //fprint_Create(h, buf, strlen(buf), 100);
+ fprint_Read( h, "/home/frank/textcat/LM/dutch.lm", 100 );
+ fprint_Show(h);
+ fprint_Done(h);
+ wg_printmemstats();
+ }
+}
+#endif
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..1253f38
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,62 @@
+#
+# Apr 1, 2003 frank@wise-guys.nl
+# Apr 2, 2003 gnarl@wise-guys.nl
+#
+#--------------------------------------------------------------------------
+# Macro Variables
+#--------------------------------------------------------------------------
+#MDEBUG =-DMEMDEBUG -DMEMVERBOSE
+#
+SRC =/home/src
+RANLIB =ranlib
+RM =/bin/rm -f
+CC =gcc
+LIBS =
+LEX =flex
+LDFLAGS =-g
+OPTS =-static -O3 -DWGTIMER -D_THREAD_SAFE -D_GNU_SOURCE -march=i686 \
+ -funroll-loops --pipe -D__GNU_C__
+VERBOSE = -DVERBOSE
+IFLAGS =
+
+WARNS =-W -Winline -Wall -Wshadow -Wno-unused -Wpointer-arith
+
+CFLAGS =$(VERBOSE) $(IFLAGS) $(LDFLAGS) -D_GNU_SOURCE -D_REENTRANT -DWGTIMER $(WARNS) $(OPTS) $(MDEBUG)
+
+OBJECTS2 = fingerprint.o textcat.o common.o wg_mempool.o
+OBJECTS = testtextcat.o
+OBJECTS3 = createfp.o
+
+SOURCES = fingerprint.c textcat.c common.c wg_mempool.c testtextcat.c createfp.c
+
+LIBS1 = -lm
+
+LIBS = $(LIBS1)
+
+#--------------------------------------------------------------------------
+# Dependencies
+#--------------------------------------------------------------------------
+all: textcat testtextcat createfp
+
+$(OBJECTS) : Makefile
+
+textcat: $(OBJECTS2)
+ $(RM) -f libtextcat.a ; ar cr libtextcat.a $(OBJECTS2) ; $(RANLIB) libtextcat.a
+
+testtextcat: $(OBJECTS) libtextcat.a
+ $(CC) $(CFLAGS) -static -o $@ $(OBJECTS) libtextcat.a $(LDFLAGS) $(LIBS)
+
+createfp: $(OBJECTS3) libtextcat.a
+ $(CC) $(CFLAGS) -static -o $@ $(OBJECTS3) libtextcat.a $(LDFLAGS) $(LIBS)
+
+dep: depend
+#
+depend:
+ makedepend -- $(CFLAGS) -- $(SOURCES)
+#
+$(SOURCES): Makefile
+#
+clean:
+ $(RM) core *.o *~ testtextcat createfp *.a
+
+# DO NOT DELETE
diff --git a/src/common.c b/src/common.c
new file mode 100644
index 0000000..054dae4
--- /dev/null
+++ b/src/common.c
@@ -0,0 +1,376 @@
+/*
+ * common.c -- miscellanious helper functions.
+ *
+ * Copyright (c) 2003, WiseGuys Internet B.V.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <ctype.h>
+#include "common.h"
+
+extern void wgmem_error( const char *fmt, ... )
+{
+ va_list ap;
+
+ fprintf(stderr, "MEMERROR : ");
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+
+ exit(-1);
+}
+
+
+extern void *wg_malloc( size_t size )
+{
+ void *result;
+ if ( !size ) {
+ wgmem_error( "Error mallocing 0 bytes.\n" );
+ }
+
+ result = malloc( size );
+ if ( !result ) {
+ wgmem_error( "Error while mallocing %u bytes.\n", size );
+ }
+
+ return result;
+}
+
+extern void *wg_calloc( size_t nmemb, size_t size )
+{
+ void *result;
+ if ( !size || !nmemb ) {
+ wgmem_error( "Error callocing 0 bytes.\n" );
+ }
+
+ result = calloc( nmemb, size );
+ if ( !result ) {
+ wgmem_error( "Error while callocing %u elements of %u bytes.\n", nmemb, size);
+ }
+
+ return result;
+}
+
+extern void *wg_zalloc( size_t size )
+{
+ void *result;
+
+ if (!size) {
+ wgmem_error( "Error zallocing 0 bytes.\n" );
+ }
+
+ result = calloc(1, size);
+ if ( !result ) {
+ wgmem_error( "Error while zallocing %u bytes.\n", size );
+ }
+
+ return result;
+}
+
+extern char* wg_strdup( const char *s )
+{
+ char *result = strdup( s );
+
+ if ( !result ) {
+ wgmem_error( "Error while strduping %u bytes.\n", strlen(s) );
+ }
+
+ return( result );
+}
+
+extern void* wg_realloc( void *ptr, size_t size )
+{
+ void *result;
+
+ if (!size) {
+ wgmem_error( "Error reallocing 0 bytes.\n" );
+ }
+
+ result = realloc( ptr, size );
+ if ( !result ) {
+ wgmem_error( "Error while reallocing %u bytes.\n", size );
+ }
+
+ return( result );
+}
+
+extern void wg_free( void *mem )
+{
+ if ( mem ) {
+ free(mem);
+ }
+}
+
+extern char *wg_getline( char *line, int size, FILE *fp )
+{
+ char *p;
+
+ if ( fgets(line, size, fp) == NULL ) {
+ return NULL;
+ }
+
+ /** kill term null **/
+ if ( (p = strpbrk( line, "\n\r" )) ) {
+ *p = '\0';
+ }
+
+ return line;
+}
+
+
+
+/*
+ * wg_split: split a line into segments, using whitespace-sequences as separators.
+ *
+ * ARGUMENTS:
+ * - result:
+ *
+ * After the split, this array contains pointers to the start of each
+ * detected segment. Must be preallocated and at least as large as
+ * maxsegments. The pointers point into the dest buffer.
+ *
+ * - dest:
+ *
+ * String into which result points as an index. Must be preallocated, and
+ * at least as big as src. You can use src as dest, but in that case src
+ * is overwritten!
+ *
+ * - src:
+ *
+ * The string to split. Sequences of whitespace are treated as separators, unless
+ * escaped. There are two ways to escape: by using single quotes (anything
+ * between single quotes is treated as one segment), or by using a backslash
+ * to escape the next character. The backslash escape works inside quotation
+ * as well.
+ *
+ * Example:
+ *
+ * "It\'s very\ easy 'to use WiseGuys\' wg_split()' function" is split into:
+ *
+ * "It's"
+ * "very easy"
+ * "to use WiseGuys' wg_split()"
+ * "function"
+ *
+ * - maxsegments:
+ *
+ * The maximum number of segments. If the splitter runs out of segments,
+ * the remainder of the string is stored in the last segment.
+ *
+ * RETURN VALUE:
+ * The number of segments found.
+ */
+unsigned int wg_split( char **result, char *dest, char *src, int maxsegments )
+{
+ char *p = src;
+ char *w = dest;
+ int cnt = 0;
+ int state=0;
+
+ if ( maxsegments == 0 ) {
+ return 0;
+ }
+
+ maxsegments--;
+
+ while (cnt < maxsegments && *p) {
+
+ switch (state) {
+ case 0:
+ /*** Skip spaces ***/
+ while ( isspace((int) *p) ) {
+ p++;
+ }
+ state = 1;
+
+ case 1:
+ /*** Start segment ***/
+ result[cnt] = w;
+ cnt++;
+ state = 2;
+
+ case 2:
+ /*** Unquoted segment ***/
+ while (*p) {
+ if ( isspace((int) *p) ) {
+ *w++ = '\0';
+ p++;
+ state = 0;
+ break;
+ }
+ else if ( *p == '\'' ) {
+ /*** Start quotation ***/
+ p++;
+ state = 3;
+ break;
+ }
+ else if ( *p == '\\' && p[1] ) {
+ /*** Escape ***/
+ p++;
+ *w++ = *p++;
+ }
+ else {
+ *w++ = *p++;
+ }
+ }
+ break;
+
+ case 3:
+ /*** Inside quotes ***/
+ while (*p) {
+ if (*p == '\'') {
+ p++;
+ break;
+ }
+ else if ( *p == '\\' && p[1] ) {
+ /*** Escape ***/
+ p++;
+ *w++ = *p++;
+ }
+ else {
+ *w++ = *p++;
+ }
+ }
+ state = 2;
+ break;
+
+ }
+ }
+
+ if ( !*p ) {
+ *w = '\0';
+ return cnt;
+ }
+
+ /*** We ran out of segments; copy the remainder of the string into last segment ***/
+ result[cnt++] = w;
+ while (*p) {
+ *w++ = *p++;
+ }
+ *w = '\0';
+ return cnt;
+}
+
+
+extern void wg_timerstart(wgtimer_t *t)
+{
+ gettimeofday( &(t->start), NULL );
+}
+
+
+
+extern uint4 wg_timerstop(wgtimer_t *t)
+{
+ uint4 result;
+ gettimeofday( &(t->stop), NULL );
+ result = (t->stop.tv_sec - t->start.tv_sec) * 1000000 +
+ (t->stop.tv_usec - t->start.tv_usec);
+
+ t->start.tv_sec = t->stop.tv_sec;
+ t->start.tv_usec = t->stop.tv_usec;
+
+ return result;
+}
+
+
+/**
+ * wg_strgmov -- a guarded strcpy() variation
+ *
+ * copies src to dest (including terminating zero), and returns
+ * pointer to position of terminating zero in dest. The function is
+ * guaranteed not to write past destlimit. If the copy couldn't be
+ * finished, the function returns NULL after restoring the first
+ * character in dest for your convenience (since this is usually a zero).
+ */
+char *wg_strgmov( char *dest, const char *src, const char *destlimit )
+{
+ char tmp, *w;
+
+ if ( !dest || dest >= destlimit ) {
+ return NULL;
+ }
+
+ tmp = *dest;
+ w = dest;
+
+ while ( *src ) {
+
+ *w++ = *src++;
+
+ if ( w == destlimit ) {
+ /*** restore old situation ***/
+ *dest = tmp;
+ return NULL;
+ }
+ }
+
+ *w = '\0';
+ return w;
+
+}
+
+/*
+ * wg_trim() -- remove whitespace surrounding a string.
+ *
+ * Example: " bla bla bla " becomes "bla bla bla" after trimming.
+ *
+ * ARGUMENTS
+ * - dest : After the trim, this will point to the trimmed string.
+ * Must be preallocated and at least as big as src. You can
+ * use src as dest.
+ * - src : Points to the string to be trimmed.
+ *
+ * RETURNS:
+ * dest
+ */
+char *wg_trim( char *dest, const char *src )
+{
+ char *lastnonspace = &dest[-1];
+ const char *p = src;
+ char *w = dest;
+
+ while ( isspace((int)*p) ) {
+ p++;
+ }
+ while (*p) {
+ if ( !isspace((int)*p) ) {
+ lastnonspace = w;
+ }
+ *w++ = *p++;
+ }
+ lastnonspace[1] = '\0';
+
+ return dest;
+}
+
diff --git a/src/common.h b/src/common.h
new file mode 100644
index 0000000..5dc4656
--- /dev/null
+++ b/src/common.h
@@ -0,0 +1,83 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+/**
+ * common.h -- a mixed bag of helper functions
+ *
+ * Copyright (C) 2003 WiseGuys Internet B.V.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <inttypes.h>
+#include <sys/time.h>
+#include <time.h>
+
+#define WGMIN(x,y) ((x)<=(y)?(x):(y))
+#define WGMAX(x,y) ((x)<=(y)?(y):(x))
+#define __STR__(x) #x
+#define WGSTR(x) __STR__(x)
+
+typedef uint32_t uint4;
+typedef uint16_t uint2;
+typedef uint8_t uchar;
+
+typedef int32_t sint4;
+typedef int16_t sint2;
+typedef int8_t schar;
+
+typedef int8_t boole;
+
+
+typedef struct wgtimer_s {
+ struct timeval start;
+ struct timeval stop;
+} wgtimer_t;
+
+
+extern void *wg_malloc( size_t size );
+extern void *wg_calloc( size_t nmemb, size_t size );
+extern void *wg_zalloc( size_t size );
+extern char* wg_strdup( const char *s );
+extern void* wg_realloc( void *ptr, size_t size ) ;
+extern void wg_free( void *mem );
+
+extern char *wg_getline( char *line, int size, FILE *fp );
+
+extern void wg_timerstart(wgtimer_t *t);
+extern uint4 wg_timerstop(wgtimer_t *t);
+
+extern unsigned int wg_split( char **result, char *dest, char *src, int maxsegments );
+extern char *wg_strgmov( char *dest, const char *src, const char *destlimit );
+extern char *wg_trim( char *dest, const char *src );
+
+
+#endif
+
diff --git a/src/constants.h b/src/constants.h
new file mode 100644
index 0000000..0bdae6c
--- /dev/null
+++ b/src/constants.h
@@ -0,0 +1,82 @@
+#ifndef _CONSTANTS_H_
+#define _CONSTANTS_H_
+
+/*
+ * constants.h -- some constants used throughout the code. Not pretty,
+ * but certainly convenient.
+ *
+ * Copyright (C) 2003 WiseGuys Internet B.V.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <limits.h>
+
+
+#define VERSION 2
+#define SUBVERSION 0
+#define DESCRIPTION "out of place"
+
+/* Reported matches are those fingerprints with a score less than best
+ * score * THRESHOLDVALUE (i.e. a THRESHOLDVALUE of 1.03 means matches
+ * must score within 3% from the best score.)
+ */
+#define THRESHOLDVALUE 1.03
+
+/* If more than MAXCANDIDATES matches are found, the classifier reports
+ * unknown, because the input is obviously confusing.
+ */
+#define MAXCANDIDATES 5
+
+/* The size of the buffer used to report the classification.
+ */
+#define MAXOUTPUTSIZE 1024
+
+/* Maximum number of n-grams in a fingerprint */
+#define MAXNGRAMS 400
+
+/* Maximum size of an n-gram? */
+#define MAXNGRAMSIZE 5
+
+/* Which characters are not acceptable in n-grams? */
+#define INVALID(c) (isspace(c) || isdigit(c))
+
+/* Minimum size (in characters) for accepting a document */
+#define MINDOCSIZE 25
+
+/* Maximum penalty for missing an n-gram in fingerprint */
+#define MAXOUTOFPLACE 400
+
+/* Size of hash table is 2^TABLEPOW. */
+#define TABLEPOW 13
+
+#define MAXSCORE INT_MAX
+
+#endif
diff --git a/src/createfp.c b/src/createfp.c
new file mode 100644
index 0000000..1e3e0cd
--- /dev/null
+++ b/src/createfp.c
@@ -0,0 +1,84 @@
+/*
+ * createfprint.c - can be used to create a fingerprint of a document.
+ *
+ * Copyright (c) 2003, WiseGuys Internet B.V.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "fingerprint.h"
+#include "common.h"
+
+#define BLOCKSIZE 4096
+
+char *myread(FILE *fp)
+{
+ char *buf;
+ size_t size = 0;
+ size_t maxsize = BLOCKSIZE*2;
+
+ buf = (char *)wg_malloc( maxsize );
+ do {
+ size_t read = fread( buf+size, 1, BLOCKSIZE, fp );
+ size += read;
+ if ( size + BLOCKSIZE > maxsize ) {
+ maxsize *= 2;
+ buf = (char *)wg_realloc( buf, maxsize );
+ }
+
+ } while (!feof(stdin));
+
+ buf[size] = '\0';
+ buf = (char *)wg_realloc( buf, size+1 );
+
+ return buf;
+}
+
+
+int main( int argc, char **argv )
+{
+ void *h;
+ char *result;
+ wgtimer_t tm;
+ int i;
+ char *buf;
+
+ buf = myread(stdin);
+
+ h = fp_Init(NULL);
+ if ( fp_Create( h, buf, strlen(buf), 400 ) == 0 ) {
+ fprintf(stderr, "There was an error creating the fingerprint\n");
+ exit(-1);
+ }
+ fp_Print(h,stdout);
+ fp_Done(h);
+ wg_free(buf);
+
+ return 0;
+}
diff --git a/src/fingerprint.c b/src/fingerprint.c
new file mode 100644
index 0000000..0d14558
--- /dev/null
+++ b/src/fingerprint.c
@@ -0,0 +1,704 @@
+/**
+ * fingerprint.c -- Routines for creating an n-gram fingerprint of a
+ * buffer.
+ *
+ * Copyright (c) 2003, WiseGuys Internet B.V.
+ * All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * DESCRIPTION
+ *
+ * A fingerprint is a list of most common n-grams, ordered by
+ * frequency. (Note that we can use other strings than n-grams, for
+ * instance entire words.)
+ *
+ * HOW DOES IT WORK?
+ *
+ * - Buffer is sliced up into n-grams
+ * - N-grams are inserted into a hash table that records their frequency
+ * - The table entries are filtered through a N-sized heap to
+ * get the N most frequent n-grams.
+ *
+ * The reason why we go through the trouble of doing a partial
+ * (heap)sort is that a full quicksort behaves horribly on the data:
+ * most n-grams have a very low count, resulting in a data set in
+ * nearly-sorted order. This causes quicksort to behave very badly.
+ * Heapsort, on the other hand, behaves handsomely: worst case is
+ * Mlog(N) for M n-grams filtered through a N-sized heap.
+ *
+ * REVISION HISTORY
+ *
+ * Mar 28, 2003 frank@wise-guys.nl -- created
+ *
+ * TODO:
+ * - put table/heap datastructure in a separate file.
+ * */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "common.h"
+#include "wg_mempool.h"
+#include "constants.h"
+
+
+#define TABLESIZE (1<<TABLEPOW)
+#define TABLEMASK ((TABLESIZE)-1)
+
+typedef struct {
+
+ sint2 rank;
+ char str[MAXNGRAMSIZE+1];
+
+} ngram_t;
+
+typedef struct fp_s {
+
+ const char *name;
+ ngram_t *fprint;
+ uint4 size;
+
+} fp_t;
+
+typedef struct entry_s {
+ char str[MAXNGRAMSIZE+1];
+ unsigned int cnt;
+ struct entry_s *next;
+} entry_t;
+
+typedef struct table_s {
+ void *pool;
+ entry_t **table;
+ entry_t *heap;
+
+ struct table_s *next;
+
+ uint4 heapsize;
+ uint4 size;
+} table_t;
+
+
+
+/*
+ * fast and furious little hash function
+ *
+ * (Note that we could use some kind of rolling checksum, and update it
+ * during n-gram construction)
+ */
+static uint4 simplehash( const char *p, int len )
+{
+ sint4 h = len * 13;
+ while (*p) {
+ h = (h<<5)-h + *p++;
+ }
+ return (uint4)h;
+}
+
+
+/* checks if n-gram lex is a prefix of key and of length len */
+inline int issame( char *lex, char *key, int len )
+{
+ int i;
+ for (i=0; i<len; i++) {
+ if ( key[i] != lex[i] ) {
+ return 0;
+ }
+ }
+ if ( lex[i] != 0 ) {
+ return 0;
+ }
+ return 1;
+}
+
+
+/* increases frequency of ngram(p,len) */
+static inline int increasefreq( table_t *t, char *p, int len )
+{
+ uint4 hash = simplehash( p, len ) & TABLEMASK;
+ entry_t *entry = t->table[ hash ];
+
+ while ( entry ) {
+ if ( issame( entry->str, p, len ) ) {
+ /*** Found it! ***/
+ entry->cnt++;
+ return 1;
+ }
+ else {
+ entry = entry->next;
+ }
+ }
+
+ /*** Not found, so create ***/
+ entry = wgmempool_alloc( t->pool, sizeof(entry_t) );
+ strcpy( entry->str, p );
+ entry->cnt = 1;
+
+ entry->next = t->table[hash];
+ t->table[hash] = entry;
+
+ return 1;
+}
+
+
+/* looks up ngram(p,len) */
+static entry_t *findfreq( table_t *t, char *p, int len )
+{
+ uint4 hash = simplehash( p, len ) & TABLEMASK;
+ entry_t *entry = t->table[ hash ];
+
+ while ( entry ) {
+ if ( issame( entry->str, p, len ) ) {
+ return entry;
+ }
+ else {
+ entry = entry->next;
+ }
+ }
+
+ return NULL;
+}
+
+static void dumptable(table_t *t)
+{
+ int i;
+ for (i=0; i<TABLESIZE; i++) {
+
+ entry_t *p = t->table[i];
+
+ while (p) {
+
+ printf("%5u %s\n", p->cnt, p->str);
+ p = p->next;
+ }
+
+ }
+}
+
+
+#define GREATER(x,y) ((x).cnt > (y).cnt)
+#define LESS(x,y) ((x).cnt < (y).cnt)
+
+inline static void siftup( table_t *t, unsigned int child )
+{
+ entry_t *heap = t->heap;
+ unsigned int parent = (child-1) >> 1;
+ entry_t tmp;
+
+ while ( child > 0 ) {
+ if ( GREATER(heap[parent],heap[child]) ) {
+ memcpy( &tmp, &heap[parent], sizeof(entry_t) );
+ memcpy( &heap[parent], &heap[child], sizeof(entry_t) );
+ memcpy( &heap[child], &tmp, sizeof(entry_t) );
+ }
+ else {
+ return;
+ }
+
+ child = parent;
+ parent = (child-1) >> 1;
+ }
+}
+
+
+inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
+{
+ entry_t *heap = t->heap;
+ unsigned int child = parent*2 + 1;
+ entry_t tmp;
+
+ while ( child < heapsize ) {
+ if ( child+1 < heapsize && GREATER(heap[child], heap[child+1]) ) {
+ child++;
+ }
+ if ( GREATER(heap[parent], heap[child] ) ) {
+ memcpy( &tmp, &heap[parent], sizeof(entry_t) );
+ memcpy( &heap[parent], &heap[child], sizeof(entry_t) );
+ memcpy( &heap[child], &tmp, sizeof(entry_t) );
+ }
+ else {
+ return;
+ }
+ parent = child;
+ child = (parent*2)+1;
+ }
+}
+
+
+static int heapinsert( table_t *t, entry_t *item )
+{
+ entry_t *heap = t->heap;
+
+ /*** Still room for an entry? ***/
+ if (t->size < t->heapsize) {
+ memcpy( &(heap[t->size]), item, sizeof(entry_t));
+ siftup( t, t->size );
+ t->size++;
+ return 0;
+ }
+
+ /*** Worse than the worst performer? ***/
+ if ( LESS(*item, heap[0]) ) {
+ return 0;
+ }
+
+ /*** Insert into heap and reheap ***/
+ memcpy( &(heap[0]), item, sizeof(entry_t));
+ siftdown( t, t->size, 0 );
+ return 0;
+}
+
+
+extern int heapextract( table_t *t, entry_t *item )
+{
+ entry_t *p;
+
+ if (t->size == 0 ) {
+ return 0;
+ }
+
+ p = &(t->heap[0]);
+
+ memcpy( item, p, sizeof( entry_t ));
+ memcpy( &(t->heap[0]), &(t->heap[t->size-1]), sizeof(entry_t) );
+
+ siftdown(t,t->size, 0);
+ t->size--;
+
+ return 1;
+}
+
+
+/*** Makes a heap of all table entries ***/
+static int table2heap(table_t *t)
+{
+ int i;
+ uint4 cnt = 0;
+ uint4 bufsize = 2048;
+
+ /*** Fill result heap ***/
+ for (i=0; i<TABLESIZE; i++) {
+ entry_t *p = t->table[i];
+ while (p) {
+ heapinsert(t, p);
+ p = p->next;
+ }
+ }
+ return 1;
+}
+
+
+static table_t *inittable(uint4 maxngrams)
+{
+ table_t *result = (table_t *)wg_zalloc( sizeof(table_t) );
+ result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE );
+ result->pool = wgmempool_Init( 10000, 10 );
+
+ result->heap = (entry_t *)wg_malloc( sizeof(entry_t) * maxngrams );
+ result->heapsize = maxngrams;
+ result->size = 0;
+
+ return result;
+}
+
+static void tabledone( table_t *t )
+{
+ if (!t) {
+ return;
+ }
+ wgmempool_Done(t->pool);
+ wg_free(t->table);
+ wg_free(t->heap);
+ wg_free(t);
+}
+
+
+extern void *fp_Init(const char *name)
+{
+ fp_t *h = (fp_t *)wg_zalloc( sizeof(fp_t) );
+
+ if ( name ) {
+ h->name = wg_strdup(name);
+ }
+
+ return (void *)h;
+}
+
+
+extern void fp_Done( void *handle )
+{
+ fp_t *h = (fp_t *)handle;
+
+ if ( h->name ) {
+ wg_free( (void *)h->name );
+ }
+ if ( h->fprint ) {
+ wg_free( h->fprint );
+ }
+
+ wg_free( h );
+}
+
+extern const char *fp_Name( void *handle )
+{
+ fp_t *h = (fp_t *)handle;
+ return h->name;
+}
+
+/**
+ * Function that prepares buffer for n-grammification:
+ * runs of invalid characters are collapsed to a single
+ * underscore.
+ *
+ * Function is implemented as a finite state machine.
+ */
+static char *prepbuffer( const char *src, size_t bufsize )
+{
+ const char *p = src;
+ char *dest = (char *)wg_malloc( bufsize + 3 );
+ char *w = dest;
+ char *wlimit = dest + bufsize + 1;
+
+ START:
+ if ( INVALID(*p) ) {
+ goto SPACE;
+ }
+ else if ( *p == '\0' ) {
+ goto END;
+ }
+
+ *w++ = '_';
+ if ( w == wlimit ) {
+ goto STOP;
+ }
+
+ goto WORD;
+
+
+ SPACE:
+ /*** Inside string of invalid characters ***/
+ p++;
+ if ( INVALID(*p) ) {
+ goto SPACE;
+ }
+ else if ( *p == '\0' ) {
+ goto END;
+ }
+
+ *w++ = '_';
+ if ( w == wlimit ) {
+ goto STOP;
+ }
+
+ goto WORD;
+
+ WORD:
+ /*** Inside string of valid characters ***/
+ *w++ = *p++;
+ if ( w == wlimit ) {
+ goto END;
+ }
+ else if ( INVALID(*p) ) {
+ goto SPACE;
+ }
+ else if ( *p == '\0' ) {
+ goto STOP;
+ }
+ goto WORD;
+
+ END:
+ *w++ = '_';
+
+ STOP:
+ *w++ = '\0';
+
+ /*** Docs that are too small for a fingerprint, are refused ***/
+ if ( w - dest < MINDOCSIZE ) {
+ wg_free(dest);
+ return NULL;
+ }
+
+ return dest;
+}
+
+
+static void createngramtable( table_t *t, const char *buf )
+{
+ char n[MAXNGRAMSIZE+1];
+ const char *p = buf;
+ int i;
+
+ /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/
+ for (;;p++) {
+
+ const char *q = p;
+ char *m = n;
+
+ /*** First char may be an underscore ***/
+ *m++ = *q++;
+ *m = '\0';
+
+ increasefreq( t, n, 1 );
+
+ if ( *q == '\0' ) {
+ return;
+ }
+
+ /*** Let the compiler unroll this ***/
+ for ( i=2; i<=MAXNGRAMSIZE; i++) {
+
+ *m++ = *q;
+ *m = '\0';
+
+ increasefreq( t, n, i );
+
+ if ( *q == '_' ) break;
+ q++;
+ if ( *q == '\0' ) {
+ return;
+ }
+ }
+ }
+ return;
+}
+
+
+static int mystrcmp(const char *a, const char *b)
+{
+ int i;
+ while ( *a && *a == *b ) {
+ a++;
+ b++;
+ }
+ return (*a - *b);
+}
+
+
+static int ngramcmp_str(const void *a, const void *b)
+{
+ ngram_t *x = (ngram_t *)a;
+ ngram_t *y = (ngram_t *)b;
+
+ return mystrcmp( x->str, y->str );
+}
+
+static int ngramcmp_rank(const void *a, const void *b)
+{
+ ngram_t *x = (ngram_t *)a;
+ ngram_t *y = (ngram_t *)b;
+
+ return x->rank - y->rank;
+}
+
+/**
+ * Create a fingerprint:
+ * - record the frequency of each unique n-gram in a hash table
+ * - take the most frequent n-grams
+ * - sort them alphabetically, recording their relative rank
+ */
+extern int fp_Create( void *handle, const char *buffer, uint4 bufsize, uint4 maxngrams )
+{
+ sint4 i = 0;
+ fp_t *h = NULL;
+ table_t *t = NULL;
+ char *tmp = NULL;
+
+ if ( bufsize < MINDOCSIZE ) {
+ return 0;
+ }
+
+ /*** Throw out all invalid chars ***/
+ tmp = prepbuffer( buffer, bufsize );
+ if ( tmp == NULL ) {
+ return 0;
+ }
+
+ h = (fp_t*)handle;
+ t = inittable(maxngrams);
+
+ /*** Create a hash table containing n-gram counts ***/
+ createngramtable(t, tmp);
+
+ /*** Take the top N n-grams and add them to the profile ***/
+ table2heap(t);
+ maxngrams = WGMIN( maxngrams, t->size );
+
+ h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams );
+ h->size = maxngrams;
+
+ /*** Pull n-grams out of heap (backwards) ***/
+ for (i=maxngrams-1; i>=0; i--) {
+
+ entry_t tmp2;
+
+ heapextract(t, &tmp2);
+
+ /*** the string and its rank is all we need ***/
+ strcpy( h->fprint[i].str, tmp2.str );
+ h->fprint[i].rank = i;
+ }
+
+ tabledone(t);
+ wg_free(tmp);
+
+ /*** Sort n-grams alphabetically, for easy comparison ***/
+ qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str );
+ return 1;
+}
+
+extern void fp_Debug( void *handle )
+{
+ fp_t *h = (fp_t *)handle;
+ uint4 i;
+ printf("------ %s --------\n", h->name );
+ for (i=0; i<h->size; i++) {
+ printf("%3u: '%s' [%u]\n", i, h->fprint[i].str, h->fprint[i].rank);
+ }
+
+
+}
+
+extern int fp_Read( void *handle, const char *fname, int maxngrams )
+{
+ fp_t *h = (fp_t *)handle;
+ FILE *fp;
+ char line[1024];
+ int cnt = 0;
+
+ fp = fopen( fname, "r" );
+ if (!fp) {
+#ifdef VERBOSE
+ fprintf( stderr, "Failed to open fingerprint file '%s'\n", fname);
+#endif
+ return 0;
+ }
+
+ h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t));
+
+ while (cnt < maxngrams && wg_getline(line,1024,fp)) {
+
+ char *p;
+
+ wg_trim(line, line);
+
+ p = strpbrk( line, " \t" );
+ if ( p ) {
+ *p = '\0';
+ }
+
+ if ( strlen(line) > MAXNGRAMSIZE ) {
+ continue;
+ }
+
+ strcpy( h->fprint[cnt].str, line );
+ h->fprint[cnt].rank = cnt;
+
+ cnt++;
+ }
+
+ h->size = cnt;
+
+ /*** Sort n-grams, for easy comparison later on ***/
+ qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str );
+
+ fclose(fp);
+
+ return 1;
+}
+
+
+
+extern void fp_Print( void *handle, FILE *fp )
+{
+ uint4 i;
+ fp_t *h = (fp_t *)handle;
+ ngram_t *tmp = wg_malloc( sizeof(ngram_t) * h->size );
+
+ /*** Make a temporary and sort it on rank ***/
+ memcpy( tmp, h->fprint, h->size * sizeof(ngram_t) );
+ qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank );
+
+ for (i=0; i<h->size; i++) {
+ fprintf( fp, "%s\n", tmp[i].str );
+ }
+ wg_free( tmp );
+}
+
+
+
+extern sint4 fp_Compare( void *cat, void *unknown, int cutoff )
+{
+ fp_t *c = (fp_t *)cat;
+ fp_t *u = (fp_t *)unknown;
+ uint4 i = 0;
+ uint4 j = 0;
+ sint4 sum = 0;
+
+ /*** Compare the profiles in mergesort fashion ***/
+ while ( i < c->size && j < u->size ) {
+
+ int cmp = mystrcmp( c->fprint[i].str, u->fprint[j].str );
+
+ if ( cmp < 0 ) {
+ i++;
+ }
+ else if ( cmp == 0 ) {
+ sum += abs( c->fprint[i].rank - u->fprint[j].rank );
+ if ( sum > cutoff ) {
+ return MAXSCORE;
+ }
+ i++;
+ j++;
+ }
+ else {
+ sum += MAXOUTOFPLACE;
+ if ( sum > cutoff ) {
+ return MAXSCORE;
+ }
+ j++;
+ }
+ }
+
+ /*** Process tail of unknown, if any ***/
+ while ( j < u->size ) {
+ sum += MAXOUTOFPLACE;
+ if ( sum > cutoff ) {
+ return MAXSCORE;
+ }
+ j++;
+ }
+
+ return sum;
+
+}
+
+
diff --git a/src/fingerprint.h b/src/fingerprint.h
new file mode 100644
index 0000000..de55fe9
--- /dev/null
+++ b/src/fingerprint.h
@@ -0,0 +1,47 @@
+#ifndef _FINGERPRINT_H_
+#define _FINGERPRINT_H_
+/*
+ * Copyright (C) 2003 WiseGuys Internet B.V.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "common.h"
+
+extern void *fp_Init(const char *name);
+extern void fp_Done( void *handle );
+extern int fp_Create( void *handle, const char *buffer, uint4 bufsize, uint4 maxngrams );
+extern int fp_Read( void *handle, const char *fname, int maxngrams );
+extern sint4 fp_Compare( void *cat, void *unknown, int cutoff );
+extern void fp_Show( void *handle );
+extern const char *fp_Name( void *handle );
+extern void fp_Print( void *handle, FILE *fp );
+
+#endif
diff --git a/src/testtextcat.c b/src/testtextcat.c
new file mode 100644
index 0000000..3fecc94
--- /dev/null
+++ b/src/testtextcat.c
@@ -0,0 +1,98 @@
+/*
+ * testtextcat.c -- a simple commandline classifier. Feed it input on
+ * standard in and it will feed you a classification on standard out.
+ *
+ * Copyright (C) 2003 WiseGuys Internet B.V.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "textcat.h"
+#include "common.h"
+
+#define BLOCKSIZE 4096
+
+char *myread(FILE *fp)
+{
+ char *buf;
+ size_t size = 0;
+ size_t maxsize = BLOCKSIZE*2;
+
+ buf = (char *)wg_malloc( maxsize );
+ do {
+ size_t read = fread( buf+size, 1, BLOCKSIZE, fp );
+ size += read;
+ if ( size + BLOCKSIZE > maxsize ) {
+ maxsize *= 2;
+ buf = (char *)wg_realloc( buf, maxsize );
+ }
+
+ } while (!feof(stdin));
+
+ buf[size] = '\0';
+ buf = (char *)wg_realloc( buf, size+1 );
+
+ return buf;
+}
+
+
+int main( int argc, char **argv )
+{
+ void *h;
+ char *result;
+ wgtimer_t tm;
+ int i;
+ char *buf;
+
+ printf("%s\n", textcat_Version());
+
+ h = textcat_Init( argc>1?argv[1]:"conf.txt" );
+ if ( !h ) {
+ printf("Unable to init. Aborting.\n");
+ exit(-1);
+ }
+
+ buf = myread(stdin);
+
+ wg_timerstart(&tm);
+
+ /*** We only need a little text to determine the language ***/
+ buf[1024] = '\0';
+ result = textcat_Classify( h, buf, strlen(buf)+1 );
+ printf("Result == %s\n", result);
+
+ fprintf(stderr, "That took %u ms.\n", wg_timerstop(&tm)/1000);
+
+ textcat_Done(h);
+
+ wg_free(buf);
+
+ return 0;
+}
diff --git a/src/textcat.c b/src/textcat.c
new file mode 100644
index 0000000..b64734c
--- /dev/null
+++ b/src/textcat.c
@@ -0,0 +1,237 @@
+/**
+ * textcat.c -- routines for categorizing text
+ *
+ * Copyright (C) 2003 WiseGuys Internet B.V.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * DESCRIPTION
+ *
+ * These routines use the N-gram fingerprinting technique as described
+ * in Cavnar and Trenkle, (1994.), N-Gram-Based Text Categorization.
+ * (cf. http://www.nonlineardynamics.com/trenkle/)
+ *
+ * REVISION HISTORY
+ *
+ * Mar 27, 2003 frank@wise-guys.nl -- created
+ *
+ * IMPROVEMENTS:
+ * - If two n-grams have the same frequency count, choose the shortest
+ * - Use a better similarity measure (the article suggests Wilcoxon rank test)
+ * - The profiles are matched one by one, which results in redundant lookups.
+ * - Make the thingy reentrant as well as thread-safe. (Reentrancy is abandoned
+ * by the use of the output buffer in textcat_t.)
+ */
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+#include "fingerprint.h"
+#include "textcat.h"
+#include "constants.h"
+
+
+typedef struct {
+
+ void **fprint;
+ uint4 size;
+ uint4 maxsize;
+
+ char output[MAXOUTPUTSIZE];
+
+} textcat_t;
+
+
+typedef struct {
+ int score;
+ const char *name;
+} candidate_t;
+
+
+static int cmpcandidates(const void *a, const void *b)
+{
+ candidate_t *x = (candidate_t *)a;
+ candidate_t *y = (candidate_t *)b;
+
+ if ( x->score < y->score ) {
+ return -1;
+ }
+ if ( x->score > y->score ) {
+ return 1;
+ }
+ return 0;
+}
+
+
+extern void textcat_Done( void *handle )
+{
+ textcat_t *h = (textcat_t *)handle;
+ uint4 i;
+
+ for (i=0; i<h->size; i++) {
+ fp_Done( h->fprint[i] );
+ }
+ wg_free( h->fprint );
+ wg_free( h );
+
+}
+
+extern void *textcat_Init( const char *conffile )
+{
+ textcat_t *h;
+ char line[1024];
+ FILE *fp;
+
+ fp = fopen( conffile, "r" );
+ if ( !fp ) {
+#ifdef VERBOSE
+ fprintf( stderr, "Failed to open config file '%s'\n", conffile);
+#endif
+ return NULL;
+ }
+
+ h = (textcat_t *)wg_malloc(sizeof(textcat_t));
+ h->size = 0;
+ h->maxsize = 16;
+ h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize );
+
+ while ( wg_getline( line, 1024, fp ) ) {
+ char *p;
+ char *segment[4];
+ double startcnt = 0.0;
+ int res;
+
+ /*** Skip comments ***/
+ if (( p = strchr(line,'#') )) {
+ *p = '\0';
+ }
+ if ((res = wg_split( segment, line, line, 4)) < 2 ) {
+ continue;
+ }
+
+ /*** Ensure enough space ***/
+ if ( h->size == h->maxsize ) {
+ h->maxsize *= 2;
+ h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize );
+ }
+
+ /*** Load data ***/
+ if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) {
+ goto ERROR;
+ }
+ if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) {
+ textcat_Done(h);
+ goto ERROR;
+ }
+ h->size++;
+ }
+
+ fclose(fp);
+ return h;
+
+ ERROR:
+ fclose(fp);
+ return NULL;
+
+}
+
+
+extern char *textcat_Classify( void *handle, const char *buffer, size_t size )
+{
+ textcat_t *h = (textcat_t *)handle;
+ uint4 i, cnt = 0;
+ int minscore = MAXSCORE;
+ int min = -1;
+ int threshold = minscore;
+ char *result = h->output;
+
+ candidate_t *candidates = (candidate_t *)alloca( sizeof(candidate_t) * h->size );
+
+ void *unknown;
+
+ unknown = fp_Init(NULL);
+ if ( fp_Create( unknown, buffer, size, MAXNGRAMS ) == 0 ) {
+ /*** Too little information ***/
+ result = _TEXTCAT_RESULT_SHORT;
+ goto READY;
+ }
+
+ /*** Calculate the score for each category. ***/
+ for (i=0; i<h->size; i++) {
+ int score = fp_Compare( h->fprint[i], unknown, threshold );
+ candidates[i].score = score;
+ candidates[i].name = fp_Name( h->fprint[i] );
+ if ( score < minscore ) {
+ minscore = score;
+ threshold = (int)( (double)score * THRESHOLDVALUE );
+ }
+ }
+
+ /*** Find the best performers ***/
+ for (i=0; i<h->size; i++) {
+ if ( candidates[i].score < threshold ) {
+
+ if ( ++cnt == MAXCANDIDATES+1 ) {
+ break;
+ }
+
+ memcpy( &candidates[cnt-1], &candidates[i], sizeof(candidate_t) );
+
+ }
+ }
+
+ /*** The verdict ***/
+ if ( cnt == MAXCANDIDATES+1 ) {
+ result = _TEXTCAT_RESULT_UNKOWN;
+ }
+ else {
+ char *p = result;
+ char *plimit = result+MAXOUTPUTSIZE;
+
+ qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates );
+
+ *p = '\0';
+ for (i=0; i<cnt; i++) {
+ p = wg_strgmov( p, "[", plimit );
+ p = wg_strgmov( p, candidates[i].name, plimit );
+ p = wg_strgmov( p, "]", plimit );
+ }
+ }
+ READY:
+ fp_Done(unknown);
+ return result;
+}
+
+
+extern char *textcat_Version()
+{
+ return "TextCat " WGSTR(VERSION) "." WGSTR(SUBVERSION) " (" DESCRIPTION ")";
+}
diff --git a/src/textcat.h b/src/textcat.h
new file mode 100644
index 0000000..746e131
--- /dev/null
+++ b/src/textcat.h
@@ -0,0 +1,80 @@
+#ifndef _TEXTCAT_H_
+#define _TEXTCAT_H_
+/*
+ * textcat.h -- routines for categorizing text
+ *
+ * Copyright (C) 2003 WiseGuys Internet B.V.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+
+#define _TEXTCAT_RESULT_UNKOWN "UNKNOWN"
+#define _TEXTCAT_RESULT_SHORT "SHORT"
+
+
+/**
+ * textcat_Init() - Initialize the text classifier. The textfile
+ * conffile should contain a list of fingerprint filenames and
+ * identification strings for the categories. The filenames should be
+ * reachable from the current working directory. The identification
+ * strings will are used in the classification output.
+ *
+ * Returns: handle on success, NULL on error. (At the moment, the
+ * only way errors can occur, is when the library cannot read the
+ * conffile, or one of the fingerprint files listed in it.)
+ */
+extern void *textcat_Init( const char *conffile );
+
+/**
+ * textcat_Done() - Free up resources for handle
+ */
+extern void textcat_Done( void *handle );
+
+/**
+ * textcat_Classify() - Give the most likely categories for buffer
+ * with length size.
+ *
+ * Returns: string containing a list of category id's, each one
+ * between square brackets, "UNKNOWN" when not recognized, "SHORT" if the
+ * document was too short to make a reliable assessment.
+ *
+ * Performace note: longer buffers take longer to process. However,
+ * for many uses it is not necessary to categorize the whole buffer.
+ * For language classification, a few hundred bytes will suffice.
+ */
+extern char *textcat_Classify( void *handle, const char *buffer, size_t size );
+
+/**
+ * textcat_Version() - Returns a string describing the version of this classifier.
+ */
+extern char *textcat_Version();
+#endif
diff --git a/src/wg_mempool.c b/src/wg_mempool.c
new file mode 100644
index 0000000..d489ffd
--- /dev/null
+++ b/src/wg_mempool.c
@@ -0,0 +1,231 @@
+/*
+ * wg_mempool.c -- Functions for managing a memory pool.
+ *
+ * Copyright (C) 2003 WiseGuys Internet B.V.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include "common.h"
+
+typedef struct memblock_s {
+ char *pool;
+ char *p;
+ char *pend;
+ struct memblock_s *next;
+} memblock_t;
+
+
+typedef struct mempool_s {
+ memblock_t *first; /* linked list of blocks in use */
+ memblock_t *spare; /* linked list of spare blocks */
+ size_t maxallocsize;
+ size_t blocksize;
+} mempool_t;
+
+
+
+static void addblock( mempool_t *h )
+{
+ memblock_t *block;
+
+ /*** Spare blocks from previous round? ***/
+ if ( h->spare ) {
+ /*** Use spare block ***/
+ block = h->spare;
+ h->spare = block->next;
+ }
+ else {
+ /*** Make a new block ***/
+ block = (memblock_t *)wg_malloc( sizeof(memblock_t) );
+ block->pool = (char *)wg_malloc( h->blocksize );
+ }
+
+ block->p = block->pool;
+ block->pend = block->pool + h->blocksize - h->maxallocsize;
+ block->next = h->first;
+ h->first = block;
+}
+
+
+extern void *wgmempool_Init(size_t blocksize, size_t maxstrsize)
+{
+ mempool_t *result = (mempool_t *)wg_malloc( sizeof(mempool_t) );
+
+ result->first = NULL;
+ result->spare = NULL;
+ result->blocksize = blocksize;
+ result->maxallocsize = maxstrsize?(maxstrsize + 1):0;
+ addblock( result );
+
+ return (void *)result;
+}
+
+
+extern void wgmempool_Done( void *handle )
+{
+ mempool_t *h = (mempool_t *)handle;
+
+ memblock_t *p;
+
+ /*** Active blocks ***/
+ p = h->first;
+ while (p) {
+ memblock_t *next = p->next;
+ wg_free( p->pool );
+
+ memset( p, 0, sizeof(memblock_t)); /* for safety */
+ wg_free( p );
+
+ p = next;
+ }
+
+ /*** Spare blocks ***/
+ p = h->spare;
+ while (p) {
+ memblock_t *next = p->next;
+ wg_free( p->pool );
+
+ memset( p, 0, sizeof(memblock_t)); /* for safety */
+ wg_free( p );
+
+ p = next;
+ }
+
+ memset( h, 0, sizeof(mempool_t)); /* for safety */
+ wg_free(h);
+}
+
+extern void wgmempool_Reset( void *handle )
+{
+ mempool_t *h = (mempool_t *)handle;
+ memblock_t *p;
+
+ if ( !h->first ) {
+ return;
+ }
+
+ /*** Find last active block ***/
+ p = h->first;
+ while (p->next) {
+ p = p->next;
+ }
+
+ /*** Append spare list to it ***/
+ p->next = h->spare;
+ h->spare = h->first;
+ h->first = NULL;
+
+ /*** Start with a new block ***/
+ addblock(h);
+}
+
+
+extern void *wgmempool_alloc( void *handle, size_t size )
+{
+ void *result;
+ mempool_t *h = (mempool_t *)handle;
+ memblock_t *block = h->first;
+
+ /*** Too little space left in block? ***/
+ if ( block->p + size > block->pend + h->maxallocsize ) {
+ addblock(h);
+ block = h->first;
+ }
+ result = (void *)block->p;
+ block->p += size;
+ return result;
+}
+
+
+
+extern char *wgmempool_strdup( void *handle, const char *str )
+{
+ char *w, *result;
+ mempool_t *h = (mempool_t *)handle;
+ memblock_t *block = h->first;
+
+ /*** Create extra room? ***/
+ if ( h->maxallocsize ) {
+ if ( block->p >= block->pend ) {
+ addblock(h);
+ block = h->first;
+ }
+ }
+ else if ( block->p + strlen(str) + 1 >= block->pend ) {
+ addblock(h);
+ block = h->first;
+ }
+
+ /*** Enough room, so copy string ***/
+ result = w = block->p;
+ while (*str) {
+ *w++ = *str++;
+ }
+ *w++ = '\0';
+ block->p = w;
+ return result;
+}
+
+
+extern char *wgmempool_getline( void *handle, size_t size, FILE *fp )
+{
+ char *result, *p;
+ mempool_t *h = (mempool_t *)handle;
+ memblock_t *block = h->first;
+
+ /*** Enough space? ***/
+ if ( block->p + size > block->pend + h->maxallocsize ) {
+ addblock(h);
+ block = h->first;
+ }
+
+ result = (char *)block->p;
+ fgets(result, size, fp) ;
+
+ /** end of stream? **/
+ if ( feof( fp ) ) {
+ return NULL;
+ }
+
+ /** find end of line **/
+ p = result;
+ while (*p && *p != '\n' && *p != '\r' ) {
+ p++;
+ }
+ *p++ = '\0';
+
+ block->p = p;
+ return result;
+}
+
+
diff --git a/src/wg_mempool.h b/src/wg_mempool.h
new file mode 100644
index 0000000..73d40af
--- /dev/null
+++ b/src/wg_mempool.h
@@ -0,0 +1,144 @@
+#ifndef _WGMEMPOOL_H_
+#define _WGMEMPOOL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+ * mempool.c -- functions for efficient, pooled memory allocation
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * DESCRIPTION
+ *
+ * These functions can be used to maintain a pool from which memory
+ * can be claimed without performing numerous allocs.
+ *
+ * A memory pool can increase time and space efficiency by factors
+ * in situations where:
+ *
+ * - you need numerous allocations of little chunks of memory, e.g.
+ * a huge list of keywords.
+ * - you need little or no deallocations: preferable just one big
+ * dealloc at the end.
+ *
+ * A memory pool is unfavorable in situations where you constantly
+ * alloc and free memory.
+ *
+ * NOTES
+ *
+ * - A memory pool consists of "blocks" of a fixed size, which are
+ * added as the need for extra memory grows.
+ *
+ * - Deallocation of the individual fragments is not possible. The
+ * pool can only be deallocated in its entirety.
+ *
+ */
+
+
+#include "common.h"
+
+/*
+ * wgmempool_Init() -- intialize a memory pool
+ *
+ * ARGUMENTS
+ *
+ * - blocksize : size of blocks of which pool consists
+ * - maxstrsize:
+ * - > 0 : the maximum size of a string that is copied
+ * with wgmempool_strdup(). Omits the necessity of boundschecking
+ * in wgmempool_strdup(), making is slighlty faster.
+ * - 0 : causes wgmempool_strdup() to do its own boundschecking,
+ * which makes it somewhat slower.
+ *
+ * RETURN VALUE
+ *
+ * mempool handler on success, NULL on error.
+ *
+ */
+extern void *wgmempool_Init(uint4 blocksize, size_t maxstrsize);
+
+
+/*
+ * wgmempool_Done() -- deallocate memory pool in its entirety
+ */
+extern void wgmempool_Done( void *handle );
+
+
+/*
+ * wgmempool_Reset() -- resets a memory pool for reuse
+ *
+ * wgmempool_Reset() preserves already claimed memory for reuse, making
+ * it more time efficient than doing a wgmempool_Done() and wgmempool_Init().
+ */
+extern void wgmempool_Reset( void *handle );
+
+
+/*
+ * wgmempool_alloc() -- Allocate size bytes of memory in mempool
+ *
+ * ARGUMENTS
+ *
+ * - size: number of bytes to claim
+ *
+ * RETURN VALUE
+ *
+ * - pointer to claimed memory on success
+ * - NULL on error
+ */
+extern void *wgmempool_alloc( void *handle, size_t size );
+
+
+/*
+ * wgmempool_strdup() -- Allocate and copy a string to mempool
+ *
+ * ARGUMENTS
+ *
+ * - str : null-terminated string. If maxstrsize > 0 during
+ * init, strlen(str) may not be greater than this value.
+ *
+ * RETURN VALUE
+ *
+ * - pointer to duplicated string on success
+ * - NULL on error
+ */
+extern char *wgmempool_strdup( void *handle, const char* str );
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
+