diff options
-rw-r--r-- | LICENSE | 29 | ||||
-rw-r--r-- | README | 141 | ||||
-rw-r--r-- | TODO | 9 | ||||
l--------- | include/textcat.h | 1 | ||||
-rw-r--r-- | langclass/conf.txt | 89 | ||||
l--------- | lib/libtextcat.a | 1 | ||||
-rw-r--r-- | src/.#fingerprint.c.1.2 | 634 | ||||
-rw-r--r-- | src/Makefile | 62 | ||||
-rw-r--r-- | src/common.c | 376 | ||||
-rw-r--r-- | src/common.h | 83 | ||||
-rw-r--r-- | src/constants.h | 82 | ||||
-rw-r--r-- | src/createfp.c | 84 | ||||
-rw-r--r-- | src/fingerprint.c | 704 | ||||
-rw-r--r-- | src/fingerprint.h | 47 | ||||
-rw-r--r-- | src/testtextcat.c | 98 | ||||
-rw-r--r-- | src/textcat.c | 237 | ||||
-rw-r--r-- | src/textcat.h | 80 | ||||
-rw-r--r-- | src/wg_mempool.c | 231 | ||||
-rw-r--r-- | src/wg_mempool.h | 144 |
19 files changed, 3132 insertions, 0 deletions
@@ -0,0 +1,29 @@ +Copyright (c) 2003, WiseGuys Internet B.V. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +- Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +- Neither the name of the WiseGuys Internet B.V. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. @@ -0,0 +1,141 @@ + + :: libTextCat 2.0 :: + + What is it? + + Libtextcat is a library with functions that implement the + classification technique described in Cavnar & Trenkle, "N-Gram-Based + Text Categorization" [1]. It was primarily developed for language + guessing, a task on which it is known to perform with near-perfect + accuracy. + + The central idea of the Cavnar & Trenkle technique is to calculate a + "fingerprint" of a document with an unknown category, and compare this + with the fingerprints of a number of documents of which the categories + are known. The categories of the closest matches are output as the + classification. A fingerprint is a list of the most frequent n-grams + occurring in a document, ordered by frequency. Fingerprints are + compared with a simple out-of-place metric. See the article for more + details. + + Considerable effort went into making this implementation fast and + efficient. The language guesser processes over 100 documents/second on + a simple PC, which makes it practical for many uses. It was developed + for use in our webcrawler and search engine software, in which it it + handles millions of documents a day. + + Download + + The library is released under the BSD License, which basicly states + that you can do anything you like with it as long as you mention us + and make it clear that this library is covered by the BSD License. It + also exempts us from any liability, should this library eat your hard + disc, kill your cat or classify your attorney's e-mails as spam. + + The current version is 2.0. + + As of yet there is no development version. + + The distribution contains a configuration file for Gertjan van Noord's + language models, which are distributed under the GNU Public License. + Please note that this license does not allow you to distribute them as + a part of closed source software package. In time, we will provide you + with language models under a less restrictive license. + + Installation + + As of now, we have no autoconfig script. A "make all" in the src/ dir + should do the trick. If that doesn't work, you'll have to dive into + the mess we call our Makefile. + + The library is known to compile flawlessly on GNU/Linux for x86. If + you manage to get it working on other systems, please drop us a note, + and we'll proudly add you to this page. + + Quickstart: language guesser + + Assuming that you have successfully compiled the library, you still + need some language models to start guessing languages. If you don't + feel like creating them yourself (cf. [2]Creating your own + fingerprints below), you can use the excellent collection of over 70 + language models provided in Gertjan van Noord's "TextCat" package. We + have provided a configuration file for these models in the langclass + directory. Hack it at will. + + * Download the textcat package at + http://odur.let.rug.nl/~vannoord/TextCat/ + * mkdir ~/text_cat + * cd ~/text_cat + * tar xzf ~/your_download_dir/text_cat.tgz + * ~/libtextcat/src/testtextcat ~/libtextcat/langclass/conf.txt + + Paste some text onto the commandline, and watch it get classified. + + Using the API + + Classifying the language of a textbuffer can be as easy as: + + #include "textcat.h" + ... + void *h = textcat_Init( "conf.txt" ); + ... + printf( "Language: %s\n", textcat_Classify(h, buffer, 400); + ... + textcat_Done(h); + + Creating your own fingerprints + + The createfp program allows you to easily create your own document + fingerprints. Just feed it an example document on standard input, and + store the standard output: + + % createfp < mydocument.txt > myfingerprint.txt + + Put the names of your fingerprints in a configuration file, add some + id's and you're ready to classify. + + Performance tuning + + This library was made with efficiency in mind. There are couple of + parameters you may wish to tweak if you intend to use it for other + tasks than language guessing. + + The most important thing is buffer size. For reliable language + guessing the classifier only needs a couple of hundreds of bytes max. + So don't feed it 100KB of text unless you are creating a fingerprint. + + If you insist on feeding the classifier lots of text, try fiddling + with TABLEPOW, which determines the size of the hash table that is + used to store the n-grams. Making it too small will result in many + hashtable clashes, making it too large will cause wild memory + behaviour and both are bad for the performance. + + Putting the most probable models at the top of the list in your config + file improves performance, because this will raise the threshold for + likely candidates more quickly. + + Since the speed of the classifier is roughly linear with respect to + the number of models, you should consider how many models you really + need. In case of language guessing: do you really want to recognize + every language ever invented? + + References + + [1] The document that started it all can be downloaded at John M. + Trenkle's site: N-Gram-Based Text Categorization + + http://www.novodynamics.com/trenkle/papers/sdair-94-bc.ps.gz + + [2] The Perl implementation by Gertjan van Noord (code + language + models): downloadable from his [7]website + + http://odur.let.rug.nl/~vannoord/TextCat/ + + Contact + + Praise and flames may be directed at us through + libtextcat AT wise-guys.nl. If there is enough interest, we'll whip up + a mailing list. The current project maintainer is Frank Scheelen. + + c. 2003 WiseGuys Internet B.V. + @@ -0,0 +1,9 @@ + +TODO + +- Better Makefile +- Improve error handling and reporting. +- Split off table and heap routines to separate source files. +- Speed optimizations. Some speed might be gained by comparing + the unknown fingerprint to all known fingerprints at once. +- More docs, more comments.
\ No newline at end of file diff --git a/include/textcat.h b/include/textcat.h new file mode 120000 index 0000000..9db5260 --- /dev/null +++ b/include/textcat.h @@ -0,0 +1 @@ +../src/textcat.h
\ No newline at end of file diff --git a/langclass/conf.txt b/langclass/conf.txt new file mode 100644 index 0000000..4eb9f73 --- /dev/null +++ b/langclass/conf.txt @@ -0,0 +1,89 @@ +# +# A sample config file for the language models +# provided with Gertjan van Noords language guesser +# (http://odur.let.rug.nl/~vannoord/TextCat/) +# +# Notes: +# - You may consider eliminating a couple of small languages from this +# list because they cause false positives with big languages and are +# bad for performance. (Do you really want to recognize Drents?) +# - Putting the most probable languages at the top of the list +# improves performance, because this will raise the threshold for +# likely candidates more quickly. +# +LM/afrikaans.lm afrikaans +LM/albanian.lm albanian +LM/amharic-utf.lm amharic-utf +LM/arabic-iso8859_6.lm arabic-iso8859_6 +LM/arabic-windows1256.lm arabic-windows1256 +LM/armenian.lm armenian +LM/basque.lm basque +LM/belarus-windows1251.lm belarus-windows1251 +LM/bosnian.lm bosnian +LM/breton.lm breton +LM/bulgarian-iso8859_5.lm bulgarian-iso8859_5 +LM/catalan.lm catalan +LM/chinese-big5.lm chinese-big5 +LM/chinese-gb2312.lm chinese-gb2312 +LM/croatian-ascii.lm croatian-ascii +LM/czech-iso8859_2.lm czech-iso8859_2 +LM/danish.lm danish +LM/drents.lm drents # Dutch dialect +LM/dutch.lm dutch +LM/english.lm english +LM/esperanto.lm esperanto +LM/estonian.lm estonian +LM/finnish.lm finnish +LM/french.lm french +LM/frisian.lm frisian +LM/georgian.lm georgian +LM/german.lm german +LM/greek-iso8859-7.lm greek-iso8859-7 +LM/hebrew-iso8859_8.lm hebrew-iso8859_8 +LM/hindi.lm hindi +LM/hungarian.lm hungarian +LM/icelandic.lm icelandic +LM/indonesian.lm indonesian +LM/irish.lm irish +LM/italian.lm italian +LM/japanese-euc_jp.lm japanese-euc_jp +LM/japanese-shift_jis.lm japanese-shift_jis +LM/korean.lm korean +LM/latin.lm latin +LM/latvian.lm latvian +LM/lithuanian.lm lithuanian +LM/malay.lm malay +LM/manx.lm manx +LM/marathi.lm marathi +LM/middle_frisian.lm middle_frisian +LM/mingo.lm mingo +LM/nepali.lm nepali +LM/norwegian.lm norwegian +LM/persian.lm persian +LM/polish.lm polish +LM/portuguese.lm portuguese +LM/quechua.lm quechua +LM/romanian.lm romanian +LM/rumantsch.lm rumantsch +LM/russian-iso8859_5.lm russian-iso8859_5 +LM/russian-koi8_r.lm russian-koi8_r +LM/russian-windows1251.lm russian-windows1251 +LM/sanskrit.lm sanskrit +LM/scots.lm scots +LM/scots_gaelic.lm scots_gaelic +LM/serbian-ascii.lm serbian-ascii +LM/slovak-ascii.lm slovak-ascii +LM/slovak-windows1250.lm slovak-windows1250 +LM/slovenian-ascii.lm slovenian-ascii +LM/slovenian-iso8859_2.lm slovenian-iso8859_2 +LM/spanish.lm spanish +LM/swahili.lm swahili +LM/swedish.lm swedish +LM/tagalog.lm tagalog +LM/tamil.lm tamil +LM/thai.lm thai +LM/turkish.lm turkish +LM/ukrainian-koi8_r.lm ukrainian-koi8_r +LM/vietnamese.lm vietnamese +LM/welsh.lm welsh +LM/yiddish-utf.lm yiddish-utf diff --git a/lib/libtextcat.a b/lib/libtextcat.a new file mode 120000 index 0000000..53dc620 --- /dev/null +++ b/lib/libtextcat.a @@ -0,0 +1 @@ +../src/libtextcat.a
\ No newline at end of file diff --git a/src/.#fingerprint.c.1.2 b/src/.#fingerprint.c.1.2 new file mode 100644 index 0000000..19d55d9 --- /dev/null +++ b/src/.#fingerprint.c.1.2 @@ -0,0 +1,634 @@ +/** + * fingerprint.c -- Routines for creating an n-gram fingerprint of a + * buffer. + * + * A fingerprint is a list of most common n-grams, ordered by + * frequency. (Note that we can use other strings than n-grams, for + * instance entire words.) + * + * Mar 28, 2003 frank@wise-guys.nl -- created + * + * TODO: + * - put table/heap datastructure in a separate file. + * */ +#include <string.h> +#include <ctype.h> + +#include "wglib.h" +#include "wg_mempool.h" + +#define INVALID(c) (isspace(c) || isdigit(c)) + +#define TABLEPOW 13 +#define TABLESIZE (1<<TABLEPOW) +#define TABLEMASK ((TABLESIZE)-1) + +#define MAXOUTOFPLACE 400 +#define MINDOCSIZE 25 + +typedef struct { + + sint2 rank; + char str[6]; + +} ngram_t; + +typedef struct fprint_s { + + const char *name; + ngram_t *fprint; + uint4 size; + +} fprint_t; + +typedef struct entry_s { + char str[6]; + unsigned int cnt; + struct entry_s *next; +} entry_t; + +typedef struct table_s { + void *pool; + entry_t **table; + entry_t *heap; + + struct table_s *next; + + uint4 heapsize; + uint4 size; +} table_t; + + + +/* + * fast and furious little hash function + * + * (Note that we could use some kind of rolling checksum, and update it + * during n-gram construction) + */ +static uint4 simplehash( const char *p, int len ) +{ + sint4 h = len * 13; + while (*p) { + h = (h<<5)-h + *p++; + } + return (uint4)h; +} + + +/* checks if n-gram lex is a prefix of key and of length len */ +inline int issame( char *lex, char *key, int len ) +{ + int i; + for (i=0; i<len; i++) { + if ( key[i] != lex[i] ) { + return 0; + } + } + if ( lex[i] != 0 ) { + return 0; + } + return 1; +} + + +/* increases frequency of ngram(p,len) */ +static inline int increasefreq( table_t *t, char *p, int len ) +{ + uint4 hash = simplehash( p, len ) & TABLEMASK; + entry_t *entry = t->table[ hash ]; + + wg_debug(NGRAMDEBUG, "%u-GRAM: %s\n", len,p); + + while ( entry ) { + if ( issame( entry->str, p, len ) ) { + /*** Found it! ***/ + entry->cnt++; + return 1; + } + else { + entry = entry->next; + } + } + + /*** Not found, so create ***/ + entry = wgmempool_alloc( t->pool, sizeof(entry_t) ); + strcpy( entry->str, p ); + entry->cnt = 1; + + entry->next = t->table[hash]; + t->table[hash] = entry; + + return 1; +} + + +/* looks up ngram(p,len) */ +static entry_t *findfreq( table_t *t, char *p, int len ) +{ + uint4 hash = simplehash( p, len ) & TABLEMASK; + entry_t *entry = t->table[ hash ]; + + while ( entry ) { + if ( issame( entry->str, p, len ) ) { + return entry; + } + else { + entry = entry->next; + } + } + + return NULL; +} + +static void dumptable(table_t *t) +{ + int i; + for (i=0; i<TABLESIZE; i++) { + + entry_t *p = t->table[i]; + + while (p) { + + printf("%5u %s\n", p->cnt, p->str); + p = p->next; + } + + } +} + + +#define GREATER(x,y) ((x).cnt > (y).cnt) +#define LESS(x,y) ((x).cnt < (y).cnt) + +inline static void siftup( table_t *t, unsigned int child ) +{ + entry_t *heap = t->heap; + unsigned int parent = (child-1) >> 1; + entry_t tmp; + + while ( child > 0 ) { + if ( GREATER(heap[parent],heap[child]) ) { + memcpy( &tmp, &heap[parent], sizeof(entry_t) ); + memcpy( &heap[parent], &heap[child], sizeof(entry_t) ); + memcpy( &heap[child], &tmp, sizeof(entry_t) ); + } + else { + return; + } + + child = parent; + parent = (child-1) >> 1; + } +} + + +inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent ) +{ + entry_t *heap = t->heap; + unsigned int child = parent*2 + 1; + entry_t tmp; + + while ( child < heapsize ) { + if ( child+1 < heapsize && GREATER(heap[child], heap[child+1]) ) { + child++; + } + if ( GREATER(heap[parent], heap[child] ) ) { + memcpy( &tmp, &heap[parent], sizeof(entry_t) ); + memcpy( &heap[parent], &heap[child], sizeof(entry_t) ); + memcpy( &heap[child], &tmp, sizeof(entry_t) ); + } + else { + return; + } + parent = child; + child = (parent*2)+1; + } +} + + +static int heapinsert( table_t *t, entry_t *item ) +{ + entry_t *heap = t->heap; + + /*** Still room for an entry? ***/ + if (t->size < t->heapsize) { + memcpy( &(heap[t->size]), item, sizeof(entry_t)); + siftup( t, t->size ); + t->size++; + return 0; + } + + /*** Worse than the worst performer? ***/ + if ( LESS(*item, heap[0]) ) { + return 0; + } + + /*** Insert into heap and reheap ***/ + memcpy( &(heap[0]), item, sizeof(entry_t)); + siftdown( t, t->size, 0 ); + return 0; +} + + +extern int heapextract( table_t *t, entry_t *item ) +{ + entry_t *p; + + if (t->size == 0 ) { + return 0; + } + + p = &(t->heap[0]); + + memcpy( item, p, sizeof( entry_t )); + memcpy( &(t->heap[0]), &(t->heap[t->size-1]), sizeof(entry_t) ); + + siftdown(t,t->size, 0); + t->size--; + + return 1; +} + + +/*** Makes a heap of all table entries ***/ +static int table2heap(table_t *t) +{ + int i; + uint4 cnt = 0; + uint4 bufsize = 2048; + + /*** Fill result heap ***/ + for (i=0; i<TABLESIZE; i++) { + entry_t *p = t->table[i]; + while (p) { + heapinsert(t, p); + p = p->next; + } + } + return 1; +} + + +static table_t *inittable(uint4 maxngrams) +{ + table_t *result = (table_t *)wg_zalloc( sizeof(table_t) ); + result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE ); + result->pool = wgmempool_Init( 10000, 10 ); + + result->heap = (entry_t *)wg_malloc( sizeof(entry_t) * maxngrams ); + result->heapsize = maxngrams; + result->size = 0; + + return result; +} + +static void tabledone( table_t *t ) +{ + if (!t) { + return; + } + wgmempool_Done(t->pool); + wg_free(t->table); + wg_free(t->heap); + wg_free(t); +} + + +extern void *fprint_Init(const char *name) +{ + fprint_t *h = (fprint_t *)wg_zalloc( sizeof(fprint_t) ); + + if ( name ) { + h->name = wg_strdup(name); + } + + return (void *)h; +} + + +extern void fprint_Done( void *handle ) +{ + fprint_t *h = (fprint_t *)handle; + + if ( h->name ) { + wg_free( (void *)h->name ); + } + if ( h->fprint ) { + wg_free( h->fprint ); + } + + wg_free( h ); +} + +extern const char *fprint_Name( void *handle ) +{ + fprint_t *h = (fprint_t *)handle; + return h->name; +} + +/** + * Function that prepares buffer for n-grammification: + * runs of invalid characters are collapsed to a single + * underscore. + * + * Function is implemented as a finite state machine. + */ +static char *prepbuffer( const char *src, size_t bufsize ) +{ + const char *p = src; + char *dest = (char *)wg_malloc( bufsize + 3 ); + char *w = dest; + char *wlimit = dest + bufsize + 1; + + START: + if ( INVALID(*p) ) { + goto SPACE; + } + else if ( *p == '\0' ) { + goto END; + } + + *w++ = '_'; + if ( w == wlimit ) { + goto STOP; + } + + goto WORD; + + + SPACE: + /*** Inside string of invalid characters ***/ + p++; + if ( INVALID(*p) ) { + goto SPACE; + } + else if ( *p == '\0' ) { + goto END; + } + + *w++ = '_'; + if ( w == wlimit ) { + goto STOP; + } + + goto WORD; + + WORD: + /*** Inside string of valid characters ***/ + *w++ = *p++; + if ( w == wlimit ) { + goto END; + } + else if ( INVALID(*p) ) { + goto SPACE; + } + else if ( *p == '\0' ) { + goto STOP; + } + goto WORD; + + END: + *w++ = '_'; + + STOP: + *w++ = '\0'; + + /*** Docs that are too small for a fingerprint, are refused ***/ + if ( w - dest < MINDOCSIZE ) { + wg_free(dest); + return NULL; + } + + return dest; +} + + +static void createngramtable( table_t *t, const char *buf ) +{ + char n[6]; + const char *p = buf; + int i; + + /*** Get all n-grams where 1<=n<=5. Allow underscores only at borders. ***/ + for (;;p++) { + + const char *q = p; + char *m = n; + + /*** First char may be an underscore ***/ + *m++ = *q++; + *m = '\0'; + + increasefreq( t, n, 1 ); + + if ( *q == '\0' ) { + return; + } + + /*** Let the compiler unroll this ***/ + for ( i=2; i<=5; i++) { + + *m++ = *q; + *m = '\0'; + + increasefreq( t, n, i ); + + if ( *q == '_' ) break; + q++; + if ( *q == '\0' ) { + return; + } + } + } + return; +} + + +static int ngramcmp(const void *a, const void *b) +{ + ngram_t *x = (ngram_t *)a; + ngram_t *y = (ngram_t *)b; + + return strcmp( x->str, y->str ); +} + +/** + * Create a fingerprint: + * - record the frequency of each unique n-gram in a hash table + * - take the most frequent n-grams + * - sort them alphabetically, recording their relative rank + */ +extern int fprint_Create( void *handle, const char *buffer, uint4 bufsize, uint4 maxngrams ) +{ + sint4 i; + fprint_t *h = (fprint_t *)handle; + table_t *t = inittable(maxngrams); + char *tmp; + + if ( bufsize < MINDOCSIZE ) { + return 0; + } + + /*** Throw out all invalid chars ***/ + tmp = prepbuffer( buffer, bufsize ); + if ( tmp == NULL ) { + return 0; + } + + /*** Create a hash table containing n-gram counts ***/ + createngramtable(t, tmp); + + /*** Take the top N n-grams and add them to the profile ***/ + table2heap(t); + maxngrams = WGMIN( maxngrams, t->size ); + + h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams ); + h->size = maxngrams; + + /*** Pull n-grams out of heap (backwards) ***/ + for (i=maxngrams-1; i>=0; i--) { + + entry_t tmp; + + heapextract(t, &tmp); + + /*** the string and its rank is all we need ***/ + strcpy( h->fprint[i].str, tmp.str ); + h->fprint[i].rank = i; + } + + tabledone(t); + wg_free(tmp); + + /*** Sort n-grams alphabetically, for easy comparison ***/ + qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp ); + return 1; +} + +extern void fprint_Show( void *handle ) +{ + fprint_t *h = (fprint_t *)handle; + int i; + printf("------ %s --------\n", h->name ); + for (i=0; i<h->size; i++) { + printf("%3u: '%s' [%u]\n", i, h->fprint[i].str, h->fprint[i].rank); + } + + +} + +extern int fprint_Read( void *handle, const char *fname, int maxngrams ) +{ + fprint_t *h = (fprint_t *)handle; + FILE *fp; + char line[1024]; + int cnt = 0; + + fp = fopen( fname, "r" ); + if (!fp) { + return 0; + } + + h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t)); + + while (cnt < maxngrams && wg_getline(line,1024,fp)) { + + char *p; + + wg_trim(line, line); + + p = strpbrk( line, " \t" ); + if ( p ) { + *p = '\0'; + } + + if ( strlen(line) > 5 ) { + continue; + } + + strcpy( h->fprint[cnt].str, line ); + h->fprint[cnt].rank = cnt; + + cnt++; + } + + h->size = cnt; + + /*** Sort n-grams, for easy comparison later on ***/ + qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp ); + + fclose(fp); + + return 1; +} + + +extern sint4 fprint_Compare( void *cat, void *unknown, int cutoff ) +{ + fprint_t *c = (fprint_t *)cat; + fprint_t *u = (fprint_t *)unknown; + int i = 0; + int j = 0; + sint4 sum = 0; + + /*** Compare the profiles in mergesort fashion ***/ + while ( i < c->size && j < u->size ) { + + int cmp = strcmp( c->fprint[i].str, u->fprint[j].str ); + + if ( cmp < 0 ) { + i++; + } + else if ( cmp == 0 ) { + sum += abs( c->fprint[i].rank - u->fprint[j].rank ); + if ( sum > cutoff ) { + return 1000000000; + } + i++; + j++; + } + else { + sum += MAXOUTOFPLACE; + if ( sum > cutoff ) { + return 1000000000; + } + j++; + } + } + + /*** Process tail of unknown, if any ***/ + while ( j < u->size ) { + sum += MAXOUTOFPLACE; + if ( sum > cutoff ) { + return 1000000000; + } + j++; + } + + return sum; + +} + + +#if 0 +int main(int argc, char **argv) +{ + char buf[100 * 1024]; + char tmp[100 * 1024]; + int size; + fprint_t *h; + + wg_meminit(); + { + //size = fread( buf, 1, 100*1024, stdin ); + h = fprint_Init("test"); + buf[size] = '\0'; + //fprint_Create(h, buf, strlen(buf), 100); + fprint_Read( h, "/home/frank/textcat/LM/dutch.lm", 100 ); + fprint_Show(h); + fprint_Done(h); + wg_printmemstats(); + } +} +#endif diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..1253f38 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,62 @@ +# +# Apr 1, 2003 frank@wise-guys.nl +# Apr 2, 2003 gnarl@wise-guys.nl +# +#-------------------------------------------------------------------------- +# Macro Variables +#-------------------------------------------------------------------------- +#MDEBUG =-DMEMDEBUG -DMEMVERBOSE +# +SRC =/home/src +RANLIB =ranlib +RM =/bin/rm -f +CC =gcc +LIBS = +LEX =flex +LDFLAGS =-g +OPTS =-static -O3 -DWGTIMER -D_THREAD_SAFE -D_GNU_SOURCE -march=i686 \ + -funroll-loops --pipe -D__GNU_C__ +VERBOSE = -DVERBOSE +IFLAGS = + +WARNS =-W -Winline -Wall -Wshadow -Wno-unused -Wpointer-arith + +CFLAGS =$(VERBOSE) $(IFLAGS) $(LDFLAGS) -D_GNU_SOURCE -D_REENTRANT -DWGTIMER $(WARNS) $(OPTS) $(MDEBUG) + +OBJECTS2 = fingerprint.o textcat.o common.o wg_mempool.o +OBJECTS = testtextcat.o +OBJECTS3 = createfp.o + +SOURCES = fingerprint.c textcat.c common.c wg_mempool.c testtextcat.c createfp.c + +LIBS1 = -lm + +LIBS = $(LIBS1) + +#-------------------------------------------------------------------------- +# Dependencies +#-------------------------------------------------------------------------- +all: textcat testtextcat createfp + +$(OBJECTS) : Makefile + +textcat: $(OBJECTS2) + $(RM) -f libtextcat.a ; ar cr libtextcat.a $(OBJECTS2) ; $(RANLIB) libtextcat.a + +testtextcat: $(OBJECTS) libtextcat.a + $(CC) $(CFLAGS) -static -o $@ $(OBJECTS) libtextcat.a $(LDFLAGS) $(LIBS) + +createfp: $(OBJECTS3) libtextcat.a + $(CC) $(CFLAGS) -static -o $@ $(OBJECTS3) libtextcat.a $(LDFLAGS) $(LIBS) + +dep: depend +# +depend: + makedepend -- $(CFLAGS) -- $(SOURCES) +# +$(SOURCES): Makefile +# +clean: + $(RM) core *.o *~ testtextcat createfp *.a + +# DO NOT DELETE diff --git a/src/common.c b/src/common.c new file mode 100644 index 0000000..054dae4 --- /dev/null +++ b/src/common.c @@ -0,0 +1,376 @@ +/* + * common.c -- miscellanious helper functions. + * + * Copyright (c) 2003, WiseGuys Internet B.V. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stdarg.h> +#include <ctype.h> +#include "common.h" + +extern void wgmem_error( const char *fmt, ... ) +{ + va_list ap; + + fprintf(stderr, "MEMERROR : "); + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + + exit(-1); +} + + +extern void *wg_malloc( size_t size ) +{ + void *result; + if ( !size ) { + wgmem_error( "Error mallocing 0 bytes.\n" ); + } + + result = malloc( size ); + if ( !result ) { + wgmem_error( "Error while mallocing %u bytes.\n", size ); + } + + return result; +} + +extern void *wg_calloc( size_t nmemb, size_t size ) +{ + void *result; + if ( !size || !nmemb ) { + wgmem_error( "Error callocing 0 bytes.\n" ); + } + + result = calloc( nmemb, size ); + if ( !result ) { + wgmem_error( "Error while callocing %u elements of %u bytes.\n", nmemb, size); + } + + return result; +} + +extern void *wg_zalloc( size_t size ) +{ + void *result; + + if (!size) { + wgmem_error( "Error zallocing 0 bytes.\n" ); + } + + result = calloc(1, size); + if ( !result ) { + wgmem_error( "Error while zallocing %u bytes.\n", size ); + } + + return result; +} + +extern char* wg_strdup( const char *s ) +{ + char *result = strdup( s ); + + if ( !result ) { + wgmem_error( "Error while strduping %u bytes.\n", strlen(s) ); + } + + return( result ); +} + +extern void* wg_realloc( void *ptr, size_t size ) +{ + void *result; + + if (!size) { + wgmem_error( "Error reallocing 0 bytes.\n" ); + } + + result = realloc( ptr, size ); + if ( !result ) { + wgmem_error( "Error while reallocing %u bytes.\n", size ); + } + + return( result ); +} + +extern void wg_free( void *mem ) +{ + if ( mem ) { + free(mem); + } +} + +extern char *wg_getline( char *line, int size, FILE *fp ) +{ + char *p; + + if ( fgets(line, size, fp) == NULL ) { + return NULL; + } + + /** kill term null **/ + if ( (p = strpbrk( line, "\n\r" )) ) { + *p = '\0'; + } + + return line; +} + + + +/* + * wg_split: split a line into segments, using whitespace-sequences as separators. + * + * ARGUMENTS: + * - result: + * + * After the split, this array contains pointers to the start of each + * detected segment. Must be preallocated and at least as large as + * maxsegments. The pointers point into the dest buffer. + * + * - dest: + * + * String into which result points as an index. Must be preallocated, and + * at least as big as src. You can use src as dest, but in that case src + * is overwritten! + * + * - src: + * + * The string to split. Sequences of whitespace are treated as separators, unless + * escaped. There are two ways to escape: by using single quotes (anything + * between single quotes is treated as one segment), or by using a backslash + * to escape the next character. The backslash escape works inside quotation + * as well. + * + * Example: + * + * "It\'s very\ easy 'to use WiseGuys\' wg_split()' function" is split into: + * + * "It's" + * "very easy" + * "to use WiseGuys' wg_split()" + * "function" + * + * - maxsegments: + * + * The maximum number of segments. If the splitter runs out of segments, + * the remainder of the string is stored in the last segment. + * + * RETURN VALUE: + * The number of segments found. + */ +unsigned int wg_split( char **result, char *dest, char *src, int maxsegments ) +{ + char *p = src; + char *w = dest; + int cnt = 0; + int state=0; + + if ( maxsegments == 0 ) { + return 0; + } + + maxsegments--; + + while (cnt < maxsegments && *p) { + + switch (state) { + case 0: + /*** Skip spaces ***/ + while ( isspace((int) *p) ) { + p++; + } + state = 1; + + case 1: + /*** Start segment ***/ + result[cnt] = w; + cnt++; + state = 2; + + case 2: + /*** Unquoted segment ***/ + while (*p) { + if ( isspace((int) *p) ) { + *w++ = '\0'; + p++; + state = 0; + break; + } + else if ( *p == '\'' ) { + /*** Start quotation ***/ + p++; + state = 3; + break; + } + else if ( *p == '\\' && p[1] ) { + /*** Escape ***/ + p++; + *w++ = *p++; + } + else { + *w++ = *p++; + } + } + break; + + case 3: + /*** Inside quotes ***/ + while (*p) { + if (*p == '\'') { + p++; + break; + } + else if ( *p == '\\' && p[1] ) { + /*** Escape ***/ + p++; + *w++ = *p++; + } + else { + *w++ = *p++; + } + } + state = 2; + break; + + } + } + + if ( !*p ) { + *w = '\0'; + return cnt; + } + + /*** We ran out of segments; copy the remainder of the string into last segment ***/ + result[cnt++] = w; + while (*p) { + *w++ = *p++; + } + *w = '\0'; + return cnt; +} + + +extern void wg_timerstart(wgtimer_t *t) +{ + gettimeofday( &(t->start), NULL ); +} + + + +extern uint4 wg_timerstop(wgtimer_t *t) +{ + uint4 result; + gettimeofday( &(t->stop), NULL ); + result = (t->stop.tv_sec - t->start.tv_sec) * 1000000 + + (t->stop.tv_usec - t->start.tv_usec); + + t->start.tv_sec = t->stop.tv_sec; + t->start.tv_usec = t->stop.tv_usec; + + return result; +} + + +/** + * wg_strgmov -- a guarded strcpy() variation + * + * copies src to dest (including terminating zero), and returns + * pointer to position of terminating zero in dest. The function is + * guaranteed not to write past destlimit. If the copy couldn't be + * finished, the function returns NULL after restoring the first + * character in dest for your convenience (since this is usually a zero). + */ +char *wg_strgmov( char *dest, const char *src, const char *destlimit ) +{ + char tmp, *w; + + if ( !dest || dest >= destlimit ) { + return NULL; + } + + tmp = *dest; + w = dest; + + while ( *src ) { + + *w++ = *src++; + + if ( w == destlimit ) { + /*** restore old situation ***/ + *dest = tmp; + return NULL; + } + } + + *w = '\0'; + return w; + +} + +/* + * wg_trim() -- remove whitespace surrounding a string. + * + * Example: " bla bla bla " becomes "bla bla bla" after trimming. + * + * ARGUMENTS + * - dest : After the trim, this will point to the trimmed string. + * Must be preallocated and at least as big as src. You can + * use src as dest. + * - src : Points to the string to be trimmed. + * + * RETURNS: + * dest + */ +char *wg_trim( char *dest, const char *src ) +{ + char *lastnonspace = &dest[-1]; + const char *p = src; + char *w = dest; + + while ( isspace((int)*p) ) { + p++; + } + while (*p) { + if ( !isspace((int)*p) ) { + lastnonspace = w; + } + *w++ = *p++; + } + lastnonspace[1] = '\0'; + + return dest; +} + diff --git a/src/common.h b/src/common.h new file mode 100644 index 0000000..5dc4656 --- /dev/null +++ b/src/common.h @@ -0,0 +1,83 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ +/** + * common.h -- a mixed bag of helper functions + * + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <stdio.h> +#include <inttypes.h> +#include <sys/time.h> +#include <time.h> + +#define WGMIN(x,y) ((x)<=(y)?(x):(y)) +#define WGMAX(x,y) ((x)<=(y)?(y):(x)) +#define __STR__(x) #x +#define WGSTR(x) __STR__(x) + +typedef uint32_t uint4; +typedef uint16_t uint2; +typedef uint8_t uchar; + +typedef int32_t sint4; +typedef int16_t sint2; +typedef int8_t schar; + +typedef int8_t boole; + + +typedef struct wgtimer_s { + struct timeval start; + struct timeval stop; +} wgtimer_t; + + +extern void *wg_malloc( size_t size ); +extern void *wg_calloc( size_t nmemb, size_t size ); +extern void *wg_zalloc( size_t size ); +extern char* wg_strdup( const char *s ); +extern void* wg_realloc( void *ptr, size_t size ) ; +extern void wg_free( void *mem ); + +extern char *wg_getline( char *line, int size, FILE *fp ); + +extern void wg_timerstart(wgtimer_t *t); +extern uint4 wg_timerstop(wgtimer_t *t); + +extern unsigned int wg_split( char **result, char *dest, char *src, int maxsegments ); +extern char *wg_strgmov( char *dest, const char *src, const char *destlimit ); +extern char *wg_trim( char *dest, const char *src ); + + +#endif + diff --git a/src/constants.h b/src/constants.h new file mode 100644 index 0000000..0bdae6c --- /dev/null +++ b/src/constants.h @@ -0,0 +1,82 @@ +#ifndef _CONSTANTS_H_ +#define _CONSTANTS_H_ + +/* + * constants.h -- some constants used throughout the code. Not pretty, + * but certainly convenient. + * + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <limits.h> + + +#define VERSION 2 +#define SUBVERSION 0 +#define DESCRIPTION "out of place" + +/* Reported matches are those fingerprints with a score less than best + * score * THRESHOLDVALUE (i.e. a THRESHOLDVALUE of 1.03 means matches + * must score within 3% from the best score.) + */ +#define THRESHOLDVALUE 1.03 + +/* If more than MAXCANDIDATES matches are found, the classifier reports + * unknown, because the input is obviously confusing. + */ +#define MAXCANDIDATES 5 + +/* The size of the buffer used to report the classification. + */ +#define MAXOUTPUTSIZE 1024 + +/* Maximum number of n-grams in a fingerprint */ +#define MAXNGRAMS 400 + +/* Maximum size of an n-gram? */ +#define MAXNGRAMSIZE 5 + +/* Which characters are not acceptable in n-grams? */ +#define INVALID(c) (isspace(c) || isdigit(c)) + +/* Minimum size (in characters) for accepting a document */ +#define MINDOCSIZE 25 + +/* Maximum penalty for missing an n-gram in fingerprint */ +#define MAXOUTOFPLACE 400 + +/* Size of hash table is 2^TABLEPOW. */ +#define TABLEPOW 13 + +#define MAXSCORE INT_MAX + +#endif diff --git a/src/createfp.c b/src/createfp.c new file mode 100644 index 0000000..1e3e0cd --- /dev/null +++ b/src/createfp.c @@ -0,0 +1,84 @@ +/* + * createfprint.c - can be used to create a fingerprint of a document. + * + * Copyright (c) 2003, WiseGuys Internet B.V. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "fingerprint.h" +#include "common.h" + +#define BLOCKSIZE 4096 + +char *myread(FILE *fp) +{ + char *buf; + size_t size = 0; + size_t maxsize = BLOCKSIZE*2; + + buf = (char *)wg_malloc( maxsize ); + do { + size_t read = fread( buf+size, 1, BLOCKSIZE, fp ); + size += read; + if ( size + BLOCKSIZE > maxsize ) { + maxsize *= 2; + buf = (char *)wg_realloc( buf, maxsize ); + } + + } while (!feof(stdin)); + + buf[size] = '\0'; + buf = (char *)wg_realloc( buf, size+1 ); + + return buf; +} + + +int main( int argc, char **argv ) +{ + void *h; + char *result; + wgtimer_t tm; + int i; + char *buf; + + buf = myread(stdin); + + h = fp_Init(NULL); + if ( fp_Create( h, buf, strlen(buf), 400 ) == 0 ) { + fprintf(stderr, "There was an error creating the fingerprint\n"); + exit(-1); + } + fp_Print(h,stdout); + fp_Done(h); + wg_free(buf); + + return 0; +} diff --git a/src/fingerprint.c b/src/fingerprint.c new file mode 100644 index 0000000..0d14558 --- /dev/null +++ b/src/fingerprint.c @@ -0,0 +1,704 @@ +/** + * fingerprint.c -- Routines for creating an n-gram fingerprint of a + * buffer. + * + * Copyright (c) 2003, WiseGuys Internet B.V. + * All rights reserved. + * + * THE BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * DESCRIPTION + * + * A fingerprint is a list of most common n-grams, ordered by + * frequency. (Note that we can use other strings than n-grams, for + * instance entire words.) + * + * HOW DOES IT WORK? + * + * - Buffer is sliced up into n-grams + * - N-grams are inserted into a hash table that records their frequency + * - The table entries are filtered through a N-sized heap to + * get the N most frequent n-grams. + * + * The reason why we go through the trouble of doing a partial + * (heap)sort is that a full quicksort behaves horribly on the data: + * most n-grams have a very low count, resulting in a data set in + * nearly-sorted order. This causes quicksort to behave very badly. + * Heapsort, on the other hand, behaves handsomely: worst case is + * Mlog(N) for M n-grams filtered through a N-sized heap. + * + * REVISION HISTORY + * + * Mar 28, 2003 frank@wise-guys.nl -- created + * + * TODO: + * - put table/heap datastructure in a separate file. + * */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> + +#include "common.h" +#include "wg_mempool.h" +#include "constants.h" + + +#define TABLESIZE (1<<TABLEPOW) +#define TABLEMASK ((TABLESIZE)-1) + +typedef struct { + + sint2 rank; + char str[MAXNGRAMSIZE+1]; + +} ngram_t; + +typedef struct fp_s { + + const char *name; + ngram_t *fprint; + uint4 size; + +} fp_t; + +typedef struct entry_s { + char str[MAXNGRAMSIZE+1]; + unsigned int cnt; + struct entry_s *next; +} entry_t; + +typedef struct table_s { + void *pool; + entry_t **table; + entry_t *heap; + + struct table_s *next; + + uint4 heapsize; + uint4 size; +} table_t; + + + +/* + * fast and furious little hash function + * + * (Note that we could use some kind of rolling checksum, and update it + * during n-gram construction) + */ +static uint4 simplehash( const char *p, int len ) +{ + sint4 h = len * 13; + while (*p) { + h = (h<<5)-h + *p++; + } + return (uint4)h; +} + + +/* checks if n-gram lex is a prefix of key and of length len */ +inline int issame( char *lex, char *key, int len ) +{ + int i; + for (i=0; i<len; i++) { + if ( key[i] != lex[i] ) { + return 0; + } + } + if ( lex[i] != 0 ) { + return 0; + } + return 1; +} + + +/* increases frequency of ngram(p,len) */ +static inline int increasefreq( table_t *t, char *p, int len ) +{ + uint4 hash = simplehash( p, len ) & TABLEMASK; + entry_t *entry = t->table[ hash ]; + + while ( entry ) { + if ( issame( entry->str, p, len ) ) { + /*** Found it! ***/ + entry->cnt++; + return 1; + } + else { + entry = entry->next; + } + } + + /*** Not found, so create ***/ + entry = wgmempool_alloc( t->pool, sizeof(entry_t) ); + strcpy( entry->str, p ); + entry->cnt = 1; + + entry->next = t->table[hash]; + t->table[hash] = entry; + + return 1; +} + + +/* looks up ngram(p,len) */ +static entry_t *findfreq( table_t *t, char *p, int len ) +{ + uint4 hash = simplehash( p, len ) & TABLEMASK; + entry_t *entry = t->table[ hash ]; + + while ( entry ) { + if ( issame( entry->str, p, len ) ) { + return entry; + } + else { + entry = entry->next; + } + } + + return NULL; +} + +static void dumptable(table_t *t) +{ + int i; + for (i=0; i<TABLESIZE; i++) { + + entry_t *p = t->table[i]; + + while (p) { + + printf("%5u %s\n", p->cnt, p->str); + p = p->next; + } + + } +} + + +#define GREATER(x,y) ((x).cnt > (y).cnt) +#define LESS(x,y) ((x).cnt < (y).cnt) + +inline static void siftup( table_t *t, unsigned int child ) +{ + entry_t *heap = t->heap; + unsigned int parent = (child-1) >> 1; + entry_t tmp; + + while ( child > 0 ) { + if ( GREATER(heap[parent],heap[child]) ) { + memcpy( &tmp, &heap[parent], sizeof(entry_t) ); + memcpy( &heap[parent], &heap[child], sizeof(entry_t) ); + memcpy( &heap[child], &tmp, sizeof(entry_t) ); + } + else { + return; + } + + child = parent; + parent = (child-1) >> 1; + } +} + + +inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent ) +{ + entry_t *heap = t->heap; + unsigned int child = parent*2 + 1; + entry_t tmp; + + while ( child < heapsize ) { + if ( child+1 < heapsize && GREATER(heap[child], heap[child+1]) ) { + child++; + } + if ( GREATER(heap[parent], heap[child] ) ) { + memcpy( &tmp, &heap[parent], sizeof(entry_t) ); + memcpy( &heap[parent], &heap[child], sizeof(entry_t) ); + memcpy( &heap[child], &tmp, sizeof(entry_t) ); + } + else { + return; + } + parent = child; + child = (parent*2)+1; + } +} + + +static int heapinsert( table_t *t, entry_t *item ) +{ + entry_t *heap = t->heap; + + /*** Still room for an entry? ***/ + if (t->size < t->heapsize) { + memcpy( &(heap[t->size]), item, sizeof(entry_t)); + siftup( t, t->size ); + t->size++; + return 0; + } + + /*** Worse than the worst performer? ***/ + if ( LESS(*item, heap[0]) ) { + return 0; + } + + /*** Insert into heap and reheap ***/ + memcpy( &(heap[0]), item, sizeof(entry_t)); + siftdown( t, t->size, 0 ); + return 0; +} + + +extern int heapextract( table_t *t, entry_t *item ) +{ + entry_t *p; + + if (t->size == 0 ) { + return 0; + } + + p = &(t->heap[0]); + + memcpy( item, p, sizeof( entry_t )); + memcpy( &(t->heap[0]), &(t->heap[t->size-1]), sizeof(entry_t) ); + + siftdown(t,t->size, 0); + t->size--; + + return 1; +} + + +/*** Makes a heap of all table entries ***/ +static int table2heap(table_t *t) +{ + int i; + uint4 cnt = 0; + uint4 bufsize = 2048; + + /*** Fill result heap ***/ + for (i=0; i<TABLESIZE; i++) { + entry_t *p = t->table[i]; + while (p) { + heapinsert(t, p); + p = p->next; + } + } + return 1; +} + + +static table_t *inittable(uint4 maxngrams) +{ + table_t *result = (table_t *)wg_zalloc( sizeof(table_t) ); + result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE ); + result->pool = wgmempool_Init( 10000, 10 ); + + result->heap = (entry_t *)wg_malloc( sizeof(entry_t) * maxngrams ); + result->heapsize = maxngrams; + result->size = 0; + + return result; +} + +static void tabledone( table_t *t ) +{ + if (!t) { + return; + } + wgmempool_Done(t->pool); + wg_free(t->table); + wg_free(t->heap); + wg_free(t); +} + + +extern void *fp_Init(const char *name) +{ + fp_t *h = (fp_t *)wg_zalloc( sizeof(fp_t) ); + + if ( name ) { + h->name = wg_strdup(name); + } + + return (void *)h; +} + + +extern void fp_Done( void *handle ) +{ + fp_t *h = (fp_t *)handle; + + if ( h->name ) { + wg_free( (void *)h->name ); + } + if ( h->fprint ) { + wg_free( h->fprint ); + } + + wg_free( h ); +} + +extern const char *fp_Name( void *handle ) +{ + fp_t *h = (fp_t *)handle; + return h->name; +} + +/** + * Function that prepares buffer for n-grammification: + * runs of invalid characters are collapsed to a single + * underscore. + * + * Function is implemented as a finite state machine. + */ +static char *prepbuffer( const char *src, size_t bufsize ) +{ + const char *p = src; + char *dest = (char *)wg_malloc( bufsize + 3 ); + char *w = dest; + char *wlimit = dest + bufsize + 1; + + START: + if ( INVALID(*p) ) { + goto SPACE; + } + else if ( *p == '\0' ) { + goto END; + } + + *w++ = '_'; + if ( w == wlimit ) { + goto STOP; + } + + goto WORD; + + + SPACE: + /*** Inside string of invalid characters ***/ + p++; + if ( INVALID(*p) ) { + goto SPACE; + } + else if ( *p == '\0' ) { + goto END; + } + + *w++ = '_'; + if ( w == wlimit ) { + goto STOP; + } + + goto WORD; + + WORD: + /*** Inside string of valid characters ***/ + *w++ = *p++; + if ( w == wlimit ) { + goto END; + } + else if ( INVALID(*p) ) { + goto SPACE; + } + else if ( *p == '\0' ) { + goto STOP; + } + goto WORD; + + END: + *w++ = '_'; + + STOP: + *w++ = '\0'; + + /*** Docs that are too small for a fingerprint, are refused ***/ + if ( w - dest < MINDOCSIZE ) { + wg_free(dest); + return NULL; + } + + return dest; +} + + +static void createngramtable( table_t *t, const char *buf ) +{ + char n[MAXNGRAMSIZE+1]; + const char *p = buf; + int i; + + /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/ + for (;;p++) { + + const char *q = p; + char *m = n; + + /*** First char may be an underscore ***/ + *m++ = *q++; + *m = '\0'; + + increasefreq( t, n, 1 ); + + if ( *q == '\0' ) { + return; + } + + /*** Let the compiler unroll this ***/ + for ( i=2; i<=MAXNGRAMSIZE; i++) { + + *m++ = *q; + *m = '\0'; + + increasefreq( t, n, i ); + + if ( *q == '_' ) break; + q++; + if ( *q == '\0' ) { + return; + } + } + } + return; +} + + +static int mystrcmp(const char *a, const char *b) +{ + int i; + while ( *a && *a == *b ) { + a++; + b++; + } + return (*a - *b); +} + + +static int ngramcmp_str(const void *a, const void *b) +{ + ngram_t *x = (ngram_t *)a; + ngram_t *y = (ngram_t *)b; + + return mystrcmp( x->str, y->str ); +} + +static int ngramcmp_rank(const void *a, const void *b) +{ + ngram_t *x = (ngram_t *)a; + ngram_t *y = (ngram_t *)b; + + return x->rank - y->rank; +} + +/** + * Create a fingerprint: + * - record the frequency of each unique n-gram in a hash table + * - take the most frequent n-grams + * - sort them alphabetically, recording their relative rank + */ +extern int fp_Create( void *handle, const char *buffer, uint4 bufsize, uint4 maxngrams ) +{ + sint4 i = 0; + fp_t *h = NULL; + table_t *t = NULL; + char *tmp = NULL; + + if ( bufsize < MINDOCSIZE ) { + return 0; + } + + /*** Throw out all invalid chars ***/ + tmp = prepbuffer( buffer, bufsize ); + if ( tmp == NULL ) { + return 0; + } + + h = (fp_t*)handle; + t = inittable(maxngrams); + + /*** Create a hash table containing n-gram counts ***/ + createngramtable(t, tmp); + + /*** Take the top N n-grams and add them to the profile ***/ + table2heap(t); + maxngrams = WGMIN( maxngrams, t->size ); + + h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams ); + h->size = maxngrams; + + /*** Pull n-grams out of heap (backwards) ***/ + for (i=maxngrams-1; i>=0; i--) { + + entry_t tmp2; + + heapextract(t, &tmp2); + + /*** the string and its rank is all we need ***/ + strcpy( h->fprint[i].str, tmp2.str ); + h->fprint[i].rank = i; + } + + tabledone(t); + wg_free(tmp); + + /*** Sort n-grams alphabetically, for easy comparison ***/ + qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); + return 1; +} + +extern void fp_Debug( void *handle ) +{ + fp_t *h = (fp_t *)handle; + uint4 i; + printf("------ %s --------\n", h->name ); + for (i=0; i<h->size; i++) { + printf("%3u: '%s' [%u]\n", i, h->fprint[i].str, h->fprint[i].rank); + } + + +} + +extern int fp_Read( void *handle, const char *fname, int maxngrams ) +{ + fp_t *h = (fp_t *)handle; + FILE *fp; + char line[1024]; + int cnt = 0; + + fp = fopen( fname, "r" ); + if (!fp) { +#ifdef VERBOSE + fprintf( stderr, "Failed to open fingerprint file '%s'\n", fname); +#endif + return 0; + } + + h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t)); + + while (cnt < maxngrams && wg_getline(line,1024,fp)) { + + char *p; + + wg_trim(line, line); + + p = strpbrk( line, " \t" ); + if ( p ) { + *p = '\0'; + } + + if ( strlen(line) > MAXNGRAMSIZE ) { + continue; + } + + strcpy( h->fprint[cnt].str, line ); + h->fprint[cnt].rank = cnt; + + cnt++; + } + + h->size = cnt; + + /*** Sort n-grams, for easy comparison later on ***/ + qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); + + fclose(fp); + + return 1; +} + + + +extern void fp_Print( void *handle, FILE *fp ) +{ + uint4 i; + fp_t *h = (fp_t *)handle; + ngram_t *tmp = wg_malloc( sizeof(ngram_t) * h->size ); + + /*** Make a temporary and sort it on rank ***/ + memcpy( tmp, h->fprint, h->size * sizeof(ngram_t) ); + qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank ); + + for (i=0; i<h->size; i++) { + fprintf( fp, "%s\n", tmp[i].str ); + } + wg_free( tmp ); +} + + + +extern sint4 fp_Compare( void *cat, void *unknown, int cutoff ) +{ + fp_t *c = (fp_t *)cat; + fp_t *u = (fp_t *)unknown; + uint4 i = 0; + uint4 j = 0; + sint4 sum = 0; + + /*** Compare the profiles in mergesort fashion ***/ + while ( i < c->size && j < u->size ) { + + int cmp = mystrcmp( c->fprint[i].str, u->fprint[j].str ); + + if ( cmp < 0 ) { + i++; + } + else if ( cmp == 0 ) { + sum += abs( c->fprint[i].rank - u->fprint[j].rank ); + if ( sum > cutoff ) { + return MAXSCORE; + } + i++; + j++; + } + else { + sum += MAXOUTOFPLACE; + if ( sum > cutoff ) { + return MAXSCORE; + } + j++; + } + } + + /*** Process tail of unknown, if any ***/ + while ( j < u->size ) { + sum += MAXOUTOFPLACE; + if ( sum > cutoff ) { + return MAXSCORE; + } + j++; + } + + return sum; + +} + + diff --git a/src/fingerprint.h b/src/fingerprint.h new file mode 100644 index 0000000..de55fe9 --- /dev/null +++ b/src/fingerprint.h @@ -0,0 +1,47 @@ +#ifndef _FINGERPRINT_H_ +#define _FINGERPRINT_H_ +/* + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "common.h" + +extern void *fp_Init(const char *name); +extern void fp_Done( void *handle ); +extern int fp_Create( void *handle, const char *buffer, uint4 bufsize, uint4 maxngrams ); +extern int fp_Read( void *handle, const char *fname, int maxngrams ); +extern sint4 fp_Compare( void *cat, void *unknown, int cutoff ); +extern void fp_Show( void *handle ); +extern const char *fp_Name( void *handle ); +extern void fp_Print( void *handle, FILE *fp ); + +#endif diff --git a/src/testtextcat.c b/src/testtextcat.c new file mode 100644 index 0000000..3fecc94 --- /dev/null +++ b/src/testtextcat.c @@ -0,0 +1,98 @@ +/* + * testtextcat.c -- a simple commandline classifier. Feed it input on + * standard in and it will feed you a classification on standard out. + * + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "textcat.h" +#include "common.h" + +#define BLOCKSIZE 4096 + +char *myread(FILE *fp) +{ + char *buf; + size_t size = 0; + size_t maxsize = BLOCKSIZE*2; + + buf = (char *)wg_malloc( maxsize ); + do { + size_t read = fread( buf+size, 1, BLOCKSIZE, fp ); + size += read; + if ( size + BLOCKSIZE > maxsize ) { + maxsize *= 2; + buf = (char *)wg_realloc( buf, maxsize ); + } + + } while (!feof(stdin)); + + buf[size] = '\0'; + buf = (char *)wg_realloc( buf, size+1 ); + + return buf; +} + + +int main( int argc, char **argv ) +{ + void *h; + char *result; + wgtimer_t tm; + int i; + char *buf; + + printf("%s\n", textcat_Version()); + + h = textcat_Init( argc>1?argv[1]:"conf.txt" ); + if ( !h ) { + printf("Unable to init. Aborting.\n"); + exit(-1); + } + + buf = myread(stdin); + + wg_timerstart(&tm); + + /*** We only need a little text to determine the language ***/ + buf[1024] = '\0'; + result = textcat_Classify( h, buf, strlen(buf)+1 ); + printf("Result == %s\n", result); + + fprintf(stderr, "That took %u ms.\n", wg_timerstop(&tm)/1000); + + textcat_Done(h); + + wg_free(buf); + + return 0; +} diff --git a/src/textcat.c b/src/textcat.c new file mode 100644 index 0000000..b64734c --- /dev/null +++ b/src/textcat.c @@ -0,0 +1,237 @@ +/** + * textcat.c -- routines for categorizing text + * + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * DESCRIPTION + * + * These routines use the N-gram fingerprinting technique as described + * in Cavnar and Trenkle, (1994.), N-Gram-Based Text Categorization. + * (cf. http://www.nonlineardynamics.com/trenkle/) + * + * REVISION HISTORY + * + * Mar 27, 2003 frank@wise-guys.nl -- created + * + * IMPROVEMENTS: + * - If two n-grams have the same frequency count, choose the shortest + * - Use a better similarity measure (the article suggests Wilcoxon rank test) + * - The profiles are matched one by one, which results in redundant lookups. + * - Make the thingy reentrant as well as thread-safe. (Reentrancy is abandoned + * by the use of the output buffer in textcat_t.) + */ +#include <stdlib.h> +#include <string.h> + +#include "common.h" +#include "fingerprint.h" +#include "textcat.h" +#include "constants.h" + + +typedef struct { + + void **fprint; + uint4 size; + uint4 maxsize; + + char output[MAXOUTPUTSIZE]; + +} textcat_t; + + +typedef struct { + int score; + const char *name; +} candidate_t; + + +static int cmpcandidates(const void *a, const void *b) +{ + candidate_t *x = (candidate_t *)a; + candidate_t *y = (candidate_t *)b; + + if ( x->score < y->score ) { + return -1; + } + if ( x->score > y->score ) { + return 1; + } + return 0; +} + + +extern void textcat_Done( void *handle ) +{ + textcat_t *h = (textcat_t *)handle; + uint4 i; + + for (i=0; i<h->size; i++) { + fp_Done( h->fprint[i] ); + } + wg_free( h->fprint ); + wg_free( h ); + +} + +extern void *textcat_Init( const char *conffile ) +{ + textcat_t *h; + char line[1024]; + FILE *fp; + + fp = fopen( conffile, "r" ); + if ( !fp ) { +#ifdef VERBOSE + fprintf( stderr, "Failed to open config file '%s'\n", conffile); +#endif + return NULL; + } + + h = (textcat_t *)wg_malloc(sizeof(textcat_t)); + h->size = 0; + h->maxsize = 16; + h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize ); + + while ( wg_getline( line, 1024, fp ) ) { + char *p; + char *segment[4]; + double startcnt = 0.0; + int res; + + /*** Skip comments ***/ + if (( p = strchr(line,'#') )) { + *p = '\0'; + } + if ((res = wg_split( segment, line, line, 4)) < 2 ) { + continue; + } + + /*** Ensure enough space ***/ + if ( h->size == h->maxsize ) { + h->maxsize *= 2; + h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize ); + } + + /*** Load data ***/ + if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) { + goto ERROR; + } + if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) { + textcat_Done(h); + goto ERROR; + } + h->size++; + } + + fclose(fp); + return h; + + ERROR: + fclose(fp); + return NULL; + +} + + +extern char *textcat_Classify( void *handle, const char *buffer, size_t size ) +{ + textcat_t *h = (textcat_t *)handle; + uint4 i, cnt = 0; + int minscore = MAXSCORE; + int min = -1; + int threshold = minscore; + char *result = h->output; + + candidate_t *candidates = (candidate_t *)alloca( sizeof(candidate_t) * h->size ); + + void *unknown; + + unknown = fp_Init(NULL); + if ( fp_Create( unknown, buffer, size, MAXNGRAMS ) == 0 ) { + /*** Too little information ***/ + result = _TEXTCAT_RESULT_SHORT; + goto READY; + } + + /*** Calculate the score for each category. ***/ + for (i=0; i<h->size; i++) { + int score = fp_Compare( h->fprint[i], unknown, threshold ); + candidates[i].score = score; + candidates[i].name = fp_Name( h->fprint[i] ); + if ( score < minscore ) { + minscore = score; + threshold = (int)( (double)score * THRESHOLDVALUE ); + } + } + + /*** Find the best performers ***/ + for (i=0; i<h->size; i++) { + if ( candidates[i].score < threshold ) { + + if ( ++cnt == MAXCANDIDATES+1 ) { + break; + } + + memcpy( &candidates[cnt-1], &candidates[i], sizeof(candidate_t) ); + + } + } + + /*** The verdict ***/ + if ( cnt == MAXCANDIDATES+1 ) { + result = _TEXTCAT_RESULT_UNKOWN; + } + else { + char *p = result; + char *plimit = result+MAXOUTPUTSIZE; + + qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates ); + + *p = '\0'; + for (i=0; i<cnt; i++) { + p = wg_strgmov( p, "[", plimit ); + p = wg_strgmov( p, candidates[i].name, plimit ); + p = wg_strgmov( p, "]", plimit ); + } + } + READY: + fp_Done(unknown); + return result; +} + + +extern char *textcat_Version() +{ + return "TextCat " WGSTR(VERSION) "." WGSTR(SUBVERSION) " (" DESCRIPTION ")"; +} diff --git a/src/textcat.h b/src/textcat.h new file mode 100644 index 0000000..746e131 --- /dev/null +++ b/src/textcat.h @@ -0,0 +1,80 @@ +#ifndef _TEXTCAT_H_ +#define _TEXTCAT_H_ +/* + * textcat.h -- routines for categorizing text + * + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include <stdio.h> + +#define _TEXTCAT_RESULT_UNKOWN "UNKNOWN" +#define _TEXTCAT_RESULT_SHORT "SHORT" + + +/** + * textcat_Init() - Initialize the text classifier. The textfile + * conffile should contain a list of fingerprint filenames and + * identification strings for the categories. The filenames should be + * reachable from the current working directory. The identification + * strings will are used in the classification output. + * + * Returns: handle on success, NULL on error. (At the moment, the + * only way errors can occur, is when the library cannot read the + * conffile, or one of the fingerprint files listed in it.) + */ +extern void *textcat_Init( const char *conffile ); + +/** + * textcat_Done() - Free up resources for handle + */ +extern void textcat_Done( void *handle ); + +/** + * textcat_Classify() - Give the most likely categories for buffer + * with length size. + * + * Returns: string containing a list of category id's, each one + * between square brackets, "UNKNOWN" when not recognized, "SHORT" if the + * document was too short to make a reliable assessment. + * + * Performace note: longer buffers take longer to process. However, + * for many uses it is not necessary to categorize the whole buffer. + * For language classification, a few hundred bytes will suffice. + */ +extern char *textcat_Classify( void *handle, const char *buffer, size_t size ); + +/** + * textcat_Version() - Returns a string describing the version of this classifier. + */ +extern char *textcat_Version(); +#endif diff --git a/src/wg_mempool.c b/src/wg_mempool.c new file mode 100644 index 0000000..d489ffd --- /dev/null +++ b/src/wg_mempool.c @@ -0,0 +1,231 @@ +/* + * wg_mempool.c -- Functions for managing a memory pool. + * + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <string.h> +#include "common.h" + +typedef struct memblock_s { + char *pool; + char *p; + char *pend; + struct memblock_s *next; +} memblock_t; + + +typedef struct mempool_s { + memblock_t *first; /* linked list of blocks in use */ + memblock_t *spare; /* linked list of spare blocks */ + size_t maxallocsize; + size_t blocksize; +} mempool_t; + + + +static void addblock( mempool_t *h ) +{ + memblock_t *block; + + /*** Spare blocks from previous round? ***/ + if ( h->spare ) { + /*** Use spare block ***/ + block = h->spare; + h->spare = block->next; + } + else { + /*** Make a new block ***/ + block = (memblock_t *)wg_malloc( sizeof(memblock_t) ); + block->pool = (char *)wg_malloc( h->blocksize ); + } + + block->p = block->pool; + block->pend = block->pool + h->blocksize - h->maxallocsize; + block->next = h->first; + h->first = block; +} + + +extern void *wgmempool_Init(size_t blocksize, size_t maxstrsize) +{ + mempool_t *result = (mempool_t *)wg_malloc( sizeof(mempool_t) ); + + result->first = NULL; + result->spare = NULL; + result->blocksize = blocksize; + result->maxallocsize = maxstrsize?(maxstrsize + 1):0; + addblock( result ); + + return (void *)result; +} + + +extern void wgmempool_Done( void *handle ) +{ + mempool_t *h = (mempool_t *)handle; + + memblock_t *p; + + /*** Active blocks ***/ + p = h->first; + while (p) { + memblock_t *next = p->next; + wg_free( p->pool ); + + memset( p, 0, sizeof(memblock_t)); /* for safety */ + wg_free( p ); + + p = next; + } + + /*** Spare blocks ***/ + p = h->spare; + while (p) { + memblock_t *next = p->next; + wg_free( p->pool ); + + memset( p, 0, sizeof(memblock_t)); /* for safety */ + wg_free( p ); + + p = next; + } + + memset( h, 0, sizeof(mempool_t)); /* for safety */ + wg_free(h); +} + +extern void wgmempool_Reset( void *handle ) +{ + mempool_t *h = (mempool_t *)handle; + memblock_t *p; + + if ( !h->first ) { + return; + } + + /*** Find last active block ***/ + p = h->first; + while (p->next) { + p = p->next; + } + + /*** Append spare list to it ***/ + p->next = h->spare; + h->spare = h->first; + h->first = NULL; + + /*** Start with a new block ***/ + addblock(h); +} + + +extern void *wgmempool_alloc( void *handle, size_t size ) +{ + void *result; + mempool_t *h = (mempool_t *)handle; + memblock_t *block = h->first; + + /*** Too little space left in block? ***/ + if ( block->p + size > block->pend + h->maxallocsize ) { + addblock(h); + block = h->first; + } + result = (void *)block->p; + block->p += size; + return result; +} + + + +extern char *wgmempool_strdup( void *handle, const char *str ) +{ + char *w, *result; + mempool_t *h = (mempool_t *)handle; + memblock_t *block = h->first; + + /*** Create extra room? ***/ + if ( h->maxallocsize ) { + if ( block->p >= block->pend ) { + addblock(h); + block = h->first; + } + } + else if ( block->p + strlen(str) + 1 >= block->pend ) { + addblock(h); + block = h->first; + } + + /*** Enough room, so copy string ***/ + result = w = block->p; + while (*str) { + *w++ = *str++; + } + *w++ = '\0'; + block->p = w; + return result; +} + + +extern char *wgmempool_getline( void *handle, size_t size, FILE *fp ) +{ + char *result, *p; + mempool_t *h = (mempool_t *)handle; + memblock_t *block = h->first; + + /*** Enough space? ***/ + if ( block->p + size > block->pend + h->maxallocsize ) { + addblock(h); + block = h->first; + } + + result = (char *)block->p; + fgets(result, size, fp) ; + + /** end of stream? **/ + if ( feof( fp ) ) { + return NULL; + } + + /** find end of line **/ + p = result; + while (*p && *p != '\n' && *p != '\r' ) { + p++; + } + *p++ = '\0'; + + block->p = p; + return result; +} + + diff --git a/src/wg_mempool.h b/src/wg_mempool.h new file mode 100644 index 0000000..73d40af --- /dev/null +++ b/src/wg_mempool.h @@ -0,0 +1,144 @@ +#ifndef _WGMEMPOOL_H_ +#define _WGMEMPOOL_H_ + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * mempool.c -- functions for efficient, pooled memory allocation + * + * THE BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * DESCRIPTION + * + * These functions can be used to maintain a pool from which memory + * can be claimed without performing numerous allocs. + * + * A memory pool can increase time and space efficiency by factors + * in situations where: + * + * - you need numerous allocations of little chunks of memory, e.g. + * a huge list of keywords. + * - you need little or no deallocations: preferable just one big + * dealloc at the end. + * + * A memory pool is unfavorable in situations where you constantly + * alloc and free memory. + * + * NOTES + * + * - A memory pool consists of "blocks" of a fixed size, which are + * added as the need for extra memory grows. + * + * - Deallocation of the individual fragments is not possible. The + * pool can only be deallocated in its entirety. + * + */ + + +#include "common.h" + +/* + * wgmempool_Init() -- intialize a memory pool + * + * ARGUMENTS + * + * - blocksize : size of blocks of which pool consists + * - maxstrsize: + * - > 0 : the maximum size of a string that is copied + * with wgmempool_strdup(). Omits the necessity of boundschecking + * in wgmempool_strdup(), making is slighlty faster. + * - 0 : causes wgmempool_strdup() to do its own boundschecking, + * which makes it somewhat slower. + * + * RETURN VALUE + * + * mempool handler on success, NULL on error. + * + */ +extern void *wgmempool_Init(uint4 blocksize, size_t maxstrsize); + + +/* + * wgmempool_Done() -- deallocate memory pool in its entirety + */ +extern void wgmempool_Done( void *handle ); + + +/* + * wgmempool_Reset() -- resets a memory pool for reuse + * + * wgmempool_Reset() preserves already claimed memory for reuse, making + * it more time efficient than doing a wgmempool_Done() and wgmempool_Init(). + */ +extern void wgmempool_Reset( void *handle ); + + +/* + * wgmempool_alloc() -- Allocate size bytes of memory in mempool + * + * ARGUMENTS + * + * - size: number of bytes to claim + * + * RETURN VALUE + * + * - pointer to claimed memory on success + * - NULL on error + */ +extern void *wgmempool_alloc( void *handle, size_t size ); + + +/* + * wgmempool_strdup() -- Allocate and copy a string to mempool + * + * ARGUMENTS + * + * - str : null-terminated string. If maxstrsize > 0 during + * init, strlen(str) may not be greater than this value. + * + * RETURN VALUE + * + * - pointer to duplicated string on success + * - NULL on error + */ +extern char *wgmempool_strdup( void *handle, const char* str ); + + +#ifdef __cplusplus +} +#endif + + +#endif + |