summaryrefslogtreecommitdiff
path: root/src/textcat.c
diff options
context:
space:
mode:
authorFrank Scheelen <fscheelen@wise-guys.nl>2003-05-15 16:03:51 +0100
committerCaolán McNamara <caolanm@redhat.com>2011-09-19 14:12:01 +0100
commitf5af079ca969a2948b865c0342a22299de5afcde (patch)
tree790c49261fa5fc2cb1c5154af3526d4a6ccd99c7 /src/textcat.c
Released libtextcat 2.0 under BSD license
Diffstat (limited to 'src/textcat.c')
-rw-r--r--src/textcat.c237
1 files changed, 237 insertions, 0 deletions
diff --git a/src/textcat.c b/src/textcat.c
new file mode 100644
index 0000000..b64734c
--- /dev/null
+++ b/src/textcat.c
@@ -0,0 +1,237 @@
+/**
+ * textcat.c -- routines for categorizing text
+ *
+ * Copyright (C) 2003 WiseGuys Internet B.V.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * DESCRIPTION
+ *
+ * These routines use the N-gram fingerprinting technique as described
+ * in Cavnar and Trenkle, (1994.), N-Gram-Based Text Categorization.
+ * (cf. http://www.nonlineardynamics.com/trenkle/)
+ *
+ * REVISION HISTORY
+ *
+ * Mar 27, 2003 frank@wise-guys.nl -- created
+ *
+ * IMPROVEMENTS:
+ * - If two n-grams have the same frequency count, choose the shortest
+ * - Use a better similarity measure (the article suggests Wilcoxon rank test)
+ * - The profiles are matched one by one, which results in redundant lookups.
+ * - Make the thingy reentrant as well as thread-safe. (Reentrancy is abandoned
+ * by the use of the output buffer in textcat_t.)
+ */
+#include <stdlib.h>
+#include <string.h>
+
+#include "common.h"
+#include "fingerprint.h"
+#include "textcat.h"
+#include "constants.h"
+
+
+typedef struct {
+
+ void **fprint;
+ uint4 size;
+ uint4 maxsize;
+
+ char output[MAXOUTPUTSIZE];
+
+} textcat_t;
+
+
+typedef struct {
+ int score;
+ const char *name;
+} candidate_t;
+
+
+static int cmpcandidates(const void *a, const void *b)
+{
+ candidate_t *x = (candidate_t *)a;
+ candidate_t *y = (candidate_t *)b;
+
+ if ( x->score < y->score ) {
+ return -1;
+ }
+ if ( x->score > y->score ) {
+ return 1;
+ }
+ return 0;
+}
+
+
+extern void textcat_Done( void *handle )
+{
+ textcat_t *h = (textcat_t *)handle;
+ uint4 i;
+
+ for (i=0; i<h->size; i++) {
+ fp_Done( h->fprint[i] );
+ }
+ wg_free( h->fprint );
+ wg_free( h );
+
+}
+
+extern void *textcat_Init( const char *conffile )
+{
+ textcat_t *h;
+ char line[1024];
+ FILE *fp;
+
+ fp = fopen( conffile, "r" );
+ if ( !fp ) {
+#ifdef VERBOSE
+ fprintf( stderr, "Failed to open config file '%s'\n", conffile);
+#endif
+ return NULL;
+ }
+
+ h = (textcat_t *)wg_malloc(sizeof(textcat_t));
+ h->size = 0;
+ h->maxsize = 16;
+ h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize );
+
+ while ( wg_getline( line, 1024, fp ) ) {
+ char *p;
+ char *segment[4];
+ double startcnt = 0.0;
+ int res;
+
+ /*** Skip comments ***/
+ if (( p = strchr(line,'#') )) {
+ *p = '\0';
+ }
+ if ((res = wg_split( segment, line, line, 4)) < 2 ) {
+ continue;
+ }
+
+ /*** Ensure enough space ***/
+ if ( h->size == h->maxsize ) {
+ h->maxsize *= 2;
+ h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize );
+ }
+
+ /*** Load data ***/
+ if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) {
+ goto ERROR;
+ }
+ if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) {
+ textcat_Done(h);
+ goto ERROR;
+ }
+ h->size++;
+ }
+
+ fclose(fp);
+ return h;
+
+ ERROR:
+ fclose(fp);
+ return NULL;
+
+}
+
+
+extern char *textcat_Classify( void *handle, const char *buffer, size_t size )
+{
+ textcat_t *h = (textcat_t *)handle;
+ uint4 i, cnt = 0;
+ int minscore = MAXSCORE;
+ int min = -1;
+ int threshold = minscore;
+ char *result = h->output;
+
+ candidate_t *candidates = (candidate_t *)alloca( sizeof(candidate_t) * h->size );
+
+ void *unknown;
+
+ unknown = fp_Init(NULL);
+ if ( fp_Create( unknown, buffer, size, MAXNGRAMS ) == 0 ) {
+ /*** Too little information ***/
+ result = _TEXTCAT_RESULT_SHORT;
+ goto READY;
+ }
+
+ /*** Calculate the score for each category. ***/
+ for (i=0; i<h->size; i++) {
+ int score = fp_Compare( h->fprint[i], unknown, threshold );
+ candidates[i].score = score;
+ candidates[i].name = fp_Name( h->fprint[i] );
+ if ( score < minscore ) {
+ minscore = score;
+ threshold = (int)( (double)score * THRESHOLDVALUE );
+ }
+ }
+
+ /*** Find the best performers ***/
+ for (i=0; i<h->size; i++) {
+ if ( candidates[i].score < threshold ) {
+
+ if ( ++cnt == MAXCANDIDATES+1 ) {
+ break;
+ }
+
+ memcpy( &candidates[cnt-1], &candidates[i], sizeof(candidate_t) );
+
+ }
+ }
+
+ /*** The verdict ***/
+ if ( cnt == MAXCANDIDATES+1 ) {
+ result = _TEXTCAT_RESULT_UNKOWN;
+ }
+ else {
+ char *p = result;
+ char *plimit = result+MAXOUTPUTSIZE;
+
+ qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates );
+
+ *p = '\0';
+ for (i=0; i<cnt; i++) {
+ p = wg_strgmov( p, "[", plimit );
+ p = wg_strgmov( p, candidates[i].name, plimit );
+ p = wg_strgmov( p, "]", plimit );
+ }
+ }
+ READY:
+ fp_Done(unknown);
+ return result;
+}
+
+
+extern char *textcat_Version()
+{
+ return "TextCat " WGSTR(VERSION) "." WGSTR(SUBVERSION) " (" DESCRIPTION ")";
+}