indent -bap -bli0 -i4 -l79 -ncs -npcs -npsl -fca -lc79 -fc1 -ts4 -nut

author: Giulio Paci <giuliopaci@gmail.com> 2012-01-09 12:41:14 +0000
committer: Caolán McNamara <caolanm@redhat.com> 2012-01-09 12:41:14 +0000
commit: c29914910dbb446b1ac99cb4a716ee483853d9b9 (patch)
tree: 1bab96a21d281857ece24b7a5a0316ec315cb98e
parent: 4561963bf5b7806e77404104c27e16929784eb51 (diff)
5 files changed, 245 insertions, 231 deletions
diff --git a/src/fingerprint.c b/src/fingerprint.c
index dce3bd9..a59ee30 100644
--- a/src/fingerprint.c
+++ b/src/fingerprint.c
@@ -438,10 +438,10 @@ static void createngramtable(table_t * t, const char *buf)
         char *m = n;
 
         /*** First char may be an underscore ***/
-        decay = utf8_charcopy(q, m); /* [modified] previously *q++ = *m++ */
+        decay = utf8_charcopy(q, m);    /* [modified] previously *q++ = *m++ */
 
-        q += decay;  /* [modified] */
-        m += decay; /* [modified] */
+        q += decay;             /* [modified] */
+        m += decay;             /* [modified] */
         *m = '\0';
 
         increasefreq(t, n, 1);
@@ -452,7 +452,7 @@ static void createngramtable(table_t * t, const char *buf)
         /*** Let the compiler unroll this ***/
         for (i = 2; i <= MAXNGRAMSYMBOL; i++)
         {
-            decay = utf8_charcopy(q, m); /* [modified] like above */
+            decay = utf8_charcopy(q, m);    /* [modified] like above */
             m += decay;
             *m = '\0';
 
@@ -465,9 +465,9 @@ static void createngramtable(table_t * t, const char *buf)
                 return;
         }
 
-        p = utf8_next_char(p); /* [modified] */
+        p = utf8_next_char(p);  /* [modified] */
     }
-	return;
+    return;
 }
 
 static int mystrcmp(const char *a, const char *b)
diff --git a/src/textcat.c b/src/textcat.c
index 8a28956..399b962 100644
--- a/src/textcat.c
+++ b/src/textcat.c
@@ -75,7 +75,7 @@ typedef struct
     uint4 maxsize;
 
     char output[MAXOUTPUTSIZE];
-	candidate_t *tmp_candidates;
+    candidate_t *tmp_candidates;
 
 } textcat_t;
 
@@ -97,9 +97,10 @@ extern void textcat_Done(void *handle)
     {
         fp_Done(h->fprint[i]);
     }
-	if(h->tmp_candidates != NULL) {
-		textcat_ReleaseClassifyFullOutput(h, h->tmp_candidates);
-	}
+    if (h->tmp_candidates != NULL)
+    {
+        textcat_ReleaseClassifyFullOutput(h, h->tmp_candidates);
+    }
     free(h->fprint);
     free(h->fprint_disable);
     free(h);
@@ -119,9 +120,9 @@ extern void *textcat_Init(const char *conffile)
 extern void *special_textcat_Init(const char *conffile, const char *prefix)
 {
     textcat_t *h;
-	char *finger_print_file_name;
-	size_t finger_print_file_name_size;
-	size_t prefix_size;
+    char *finger_print_file_name;
+    size_t finger_print_file_name_size;
+    size_t prefix_size;
     char line[1024];
     FILE *fp;
 
@@ -138,15 +139,17 @@ extern void *special_textcat_Init(const char *conffile, const char *prefix)
     h->size = 0;
     h->maxsize = 16;
     h->fprint = (void **)malloc(sizeof(void *) * h->maxsize);
-    h->fprint_disable = (unsigned char *)malloc(sizeof(unsigned char) * h->maxsize);
+    h->fprint_disable =
+        (unsigned char *)malloc(sizeof(unsigned char) * h->maxsize);
     /* added to store the state of languages */
-	h->tmp_candidates = NULL;
+    h->tmp_candidates = NULL;
 
-	prefix_size = strlen(prefix);
-	finger_print_file_name_size = prefix_size + 1;
-	finger_print_file_name = (char*)malloc( sizeof(char) * ( finger_print_file_name_size +1024 ) );
-	finger_print_file_name[0] = '\0';
-	strcat(finger_print_file_name, prefix);
+    prefix_size = strlen(prefix);
+    finger_print_file_name_size = prefix_size + 1;
+    finger_print_file_name =
+        (char *)malloc(sizeof(char) * (finger_print_file_name_size + 1024));
+    finger_print_file_name[0] = '\0';
+    strcat(finger_print_file_name, prefix);
 
     while (wg_getline(line, 1024, fp))
     {
@@ -172,8 +175,7 @@ extern void *special_textcat_Init(const char *conffile, const char *prefix)
                 (void **)realloc(h->fprint, sizeof(void *) * h->maxsize);
             h->fprint_disable =
                 (unsigned char *)realloc(h->fprint_disable,
-                                            sizeof(unsigned char) *
-                                            h->maxsize);
+                                         sizeof(unsigned char) * h->maxsize);
         }
 
         /*** Load data ***/
@@ -182,37 +184,40 @@ extern void *special_textcat_Init(const char *conffile, const char *prefix)
             goto BAILOUT;
         }
 
-		while( prefix_size + strlen(segment[0]) > finger_print_file_name_size )
-		{
-			char *tmp;
-			size_t tmp_size = finger_print_file_name_size * 2;
-			tmp = (char *)realloc(finger_print_file_name, sizeof(char)*(tmp_size+1) );
-			if( tmp == NULL )
-			{
-				free( finger_print_file_name );
-				finger_print_file_name_size = 0;
-				goto BAILOUT;
-			}
-			else
-			{
-				finger_print_file_name = tmp;
-				finger_print_file_name_size = tmp_size;
-			}
-		}
+        while (prefix_size + strlen(segment[0]) > finger_print_file_name_size)
+        {
+            char *tmp;
+            size_t tmp_size = finger_print_file_name_size * 2;
+            tmp =
+                (char *)realloc(finger_print_file_name,
+                                sizeof(char) * (tmp_size + 1));
+            if (tmp == NULL)
+            {
+                free(finger_print_file_name);
+                finger_print_file_name_size = 0;
+                goto BAILOUT;
+            }
+            else
+            {
+                finger_print_file_name = tmp;
+                finger_print_file_name_size = tmp_size;
+            }
+        }
         finger_print_file_name[prefix_size] = '\0';
         strcat(finger_print_file_name, segment[0]);
 
         if (fp_Read(h->fprint[h->size], finger_print_file_name, 400) == 0)
         {
-			textcat_Done(h);
+            textcat_Done(h);
             goto BAILOUT;
         }
         h->fprint_disable[h->size] = 0xF0;  /* 0xF0 is the code for enabled
-                                               languages, 0x0F is for disabled */
+                                               languages, 0x0F is for disabled 
+                                             */
         h->size++;
     }
 
-	free( finger_print_file_name );
+    free(finger_print_file_name);
 
     fclose(fp);
     return h;
@@ -223,79 +228,83 @@ extern void *special_textcat_Init(const char *conffile, const char *prefix)
 
 }
 
-extern candidate_t *textcat_GetClassifyFullOutput( void *handle )
+extern candidate_t *textcat_GetClassifyFullOutput(void *handle)
 {
-	textcat_t *h = (textcat_t *)handle;
-	return (candidate_t *) malloc( sizeof(candidate_t) * h->size );
+    textcat_t *h = (textcat_t *) handle;
+    return (candidate_t *) malloc(sizeof(candidate_t) * h->size);
 }
 
-extern void textcat_ReleaseClassifyFullOutput( void *handle, candidate_t *candidates )
+extern void textcat_ReleaseClassifyFullOutput(void *handle,
+                                              candidate_t * candidates)
 {
-	if(candidates != NULL) {
-		free(candidates);
-	}
+    if (candidates != NULL)
+    {
+        free(candidates);
+    }
 }
 
-extern char *textcat_Classify( void *handle, const char *buffer, size_t size )
+extern char *textcat_Classify(void *handle, const char *buffer, size_t size)
 {
     textcat_t *h = (textcat_t *) handle;
     char *result = h->output;
-	uint4 i, cnt;
+    uint4 i, cnt;
 
-	if( h->tmp_candidates == NULL)
-	{
-		h->tmp_candidates = textcat_GetClassifyFullOutput( h );
-	}
+    if (h->tmp_candidates == NULL)
+    {
+        h->tmp_candidates = textcat_GetClassifyFullOutput(h);
+    }
 
-	cnt = textcat_ClassifyFull( h, buffer, size, h->tmp_candidates );
+    cnt = textcat_ClassifyFull(h, buffer, size, h->tmp_candidates);
 
-	switch(cnt){
-        case TEXTCAT_RESULT_UNKOWN:
-		result = _TEXTCAT_RESULT_UNKOWN;
-                break;
-        case TEXTCAT_RESULT_SHORT:
-		result = _TEXTCAT_RESULT_SHORT;
-                break;
-        default:
-	{
-        const char *plimit = result + MAXOUTPUTSIZE;
-		char *p = result;
-
-		*p = '\0';
-        for (i = 0; i < cnt; i++)
+    switch (cnt)
+    {
+    case TEXTCAT_RESULT_UNKOWN:
+        result = _TEXTCAT_RESULT_UNKOWN;
+        break;
+    case TEXTCAT_RESULT_SHORT:
+        result = _TEXTCAT_RESULT_SHORT;
+        break;
+    default:
         {
-            p = wg_strgmov(p, "[", plimit);
-            p = wg_strgmov(p, h->tmp_candidates[i].name, plimit);
-            p = wg_strgmov(p, "]", plimit);
-        }
-	}
+            const char *plimit = result + MAXOUTPUTSIZE;
+            char *p = result;
+
+            *p = '\0';
+            for (i = 0; i < cnt; i++)
+            {
+                p = wg_strgmov(p, "[", plimit);
+                p = wg_strgmov(p, h->tmp_candidates[i].name, plimit);
+                p = wg_strgmov(p, "]", plimit);
+            }
         }
+    }
 
     return result;
 }
 
 
-extern int textcat_ClassifyFull( void *handle, const char *buffer, size_t size, candidate_t *candidates )
+extern int textcat_ClassifyFull(void *handle, const char *buffer, size_t size,
+                                candidate_t * candidates)
 {
-	textcat_t *h = (textcat_t *)handle;
-	uint4 i, cnt = 0;
-	int minscore = MAXSCORE;
-	int threshold = minscore;
+    textcat_t *h = (textcat_t *) handle;
+    uint4 i, cnt = 0;
+    int minscore = MAXSCORE;
+    int threshold = minscore;
 
-	void *unknown;
+    void *unknown;
 
-	unknown = fp_Init(NULL);
+    unknown = fp_Init(NULL);
     if (fp_Create(unknown, buffer, size, MAXNGRAMS, MINDOCSIZE) == 0)
     {
-		/*** Too little information ***/
-		fp_Done(unknown);
-		return TEXTCAT_RESULT_SHORT ;
-	}
+        /*** Too little information ***/
+        fp_Done(unknown);
+        return TEXTCAT_RESULT_SHORT;
+    }
 
-	/*** Calculate the score for each category. ***/
+    /*** Calculate the score for each category. ***/
     for (i = 0; i < h->size; i++)
     {
-		int score;
+        int score;
         if (h->fprint_disable[i] & 0x0F)
         {                       /* if this language is disabled */
             score = MAXSCORE;
@@ -304,42 +313,42 @@ extern int textcat_ClassifyFull( void *handle, const char *buffer, size_t size,
         {
             score = fp_Compare(h->fprint[i], unknown, threshold);
             /* printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score); */
-		}
-		candidates[i].score = score;
+        }
+        candidates[i].score = score;
         candidates[i].name = fp_Name(h->fprint[i]);
         if (score < minscore)
         {
-			minscore = score;
+            minscore = score;
             threshold = (int)((double)score * THRESHOLDVALUE);
-		}
-	}
+        }
+    }
 
-	/*** Find the best performers ***/
+    /*** Find the best performers ***/
     for (i = 0, cnt = 0; i < h->size; i++)
     {
         if (candidates[i].score < threshold)
         {
             if (++cnt == MAXCANDIDATES + 1)
             {
-				break;
-			}
+                break;
+            }
 
             memcpy(&candidates[cnt - 1], &candidates[i], sizeof(candidate_t));
 
-		}
-	}
+        }
+    }
 
-	fp_Done(unknown);
-	/*** The verdict ***/
+    fp_Done(unknown);
+    /*** The verdict ***/
     if (cnt == MAXCANDIDATES + 1)
     {
-		return TEXTCAT_RESULT_UNKOWN;
-	}
+        return TEXTCAT_RESULT_UNKOWN;
+    }
     else
     {
         qsort(candidates, cnt, sizeof(candidate_t), cmpcandidates);
-		return cnt;
-	}
+        return cnt;
+    }
 }
 
 extern const char *textcat_Version(void)
diff --git a/src/textcat.h b/src/textcat.h
index 9b52063..a1e1770 100644
--- a/src/textcat.h
+++ b/src/textcat.h
@@ -48,90 +48,94 @@ extern "C"
 {
 #endif
 
-typedef struct {
-	int score;
-	const char *name;
-} candidate_t;
-
-
-/**
- * textcat_Init() - Initialize the text classifier. The textfile
- * conffile should contain a list of fingerprint filenames and
- * identification strings for the categories.  The filenames should be
- * reachable from the current working directory. The identification
- * strings will are used in the classification output.
- * 
- * Returns: handle on success, NULL on error. (At the moment, the
- * only way errors can occur, is when the library cannot read the
- * conffile, or one of the fingerprint files listed in it.)
- *
- * Replace older function (and has exacly the same behaviour)
- * see below
- */
+    typedef struct
+    {
+        int score;
+        const char *name;
+    } candidate_t;
+
+    /**
+     * textcat_Init() - Initialize the text classifier. The textfile
+     * conffile should contain a list of fingerprint filenames and
+     * identification strings for the categories.  The filenames should be
+     * reachable from the current working directory. The identification
+     * strings will are used in the classification output.
+     * 
+     * Returns: handle on success, NULL on error. (At the moment, the
+     * only way errors can occur, is when the library cannot read the
+     * conffile, or one of the fingerprint files listed in it.)
+     *
+     * Replace older function (and has exacly the same behaviour)
+     * see below
+     */
     extern void *textcat_Init(const char *conffile);
 
-/**
- * special_textcat_Init() - Initialize the text classifier. This function
- * prepare the classifier as needed by OpenOffice.org. The textfile
- * conffile should contain a list of utf8 fingerprint filenames and
- * identification strings for the categories.prefix will be
- * prepended to the filenames to locate the files. The identification
- * strings will be used in the classification output.
- * 
- * Returns: handle on success, NULL on error. (At the moment, the
- * only way errors can occur, is when the library cannot read the
- * conffile, or one of the fingerprint files listed in it.)
- */
+    /**
+     * special_textcat_Init() - Initialize the text classifier. This function
+     * prepare the classifier as needed by OpenOffice.org. The textfile
+     * conffile should contain a list of utf8 fingerprint filenames and
+     * identification strings for the categories.prefix will be
+     * prepended to the filenames to locate the files. The identification
+     * strings will be used in the classification output.
+     * 
+     * Returns: handle on success, NULL on error. (At the moment, the
+     * only way errors can occur, is when the library cannot read the
+     * conffile, or one of the fingerprint files listed in it.)
+     */
     extern void *special_textcat_Init(const char *conffile,
                                       const char *prefix);
 
-/**
- * textcat_Done() - Free up resources for handle
- */
+    /**
+     * textcat_Done() - Free up resources for handle
+     */
     extern void textcat_Done(void *handle);
 
-/**
- * textcat_Classify() - Give the most likely categories for buffer
- * with length size.
- *
- * Returns: string containing a list of category id's, each one
- * between square brackets, "UNKNOWN" when not recognized, "SHORT" if the
- * document was too short to make a reliable assessment.
- *
- * Performace note: longer buffers take longer to process. However,
- * for many uses it is not necessary to categorize the whole buffer.
- * For language classification, a few hundred bytes will suffice.  
- */
+    /**
+     * textcat_Classify() - Give the most likely categories for buffer
+     * with length size.
+     *
+     * Returns: string containing a list of category id's, each one
+     * between square brackets, "UNKNOWN" when not recognized, "SHORT" if the
+     * document was too short to make a reliable assessment.
+     *
+     * Performace note: longer buffers take longer to process. However,
+     * for many uses it is not necessary to categorize the whole buffer.
+     * For language classification, a few hundred bytes will suffice.  
+     */
     extern char *textcat_Classify(void *handle, const char *buffer,
                                   size_t size);
 
 
-/**
- * textcat_GetClassifyFullOutput() - Create a classifier output handler
- */
-extern candidate_t *textcat_GetClassifyFullOutput( void *handle );
-
-/**
- * textcat_ReleaseClassifyFullOutput() - Free up resources for the classifier output handler
- */
-extern void textcat_ReleaseClassifyFullOutput( void *handle, candidate_t *candidates );
-
-/**
- * textcat_ClassifyFull() - Give the most likely categories for buffer
- * with length size.
- *
- * Returns: the numbers of results.
- *
- * Performace note: longer buffers take longer to process. However,
- * for many uses it is not necessary to categorize the whole buffer.
- * For language classification, a few hundred bytes will suffice.
- */
-extern int textcat_ClassifyFull( void *handle, const char *buffer, size_t size, candidate_t *candidates );
-
-
-/**
- * textcat_Version() - Returns a string describing the version of this classifier.
- */
+    /**
+     * textcat_GetClassifyFullOutput() - Create a classifier output handler
+     */
+    extern candidate_t *textcat_GetClassifyFullOutput(void *handle);
+
+    /**
+     * textcat_ReleaseClassifyFullOutput() - Free up resources for the
+     * classifier output handler
+     */
+    extern void textcat_ReleaseClassifyFullOutput(void *handle,
+                                                  candidate_t * candidates);
+
+    /**
+     * textcat_ClassifyFull() - Give the most likely categories for buffer
+     * with length size.
+     *
+     * Returns: the numbers of results.
+     *
+     * Performace note: longer buffers take longer to process. However,
+     * for many uses it is not necessary to categorize the whole buffer.
+     * For language classification, a few hundred bytes will suffice.
+     */
+    extern int textcat_ClassifyFull(void *handle, const char *buffer,
+                                    size_t size, candidate_t * candidates);
+
+
+    /**
+     * textcat_Version() - Returns a string describing the version of this
+     * classifier.
+     */
     extern const char *textcat_Version(void);
 
 #ifdef __cplusplus
diff --git a/src/utf8misc.c b/src/utf8misc.c
index e0b151a..046d96b 100644
--- a/src/utf8misc.c
+++ b/src/utf8misc.c
@@ -53,22 +53,25 @@
 #define WEIGHT_MASK 0x00
 #endif
 
-const char* utf8_next_char(const char *str)
+const char *utf8_next_char(const char *str)
 {
     if (*str & ESCAPE_MASK)
     {
-        /* if the first bit of the current char is 1
-         * then *str is an escape character
+        /* 
+         * if the first bit of the current char is 1 then *str is an escape
+         * character
          */
         char escape_char = ((*str & WEIGHT_MASK) << 1);
 
-        /* and we use it to count (by bit translation) following characters
+        /* 
+         * and we use it to count (by bit translation) following characters
          * (only the weightest part)
          */
         while (escape_char & ESCAPE_MASK && *str)
         {
-            /* every step, we move the byte of 1 bit left, 
-             * when first bit is 0, it's finished
+            /* 
+             * every step, we move the byte of 1 bit left, when first bit is 0,
+             * it's finished
              */
             escape_char = escape_char << 1;
             ++str;
@@ -76,8 +79,9 @@ const char* utf8_next_char(const char *str)
     }
     if (*str)
     {
-        /* finaly, if we are not on the \0 character,
-         * we jump to the next character
+        /* 
+         * finally, if we are not on the \0 character, we jump to the next
+         * character
          */
         ++str;
     }
@@ -88,22 +92,21 @@ int utf8_charcopy(const char *str, char *dest)
 {
 
     int pointer = 0;
+    /* if the first bit of the current char is 1 */
     if (str[pointer] & ESCAPE_MASK)
-    {                           /* if the first bit of the current char is 1 */
-
-        /* then str[pointer] is an escape character */
-
-        char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /* and we use
-                                                                   it to count 
-                                                                   following
-                                                                   characters
-                                                                   (only the
-                                                                   weightest
-                                                                   part) */
+    {
+        /* 
+         * then str[pointer] is an escape character and we use it to count
+         * following characters (only the weightest part)
+         */
+        char escape_char = ((str[pointer] & WEIGHT_MASK) << 1);
 
+        /* 
+         * every step, we move the byte of 1 bit left, when first bit is 0,
+         * it's finished
+         */
         while (escape_char & ESCAPE_MASK && str[pointer])
-        {                       /* every step, we move the byte of 1 bit left, 
-                                   when first bit is 0, it's finished */
+        {
             dest[pointer] = str[pointer];
             escape_char = escape_char << 1;
             ++pointer;
@@ -127,19 +130,15 @@ int utf8_issame(char *lex, char *key, int len)
     while (char_counter < len)
     {
 
+        /* if the first bit of the current char is 1 */
         if (key[pointer] & ESCAPE_MASK)
-        {                       /* if the first bit of the current char is 1 */
-
-            /* then key[pointer] is an escap character */
-
-            char escape_char = ((key[pointer] & WEIGHT_MASK) << 1); /* and we
-                                                                       use it
-                                                                       to
-                                                                       count
-                                                                       (only
-                                                                       the
-                                                                       weightest 
-                                                                       part) */
+        {
+            /* 
+             * then key[pointer] is an escape character and we use it to count
+             * (only the weightest part)
+             */
+
+            char escape_char = ((key[pointer] & WEIGHT_MASK) << 1);
 
             while (escape_char & ESCAPE_MASK && key[pointer] == lex[pointer])
             {
diff --git a/src/utf8misc.h b/src/utf8misc.h
index 46df7fd..2dc0628 100644
--- a/src/utf8misc.h
+++ b/src/utf8misc.h
@@ -42,32 +42,34 @@ extern "C"
 {
 #endif
 
-/* 
- * Is used to jump to the next start of char
- * of course it's only usefull when encoding is utf-8
- * This function have been added by Jocelyn Merand to use libtextcat in OOo
- */
-const char* utf8_next_char(const char *str);
+    /* 
+     * Is used to jump to the next start of char
+     * of course it's only usefull when encoding is utf-8
+     * This function have been added by Jocelyn Merand to use libtextcat in OOo
+     */
+    const char *utf8_next_char(const char *str);
 
-/* Copy the char in str to dest of course it's only usefull when encoding is
-   utf8 and the symbol is encoded with more than 1 char return the number of
-   char jumped This function have been added by Jocelyn Merand to use
-   libtextcat in OOo */
-int utf8_charcopy(const char *str, char *dest);
+    /* 
+     * Copy the char in str to dest of course it's only usefull when encoding
+     * is utf8 and the symbol is encoded with more than 1 char return the
+     * number of char jumped This function have been added by Jocelyn Merand to
+     * use libtextcat in OOo
+     */
+    int utf8_charcopy(const char *str, char *dest);
 
 
-/* checks if n-gram lex is a prefix of key and of length len
- * len is the number of unicode code points
- * strlen("€") == 3 but len == 1
- */
-int utf8_issame(char *lex, char *key, int len);
+    /* 
+     * checks if n-gram lex is a prefix of key and of length len len is the
+     * number of unicode code points strlen("€") == 3 but len == 1
+     */
+    int utf8_issame(char *lex, char *key, int len);
 
 
-/*
- * len is the number of unicode code points
- * strlen("€") == 3 but len == 1
- */
-extern int utf8_strlen(const char *str);
+    /* 
+     * len is the number of unicode code points
+     * strlen("€") == 3 but len == 1
+     */
+    extern int utf8_strlen(const char *str);
 #ifdef __cplusplus
 }
 #endif
author	Giulio Paci <giuliopaci@gmail.com>	2012-01-09 12:41:14 +0000
committer	Caolán McNamara <caolanm@redhat.com>	2012-01-09 12:41:14 +0000
commit	c29914910dbb446b1ac99cb4a716ee483853d9b9 (patch)
tree	1bab96a21d281857ece24b7a5a0316ec315cb98e
parent	4561963bf5b7806e77404104c27e16929784eb51 (diff)