summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGiulio Paci <giuliopaci@gmail.com>2012-01-09 12:41:14 +0000
committerCaolán McNamara <caolanm@redhat.com>2012-01-09 12:41:14 +0000
commitc29914910dbb446b1ac99cb4a716ee483853d9b9 (patch)
tree1bab96a21d281857ece24b7a5a0316ec315cb98e /src
parent4561963bf5b7806e77404104c27e16929784eb51 (diff)
indent -bap -bli0 -i4 -l79 -ncs -npcs -npsl -fca -lc79 -fc1 -ts4 -nut
Diffstat (limited to 'src')
-rw-r--r--src/fingerprint.c12
-rw-r--r--src/textcat.c205
-rw-r--r--src/textcat.h150
-rw-r--r--src/utf8misc.c65
-rw-r--r--src/utf8misc.h44
5 files changed, 245 insertions, 231 deletions
diff --git a/src/fingerprint.c b/src/fingerprint.c
index dce3bd9..a59ee30 100644
--- a/src/fingerprint.c
+++ b/src/fingerprint.c
@@ -438,10 +438,10 @@ static void createngramtable(table_t * t, const char *buf)
char *m = n;
/*** First char may be an underscore ***/
- decay = utf8_charcopy(q, m); /* [modified] previously *q++ = *m++ */
+ decay = utf8_charcopy(q, m); /* [modified] previously *q++ = *m++ */
- q += decay; /* [modified] */
- m += decay; /* [modified] */
+ q += decay; /* [modified] */
+ m += decay; /* [modified] */
*m = '\0';
increasefreq(t, n, 1);
@@ -452,7 +452,7 @@ static void createngramtable(table_t * t, const char *buf)
/*** Let the compiler unroll this ***/
for (i = 2; i <= MAXNGRAMSYMBOL; i++)
{
- decay = utf8_charcopy(q, m); /* [modified] like above */
+ decay = utf8_charcopy(q, m); /* [modified] like above */
m += decay;
*m = '\0';
@@ -465,9 +465,9 @@ static void createngramtable(table_t * t, const char *buf)
return;
}
- p = utf8_next_char(p); /* [modified] */
+ p = utf8_next_char(p); /* [modified] */
}
- return;
+ return;
}
static int mystrcmp(const char *a, const char *b)
diff --git a/src/textcat.c b/src/textcat.c
index 8a28956..399b962 100644
--- a/src/textcat.c
+++ b/src/textcat.c
@@ -75,7 +75,7 @@ typedef struct
uint4 maxsize;
char output[MAXOUTPUTSIZE];
- candidate_t *tmp_candidates;
+ candidate_t *tmp_candidates;
} textcat_t;
@@ -97,9 +97,10 @@ extern void textcat_Done(void *handle)
{
fp_Done(h->fprint[i]);
}
- if(h->tmp_candidates != NULL) {
- textcat_ReleaseClassifyFullOutput(h, h->tmp_candidates);
- }
+ if (h->tmp_candidates != NULL)
+ {
+ textcat_ReleaseClassifyFullOutput(h, h->tmp_candidates);
+ }
free(h->fprint);
free(h->fprint_disable);
free(h);
@@ -119,9 +120,9 @@ extern void *textcat_Init(const char *conffile)
extern void *special_textcat_Init(const char *conffile, const char *prefix)
{
textcat_t *h;
- char *finger_print_file_name;
- size_t finger_print_file_name_size;
- size_t prefix_size;
+ char *finger_print_file_name;
+ size_t finger_print_file_name_size;
+ size_t prefix_size;
char line[1024];
FILE *fp;
@@ -138,15 +139,17 @@ extern void *special_textcat_Init(const char *conffile, const char *prefix)
h->size = 0;
h->maxsize = 16;
h->fprint = (void **)malloc(sizeof(void *) * h->maxsize);
- h->fprint_disable = (unsigned char *)malloc(sizeof(unsigned char) * h->maxsize);
+ h->fprint_disable =
+ (unsigned char *)malloc(sizeof(unsigned char) * h->maxsize);
/* added to store the state of languages */
- h->tmp_candidates = NULL;
+ h->tmp_candidates = NULL;
- prefix_size = strlen(prefix);
- finger_print_file_name_size = prefix_size + 1;
- finger_print_file_name = (char*)malloc( sizeof(char) * ( finger_print_file_name_size +1024 ) );
- finger_print_file_name[0] = '\0';
- strcat(finger_print_file_name, prefix);
+ prefix_size = strlen(prefix);
+ finger_print_file_name_size = prefix_size + 1;
+ finger_print_file_name =
+ (char *)malloc(sizeof(char) * (finger_print_file_name_size + 1024));
+ finger_print_file_name[0] = '\0';
+ strcat(finger_print_file_name, prefix);
while (wg_getline(line, 1024, fp))
{
@@ -172,8 +175,7 @@ extern void *special_textcat_Init(const char *conffile, const char *prefix)
(void **)realloc(h->fprint, sizeof(void *) * h->maxsize);
h->fprint_disable =
(unsigned char *)realloc(h->fprint_disable,
- sizeof(unsigned char) *
- h->maxsize);
+ sizeof(unsigned char) * h->maxsize);
}
/*** Load data ***/
@@ -182,37 +184,40 @@ extern void *special_textcat_Init(const char *conffile, const char *prefix)
goto BAILOUT;
}
- while( prefix_size + strlen(segment[0]) > finger_print_file_name_size )
- {
- char *tmp;
- size_t tmp_size = finger_print_file_name_size * 2;
- tmp = (char *)realloc(finger_print_file_name, sizeof(char)*(tmp_size+1) );
- if( tmp == NULL )
- {
- free( finger_print_file_name );
- finger_print_file_name_size = 0;
- goto BAILOUT;
- }
- else
- {
- finger_print_file_name = tmp;
- finger_print_file_name_size = tmp_size;
- }
- }
+ while (prefix_size + strlen(segment[0]) > finger_print_file_name_size)
+ {
+ char *tmp;
+ size_t tmp_size = finger_print_file_name_size * 2;
+ tmp =
+ (char *)realloc(finger_print_file_name,
+ sizeof(char) * (tmp_size + 1));
+ if (tmp == NULL)
+ {
+ free(finger_print_file_name);
+ finger_print_file_name_size = 0;
+ goto BAILOUT;
+ }
+ else
+ {
+ finger_print_file_name = tmp;
+ finger_print_file_name_size = tmp_size;
+ }
+ }
finger_print_file_name[prefix_size] = '\0';
strcat(finger_print_file_name, segment[0]);
if (fp_Read(h->fprint[h->size], finger_print_file_name, 400) == 0)
{
- textcat_Done(h);
+ textcat_Done(h);
goto BAILOUT;
}
h->fprint_disable[h->size] = 0xF0; /* 0xF0 is the code for enabled
- languages, 0x0F is for disabled */
+ languages, 0x0F is for disabled
+ */
h->size++;
}
- free( finger_print_file_name );
+ free(finger_print_file_name);
fclose(fp);
return h;
@@ -223,79 +228,83 @@ extern void *special_textcat_Init(const char *conffile, const char *prefix)
}
-extern candidate_t *textcat_GetClassifyFullOutput( void *handle )
+extern candidate_t *textcat_GetClassifyFullOutput(void *handle)
{
- textcat_t *h = (textcat_t *)handle;
- return (candidate_t *) malloc( sizeof(candidate_t) * h->size );
+ textcat_t *h = (textcat_t *) handle;
+ return (candidate_t *) malloc(sizeof(candidate_t) * h->size);
}
-extern void textcat_ReleaseClassifyFullOutput( void *handle, candidate_t *candidates )
+extern void textcat_ReleaseClassifyFullOutput(void *handle,
+ candidate_t * candidates)
{
- if(candidates != NULL) {
- free(candidates);
- }
+ if (candidates != NULL)
+ {
+ free(candidates);
+ }
}
-extern char *textcat_Classify( void *handle, const char *buffer, size_t size )
+extern char *textcat_Classify(void *handle, const char *buffer, size_t size)
{
textcat_t *h = (textcat_t *) handle;
char *result = h->output;
- uint4 i, cnt;
+ uint4 i, cnt;
- if( h->tmp_candidates == NULL)
- {
- h->tmp_candidates = textcat_GetClassifyFullOutput( h );
- }
+ if (h->tmp_candidates == NULL)
+ {
+ h->tmp_candidates = textcat_GetClassifyFullOutput(h);
+ }
- cnt = textcat_ClassifyFull( h, buffer, size, h->tmp_candidates );
+ cnt = textcat_ClassifyFull(h, buffer, size, h->tmp_candidates);
- switch(cnt){
- case TEXTCAT_RESULT_UNKOWN:
- result = _TEXTCAT_RESULT_UNKOWN;
- break;
- case TEXTCAT_RESULT_SHORT:
- result = _TEXTCAT_RESULT_SHORT;
- break;
- default:
- {
- const char *plimit = result + MAXOUTPUTSIZE;
- char *p = result;
-
- *p = '\0';
- for (i = 0; i < cnt; i++)
+ switch (cnt)
+ {
+ case TEXTCAT_RESULT_UNKOWN:
+ result = _TEXTCAT_RESULT_UNKOWN;
+ break;
+ case TEXTCAT_RESULT_SHORT:
+ result = _TEXTCAT_RESULT_SHORT;
+ break;
+ default:
{
- p = wg_strgmov(p, "[", plimit);
- p = wg_strgmov(p, h->tmp_candidates[i].name, plimit);
- p = wg_strgmov(p, "]", plimit);
- }
- }
+ const char *plimit = result + MAXOUTPUTSIZE;
+ char *p = result;
+
+ *p = '\0';
+ for (i = 0; i < cnt; i++)
+ {
+ p = wg_strgmov(p, "[", plimit);
+ p = wg_strgmov(p, h->tmp_candidates[i].name, plimit);
+ p = wg_strgmov(p, "]", plimit);
+ }
}
+ }
return result;
}
-extern int textcat_ClassifyFull( void *handle, const char *buffer, size_t size, candidate_t *candidates )
+extern int textcat_ClassifyFull(void *handle, const char *buffer, size_t size,
+ candidate_t * candidates)
{
- textcat_t *h = (textcat_t *)handle;
- uint4 i, cnt = 0;
- int minscore = MAXSCORE;
- int threshold = minscore;
+ textcat_t *h = (textcat_t *) handle;
+ uint4 i, cnt = 0;
+ int minscore = MAXSCORE;
+ int threshold = minscore;
- void *unknown;
+ void *unknown;
- unknown = fp_Init(NULL);
+ unknown = fp_Init(NULL);
if (fp_Create(unknown, buffer, size, MAXNGRAMS, MINDOCSIZE) == 0)
{
- /*** Too little information ***/
- fp_Done(unknown);
- return TEXTCAT_RESULT_SHORT ;
- }
+ /*** Too little information ***/
+ fp_Done(unknown);
+ return TEXTCAT_RESULT_SHORT;
+ }
- /*** Calculate the score for each category. ***/
+ /*** Calculate the score for each category. ***/
for (i = 0; i < h->size; i++)
{
- int score;
+ int score;
if (h->fprint_disable[i] & 0x0F)
{ /* if this language is disabled */
score = MAXSCORE;
@@ -304,42 +313,42 @@ extern int textcat_ClassifyFull( void *handle, const char *buffer, size_t size,
{
score = fp_Compare(h->fprint[i], unknown, threshold);
/* printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score); */
- }
- candidates[i].score = score;
+ }
+ candidates[i].score = score;
candidates[i].name = fp_Name(h->fprint[i]);
if (score < minscore)
{
- minscore = score;
+ minscore = score;
threshold = (int)((double)score * THRESHOLDVALUE);
- }
- }
+ }
+ }
- /*** Find the best performers ***/
+ /*** Find the best performers ***/
for (i = 0, cnt = 0; i < h->size; i++)
{
if (candidates[i].score < threshold)
{
if (++cnt == MAXCANDIDATES + 1)
{
- break;
- }
+ break;
+ }
memcpy(&candidates[cnt - 1], &candidates[i], sizeof(candidate_t));
- }
- }
+ }
+ }
- fp_Done(unknown);
- /*** The verdict ***/
+ fp_Done(unknown);
+ /*** The verdict ***/
if (cnt == MAXCANDIDATES + 1)
{
- return TEXTCAT_RESULT_UNKOWN;
- }
+ return TEXTCAT_RESULT_UNKOWN;
+ }
else
{
qsort(candidates, cnt, sizeof(candidate_t), cmpcandidates);
- return cnt;
- }
+ return cnt;
+ }
}
extern const char *textcat_Version(void)
diff --git a/src/textcat.h b/src/textcat.h
index 9b52063..a1e1770 100644
--- a/src/textcat.h
+++ b/src/textcat.h
@@ -48,90 +48,94 @@ extern "C"
{
#endif
-typedef struct {
- int score;
- const char *name;
-} candidate_t;
-
-
-/**
- * textcat_Init() - Initialize the text classifier. The textfile
- * conffile should contain a list of fingerprint filenames and
- * identification strings for the categories. The filenames should be
- * reachable from the current working directory. The identification
- * strings will are used in the classification output.
- *
- * Returns: handle on success, NULL on error. (At the moment, the
- * only way errors can occur, is when the library cannot read the
- * conffile, or one of the fingerprint files listed in it.)
- *
- * Replace older function (and has exacly the same behaviour)
- * see below
- */
+ typedef struct
+ {
+ int score;
+ const char *name;
+ } candidate_t;
+
+ /**
+ * textcat_Init() - Initialize the text classifier. The textfile
+ * conffile should contain a list of fingerprint filenames and
+ * identification strings for the categories. The filenames should be
+ * reachable from the current working directory. The identification
+ * strings will are used in the classification output.
+ *
+ * Returns: handle on success, NULL on error. (At the moment, the
+ * only way errors can occur, is when the library cannot read the
+ * conffile, or one of the fingerprint files listed in it.)
+ *
+ * Replace older function (and has exacly the same behaviour)
+ * see below
+ */
extern void *textcat_Init(const char *conffile);
-/**
- * special_textcat_Init() - Initialize the text classifier. This function
- * prepare the classifier as needed by OpenOffice.org. The textfile
- * conffile should contain a list of utf8 fingerprint filenames and
- * identification strings for the categories.prefix will be
- * prepended to the filenames to locate the files. The identification
- * strings will be used in the classification output.
- *
- * Returns: handle on success, NULL on error. (At the moment, the
- * only way errors can occur, is when the library cannot read the
- * conffile, or one of the fingerprint files listed in it.)
- */
+ /**
+ * special_textcat_Init() - Initialize the text classifier. This function
+ * prepare the classifier as needed by OpenOffice.org. The textfile
+ * conffile should contain a list of utf8 fingerprint filenames and
+ * identification strings for the categories.prefix will be
+ * prepended to the filenames to locate the files. The identification
+ * strings will be used in the classification output.
+ *
+ * Returns: handle on success, NULL on error. (At the moment, the
+ * only way errors can occur, is when the library cannot read the
+ * conffile, or one of the fingerprint files listed in it.)
+ */
extern void *special_textcat_Init(const char *conffile,
const char *prefix);
-/**
- * textcat_Done() - Free up resources for handle
- */
+ /**
+ * textcat_Done() - Free up resources for handle
+ */
extern void textcat_Done(void *handle);
-/**
- * textcat_Classify() - Give the most likely categories for buffer
- * with length size.
- *
- * Returns: string containing a list of category id's, each one
- * between square brackets, "UNKNOWN" when not recognized, "SHORT" if the
- * document was too short to make a reliable assessment.
- *
- * Performace note: longer buffers take longer to process. However,
- * for many uses it is not necessary to categorize the whole buffer.
- * For language classification, a few hundred bytes will suffice.
- */
+ /**
+ * textcat_Classify() - Give the most likely categories for buffer
+ * with length size.
+ *
+ * Returns: string containing a list of category id's, each one
+ * between square brackets, "UNKNOWN" when not recognized, "SHORT" if the
+ * document was too short to make a reliable assessment.
+ *
+ * Performace note: longer buffers take longer to process. However,
+ * for many uses it is not necessary to categorize the whole buffer.
+ * For language classification, a few hundred bytes will suffice.
+ */
extern char *textcat_Classify(void *handle, const char *buffer,
size_t size);
-/**
- * textcat_GetClassifyFullOutput() - Create a classifier output handler
- */
-extern candidate_t *textcat_GetClassifyFullOutput( void *handle );
-
-/**
- * textcat_ReleaseClassifyFullOutput() - Free up resources for the classifier output handler
- */
-extern void textcat_ReleaseClassifyFullOutput( void *handle, candidate_t *candidates );
-
-/**
- * textcat_ClassifyFull() - Give the most likely categories for buffer
- * with length size.
- *
- * Returns: the numbers of results.
- *
- * Performace note: longer buffers take longer to process. However,
- * for many uses it is not necessary to categorize the whole buffer.
- * For language classification, a few hundred bytes will suffice.
- */
-extern int textcat_ClassifyFull( void *handle, const char *buffer, size_t size, candidate_t *candidates );
-
-
-/**
- * textcat_Version() - Returns a string describing the version of this classifier.
- */
+ /**
+ * textcat_GetClassifyFullOutput() - Create a classifier output handler
+ */
+ extern candidate_t *textcat_GetClassifyFullOutput(void *handle);
+
+ /**
+ * textcat_ReleaseClassifyFullOutput() - Free up resources for the
+ * classifier output handler
+ */
+ extern void textcat_ReleaseClassifyFullOutput(void *handle,
+ candidate_t * candidates);
+
+ /**
+ * textcat_ClassifyFull() - Give the most likely categories for buffer
+ * with length size.
+ *
+ * Returns: the numbers of results.
+ *
+ * Performace note: longer buffers take longer to process. However,
+ * for many uses it is not necessary to categorize the whole buffer.
+ * For language classification, a few hundred bytes will suffice.
+ */
+ extern int textcat_ClassifyFull(void *handle, const char *buffer,
+ size_t size, candidate_t * candidates);
+
+
+ /**
+ * textcat_Version() - Returns a string describing the version of this
+ * classifier.
+ */
extern const char *textcat_Version(void);
#ifdef __cplusplus
diff --git a/src/utf8misc.c b/src/utf8misc.c
index e0b151a..046d96b 100644
--- a/src/utf8misc.c
+++ b/src/utf8misc.c
@@ -53,22 +53,25 @@
#define WEIGHT_MASK 0x00
#endif
-const char* utf8_next_char(const char *str)
+const char *utf8_next_char(const char *str)
{
if (*str & ESCAPE_MASK)
{
- /* if the first bit of the current char is 1
- * then *str is an escape character
+ /*
+ * if the first bit of the current char is 1 then *str is an escape
+ * character
*/
char escape_char = ((*str & WEIGHT_MASK) << 1);
- /* and we use it to count (by bit translation) following characters
+ /*
+ * and we use it to count (by bit translation) following characters
* (only the weightest part)
*/
while (escape_char & ESCAPE_MASK && *str)
{
- /* every step, we move the byte of 1 bit left,
- * when first bit is 0, it's finished
+ /*
+ * every step, we move the byte of 1 bit left, when first bit is 0,
+ * it's finished
*/
escape_char = escape_char << 1;
++str;
@@ -76,8 +79,9 @@ const char* utf8_next_char(const char *str)
}
if (*str)
{
- /* finaly, if we are not on the \0 character,
- * we jump to the next character
+ /*
+ * finally, if we are not on the \0 character, we jump to the next
+ * character
*/
++str;
}
@@ -88,22 +92,21 @@ int utf8_charcopy(const char *str, char *dest)
{
int pointer = 0;
+ /* if the first bit of the current char is 1 */
if (str[pointer] & ESCAPE_MASK)
- { /* if the first bit of the current char is 1 */
-
- /* then str[pointer] is an escape character */
-
- char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /* and we use
- it to count
- following
- characters
- (only the
- weightest
- part) */
+ {
+ /*
+ * then str[pointer] is an escape character and we use it to count
+ * following characters (only the weightest part)
+ */
+ char escape_char = ((str[pointer] & WEIGHT_MASK) << 1);
+ /*
+ * every step, we move the byte of 1 bit left, when first bit is 0,
+ * it's finished
+ */
while (escape_char & ESCAPE_MASK && str[pointer])
- { /* every step, we move the byte of 1 bit left,
- when first bit is 0, it's finished */
+ {
dest[pointer] = str[pointer];
escape_char = escape_char << 1;
++pointer;
@@ -127,19 +130,15 @@ int utf8_issame(char *lex, char *key, int len)
while (char_counter < len)
{
+ /* if the first bit of the current char is 1 */
if (key[pointer] & ESCAPE_MASK)
- { /* if the first bit of the current char is 1 */
-
- /* then key[pointer] is an escap character */
-
- char escape_char = ((key[pointer] & WEIGHT_MASK) << 1); /* and we
- use it
- to
- count
- (only
- the
- weightest
- part) */
+ {
+ /*
+ * then key[pointer] is an escape character and we use it to count
+ * (only the weightest part)
+ */
+
+ char escape_char = ((key[pointer] & WEIGHT_MASK) << 1);
while (escape_char & ESCAPE_MASK && key[pointer] == lex[pointer])
{
diff --git a/src/utf8misc.h b/src/utf8misc.h
index 46df7fd..2dc0628 100644
--- a/src/utf8misc.h
+++ b/src/utf8misc.h
@@ -42,32 +42,34 @@ extern "C"
{
#endif
-/*
- * Is used to jump to the next start of char
- * of course it's only usefull when encoding is utf-8
- * This function have been added by Jocelyn Merand to use libtextcat in OOo
- */
-const char* utf8_next_char(const char *str);
+ /*
+ * Is used to jump to the next start of char
+ * of course it's only usefull when encoding is utf-8
+ * This function have been added by Jocelyn Merand to use libtextcat in OOo
+ */
+ const char *utf8_next_char(const char *str);
-/* Copy the char in str to dest of course it's only usefull when encoding is
- utf8 and the symbol is encoded with more than 1 char return the number of
- char jumped This function have been added by Jocelyn Merand to use
- libtextcat in OOo */
-int utf8_charcopy(const char *str, char *dest);
+ /*
+ * Copy the char in str to dest of course it's only usefull when encoding
+ * is utf8 and the symbol is encoded with more than 1 char return the
+ * number of char jumped This function have been added by Jocelyn Merand to
+ * use libtextcat in OOo
+ */
+ int utf8_charcopy(const char *str, char *dest);
-/* checks if n-gram lex is a prefix of key and of length len
- * len is the number of unicode code points
- * strlen("€") == 3 but len == 1
- */
-int utf8_issame(char *lex, char *key, int len);
+ /*
+ * checks if n-gram lex is a prefix of key and of length len len is the
+ * number of unicode code points strlen("€") == 3 but len == 1
+ */
+ int utf8_issame(char *lex, char *key, int len);
-/*
- * len is the number of unicode code points
- * strlen("€") == 3 but len == 1
- */
-extern int utf8_strlen(const char *str);
+ /*
+ * len is the number of unicode code points
+ * strlen("€") == 3 but len == 1
+ */
+ extern int utf8_strlen(const char *str);
#ifdef __cplusplus
}
#endif