diff options
author | Giulio Paci <giuliopaci@gmail.com> | 2012-01-09 12:41:14 +0000 |
---|---|---|
committer | Caolán McNamara <caolanm@redhat.com> | 2012-01-09 12:41:14 +0000 |
commit | c29914910dbb446b1ac99cb4a716ee483853d9b9 (patch) | |
tree | 1bab96a21d281857ece24b7a5a0316ec315cb98e /src | |
parent | 4561963bf5b7806e77404104c27e16929784eb51 (diff) |
indent -bap -bli0 -i4 -l79 -ncs -npcs -npsl -fca -lc79 -fc1 -ts4 -nut
Diffstat (limited to 'src')
-rw-r--r-- | src/fingerprint.c | 12 | ||||
-rw-r--r-- | src/textcat.c | 205 | ||||
-rw-r--r-- | src/textcat.h | 150 | ||||
-rw-r--r-- | src/utf8misc.c | 65 | ||||
-rw-r--r-- | src/utf8misc.h | 44 |
5 files changed, 245 insertions, 231 deletions
diff --git a/src/fingerprint.c b/src/fingerprint.c index dce3bd9..a59ee30 100644 --- a/src/fingerprint.c +++ b/src/fingerprint.c @@ -438,10 +438,10 @@ static void createngramtable(table_t * t, const char *buf) char *m = n; /*** First char may be an underscore ***/ - decay = utf8_charcopy(q, m); /* [modified] previously *q++ = *m++ */ + decay = utf8_charcopy(q, m); /* [modified] previously *q++ = *m++ */ - q += decay; /* [modified] */ - m += decay; /* [modified] */ + q += decay; /* [modified] */ + m += decay; /* [modified] */ *m = '\0'; increasefreq(t, n, 1); @@ -452,7 +452,7 @@ static void createngramtable(table_t * t, const char *buf) /*** Let the compiler unroll this ***/ for (i = 2; i <= MAXNGRAMSYMBOL; i++) { - decay = utf8_charcopy(q, m); /* [modified] like above */ + decay = utf8_charcopy(q, m); /* [modified] like above */ m += decay; *m = '\0'; @@ -465,9 +465,9 @@ static void createngramtable(table_t * t, const char *buf) return; } - p = utf8_next_char(p); /* [modified] */ + p = utf8_next_char(p); /* [modified] */ } - return; + return; } static int mystrcmp(const char *a, const char *b) diff --git a/src/textcat.c b/src/textcat.c index 8a28956..399b962 100644 --- a/src/textcat.c +++ b/src/textcat.c @@ -75,7 +75,7 @@ typedef struct uint4 maxsize; char output[MAXOUTPUTSIZE]; - candidate_t *tmp_candidates; + candidate_t *tmp_candidates; } textcat_t; @@ -97,9 +97,10 @@ extern void textcat_Done(void *handle) { fp_Done(h->fprint[i]); } - if(h->tmp_candidates != NULL) { - textcat_ReleaseClassifyFullOutput(h, h->tmp_candidates); - } + if (h->tmp_candidates != NULL) + { + textcat_ReleaseClassifyFullOutput(h, h->tmp_candidates); + } free(h->fprint); free(h->fprint_disable); free(h); @@ -119,9 +120,9 @@ extern void *textcat_Init(const char *conffile) extern void *special_textcat_Init(const char *conffile, const char *prefix) { textcat_t *h; - char *finger_print_file_name; - size_t finger_print_file_name_size; - size_t prefix_size; + char *finger_print_file_name; + size_t finger_print_file_name_size; + size_t prefix_size; char line[1024]; FILE *fp; @@ -138,15 +139,17 @@ extern void *special_textcat_Init(const char *conffile, const char *prefix) h->size = 0; h->maxsize = 16; h->fprint = (void **)malloc(sizeof(void *) * h->maxsize); - h->fprint_disable = (unsigned char *)malloc(sizeof(unsigned char) * h->maxsize); + h->fprint_disable = + (unsigned char *)malloc(sizeof(unsigned char) * h->maxsize); /* added to store the state of languages */ - h->tmp_candidates = NULL; + h->tmp_candidates = NULL; - prefix_size = strlen(prefix); - finger_print_file_name_size = prefix_size + 1; - finger_print_file_name = (char*)malloc( sizeof(char) * ( finger_print_file_name_size +1024 ) ); - finger_print_file_name[0] = '\0'; - strcat(finger_print_file_name, prefix); + prefix_size = strlen(prefix); + finger_print_file_name_size = prefix_size + 1; + finger_print_file_name = + (char *)malloc(sizeof(char) * (finger_print_file_name_size + 1024)); + finger_print_file_name[0] = '\0'; + strcat(finger_print_file_name, prefix); while (wg_getline(line, 1024, fp)) { @@ -172,8 +175,7 @@ extern void *special_textcat_Init(const char *conffile, const char *prefix) (void **)realloc(h->fprint, sizeof(void *) * h->maxsize); h->fprint_disable = (unsigned char *)realloc(h->fprint_disable, - sizeof(unsigned char) * - h->maxsize); + sizeof(unsigned char) * h->maxsize); } /*** Load data ***/ @@ -182,37 +184,40 @@ extern void *special_textcat_Init(const char *conffile, const char *prefix) goto BAILOUT; } - while( prefix_size + strlen(segment[0]) > finger_print_file_name_size ) - { - char *tmp; - size_t tmp_size = finger_print_file_name_size * 2; - tmp = (char *)realloc(finger_print_file_name, sizeof(char)*(tmp_size+1) ); - if( tmp == NULL ) - { - free( finger_print_file_name ); - finger_print_file_name_size = 0; - goto BAILOUT; - } - else - { - finger_print_file_name = tmp; - finger_print_file_name_size = tmp_size; - } - } + while (prefix_size + strlen(segment[0]) > finger_print_file_name_size) + { + char *tmp; + size_t tmp_size = finger_print_file_name_size * 2; + tmp = + (char *)realloc(finger_print_file_name, + sizeof(char) * (tmp_size + 1)); + if (tmp == NULL) + { + free(finger_print_file_name); + finger_print_file_name_size = 0; + goto BAILOUT; + } + else + { + finger_print_file_name = tmp; + finger_print_file_name_size = tmp_size; + } + } finger_print_file_name[prefix_size] = '\0'; strcat(finger_print_file_name, segment[0]); if (fp_Read(h->fprint[h->size], finger_print_file_name, 400) == 0) { - textcat_Done(h); + textcat_Done(h); goto BAILOUT; } h->fprint_disable[h->size] = 0xF0; /* 0xF0 is the code for enabled - languages, 0x0F is for disabled */ + languages, 0x0F is for disabled + */ h->size++; } - free( finger_print_file_name ); + free(finger_print_file_name); fclose(fp); return h; @@ -223,79 +228,83 @@ extern void *special_textcat_Init(const char *conffile, const char *prefix) } -extern candidate_t *textcat_GetClassifyFullOutput( void *handle ) +extern candidate_t *textcat_GetClassifyFullOutput(void *handle) { - textcat_t *h = (textcat_t *)handle; - return (candidate_t *) malloc( sizeof(candidate_t) * h->size ); + textcat_t *h = (textcat_t *) handle; + return (candidate_t *) malloc(sizeof(candidate_t) * h->size); } -extern void textcat_ReleaseClassifyFullOutput( void *handle, candidate_t *candidates ) +extern void textcat_ReleaseClassifyFullOutput(void *handle, + candidate_t * candidates) { - if(candidates != NULL) { - free(candidates); - } + if (candidates != NULL) + { + free(candidates); + } } -extern char *textcat_Classify( void *handle, const char *buffer, size_t size ) +extern char *textcat_Classify(void *handle, const char *buffer, size_t size) { textcat_t *h = (textcat_t *) handle; char *result = h->output; - uint4 i, cnt; + uint4 i, cnt; - if( h->tmp_candidates == NULL) - { - h->tmp_candidates = textcat_GetClassifyFullOutput( h ); - } + if (h->tmp_candidates == NULL) + { + h->tmp_candidates = textcat_GetClassifyFullOutput(h); + } - cnt = textcat_ClassifyFull( h, buffer, size, h->tmp_candidates ); + cnt = textcat_ClassifyFull(h, buffer, size, h->tmp_candidates); - switch(cnt){ - case TEXTCAT_RESULT_UNKOWN: - result = _TEXTCAT_RESULT_UNKOWN; - break; - case TEXTCAT_RESULT_SHORT: - result = _TEXTCAT_RESULT_SHORT; - break; - default: - { - const char *plimit = result + MAXOUTPUTSIZE; - char *p = result; - - *p = '\0'; - for (i = 0; i < cnt; i++) + switch (cnt) + { + case TEXTCAT_RESULT_UNKOWN: + result = _TEXTCAT_RESULT_UNKOWN; + break; + case TEXTCAT_RESULT_SHORT: + result = _TEXTCAT_RESULT_SHORT; + break; + default: { - p = wg_strgmov(p, "[", plimit); - p = wg_strgmov(p, h->tmp_candidates[i].name, plimit); - p = wg_strgmov(p, "]", plimit); - } - } + const char *plimit = result + MAXOUTPUTSIZE; + char *p = result; + + *p = '\0'; + for (i = 0; i < cnt; i++) + { + p = wg_strgmov(p, "[", plimit); + p = wg_strgmov(p, h->tmp_candidates[i].name, plimit); + p = wg_strgmov(p, "]", plimit); + } } + } return result; } -extern int textcat_ClassifyFull( void *handle, const char *buffer, size_t size, candidate_t *candidates ) +extern int textcat_ClassifyFull(void *handle, const char *buffer, size_t size, + candidate_t * candidates) { - textcat_t *h = (textcat_t *)handle; - uint4 i, cnt = 0; - int minscore = MAXSCORE; - int threshold = minscore; + textcat_t *h = (textcat_t *) handle; + uint4 i, cnt = 0; + int minscore = MAXSCORE; + int threshold = minscore; - void *unknown; + void *unknown; - unknown = fp_Init(NULL); + unknown = fp_Init(NULL); if (fp_Create(unknown, buffer, size, MAXNGRAMS, MINDOCSIZE) == 0) { - /*** Too little information ***/ - fp_Done(unknown); - return TEXTCAT_RESULT_SHORT ; - } + /*** Too little information ***/ + fp_Done(unknown); + return TEXTCAT_RESULT_SHORT; + } - /*** Calculate the score for each category. ***/ + /*** Calculate the score for each category. ***/ for (i = 0; i < h->size; i++) { - int score; + int score; if (h->fprint_disable[i] & 0x0F) { /* if this language is disabled */ score = MAXSCORE; @@ -304,42 +313,42 @@ extern int textcat_ClassifyFull( void *handle, const char *buffer, size_t size, { score = fp_Compare(h->fprint[i], unknown, threshold); /* printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score); */ - } - candidates[i].score = score; + } + candidates[i].score = score; candidates[i].name = fp_Name(h->fprint[i]); if (score < minscore) { - minscore = score; + minscore = score; threshold = (int)((double)score * THRESHOLDVALUE); - } - } + } + } - /*** Find the best performers ***/ + /*** Find the best performers ***/ for (i = 0, cnt = 0; i < h->size; i++) { if (candidates[i].score < threshold) { if (++cnt == MAXCANDIDATES + 1) { - break; - } + break; + } memcpy(&candidates[cnt - 1], &candidates[i], sizeof(candidate_t)); - } - } + } + } - fp_Done(unknown); - /*** The verdict ***/ + fp_Done(unknown); + /*** The verdict ***/ if (cnt == MAXCANDIDATES + 1) { - return TEXTCAT_RESULT_UNKOWN; - } + return TEXTCAT_RESULT_UNKOWN; + } else { qsort(candidates, cnt, sizeof(candidate_t), cmpcandidates); - return cnt; - } + return cnt; + } } extern const char *textcat_Version(void) diff --git a/src/textcat.h b/src/textcat.h index 9b52063..a1e1770 100644 --- a/src/textcat.h +++ b/src/textcat.h @@ -48,90 +48,94 @@ extern "C" { #endif -typedef struct { - int score; - const char *name; -} candidate_t; - - -/** - * textcat_Init() - Initialize the text classifier. The textfile - * conffile should contain a list of fingerprint filenames and - * identification strings for the categories. The filenames should be - * reachable from the current working directory. The identification - * strings will are used in the classification output. - * - * Returns: handle on success, NULL on error. (At the moment, the - * only way errors can occur, is when the library cannot read the - * conffile, or one of the fingerprint files listed in it.) - * - * Replace older function (and has exacly the same behaviour) - * see below - */ + typedef struct + { + int score; + const char *name; + } candidate_t; + + /** + * textcat_Init() - Initialize the text classifier. The textfile + * conffile should contain a list of fingerprint filenames and + * identification strings for the categories. The filenames should be + * reachable from the current working directory. The identification + * strings will are used in the classification output. + * + * Returns: handle on success, NULL on error. (At the moment, the + * only way errors can occur, is when the library cannot read the + * conffile, or one of the fingerprint files listed in it.) + * + * Replace older function (and has exacly the same behaviour) + * see below + */ extern void *textcat_Init(const char *conffile); -/** - * special_textcat_Init() - Initialize the text classifier. This function - * prepare the classifier as needed by OpenOffice.org. The textfile - * conffile should contain a list of utf8 fingerprint filenames and - * identification strings for the categories.prefix will be - * prepended to the filenames to locate the files. The identification - * strings will be used in the classification output. - * - * Returns: handle on success, NULL on error. (At the moment, the - * only way errors can occur, is when the library cannot read the - * conffile, or one of the fingerprint files listed in it.) - */ + /** + * special_textcat_Init() - Initialize the text classifier. This function + * prepare the classifier as needed by OpenOffice.org. The textfile + * conffile should contain a list of utf8 fingerprint filenames and + * identification strings for the categories.prefix will be + * prepended to the filenames to locate the files. The identification + * strings will be used in the classification output. + * + * Returns: handle on success, NULL on error. (At the moment, the + * only way errors can occur, is when the library cannot read the + * conffile, or one of the fingerprint files listed in it.) + */ extern void *special_textcat_Init(const char *conffile, const char *prefix); -/** - * textcat_Done() - Free up resources for handle - */ + /** + * textcat_Done() - Free up resources for handle + */ extern void textcat_Done(void *handle); -/** - * textcat_Classify() - Give the most likely categories for buffer - * with length size. - * - * Returns: string containing a list of category id's, each one - * between square brackets, "UNKNOWN" when not recognized, "SHORT" if the - * document was too short to make a reliable assessment. - * - * Performace note: longer buffers take longer to process. However, - * for many uses it is not necessary to categorize the whole buffer. - * For language classification, a few hundred bytes will suffice. - */ + /** + * textcat_Classify() - Give the most likely categories for buffer + * with length size. + * + * Returns: string containing a list of category id's, each one + * between square brackets, "UNKNOWN" when not recognized, "SHORT" if the + * document was too short to make a reliable assessment. + * + * Performace note: longer buffers take longer to process. However, + * for many uses it is not necessary to categorize the whole buffer. + * For language classification, a few hundred bytes will suffice. + */ extern char *textcat_Classify(void *handle, const char *buffer, size_t size); -/** - * textcat_GetClassifyFullOutput() - Create a classifier output handler - */ -extern candidate_t *textcat_GetClassifyFullOutput( void *handle ); - -/** - * textcat_ReleaseClassifyFullOutput() - Free up resources for the classifier output handler - */ -extern void textcat_ReleaseClassifyFullOutput( void *handle, candidate_t *candidates ); - -/** - * textcat_ClassifyFull() - Give the most likely categories for buffer - * with length size. - * - * Returns: the numbers of results. - * - * Performace note: longer buffers take longer to process. However, - * for many uses it is not necessary to categorize the whole buffer. - * For language classification, a few hundred bytes will suffice. - */ -extern int textcat_ClassifyFull( void *handle, const char *buffer, size_t size, candidate_t *candidates ); - - -/** - * textcat_Version() - Returns a string describing the version of this classifier. - */ + /** + * textcat_GetClassifyFullOutput() - Create a classifier output handler + */ + extern candidate_t *textcat_GetClassifyFullOutput(void *handle); + + /** + * textcat_ReleaseClassifyFullOutput() - Free up resources for the + * classifier output handler + */ + extern void textcat_ReleaseClassifyFullOutput(void *handle, + candidate_t * candidates); + + /** + * textcat_ClassifyFull() - Give the most likely categories for buffer + * with length size. + * + * Returns: the numbers of results. + * + * Performace note: longer buffers take longer to process. However, + * for many uses it is not necessary to categorize the whole buffer. + * For language classification, a few hundred bytes will suffice. + */ + extern int textcat_ClassifyFull(void *handle, const char *buffer, + size_t size, candidate_t * candidates); + + + /** + * textcat_Version() - Returns a string describing the version of this + * classifier. + */ extern const char *textcat_Version(void); #ifdef __cplusplus diff --git a/src/utf8misc.c b/src/utf8misc.c index e0b151a..046d96b 100644 --- a/src/utf8misc.c +++ b/src/utf8misc.c @@ -53,22 +53,25 @@ #define WEIGHT_MASK 0x00 #endif -const char* utf8_next_char(const char *str) +const char *utf8_next_char(const char *str) { if (*str & ESCAPE_MASK) { - /* if the first bit of the current char is 1 - * then *str is an escape character + /* + * if the first bit of the current char is 1 then *str is an escape + * character */ char escape_char = ((*str & WEIGHT_MASK) << 1); - /* and we use it to count (by bit translation) following characters + /* + * and we use it to count (by bit translation) following characters * (only the weightest part) */ while (escape_char & ESCAPE_MASK && *str) { - /* every step, we move the byte of 1 bit left, - * when first bit is 0, it's finished + /* + * every step, we move the byte of 1 bit left, when first bit is 0, + * it's finished */ escape_char = escape_char << 1; ++str; @@ -76,8 +79,9 @@ const char* utf8_next_char(const char *str) } if (*str) { - /* finaly, if we are not on the \0 character, - * we jump to the next character + /* + * finally, if we are not on the \0 character, we jump to the next + * character */ ++str; } @@ -88,22 +92,21 @@ int utf8_charcopy(const char *str, char *dest) { int pointer = 0; + /* if the first bit of the current char is 1 */ if (str[pointer] & ESCAPE_MASK) - { /* if the first bit of the current char is 1 */ - - /* then str[pointer] is an escape character */ - - char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /* and we use - it to count - following - characters - (only the - weightest - part) */ + { + /* + * then str[pointer] is an escape character and we use it to count + * following characters (only the weightest part) + */ + char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); + /* + * every step, we move the byte of 1 bit left, when first bit is 0, + * it's finished + */ while (escape_char & ESCAPE_MASK && str[pointer]) - { /* every step, we move the byte of 1 bit left, - when first bit is 0, it's finished */ + { dest[pointer] = str[pointer]; escape_char = escape_char << 1; ++pointer; @@ -127,19 +130,15 @@ int utf8_issame(char *lex, char *key, int len) while (char_counter < len) { + /* if the first bit of the current char is 1 */ if (key[pointer] & ESCAPE_MASK) - { /* if the first bit of the current char is 1 */ - - /* then key[pointer] is an escap character */ - - char escape_char = ((key[pointer] & WEIGHT_MASK) << 1); /* and we - use it - to - count - (only - the - weightest - part) */ + { + /* + * then key[pointer] is an escape character and we use it to count + * (only the weightest part) + */ + + char escape_char = ((key[pointer] & WEIGHT_MASK) << 1); while (escape_char & ESCAPE_MASK && key[pointer] == lex[pointer]) { diff --git a/src/utf8misc.h b/src/utf8misc.h index 46df7fd..2dc0628 100644 --- a/src/utf8misc.h +++ b/src/utf8misc.h @@ -42,32 +42,34 @@ extern "C" { #endif -/* - * Is used to jump to the next start of char - * of course it's only usefull when encoding is utf-8 - * This function have been added by Jocelyn Merand to use libtextcat in OOo - */ -const char* utf8_next_char(const char *str); + /* + * Is used to jump to the next start of char + * of course it's only usefull when encoding is utf-8 + * This function have been added by Jocelyn Merand to use libtextcat in OOo + */ + const char *utf8_next_char(const char *str); -/* Copy the char in str to dest of course it's only usefull when encoding is - utf8 and the symbol is encoded with more than 1 char return the number of - char jumped This function have been added by Jocelyn Merand to use - libtextcat in OOo */ -int utf8_charcopy(const char *str, char *dest); + /* + * Copy the char in str to dest of course it's only usefull when encoding + * is utf8 and the symbol is encoded with more than 1 char return the + * number of char jumped This function have been added by Jocelyn Merand to + * use libtextcat in OOo + */ + int utf8_charcopy(const char *str, char *dest); -/* checks if n-gram lex is a prefix of key and of length len - * len is the number of unicode code points - * strlen("€") == 3 but len == 1 - */ -int utf8_issame(char *lex, char *key, int len); + /* + * checks if n-gram lex is a prefix of key and of length len len is the + * number of unicode code points strlen("€") == 3 but len == 1 + */ + int utf8_issame(char *lex, char *key, int len); -/* - * len is the number of unicode code points - * strlen("€") == 3 but len == 1 - */ -extern int utf8_strlen(const char *str); + /* + * len is the number of unicode code points + * strlen("€") == 3 but len == 1 + */ + extern int utf8_strlen(const char *str); #ifdef __cplusplus } #endif |