diff options
-rw-r--r-- | Makefile.am | 3 | ||||
-rw-r--r-- | libexttextcat.vapi | 34 | ||||
-rw-r--r-- | src/libexttextcat.map | 3 | ||||
-rw-r--r-- | src/textcat.c | 134 | ||||
-rw-r--r-- | src/textcat.h | 32 |
5 files changed, 156 insertions, 50 deletions
diff --git a/Makefile.am b/Makefile.am index b72d630..912003d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -5,4 +5,7 @@ SUBDIRS = src langclass pkgconfdir = $(libdir)/pkgconfig pkgconf_DATA = libexttextcat.pc +vapidir = $(datadir)/vala/vapi +vapi_DATA = libexttextcat.vapi + EXTRA_DIST = ChangeLog LICENSE README README.libtextcat TODO langclass diff --git a/libexttextcat.vapi b/libexttextcat.vapi new file mode 100644 index 0000000..562313c --- /dev/null +++ b/libexttextcat.vapi @@ -0,0 +1,34 @@ +/* libtextcat.vapi generated by vapigen-0.12, do not modify. */ + +[CCode (cprefix = "textcat_", lower_case_cprefix = "textcat_")] +namespace TextCat { + [Compact] + [CCode (cname="candidate_t",cheader_filename = "textcat.h")] + public struct candidate { + public weak string name; + public int score; + } + [Compact] + [CCode (cname="void",cheader_filename = "textcat.h", free_function="textcat_Done")] + public class Classifier { + [CCode (cname = "textcat_Classify", cheader_filename = "textcat.h")] + public unowned string classify (string buffer, size_t size); + [CCode (cname = "textcat_ClassifyFull", cheader_filename = "textcat.h")] + public int classify_full (string buffer, size_t size, candidate* candidates); + [CCode (cname = "textcat_GetClassifyFullOutput", cheader_filename = "textcat.h")] + public unowned candidate* get_classify_full_output (); + [CCode (cname = "textcat_ReleaseClassifyFullOutput", cheader_filename = "textcat.h")] + public void release_classify_full_output (candidate* candidates); + [CCode (cname = "textcat_Init", cheader_filename = "textcat.h")] + public Classifier (string conffile); + [CCode (cname = "textcat_InitWithPath", cheader_filename = "textcat.h")] + public Classifier.init_with_path (string conffile, string prefix, bool utfaware); + + } + [CCode (cheader_filename = "textcat.h")] + public const int TEXTCAT_RESULT_SHORT; + [CCode (cheader_filename = "textcat.h")] + public const int TEXTCAT_RESULT_UNKOWN; + [CCode (cname = "textcat_Version", cheader_filename = "textcat.h")] + public static unowned string version (); +} diff --git a/src/libexttextcat.map b/src/libexttextcat.map index 4edf79b..81785b9 100644 --- a/src/libexttextcat.map +++ b/src/libexttextcat.map @@ -11,6 +11,9 @@ wgmempool_strdup special_textcat_Init textcat_Classify + textcat_ClassifyFull + textcat_ReleaseClassifyFullOutput + textcat_GetClassifyFullOutput textcat_Done textcat_Init textcat_Version diff --git a/src/textcat.c b/src/textcat.c index 2479019..41a3446 100644 --- a/src/textcat.c +++ b/src/textcat.c @@ -73,17 +73,11 @@ typedef struct uint4 maxsize; char output[MAXOUTPUTSIZE]; + candidate_t *tmp_candidates; } textcat_t; -typedef struct -{ - int score; - const char *name; -} candidate_t; - - static int cmpcandidates(const void *a, const void *b) { const candidate_t *x = (const candidate_t *)a; @@ -101,6 +95,9 @@ extern void textcat_Done(void *handle) { fp_Done(h->fprint[i]); } + if(h->tmp_candidates != NULL) { + textcat_ReleaseClassifyFullOutput(h, h->tmp_candidates); + } free(h->fprint); free(h->fprint_disable); free(h); @@ -138,6 +135,7 @@ extern void *special_textcat_Init(const char *conffile, const char *prefix) h->fprint = (void **)malloc(sizeof(void *) * h->maxsize); h->fprint_disable = (unsigned char *)malloc(sizeof(unsigned char) * h->maxsize); /* added to store the state of languages */ + h->tmp_candidates = NULL; while (wg_getline(line, 1024, fp)) { char *p; @@ -201,30 +199,79 @@ extern void *special_textcat_Init(const char *conffile, const char *prefix) } -extern char *textcat_Classify(void *handle, const char *buffer, size_t size) +extern candidate_t *textcat_GetClassifyFullOutput( void *handle ) +{ + textcat_t *h = (textcat_t *)handle; + return (candidate_t *) malloc( sizeof(candidate_t) * h->size ); +} + +extern void textcat_ReleaseClassifyFullOutput( void *handle, candidate_t *candidates ) +{ + if(candidates != NULL) { + free(candidates); + } +} + +extern char *textcat_Classify( void *handle, const char *buffer, size_t size ) { textcat_t *h = (textcat_t *) handle; - int minscore = MAXSCORE; - int threshold = minscore; char *result = h->output; - void *unknown; - uint4 i, cnt; + uint4 i, cnt; + + if( h->tmp_candidates == NULL) + { + h->tmp_candidates = textcat_GetClassifyFullOutput( h ); + } + + cnt = textcat_ClassifyFull( h, buffer, size, h->tmp_candidates ); - candidate_t *candidates = - (candidate_t *) malloc(sizeof(candidate_t) * h->size); + switch(cnt){ + case TEXTCAT_RESULT_UNKOWN: + result = _TEXTCAT_RESULT_UNKOWN; + break; + case TEXTCAT_RESULT_SHORT: + result = _TEXTCAT_RESULT_SHORT; + break; + default: + { + const char *plimit = result + MAXOUTPUTSIZE; + char *p = result; + + *p = '\0'; + for (i = 0; i < cnt; i++) + { + p = wg_strgmov(p, "[", plimit); + p = wg_strgmov(p, h->tmp_candidates[i].name, plimit); + p = wg_strgmov(p, "]", plimit); + } + } + } - unknown = fp_Init(NULL); + return result; +} + + +extern int textcat_ClassifyFull( void *handle, const char *buffer, size_t size, candidate_t *candidates ) +{ + textcat_t *h = (textcat_t *)handle; + uint4 i, cnt = 0; + int minscore = MAXSCORE; + int threshold = minscore; + + void *unknown; + + unknown = fp_Init(NULL); if (fp_Create(unknown, buffer, size, MAXNGRAMS) == 0) { - /*** Too little information ***/ - result = _TEXTCAT_RESULT_SHORT; - goto READY; - } + /*** Too little information ***/ + fp_Done(unknown); + return TEXTCAT_RESULT_SHORT ; + } - /*** Calculate the score for each category. ***/ + /*** Calculate the score for each category. ***/ for (i = 0; i < h->size; i++) { - int score; + int score; if (h->fprint_disable[i] & 0x0F) { /* if this language is disabled */ score = MAXSCORE; @@ -233,55 +280,42 @@ extern char *textcat_Classify(void *handle, const char *buffer, size_t size) { score = fp_Compare(h->fprint[i], unknown, threshold); /* printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score); */ - } - candidates[i].score = score; + } + candidates[i].score = score; candidates[i].name = fp_Name(h->fprint[i]); if (score < minscore) { - minscore = score; + minscore = score; threshold = (int)((double)score * THRESHOLDVALUE); - } - } + } + } - /*** Find the best performers ***/ + /*** Find the best performers ***/ for (i = 0, cnt = 0; i < h->size; i++) { if (candidates[i].score < threshold) { if (++cnt == MAXCANDIDATES + 1) { - break; - } + break; + } memcpy(&candidates[cnt - 1], &candidates[i], sizeof(candidate_t)); - } - } + } + } - /*** The verdict ***/ + fp_Done(unknown); + /*** The verdict ***/ if (cnt == MAXCANDIDATES + 1) { - result = _TEXTCAT_RESULT_UNKOWN; - } + return TEXTCAT_RESULT_UNKOWN; + } else { - const char *plimit = result + MAXOUTPUTSIZE; - char *p = result; - qsort(candidates, cnt, sizeof(candidate_t), cmpcandidates); - - *p = '\0'; - for (i = 0; i < cnt; i++) - { - p = wg_strgmov(p, "[", plimit); - p = wg_strgmov(p, candidates[i].name, plimit); - p = wg_strgmov(p, "]", plimit); - } - } - READY: - fp_Done(unknown); - free(candidates); - return result; + return cnt; + } } extern const char *textcat_Version(void) diff --git a/src/textcat.h b/src/textcat.h index aabc74f..f335a5b 100644 --- a/src/textcat.h +++ b/src/textcat.h @@ -40,12 +40,20 @@ #define _TEXTCAT_RESULT_UNKOWN "UNKNOWN" #define _TEXTCAT_RESULT_SHORT "SHORT" +#define TEXTCAT_RESULT_UNKOWN 0 +#define TEXTCAT_RESULT_SHORT -2 #ifdef __cplusplus extern "C" { #endif +typedef struct { + int score; + const char *name; +} candidate_t; + + /** * textcat_Init() - Initialize the text classifier. The textfile * conffile should contain a list of fingerprint filenames and @@ -90,6 +98,30 @@ extern "C" extern char *textcat_Classify(void *handle, const char *buffer, size_t size); + +/** + * textcat_GetClassifyFullOutput() - Create a classifier output handler + */ +extern candidate_t *textcat_GetClassifyFullOutput( void *handle ); + +/** + * textcat_ReleaseClassifyFullOutput() - Free up resources for the classifier output handler + */ +extern void textcat_ReleaseClassifyFullOutput( void *handle, candidate_t *candidates ); + +/** + * textcat_ClassifyFull() - Give the most likely categories for buffer + * with length size. + * + * Returns: the numbers of results. + * + * Performace note: longer buffers take longer to process. However, + * for many uses it is not necessary to categorize the whole buffer. + * For language classification, a few hundred bytes will suffice. + */ +extern int textcat_ClassifyFull( void *handle, const char *buffer, size_t size, candidate_t *candidates ); + + /** * textcat_Version() - Returns a string describing the version of this classifier. */ |