summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile.am3
-rw-r--r--libexttextcat.vapi34
-rw-r--r--src/libexttextcat.map3
-rw-r--r--src/textcat.c134
-rw-r--r--src/textcat.h32
5 files changed, 156 insertions, 50 deletions
diff --git a/Makefile.am b/Makefile.am
index b72d630..912003d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -5,4 +5,7 @@ SUBDIRS = src langclass
pkgconfdir = $(libdir)/pkgconfig
pkgconf_DATA = libexttextcat.pc
+vapidir = $(datadir)/vala/vapi
+vapi_DATA = libexttextcat.vapi
+
EXTRA_DIST = ChangeLog LICENSE README README.libtextcat TODO langclass
diff --git a/libexttextcat.vapi b/libexttextcat.vapi
new file mode 100644
index 0000000..562313c
--- /dev/null
+++ b/libexttextcat.vapi
@@ -0,0 +1,34 @@
+/* libtextcat.vapi generated by vapigen-0.12, do not modify. */
+
+[CCode (cprefix = "textcat_", lower_case_cprefix = "textcat_")]
+namespace TextCat {
+ [Compact]
+ [CCode (cname="candidate_t",cheader_filename = "textcat.h")]
+ public struct candidate {
+ public weak string name;
+ public int score;
+ }
+ [Compact]
+ [CCode (cname="void",cheader_filename = "textcat.h", free_function="textcat_Done")]
+ public class Classifier {
+ [CCode (cname = "textcat_Classify", cheader_filename = "textcat.h")]
+ public unowned string classify (string buffer, size_t size);
+ [CCode (cname = "textcat_ClassifyFull", cheader_filename = "textcat.h")]
+ public int classify_full (string buffer, size_t size, candidate* candidates);
+ [CCode (cname = "textcat_GetClassifyFullOutput", cheader_filename = "textcat.h")]
+ public unowned candidate* get_classify_full_output ();
+ [CCode (cname = "textcat_ReleaseClassifyFullOutput", cheader_filename = "textcat.h")]
+ public void release_classify_full_output (candidate* candidates);
+ [CCode (cname = "textcat_Init", cheader_filename = "textcat.h")]
+ public Classifier (string conffile);
+ [CCode (cname = "textcat_InitWithPath", cheader_filename = "textcat.h")]
+ public Classifier.init_with_path (string conffile, string prefix, bool utfaware);
+
+ }
+ [CCode (cheader_filename = "textcat.h")]
+ public const int TEXTCAT_RESULT_SHORT;
+ [CCode (cheader_filename = "textcat.h")]
+ public const int TEXTCAT_RESULT_UNKOWN;
+ [CCode (cname = "textcat_Version", cheader_filename = "textcat.h")]
+ public static unowned string version ();
+}
diff --git a/src/libexttextcat.map b/src/libexttextcat.map
index 4edf79b..81785b9 100644
--- a/src/libexttextcat.map
+++ b/src/libexttextcat.map
@@ -11,6 +11,9 @@
wgmempool_strdup
special_textcat_Init
textcat_Classify
+ textcat_ClassifyFull
+ textcat_ReleaseClassifyFullOutput
+ textcat_GetClassifyFullOutput
textcat_Done
textcat_Init
textcat_Version
diff --git a/src/textcat.c b/src/textcat.c
index 2479019..41a3446 100644
--- a/src/textcat.c
+++ b/src/textcat.c
@@ -73,17 +73,11 @@ typedef struct
uint4 maxsize;
char output[MAXOUTPUTSIZE];
+ candidate_t *tmp_candidates;
} textcat_t;
-typedef struct
-{
- int score;
- const char *name;
-} candidate_t;
-
-
static int cmpcandidates(const void *a, const void *b)
{
const candidate_t *x = (const candidate_t *)a;
@@ -101,6 +95,9 @@ extern void textcat_Done(void *handle)
{
fp_Done(h->fprint[i]);
}
+ if(h->tmp_candidates != NULL) {
+ textcat_ReleaseClassifyFullOutput(h, h->tmp_candidates);
+ }
free(h->fprint);
free(h->fprint_disable);
free(h);
@@ -138,6 +135,7 @@ extern void *special_textcat_Init(const char *conffile, const char *prefix)
h->fprint = (void **)malloc(sizeof(void *) * h->maxsize);
h->fprint_disable = (unsigned char *)malloc(sizeof(unsigned char) * h->maxsize);
/* added to store the state of languages */
+ h->tmp_candidates = NULL;
while (wg_getline(line, 1024, fp))
{
char *p;
@@ -201,30 +199,79 @@ extern void *special_textcat_Init(const char *conffile, const char *prefix)
}
-extern char *textcat_Classify(void *handle, const char *buffer, size_t size)
+extern candidate_t *textcat_GetClassifyFullOutput( void *handle )
+{
+ textcat_t *h = (textcat_t *)handle;
+ return (candidate_t *) malloc( sizeof(candidate_t) * h->size );
+}
+
+extern void textcat_ReleaseClassifyFullOutput( void *handle, candidate_t *candidates )
+{
+ if(candidates != NULL) {
+ free(candidates);
+ }
+}
+
+extern char *textcat_Classify( void *handle, const char *buffer, size_t size )
{
textcat_t *h = (textcat_t *) handle;
- int minscore = MAXSCORE;
- int threshold = minscore;
char *result = h->output;
- void *unknown;
- uint4 i, cnt;
+ uint4 i, cnt;
+
+ if( h->tmp_candidates == NULL)
+ {
+ h->tmp_candidates = textcat_GetClassifyFullOutput( h );
+ }
+
+ cnt = textcat_ClassifyFull( h, buffer, size, h->tmp_candidates );
- candidate_t *candidates =
- (candidate_t *) malloc(sizeof(candidate_t) * h->size);
+ switch(cnt){
+ case TEXTCAT_RESULT_UNKOWN:
+ result = _TEXTCAT_RESULT_UNKOWN;
+ break;
+ case TEXTCAT_RESULT_SHORT:
+ result = _TEXTCAT_RESULT_SHORT;
+ break;
+ default:
+ {
+ const char *plimit = result + MAXOUTPUTSIZE;
+ char *p = result;
+
+ *p = '\0';
+ for (i = 0; i < cnt; i++)
+ {
+ p = wg_strgmov(p, "[", plimit);
+ p = wg_strgmov(p, h->tmp_candidates[i].name, plimit);
+ p = wg_strgmov(p, "]", plimit);
+ }
+ }
+ }
- unknown = fp_Init(NULL);
+ return result;
+}
+
+
+extern int textcat_ClassifyFull( void *handle, const char *buffer, size_t size, candidate_t *candidates )
+{
+ textcat_t *h = (textcat_t *)handle;
+ uint4 i, cnt = 0;
+ int minscore = MAXSCORE;
+ int threshold = minscore;
+
+ void *unknown;
+
+ unknown = fp_Init(NULL);
if (fp_Create(unknown, buffer, size, MAXNGRAMS) == 0)
{
- /*** Too little information ***/
- result = _TEXTCAT_RESULT_SHORT;
- goto READY;
- }
+ /*** Too little information ***/
+ fp_Done(unknown);
+ return TEXTCAT_RESULT_SHORT ;
+ }
- /*** Calculate the score for each category. ***/
+ /*** Calculate the score for each category. ***/
for (i = 0; i < h->size; i++)
{
- int score;
+ int score;
if (h->fprint_disable[i] & 0x0F)
{ /* if this language is disabled */
score = MAXSCORE;
@@ -233,55 +280,42 @@ extern char *textcat_Classify(void *handle, const char *buffer, size_t size)
{
score = fp_Compare(h->fprint[i], unknown, threshold);
/* printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score); */
- }
- candidates[i].score = score;
+ }
+ candidates[i].score = score;
candidates[i].name = fp_Name(h->fprint[i]);
if (score < minscore)
{
- minscore = score;
+ minscore = score;
threshold = (int)((double)score * THRESHOLDVALUE);
- }
- }
+ }
+ }
- /*** Find the best performers ***/
+ /*** Find the best performers ***/
for (i = 0, cnt = 0; i < h->size; i++)
{
if (candidates[i].score < threshold)
{
if (++cnt == MAXCANDIDATES + 1)
{
- break;
- }
+ break;
+ }
memcpy(&candidates[cnt - 1], &candidates[i], sizeof(candidate_t));
- }
- }
+ }
+ }
- /*** The verdict ***/
+ fp_Done(unknown);
+ /*** The verdict ***/
if (cnt == MAXCANDIDATES + 1)
{
- result = _TEXTCAT_RESULT_UNKOWN;
- }
+ return TEXTCAT_RESULT_UNKOWN;
+ }
else
{
- const char *plimit = result + MAXOUTPUTSIZE;
- char *p = result;
-
qsort(candidates, cnt, sizeof(candidate_t), cmpcandidates);
-
- *p = '\0';
- for (i = 0; i < cnt; i++)
- {
- p = wg_strgmov(p, "[", plimit);
- p = wg_strgmov(p, candidates[i].name, plimit);
- p = wg_strgmov(p, "]", plimit);
- }
- }
- READY:
- fp_Done(unknown);
- free(candidates);
- return result;
+ return cnt;
+ }
}
extern const char *textcat_Version(void)
diff --git a/src/textcat.h b/src/textcat.h
index aabc74f..f335a5b 100644
--- a/src/textcat.h
+++ b/src/textcat.h
@@ -40,12 +40,20 @@
#define _TEXTCAT_RESULT_UNKOWN "UNKNOWN"
#define _TEXTCAT_RESULT_SHORT "SHORT"
+#define TEXTCAT_RESULT_UNKOWN 0
+#define TEXTCAT_RESULT_SHORT -2
#ifdef __cplusplus
extern "C"
{
#endif
+typedef struct {
+ int score;
+ const char *name;
+} candidate_t;
+
+
/**
* textcat_Init() - Initialize the text classifier. The textfile
* conffile should contain a list of fingerprint filenames and
@@ -90,6 +98,30 @@ extern "C"
extern char *textcat_Classify(void *handle, const char *buffer,
size_t size);
+
+/**
+ * textcat_GetClassifyFullOutput() - Create a classifier output handler
+ */
+extern candidate_t *textcat_GetClassifyFullOutput( void *handle );
+
+/**
+ * textcat_ReleaseClassifyFullOutput() - Free up resources for the classifier output handler
+ */
+extern void textcat_ReleaseClassifyFullOutput( void *handle, candidate_t *candidates );
+
+/**
+ * textcat_ClassifyFull() - Give the most likely categories for buffer
+ * with length size.
+ *
+ * Returns: the numbers of results.
+ *
+ * Performace note: longer buffers take longer to process. However,
+ * for many uses it is not necessary to categorize the whole buffer.
+ * For language classification, a few hundred bytes will suffice.
+ */
+extern int textcat_ClassifyFull( void *handle, const char *buffer, size_t size, candidate_t *candidates );
+
+
/**
* textcat_Version() - Returns a string describing the version of this classifier.
*/