summaryrefslogtreecommitdiff
path: root/src/textcat.h
blob: f335a5becffe4e066a31493b2c159290d57ea444 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
#ifndef _TEXTCAT_H_
#define _TEXTCAT_H_
/* 
 * textcat.h -- routines for categorizing text
 *
 * Copyright (C) 2003 WiseGuys Internet B.V.
 *
 * THE BSD LICENSE
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 
 * - Redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer.
 * 
 * - Redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the
 * distribution.
 * 
 * - Neither the name of the WiseGuys Internet B.V. nor the names of
 * its contributors may be used to endorse or promote products derived
 * from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "exttextcat-version.h"

#define _TEXTCAT_RESULT_UNKOWN        "UNKNOWN"
#define _TEXTCAT_RESULT_SHORT         "SHORT"
#define TEXTCAT_RESULT_UNKOWN        0
#define TEXTCAT_RESULT_SHORT         -2

#ifdef __cplusplus
extern "C"
{
#endif

typedef struct {
	int score;
	const char *name;
} candidate_t;


/**
 * textcat_Init() - Initialize the text classifier. The textfile
 * conffile should contain a list of fingerprint filenames and
 * identification strings for the categories.  The filenames should be
 * reachable from the current working directory. The identification
 * strings will are used in the classification output.
 * 
 * Returns: handle on success, NULL on error. (At the moment, the
 * only way errors can occur, is when the library cannot read the
 * conffile, or one of the fingerprint files listed in it.)
 *
 * Replace older function (and has exacly the same behaviour)
 * see below
 */
    extern void *textcat_Init(const char *conffile);

/**
 * Originally this function had only one parameter (conffile) it has been
 * modified since OOo must be able to load alternative DB
 * Basicaly prefix is the directory path where fingerprints are stored
 */
    extern void *special_textcat_Init(const char *conffile,
                                      const char *prefix);

/**
 * textcat_Done() - Free up resources for handle
 */
    extern void textcat_Done(void *handle);

/**
 * textcat_Classify() - Give the most likely categories for buffer
 * with length size.
 *
 * Returns: string containing a list of category id's, each one
 * between square brackets, "UNKNOWN" when not recognized, "SHORT" if the
 * document was too short to make a reliable assessment.
 *
 * Performace note: longer buffers take longer to process. However,
 * for many uses it is not necessary to categorize the whole buffer.
 * For language classification, a few hundred bytes will suffice.  
 */
    extern char *textcat_Classify(void *handle, const char *buffer,
                                  size_t size);


/**
 * textcat_GetClassifyFullOutput() - Create a classifier output handler
 */
extern candidate_t *textcat_GetClassifyFullOutput( void *handle );

/**
 * textcat_ReleaseClassifyFullOutput() - Free up resources for the classifier output handler
 */
extern void textcat_ReleaseClassifyFullOutput( void *handle, candidate_t *candidates );

/**
 * textcat_ClassifyFull() - Give the most likely categories for buffer
 * with length size.
 *
 * Returns: the numbers of results.
 *
 * Performace note: longer buffers take longer to process. However,
 * for many uses it is not necessary to categorize the whole buffer.
 * For language classification, a few hundred bytes will suffice.
 */
extern int textcat_ClassifyFull( void *handle, const char *buffer, size_t size, candidate_t *candidates );


/**
 * textcat_Version() - Returns a string describing the version of this classifier.
 */
    extern const char *textcat_Version(void);

#ifdef __cplusplus
}
#endif
#endif

/* vim:set shiftwidth=4 softtabstop=4 expandtab: */