diff options
author | Rüdiger Timm <rt@openoffice.org> | 2008-04-11 06:21:37 +0000 |
---|---|---|
committer | Rüdiger Timm <rt@openoffice.org> | 2008-04-11 06:21:37 +0000 |
commit | 8589b8ab91264d3a1ba9094b7f6f2148b9019cca (patch) | |
tree | 8d360a41877a66e783585a4db8f648fe32df44ca /libtextcat | |
parent | 85d79126c11f934c707e63847403c2d70edee626 (diff) |
INTEGRATION: CWS changefileheader (1.7.6); FILE MERGED
2008/03/31 13:19:05 rt 1.7.6.1: #i87441# Change license header to LPGL v3.
Diffstat (limited to 'libtextcat')
-rw-r--r-- | libtextcat/libtextcat-2.2.patch | 3863 |
1 files changed, 1587 insertions, 2276 deletions
diff --git a/libtextcat/libtextcat-2.2.patch b/libtextcat/libtextcat-2.2.patch index ef574892c170..c9ce4add875c 100644 --- a/libtextcat/libtextcat-2.2.patch +++ b/libtextcat/libtextcat-2.2.patch @@ -1,2276 +1,1587 @@ -*** misc/libtextcat-2.2/configure Thu May 22 13:39:55 2003 ---- misc/build/libtextcat-2.2/configure Tue Nov 27 13:51:14 2007 -*************** -*** 5391,5397 **** - allow_undefined_flag= - no_undefined_flag= - need_lib_prefix=unknown -! need_version=unknown - # when you set need_version to no, make sure it does not cause -set_version - # flags to be left without arguments - archive_cmds= ---- 5391,5398 ---- - allow_undefined_flag= - no_undefined_flag= - need_lib_prefix=unknown -! #need_version=unknown -! need_version=no - # when you set need_version to no, make sure it does not cause -set_version - # flags to be left without arguments - archive_cmds= -*************** -*** 5785,5791 **** - # cross-compilation, but unfortunately the echo tests do not - # yet detect zsh echo's removal of \ escapes. Also zsh mangles - # `"' quotes if we put them in here... so don't! -! archive_cmds='$CC -r -keep_private_externs -nostdlib -o ${lib}-master.o $libobjs && $CC $(test .$module = .yes && echo -bundle || echo -dynamiclib) $allow_undefined_flag -o $lib ${lib}-master.o $deplibs$linker_flags $(test .$module != .yes && echo -install_name $rpath/$soname $verstring)' - # We need to add '_' to the symbols in $export_symbols first - #archive_expsym_cmds="$archive_cmds"' && strip -s $export_symbols' - hardcode_direct=yes ---- 5786,5792 ---- - # cross-compilation, but unfortunately the echo tests do not - # yet detect zsh echo's removal of \ escapes. Also zsh mangles - # `"' quotes if we put them in here... so don't! -! archive_cmds='$CC -r -keep_private_externs -nostdlib -o ${lib}-master.o $libobjs && $CC $(test .$module = .yes && echo -bundle || echo -dynamiclib) $allow_undefined_flag -o $lib ${lib}-master.o $deplibs$compiler_flags $(test .$module != .yes && echo -install_name $rpath/$soname $verstring)' - # We need to add '_' to the symbols in $export_symbols first - #archive_expsym_cmds="$archive_cmds"' && strip -s $export_symbols' - hardcode_direct=yes -*************** -*** 6280,6286 **** - ;; - - freebsd*) -! objformat=`test -x /usr/bin/objformat && /usr/bin/objformat || echo aout` - version_type=freebsd-$objformat - case $version_type in - freebsd-elf*) ---- 6281,6287 ---- - ;; - - freebsd*) -! objformat=`test -x /usr/bin/objformat && /usr/bin/objformat || echo elf` - version_type=freebsd-$objformat - case $version_type in - freebsd-elf*) -*** misc/libtextcat-2.2/src/Makefile.in Thu May 22 13:39:52 2003 ---- misc/build/libtextcat-2.2/src/Makefile.in Tue Nov 27 13:49:17 2007 -*************** -*** 124,143 **** - target_vendor = @target_vendor@ - AUTOMAKE_OPTIONS = 1.4 foreign - -! WARNS = -W -Wall -Wshadow -Wpointer-arith -! IFLAGS = -! FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE - VERBOSE = -DVERBOSE - AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS) - AM_LDFLAGS = -g - - noinst_HEADERS = \ -! common.h constants.h fingerprint.h textcat.h wg_mempool.h - - - lib_LTLIBRARIES = libtextcat.la - libtextcat_la_SOURCES = \ -! common.c fingerprint.c textcat.c wg_mempool.c - - - bin_PROGRAMS = createfp ---- 124,143 ---- - target_vendor = @target_vendor@ - AUTOMAKE_OPTIONS = 1.4 foreign - -! #WARNS = -W -Wall -Wshadow -Wpointer-arith -! IFLAGS = -! #FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE - VERBOSE = -DVERBOSE - AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS) - AM_LDFLAGS = -g - - noinst_HEADERS = \ -! common.h constants.h fingerprint.h textcat.h wg_mempool.h utf8misc.h - - - lib_LTLIBRARIES = libtextcat.la - libtextcat_la_SOURCES = \ -! common.c fingerprint.c textcat.c wg_mempool.c utf8misc.c - - - bin_PROGRAMS = createfp -*************** -*** 156,162 **** - libtextcat_la_LDFLAGS = - libtextcat_la_LIBADD = - am_libtextcat_la_OBJECTS = common.lo fingerprint.lo textcat.lo \ -! wg_mempool.lo - libtextcat_la_OBJECTS = $(am_libtextcat_la_OBJECTS) - bin_PROGRAMS = createfp$(EXEEXT) - noinst_PROGRAMS = testtextcat$(EXEEXT) ---- 156,162 ---- - libtextcat_la_LDFLAGS = - libtextcat_la_LIBADD = - am_libtextcat_la_OBJECTS = common.lo fingerprint.lo textcat.lo \ -! wg_mempool.lo utf8misc.lo - libtextcat_la_OBJECTS = $(am_libtextcat_la_OBJECTS) - bin_PROGRAMS = createfp$(EXEEXT) - noinst_PROGRAMS = testtextcat$(EXEEXT) -*************** -*** 177,183 **** - @AMDEP_TRUE@DEP_FILES = ./$(DEPDIR)/common.Plo ./$(DEPDIR)/createfp.Po \ - @AMDEP_TRUE@ ./$(DEPDIR)/fingerprint.Plo \ - @AMDEP_TRUE@ ./$(DEPDIR)/testtextcat.Po ./$(DEPDIR)/textcat.Plo \ -! @AMDEP_TRUE@ ./$(DEPDIR)/wg_mempool.Plo - COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ - $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) - LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) \ ---- 177,184 ---- - @AMDEP_TRUE@DEP_FILES = ./$(DEPDIR)/common.Plo ./$(DEPDIR)/createfp.Po \ - @AMDEP_TRUE@ ./$(DEPDIR)/fingerprint.Plo \ - @AMDEP_TRUE@ ./$(DEPDIR)/testtextcat.Po ./$(DEPDIR)/textcat.Plo \ -! @AMDEP_TRUE@ ./$(DEPDIR)/wg_mempool.Plo \ -! @AMDEP_TRUE@ ./$(DEPDIR)/utf8misc.Plo - COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ - $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) - LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) \ -*************** -*** 213,219 **** - @rm -f stamp-h1 - cd $(top_builddir) && $(SHELL) ./config.status src/config.h - -! $(srcdir)/config.h.in: $(top_srcdir)/configure.ac $(ACLOCAL_M4) - cd $(top_srcdir) && $(AUTOHEADER) - touch $(srcdir)/config.h.in - ---- 214,220 ---- - @rm -f stamp-h1 - cd $(top_builddir) && $(SHELL) ./config.status src/config.h - -! $(srcdir)/config.h.in: $(top_srcdir)/configure.ac $(ACLOCAL_M4) - cd $(top_srcdir) && $(AUTOHEADER) - touch $(srcdir)/config.h.in - -*************** -*** 247,254 **** - echo "rm -f \"$${dir}/so_locations\""; \ - rm -f "$${dir}/so_locations"; \ - done -! libtextcat.la: $(libtextcat_la_OBJECTS) $(libtextcat_la_DEPENDENCIES) -! $(LINK) -rpath $(libdir) $(libtextcat_la_LDFLAGS) $(libtextcat_la_OBJECTS) $(libtextcat_la_LIBADD) $(LIBS) - binPROGRAMS_INSTALL = $(INSTALL_PROGRAM) - install-binPROGRAMS: $(bin_PROGRAMS) - @$(NORMAL_INSTALL) ---- 248,255 ---- - echo "rm -f \"$${dir}/so_locations\""; \ - rm -f "$${dir}/so_locations"; \ - done -! libtextcat.la: $(libtextcat_la_OBJECTS) $(libtextcat_la_DEPENDENCIES) -! $(LINK) -avoid-version -rpath $(libdir) $(libtextcat_la_LDFLAGS) $(libtextcat_la_OBJECTS) $(libtextcat_la_LIBADD) $(LIBS) - binPROGRAMS_INSTALL = $(INSTALL_PROGRAM) - install-binPROGRAMS: $(bin_PROGRAMS) - @$(NORMAL_INSTALL) -*************** -*** 285,294 **** - echo " rm -f $$p $$f"; \ - rm -f $$p $$f ; \ - done -! createfp$(EXEEXT): $(createfp_OBJECTS) $(createfp_DEPENDENCIES) - @rm -f createfp$(EXEEXT) - $(LINK) $(createfp_LDFLAGS) $(createfp_OBJECTS) $(createfp_LDADD) $(LIBS) -! testtextcat$(EXEEXT): $(testtextcat_OBJECTS) $(testtextcat_DEPENDENCIES) - @rm -f testtextcat$(EXEEXT) - $(LINK) $(testtextcat_LDFLAGS) $(testtextcat_OBJECTS) $(testtextcat_LDADD) $(LIBS) - ---- 286,295 ---- - echo " rm -f $$p $$f"; \ - rm -f $$p $$f ; \ - done -! createfp$(EXEEXT): $(createfp_OBJECTS) $(createfp_DEPENDENCIES) - @rm -f createfp$(EXEEXT) - $(LINK) $(createfp_LDFLAGS) $(createfp_OBJECTS) $(createfp_LDADD) $(LIBS) -! testtextcat$(EXEEXT): $(testtextcat_OBJECTS) $(testtextcat_DEPENDENCIES) - @rm -f testtextcat$(EXEEXT) - $(LINK) $(testtextcat_LDFLAGS) $(testtextcat_OBJECTS) $(testtextcat_LDADD) $(LIBS) - -*************** -*** 304,309 **** ---- 305,311 ---- - @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/testtextcat.Po@am__quote@ - @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/textcat.Plo@am__quote@ - @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/wg_mempool.Plo@am__quote@ -+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/utf8misc.Plo@am__quote@ - - distclean-depend: - -rm -rf ./$(DEPDIR) -*** misc/libtextcat-2.2/src/common.c Thu May 22 13:32:43 2003 ---- misc/build/libtextcat-2.2/src/common.c Tue Nov 27 13:49:17 2007 -*************** -*** 3,25 **** - * - * Copyright (c) 2003, WiseGuys Internet B.V. - * All rights reserved. -! * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: -! * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. -! * - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the - * distribution. -! * - * - Neither the name of the WiseGuys Internet B.V. nor the names of - * its contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. -! * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ---- 3,25 ---- - * - * Copyright (c) 2003, WiseGuys Internet B.V. - * All rights reserved. -! * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: -! * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. -! * - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the - * distribution. -! * - * - Neither the name of the WiseGuys Internet B.V. nor the names of - * its contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. -! * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -*************** -*** 114,124 **** - wgmem_error( "Error while strduping %u bytes.\n", strlen(s) ); - } - -! return( result ); - } - -! extern void* wg_realloc( void *ptr, size_t size ) -! { - void *result; - - if (!size) { ---- 114,124 ---- - wgmem_error( "Error while strduping %u bytes.\n", strlen(s) ); - } - -! return( result ); - } - -! extern void* wg_realloc( void *ptr, size_t size ) -! { - void *result; - - if (!size) { -*************** -*** 131,137 **** - wgmem_error( "Error while reallocing %u bytes.\n", size ); - } - -! return( result ); - } - - extern void wg_free( void *mem ) ---- 131,137 ---- - wgmem_error( "Error while reallocing %u bytes.\n", size ); - } - -! return( result ); - } - - extern void wg_free( void *mem ) -*************** -*** 148,159 **** - if ( fgets(line, size, fp) == NULL ) { - return NULL; - } -! - /** kill term null **/ - if ( (p = strpbrk( line, "\n\r" )) ) { - *p = '\0'; -! } -! - return line; - } - ---- 148,159 ---- - if ( fgets(line, size, fp) == NULL ) { - return NULL; - } -! - /** kill term null **/ - if ( (p = strpbrk( line, "\n\r" )) ) { - *p = '\0'; -! } -! - return line; - } - -*************** -*** 164,202 **** - * - * ARGUMENTS: - * - result: -! * - * After the split, this array contains pointers to the start of each - * detected segment. Must be preallocated and at least as large as - * maxsegments. The pointers point into the dest buffer. -! * -! * - dest: -! * - * String into which result points as an index. Must be preallocated, and - * at least as big as src. You can use src as dest, but in that case src - * is overwritten! -! * -! * - src: -! * - * The string to split. Sequences of whitespace are treated as separators, unless - * escaped. There are two ways to escape: by using single quotes (anything - * between single quotes is treated as one segment), or by using a backslash - * to escape the next character. The backslash escape works inside quotation - * as well. -! * - * Example: -! * - * "It\'s very\ easy 'to use WiseGuys\' wg_split()' function" is split into: -! * - * "It's" - * "very easy" - * "to use WiseGuys' wg_split()" - * "function" -! * -! * - maxsegments: -! * - * The maximum number of segments. If the splitter runs out of segments, - * the remainder of the string is stored in the last segment. -! * - * RETURN VALUE: - * The number of segments found. - */ ---- 164,202 ---- - * - * ARGUMENTS: - * - result: -! * - * After the split, this array contains pointers to the start of each - * detected segment. Must be preallocated and at least as large as - * maxsegments. The pointers point into the dest buffer. -! * -! * - dest: -! * - * String into which result points as an index. Must be preallocated, and - * at least as big as src. You can use src as dest, but in that case src - * is overwritten! -! * -! * - src: -! * - * The string to split. Sequences of whitespace are treated as separators, unless - * escaped. There are two ways to escape: by using single quotes (anything - * between single quotes is treated as one segment), or by using a backslash - * to escape the next character. The backslash escape works inside quotation - * as well. -! * - * Example: -! * - * "It\'s very\ easy 'to use WiseGuys\' wg_split()' function" is split into: -! * - * "It's" - * "very easy" - * "to use WiseGuys' wg_split()" - * "function" -! * -! * - maxsegments: -! * - * The maximum number of segments. If the splitter runs out of segments, - * the remainder of the string is stored in the last segment. -! * - * RETURN VALUE: - * The number of segments found. - */ -*************** -*** 218,229 **** - switch (state) { - case 0: - /*** Skip spaces ***/ -! while ( isspace((int) *p) ) { - p++; - } - state = 1; - -! case 1: - /*** Start segment ***/ - result[cnt] = w; - cnt++; ---- 218,229 ---- - switch (state) { - case 0: - /*** Skip spaces ***/ -! while ( isspace((unsigned char) *p) ) { - p++; - } - state = 1; - -! case 1: - /*** Start segment ***/ - result[cnt] = w; - cnt++; -*************** -*** 232,243 **** - case 2: - /*** Unquoted segment ***/ - while (*p) { -! if ( isspace((int) *p) ) { - *w++ = '\0'; - p++; - state = 0; - break; -! } - else if ( *p == '\'' ) { - /*** Start quotation ***/ - p++; ---- 232,243 ---- - case 2: - /*** Unquoted segment ***/ - while (*p) { -! if ( isspace((unsigned char) *p) ) { - *w++ = '\0'; - p++; - state = 0; - break; -! } - else if ( *p == '\'' ) { - /*** Start quotation ***/ - p++; -*************** -*** 292,308 **** - } - - - extern void wg_timerstart(wgtimer_t *t) - { -- #ifdef HAVE_GETTIMEOFDAY - gettimeofday( &(t->start), NULL ); -- #endif - } - - - extern uint4 wg_timerstop(wgtimer_t *t) - { -- #ifdef HAVE_GETTIMEOFDAY - uint4 result; - gettimeofday( &(t->stop), NULL ); - result = (t->stop.tv_sec - t->start.tv_sec) * 1000000 + ---- 292,308 ---- - } - - -+ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ - extern void wg_timerstart(wgtimer_t *t) - { - gettimeofday( &(t->start), NULL ); - } -+ #endif /* TL : no struct timeval under Win32 */ - - -+ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ - extern uint4 wg_timerstop(wgtimer_t *t) - { - uint4 result; - gettimeofday( &(t->stop), NULL ); - result = (t->stop.tv_sec - t->start.tv_sec) * 1000000 + -*************** -*** 312,336 **** - t->start.tv_usec = t->stop.tv_usec; - - return result; -- #else -- return 0; -- #endif - } - - - /** - * wg_strgmov -- a guarded strcpy() variation -! * - * copies src to dest (including terminating zero), and returns - * pointer to position of terminating zero in dest. The function is - * guaranteed not to write past destlimit. If the copy couldn't be -! * finished, the function returns NULL after restoring the first -! * character in dest for your convenience (since this is usually a zero). - */ - char *wg_strgmov( char *dest, const char *src, const char *destlimit ) - { - char tmp, *w; -! - if ( !dest || dest >= destlimit ) { - return NULL; - } ---- 312,334 ---- - t->start.tv_usec = t->stop.tv_usec; - - return result; - } -+ #endif /* TL : no struct timeval under Win32 */ - - - /** - * wg_strgmov -- a guarded strcpy() variation -! * - * copies src to dest (including terminating zero), and returns - * pointer to position of terminating zero in dest. The function is - * guaranteed not to write past destlimit. If the copy couldn't be -! * finished, the function returns NULL after restoring the first -! * character in dest for your convenience (since this is usually a zero). - */ - char *wg_strgmov( char *dest, const char *src, const char *destlimit ) - { - char tmp, *w; -! - if ( !dest || dest >= destlimit ) { - return NULL; - } -*************** -*** 355,361 **** - } - - /* -! * wg_trim() -- remove whitespace surrounding a string. - * - * Example: " bla bla bla " becomes "bla bla bla" after trimming. - * ---- 353,359 ---- - } - - /* -! * wg_trim() -- remove whitespace surrounding a string. - * - * Example: " bla bla bla " becomes "bla bla bla" after trimming. - * -*************** -*** 373,384 **** - char *lastnonspace = &dest[-1]; - const char *p = src; - char *w = dest; -! -! while ( isspace((int)*p) ) { - p++; - } - while (*p) { -! if ( !isspace((int)*p) ) { - lastnonspace = w; - } - *w++ = *p++; ---- 371,382 ---- - char *lastnonspace = &dest[-1]; - const char *p = src; - char *w = dest; -! -! while ( isspace((unsigned char)*p) ) { - p++; - } - while (*p) { -! if ( !isspace((unsigned char)*p) ) { - lastnonspace = w; - } - *w++ = *p++; -*** misc/libtextcat-2.2/src/common.h Thu May 22 15:02:29 2003 ---- misc/build/libtextcat-2.2/src/common.h Tue Nov 27 13:49:17 2007 -*************** -*** 1,28 **** - #ifndef _COMMON_H_ - #define _COMMON_H_ - /** -! * common.h -- a mixed bag of helper functions - * - * Copyright (C) 2003 WiseGuys Internet B.V. - * - * THE BSD LICENSE -! * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: -! * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. -! * - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the - * distribution. -! * - * - Neither the name of the WiseGuys Internet B.V. nor the names of - * its contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. -! * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ---- 1,28 ---- - #ifndef _COMMON_H_ - #define _COMMON_H_ - /** -! * common.h -- a mixed bag of helper functions - * - * Copyright (C) 2003 WiseGuys Internet B.V. - * - * THE BSD LICENSE -! * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: -! * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. -! * - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the - * distribution. -! * - * - Neither the name of the WiseGuys Internet B.V. nor the names of - * its contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. -! * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -*************** -*** 86,95 **** ---- 86,97 ---- - typedef char boole; - #endif - -+ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ - typedef struct wgtimer_s { - struct timeval start; - struct timeval stop; - } wgtimer_t; -+ #endif /* TL : no struct timeval under Win32 */ - - - extern void *wg_malloc( size_t size ); -*************** -*** 101,113 **** - - extern char *wg_getline( char *line, int size, FILE *fp ); - - extern void wg_timerstart(wgtimer_t *t); - extern uint4 wg_timerstop(wgtimer_t *t); - - extern unsigned int wg_split( char **result, char *dest, char *src, int maxsegments ); - extern char *wg_strgmov( char *dest, const char *src, const char *destlimit ); - extern char *wg_trim( char *dest, const char *src ); - -! - #endif - ---- 103,117 ---- - - extern char *wg_getline( char *line, int size, FILE *fp ); - -+ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ - extern void wg_timerstart(wgtimer_t *t); - extern uint4 wg_timerstop(wgtimer_t *t); -+ #endif /* TL : no struct timeval under Win32 */ - - extern unsigned int wg_split( char **result, char *dest, char *src, int maxsegments ); - extern char *wg_strgmov( char *dest, const char *src, const char *destlimit ); - extern char *wg_trim( char *dest, const char *src ); - -! - #endif - -*** misc/libtextcat-2.2/src/constants.h Thu May 22 13:32:43 2003 ---- misc/build/libtextcat-2.2/src/constants.h Tue Nov 27 13:49:17 2007 -*************** -*** 39,44 **** ---- 39,46 ---- - */ - #include <limits.h> - -+ #define _UTF8_ -+ - #define DESCRIPTION "out of place" - - /* Reported matches are those fingerprints with a score less than best -*************** -*** 59,72 **** - /* Maximum number of n-grams in a fingerprint */ - #define MAXNGRAMS 400 - -! /* Maximum size of an n-gram? */ -! #define MAXNGRAMSIZE 5 - - /* Which characters are not acceptable in n-grams? */ -! #define INVALID(c) (isspace((int)c) || isdigit((int)c)) - - /* Minimum size (in characters) for accepting a document */ -! #define MINDOCSIZE 25 - - /* Maximum penalty for missing an n-gram in fingerprint */ - #define MAXOUTOFPLACE 400 ---- 61,81 ---- - /* Maximum number of n-grams in a fingerprint */ - #define MAXNGRAMS 400 - -! /* Maximum number of character of an n-gram? */ -! #define MAXNGRAMSYMBOL 5 - -+ /* Maximum size of the string representing an n-gram (must be greater than number of symbol) */ -+ #ifdef _UTF8_ -+ #define MAXNGRAMSIZE 20 -+ #else -+ #define MAXNGRAMSIZE MAXNGRAMSYMBOL -+ #endif -+ - /* Which characters are not acceptable in n-grams? */ -! #define INVALID(c) (isspace((unsigned char)c) || isdigit((unsigned char)c)) - - /* Minimum size (in characters) for accepting a document */ -! #define MINDOCSIZE 6 - - /* Maximum penalty for missing an n-gram in fingerprint */ - #define MAXOUTOFPLACE 400 -*************** -*** 76,79 **** ---- 85,91 ---- - - #define MAXSCORE INT_MAX - -+ /* where the fingerprints files are stored */ -+ #define DEFAULT_FINGERPRINTS_PATH "" -+ - #endif -*** misc/libtextcat-2.2/src/fingerprint.c Thu May 22 13:32:43 2003 ---- misc/build/libtextcat-2.2/src/fingerprint.c Tue Nov 27 13:49:18 2007 -*************** -*** 6,28 **** - * All rights reserved. - * - * THE BSD LICENSE -! * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: -! * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. -! * - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the - * distribution. -! * - * - Neither the name of the WiseGuys Internet B.V. nor the names of - * its contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. -! * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ---- 6,28 ---- - * All rights reserved. - * - * THE BSD LICENSE -! * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: -! * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. -! * - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the - * distribution. -! * - * - Neither the name of the WiseGuys Internet B.V. nor the names of - * its contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. -! * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -*************** -*** 51,57 **** - * The reason why we go through the trouble of doing a partial - * (heap)sort is that a full quicksort behaves horribly on the data: - * most n-grams have a very low count, resulting in a data set in -! * nearly-sorted order. This causes quicksort to behave very badly. - * Heapsort, on the other hand, behaves handsomely: worst case is - * Mlog(N) for M n-grams filtered through a N-sized heap. - * ---- 51,57 ---- - * The reason why we go through the trouble of doing a partial - * (heap)sort is that a full quicksort behaves horribly on the data: - * most n-grams have a very low count, resulting in a data set in -! * nearly-sorted order. This causes quicksort to behave very badly. - * Heapsort, on the other hand, behaves handsomely: worst case is - * Mlog(N) for M n-grams filtered through a N-sized heap. - * -*************** -*** 63,68 **** ---- 63,72 ---- - * - put table/heap datastructure in a separate file. - */ - -+ #ifndef _UTF8_ -+ #define _UTF8_ -+ #endif -+ - #include "config.h" - #include <stdio.h> - #ifdef HAVE_STDLIB_H -*************** -*** 80,89 **** ---- 84,95 ---- - #include "wg_mempool.h" - #include "constants.h" - -+ #include "utf8misc.h" - - #define TABLESIZE (1<<TABLEPOW) - #define TABLEMASK ((TABLESIZE)-1) - -+ - typedef struct { - - sint2 rank; -*************** -*** 96,102 **** - const char *name; - ngram_t *fprint; - uint4 size; -! - } fp_t; - - typedef struct entry_s { ---- 102,108 ---- - const char *name; - ngram_t *fprint; - uint4 size; -! - } fp_t; - - typedef struct entry_s { -*************** -*** 105,117 **** - struct entry_s *next; - } entry_t; - -! typedef struct table_s { - void *pool; - entry_t **table; - entry_t *heap; - - struct table_s *next; -! - uint4 heapsize; - uint4 size; - } table_t; ---- 111,123 ---- - struct entry_s *next; - } entry_t; - -! typedef struct table_s { - void *pool; - entry_t **table; - entry_t *heap; - - struct table_s *next; -! - uint4 heapsize; - uint4 size; - } table_t; -*************** -*** 122,128 **** - * fast and furious little hash function - * - * (Note that we could use some kind of rolling checksum, and update it -! * during n-gram construction) - */ - static uint4 simplehash( const char *p, int len ) - { ---- 128,134 ---- - * fast and furious little hash function - * - * (Note that we could use some kind of rolling checksum, and update it -! * during n-gram construction) - */ - static uint4 simplehash( const char *p, int len ) - { -*************** -*** 134,162 **** - } - - -- /* checks if n-gram lex is a prefix of key and of length len */ -- inline int issame( char *lex, char *key, int len ) -- { -- int i; -- for (i=0; i<len; i++) { -- if ( key[i] != lex[i] ) { -- return 0; -- } -- } -- if ( lex[i] != 0 ) { -- return 0; -- } -- return 1; -- } - -- - /* increases frequency of ngram(p,len) */ -! static inline int increasefreq( table_t *t, char *p, int len ) -! { -! uint4 hash = simplehash( p, len ) & TABLEMASK; - entry_t *entry = t->table[ hash ]; -! -! while ( entry ) { - if ( issame( entry->str, p, len ) ) { - /*** Found it! ***/ - entry->cnt++; ---- 140,153 ---- - } - - - - /* increases frequency of ngram(p,len) */ -! static int increasefreq( table_t *t, char *p, int len ) -! { -! uint4 hash = simplehash( p, len ) & TABLEMASK; - entry_t *entry = t->table[ hash ]; -! -! while ( entry ) { - if ( issame( entry->str, p, len ) ) { - /*** Found it! ***/ - entry->cnt++; -*************** -*** 168,174 **** - } - - /*** Not found, so create ***/ -! entry = wgmempool_alloc( t->pool, sizeof(entry_t) ); - strcpy( entry->str, p ); - entry->cnt = 1; - ---- 159,165 ---- - } - - /*** Not found, so create ***/ -! entry = (entry_t*)(wgmempool_alloc( t->pool, sizeof(entry_t) )); - strcpy( entry->str, p ); - entry->cnt = 1; - -*************** -*** 181,192 **** - #if 0 - - /* looks up ngram(p,len) */ -! static entry_t *findfreq( table_t *t, char *p, int len ) -! { -! uint4 hash = simplehash( p, len ) & TABLEMASK; - entry_t *entry = t->table[ hash ]; -! -! while ( entry ) { - if ( issame( entry->str, p, len ) ) { - return entry; - } ---- 172,183 ---- - #if 0 - - /* looks up ngram(p,len) */ -! static entry_t *findfreq( table_t *t, char *p, int len ) -! { -! uint4 hash = simplehash( p, len ) & TABLEMASK; - entry_t *entry = t->table[ hash ]; -! -! while ( entry ) { - if ( issame( entry->str, p, len ) ) { - return entry; - } -*************** -*** 219,225 **** - #define GREATER(x,y) ((x).cnt > (y).cnt) - #define LESS(x,y) ((x).cnt < (y).cnt) - -! inline static void siftup( table_t *t, unsigned int child ) - { - entry_t *heap = t->heap; - unsigned int parent = (child-1) >> 1; ---- 210,216 ---- - #define GREATER(x,y) ((x).cnt > (y).cnt) - #define LESS(x,y) ((x).cnt < (y).cnt) - -! static void siftup( table_t *t, unsigned int child ) - { - entry_t *heap = t->heap; - unsigned int parent = (child-1) >> 1; -*************** -*** 241,247 **** - } - - -! inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent ) - { - entry_t *heap = t->heap; - unsigned int child = parent*2 + 1; ---- 232,238 ---- - } - - -! static void siftdown( table_t *t, unsigned int heapsize, uint4 parent ) - { - entry_t *heap = t->heap; - unsigned int child = parent*2 + 1; -*************** -*** 273,279 **** - if (t->size < t->heapsize) { - memcpy( &(heap[t->size]), item, sizeof(entry_t)); - siftup( t, t->size ); -! t->size++; - return 0; - } - ---- 264,270 ---- - if (t->size < t->heapsize) { - memcpy( &(heap[t->size]), item, sizeof(entry_t)); - siftup( t, t->size ); -! t->size++; - return 0; - } - -*************** -*** 316,333 **** - - /*** Fill result heap ***/ - for (i=0; i<TABLESIZE; i++) { -! entry_t *p = t->table[i]; - while (p) { - heapinsert(t, p); - p = p->next; - } -! } - return 1; - } - - - static table_t *inittable(uint4 maxngrams) -! { - table_t *result = (table_t *)wg_zalloc( sizeof(table_t) ); - result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE ); - result->pool = wgmempool_Init( 10000, 10 ); ---- 307,324 ---- - - /*** Fill result heap ***/ - for (i=0; i<TABLESIZE; i++) { -! entry_t *p = t->table[i]; - while (p) { - heapinsert(t, p); - p = p->next; - } -! } - return 1; - } - - - static table_t *inittable(uint4 maxngrams) -! { - table_t *result = (table_t *)wg_zalloc( sizeof(table_t) ); - result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE ); - result->pool = wgmempool_Init( 10000, 10 ); -*************** -*** 347,353 **** - wgmempool_Done(t->pool); - wg_free(t->table); - wg_free(t->heap); -! wg_free(t); - } - - ---- 338,344 ---- - wgmempool_Done(t->pool); - wg_free(t->table); - wg_free(t->heap); -! wg_free(t); - } - - -*************** -*** 354,360 **** - extern void *fp_Init(const char *name) - { - fp_t *h = (fp_t *)wg_zalloc( sizeof(fp_t) ); -! - if ( name ) { - h->name = wg_strdup(name); - } ---- 345,351 ---- - extern void *fp_Init(const char *name) - { - fp_t *h = (fp_t *)wg_zalloc( sizeof(fp_t) ); -! - if ( name ) { - h->name = wg_strdup(name); - } -*************** -*** 458,478 **** - return dest; - } - -! - static void createngramtable( table_t *t, const char *buf ) - { - char n[MAXNGRAMSIZE+1]; - const char *p = buf; - int i; - - /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/ -! for (;;p++) { - -! const char *q = p; - char *m = n; - - /*** First char may be an underscore ***/ -! *m++ = *q++; - *m = '\0'; - - increasefreq( t, n, 1 ); ---- 449,475 ---- - return dest; - } - -! /** -! * this function extract all n-gram from past buffer and put them into the table "t" -! * [modified] by Jocelyn Merand to accept utf-8 multi-character symbols to be used in OpenOffice -! */ - static void createngramtable( table_t *t, const char *buf ) - { - char n[MAXNGRAMSIZE+1]; - const char *p = buf; - int i; -+ int pointer = 0; - - /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/ -! while(1) { - -! const char *q = &p[pointer]; /*[modified] previously p++ above (for(;;p++)) now, it's pointer wich is increased so we have to get the new pointer on the buffer*/ - char *m = n; - - /*** First char may be an underscore ***/ -! int decay = charcopy(q, m); /*[modified] previously *q++ = *m++*/ -! q = &(p[pointer+decay]); /*[modified] the old copying method do not manage multi-character symbols*/ -! m += decay; /*[modified]*/ - *m = '\0'; - - increasefreq( t, n, 1 ); -*************** -*** 482,500 **** - } - - /*** Let the compiler unroll this ***/ -! for ( i=2; i<=MAXNGRAMSIZE; i++) { - -! *m++ = *q; - *m = '\0'; - - increasefreq( t, n, i ); - - if ( *q == '_' ) break; -! q++; - if ( *q == '\0' ) { - return; - } - } - } - return; - } ---- 479,500 ---- - } - - /*** Let the compiler unroll this ***/ -! for ( i=2; i<=MAXNGRAMSYMBOL; i++) { - -! decay = charcopy(q, m); /*[modified] like above*/ -! m += decay; - *m = '\0'; - - increasefreq( t, n, i ); - - if ( *q == '_' ) break; -! q += decay; - if ( *q == '\0' ) { - return; - } - } -+ -+ pointer = nextcharstart(p,pointer); /*[modified] p[pointer] must point on the next start of symbol, but whith utf next start is not surely next char*/ - } - return; - } -*************** -*** 514,520 **** - { - ngram_t *x = (ngram_t *)a; - ngram_t *y = (ngram_t *)b; -! - return mystrcmp( x->str, y->str ); - } - ---- 514,520 ---- - { - ngram_t *x = (ngram_t *)a; - ngram_t *y = (ngram_t *)b; -! - return mystrcmp( x->str, y->str ); - } - -*************** -*** 522,533 **** - { - ngram_t *x = (ngram_t *)a; - ngram_t *y = (ngram_t *)b; -! - return x->rank - y->rank; - } - - /** -! * Create a fingerprint: - * - record the frequency of each unique n-gram in a hash table - * - take the most frequent n-grams - * - sort them alphabetically, recording their relative rank ---- 522,533 ---- - { - ngram_t *x = (ngram_t *)a; - ngram_t *y = (ngram_t *)b; -! - return x->rank - y->rank; - } - - /** -! * Create a fingerprint: - * - record the frequency of each unique n-gram in a hash table - * - take the most frequent n-grams - * - sort them alphabetically, recording their relative rank -*************** -*** 544,563 **** - } - - /*** Throw out all invalid chars ***/ -! tmp = prepbuffer( buffer, bufsize ); - if ( tmp == NULL ) { - return 0; - } -- - h = (fp_t*)handle; - t = inittable(maxngrams); - - /*** Create a hash table containing n-gram counts ***/ - createngramtable(t, tmp); -! - /*** Take the top N n-grams and add them to the profile ***/ -! table2heap(t); -! maxngrams = WGMIN( maxngrams, t->size ); - - h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams ); - h->size = maxngrams; ---- 544,564 ---- - } - - /*** Throw out all invalid chars ***/ -! tmp = prepbuffer( buffer, bufsize ); -! /*printf("Cleaned buffer : %s\n",tmp);*/ - if ( tmp == NULL ) { - return 0; - } - h = (fp_t*)handle; - t = inittable(maxngrams); -+ /*printf("Table initialized\n");*/ - - /*** Create a hash table containing n-gram counts ***/ - createngramtable(t, tmp); -! /*printf("Table created\n");*/ - /*** Take the top N n-grams and add them to the profile ***/ -! table2heap(t); -! maxngrams = WGMIN( maxngrams, t->size ); - - h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams ); - h->size = maxngrams; -*************** -*** 568,574 **** - entry_t tmp2; - - heapextract(t, &tmp2); -! - /*** the string and its rank is all we need ***/ - strcpy( h->fprint[i].str, tmp2.str ); - h->fprint[i].rank = i; ---- 569,575 ---- - entry_t tmp2; - - heapextract(t, &tmp2); -! - /*** the string and its rank is all we need ***/ - strcpy( h->fprint[i].str, tmp2.str ); - h->fprint[i].rank = i; -*************** -*** 578,584 **** - wg_free(tmp); - - /*** Sort n-grams alphabetically, for easy comparison ***/ -! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); - return 1; - } - ---- 579,585 ---- - wg_free(tmp); - - /*** Sort n-grams alphabetically, for easy comparison ***/ -! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); - return 1; - } - -*************** -*** 608,614 **** - #endif - return 0; - } -! - h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t)); - - while (cnt < maxngrams && wg_getline(line,1024,fp)) { ---- 609,615 ---- - #endif - return 0; - } -! - h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t)); - - while (cnt < maxngrams && wg_getline(line,1024,fp)) { -*************** -*** 635,641 **** - h->size = cnt; - - /*** Sort n-grams, for easy comparison later on ***/ -! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); - - fclose(fp); - ---- 636,642 ---- - h->size = cnt; - - /*** Sort n-grams, for easy comparison later on ***/ -! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); - - fclose(fp); - -*************** -*** 648,661 **** - { - uint4 i; - fp_t *h = (fp_t *)handle; -! ngram_t *tmp = wg_malloc( sizeof(ngram_t) * h->size ); -! - /*** Make a temporary and sort it on rank ***/ - memcpy( tmp, h->fprint, h->size * sizeof(ngram_t) ); -! qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank ); - - for (i=0; i<h->size; i++) { -! fprintf( fp, "%s\n", tmp[i].str ); - } - wg_free( tmp ); - } ---- 649,663 ---- - { - uint4 i; - fp_t *h = (fp_t *)handle; -! ngram_t *tmp = (ngram_t*)wg_malloc( sizeof(ngram_t) * h->size ); -! - /*** Make a temporary and sort it on rank ***/ - memcpy( tmp, h->fprint, h->size * sizeof(ngram_t) ); -! qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank ); - - for (i=0; i<h->size; i++) { -! /*fprintf( fp, "%s\t%i\n", tmp[i].str, tmp[i].rank );*/ -! fprintf( fp, "%s\n", tmp[i].str); - } - wg_free( tmp ); - } -*************** -*** 669,675 **** - uint4 i = 0; - uint4 j = 0; - sint4 sum = 0; -! - /*** Compare the profiles in mergesort fashion ***/ - while ( i < c->size && j < u->size ) { - ---- 671,677 ---- - uint4 i = 0; - uint4 j = 0; - sint4 sum = 0; -! - /*** Compare the profiles in mergesort fashion ***/ - while ( i < c->size && j < u->size ) { - -*************** -*** 705,711 **** - } - - return sum; -! - } - - ---- 707,713 ---- - } - - return sum; -! - } - - -*** misc/libtextcat-2.2/src/fingerprint.h Mon May 19 14:16:31 2003 ---- misc/build/libtextcat-2.2/src/fingerprint.h Tue Nov 27 13:49:18 2007 -*************** -*** 41,47 **** ---- 41,53 ---- - extern int fp_Read( void *handle, const char *fname, int maxngrams ); - extern sint4 fp_Compare( void *cat, void *unknown, int cutoff ); - extern void fp_Show( void *handle ); -+ #ifdef __cplusplus -+ extern "C" { -+ #endif - extern const char *fp_Name( void *handle ); -+ #ifdef __cplusplus -+ } -+ #endif - extern void fp_Print( void *handle, FILE *fp ); - - #endif -*** misc/libtextcat-2.2/src/libtextcat.map Tue Nov 27 13:51:28 2007 ---- misc/build/libtextcat-2.2/src/libtextcat.map Tue Nov 27 13:49:18 2007 -*************** -*** 1 **** -! dummy ---- 1,40 ---- -! { -! global: -! charcopy -! issame -! nextcharstart -! utfstrlen -! wgmempool_Done -! wgmempool_Init -! wgmempool_Reset -! wgmempool_alloc -! wgmempool_getline -! wgmempool_strdup -! special_textcat_Init -! textcat_Classify -! textcat_Done -! textcat_Init -! textcat_Version -! fp_Compare -! fp_Create -! fp_Debug -! fp_Done -! fp_Init -! fp_Name -! fp_Print -! fp_Read -! heapextract -! wg_calloc -! wg_free -! wg_getline -! wg_malloc -! wg_split -! wg_strdup -! wg_strgmov -! wg_trim -! wg_zalloc -! wgmem_error -! -! local: -! *; -! } -*** misc/libtextcat-2.2/src/makefile.mk Tue Nov 27 13:51:28 2007 ---- misc/build/libtextcat-2.2/src/makefile.mk Tue Nov 27 13:49:18 2007 -*************** -*** 1 **** -! dummy ---- 1,92 ---- -! #************************************************************************* -! # -! # $RCSfile: libtextcat-2.2.patch,v $ -! # -! # $Revision: 1.7 $ -! # -! # last change: $Author: obo $ $Date: 2008-01-04 15:02:30 $ -! # -! #* The Contents of this file are made available subject to -! #* the terms of GNU Lesser General Public License Version 2.1. -! #* -! #* -! #* GNU Lesser General Public License Version 2.1 -! #* ============================================= -! #* Copyright 2005 by Sun Microsystems, Inc. -! #* 901 San Antonio Road, Palo Alto, CA 94303, USA -! #* -! #* This library is free software; you can redistribute it and/or -! #* modify it under the terms of the GNU Lesser General Public -! #* License version 2.1, as published by the Free Software Foundation. -! #* -! #* This library is distributed in the hope that it will be useful, -! #* but WITHOUT ANY WARRANTY; without even the implied warranty of -! #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -! #* Lesser General Public License for more details. -! #* -! #* You should have received a copy of the GNU Lesser General Public -! #* License along with this library; if not, write to the Free Software -! #* Foundation, Inc., 59 Temple Place, Suite 330, Boston, -! #* MA 02111-1307 USA -! #* -! #************************************************************************* -! -! PRJ = ..$/..$/..$/..$/.. -! -! PRJNAME = libtextcat -! TARGET = libtextcat -! CFLAGSCALL=gsd -! -! USE_DEFFILE=TRUE -! EXTERNAL_WARNINGS_NOT_ERRORS := TRUE -! -! .INCLUDE : settings.mk -! -! # --- Files -------------------------------------------------------- -! -! # !! not to be compiled because those belong to a stand alone programs: !! -! # $(SLO)$/createfp.obj\ -! # $(SLO)$/testtextcat.obj -! -! SLOFILES= \ -! $(SLO)$/common.obj\ -! $(SLO)$/fingerprint.obj\ -! $(SLO)$/textcat.obj\ -! $(SLO)$/wg_mempool.obj\ -! $(SLO)$/utf8misc.obj -! -! #SHL1TARGET= $(TARGET)$(UPD)$(DLLPOSTFIX) -! SHL1TARGET= $(TARGET) -! -! SHL1STDLIBS= -! -! # build DLL -! SHL1LIBS= $(SLB)$/$(TARGET).lib -! SHL1IMPLIB= i$(TARGET) -! SHL1DEPN= $(SHL1LIBS) -! SHL1DEF= $(MISC)$/$(SHL1TARGET).def -! -! # build DEF file -! DEF1NAME= $(SHL1TARGET) -! DEF1DEPN=$(MISC)$/$(SHL1TARGET).flt -! -! SHL1VERSIONMAP= libtextcat.map -! -! # --- Targets ------------------------------------------------------ -! -! .INCLUDE : target.mk -! -! # copy hand supplied configuration file for Win32 builds to the file -! # which is included in the source code -! $(SLOFILES) : config.h -! config.h : -! $(GNUCOPY) $(OUT)$/misc$/build$/libtextcat-2.2$/src$/win32_config.h $(OUT)$/misc$/build$/libtextcat-2.2$/src$/config.h -! -! -! $(MISC)$/$(SHL1TARGET).flt: makefile.mk -! @echo ------------------------------ -! @echo Making: $@ -! @echo Imp>$@ -! @echo __CT>>$@ -! @echo _real>>$@ -! @echo unnamed>>$@ -*** misc/libtextcat-2.2/src/textcat.c Thu May 22 13:32:43 2003 ---- misc/build/libtextcat-2.2/src/textcat.c Tue Nov 27 13:49:18 2007 -*************** -*** 4,26 **** - * Copyright (C) 2003 WiseGuys Internet B.V. - * - * THE BSD LICENSE -! * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: -! * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. -! * - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the - * distribution. -! * - * - Neither the name of the WiseGuys Internet B.V. nor the names of - * its contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. -! * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ---- 4,26 ---- - * Copyright (C) 2003 WiseGuys Internet B.V. - * - * THE BSD LICENSE -! * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: -! * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. -! * - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the - * distribution. -! * - * - Neither the name of the WiseGuys Internet B.V. nor the names of - * its contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. -! * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -*************** -*** 74,79 **** ---- 74,80 ---- - typedef struct { - - void **fprint; -+ char *fprint_disable; - uint4 size; - uint4 maxsize; - -*************** -*** 112,122 **** - fp_Done( h->fprint[i] ); - } - wg_free( h->fprint ); - wg_free( h ); - - } - -! extern void *textcat_Init( const char *conffile ) - { - textcat_t *h; - char line[1024]; ---- 113,133 ---- - fp_Done( h->fprint[i] ); - } - wg_free( h->fprint ); -+ wg_free( h->fprint_disable ); - wg_free( h ); - - } - -! /** Replaces older function */ -! extern void *textcat_Init( const char *conffile ){ -! return special_textcat_Init( conffile, DEFAULT_FINGERPRINTS_PATH ); -! } -! -! /** -! * Originaly this function had only one parameter (conffile) it has been modified since OOo use -! * Basicaly prefix is the directory path where fingerprints are stored -! */ -! extern void *special_textcat_Init( const char *conffile, const char *prefix ) - { - textcat_t *h; - char line[1024]; -*************** -*** 134,144 **** - h->size = 0; - h->maxsize = 16; - h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize ); - - while ( wg_getline( line, 1024, fp ) ) { - char *p; - char *segment[4]; -! int res; - - /*** Skip comments ***/ - #ifdef HAVE_STRCHR ---- 145,157 ---- - h->size = 0; - h->maxsize = 16; - h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize ); -+ h->fprint_disable = (char *)wg_malloc( sizeof(char*) * h->maxsize ); /*added to store the state of languages*/ - - while ( wg_getline( line, 1024, fp ) ) { - char *p; - char *segment[4]; -! char finger_print_file_name[512]; -! int res; - - /*** Skip comments ***/ - #ifdef HAVE_STRCHR -*************** -*** 156,162 **** - /*** Ensure enough space ***/ - if ( h->size == h->maxsize ) { - h->maxsize *= 2; -! h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize ); - } - - /*** Load data ***/ ---- 169,176 ---- - /*** Ensure enough space ***/ - if ( h->size == h->maxsize ) { - h->maxsize *= 2; -! h->fprint = (void **)wg_realloc( h->fprint, sizeof(void*) * h->maxsize ); -! h->fprint_disable = (char *)wg_realloc( h->fprint_disable, sizeof(char*) * h->maxsize ); - } - - /*** Load data ***/ -*************** -*** 163,172 **** - if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) { - goto ERROR; - } -! if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) { - textcat_Done(h); - goto ERROR; -! } - h->size++; - } - ---- 177,191 ---- - if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) { - goto ERROR; - } -! finger_print_file_name[0] = '\0'; -! strcat(finger_print_file_name, prefix); -! strcat(finger_print_file_name, segment[0]); -! -! if ( fp_Read( h->fprint[h->size], finger_print_file_name, 400 ) == 0 ) { - textcat_Done(h); - goto ERROR; -! } -! h->fprint_disable[h->size] = 0xF0; /*0xF0 is the code for enabled languages, 0x0F is for disabled*/ - h->size++; - } - -*************** -*** 203,213 **** - result = _TEXTCAT_RESULT_SHORT; - goto READY; - } -! - /*** Calculate the score for each category. ***/ - for (i=0; i<h->size; i++) { -! int score = fp_Compare( h->fprint[i], unknown, threshold ); -! candidates[i].score = score; - candidates[i].name = fp_Name( h->fprint[i] ); - if ( score < minscore ) { - minscore = score; ---- 222,239 ---- - result = _TEXTCAT_RESULT_SHORT; - goto READY; - } -! - /*** Calculate the score for each category. ***/ - for (i=0; i<h->size; i++) { -! int score; -! if(h->fprint_disable[i] & 0x0F){ /*if this language is disabled*/ -! score = MAXSCORE; -! } -! else{ -! score = fp_Compare( h->fprint[i], unknown, threshold ); -! /*printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score);*/ -! } -! candidates[i].score = score; - candidates[i].name = fp_Name( h->fprint[i] ); - if ( score < minscore ) { - minscore = score; -*************** -*** 218,224 **** - /*** Find the best performers ***/ - for (i=0; i<h->size; i++) { - if ( candidates[i].score < threshold ) { -- - if ( ++cnt == MAXCANDIDATES+1 ) { - break; - } ---- 244,249 ---- -*************** -*** 235,241 **** - else { - char *p = result; - char *plimit = result+MAXOUTPUTSIZE; -! - qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates ); - - *p = '\0'; ---- 260,266 ---- - else { - char *p = result; - char *plimit = result+MAXOUTPUTSIZE; -! - qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates ); - - *p = '\0'; -*************** -*** 247,253 **** - } - READY: - fp_Done(unknown); -! #ifdef SHOULD_FREE - free(candidates); - #undef SHOULD_FREE - #endif ---- 272,278 ---- - } - READY: - fp_Done(unknown); -! #ifdef SHOULD_FREE - free(candidates); - #undef SHOULD_FREE - #endif -*** misc/libtextcat-2.2/src/textcat.h Mon May 19 14:16:31 2003 ---- misc/build/libtextcat-2.2/src/textcat.h Tue Nov 27 13:49:18 2007 -*************** -*** 40,45 **** ---- 40,48 ---- - #define _TEXTCAT_RESULT_UNKOWN "UNKNOWN" - #define _TEXTCAT_RESULT_SHORT "SHORT" - -+ #ifdef __cplusplus -+ extern "C" { -+ #endif - - /** - * textcat_Init() - Initialize the text classifier. The textfile -*************** -*** 51,60 **** ---- 54,72 ---- - * Returns: handle on success, NULL on error. (At the moment, the - * only way errors can occur, is when the library cannot read the - * conffile, or one of the fingerprint files listed in it.) -+ * -+ * Replace older function (and has exacly the same behaviour) -+ * see below - */ - extern void *textcat_Init( const char *conffile ); - - /** -+ * Originaly this function had only one parameter (conffile) it has been modified since OOo must be able to load alternativ DB -+ * Basicaly prefix is the directory path where fingerprints are stored -+ */ -+ extern void *special_textcat_Init( const char *conffile, const char *prefix ); -+ -+ /** - * textcat_Done() - Free up resources for handle - */ - extern void textcat_Done( void *handle ); -*************** -*** 77,80 **** ---- 89,96 ---- - * textcat_Version() - Returns a string describing the version of this classifier. - */ - extern char *textcat_Version(); -+ -+ #ifdef __cplusplus -+ } -+ #endif - #endif -*** misc/libtextcat-2.2/src/utf8misc.c Tue Nov 27 13:51:28 2007 ---- misc/build/libtextcat-2.2/src/utf8misc.c Tue Nov 27 13:49:18 2007 -*************** -*** 1 **** -! dummy ---- 1,132 ---- -! /*************************************************************************** -! * Copyright (C) 2006 by Jocelyn Merand * -! * joc.mer@gmail.com * -! * * -! * THE BSD LICENSE -! * -! * Redistribution and use in source and binary forms, with or without -! * modification, are permitted provided that the following conditions -! * are met: -! * -! * - Redistributions of source code must retain the above copyright -! * notice, this list of conditions and the following disclaimer. -! * -! * - Redistributions in binary form must reproduce the above copyright -! * notice, this list of conditions and the following disclaimer in the -! * documentation and/or other materials provided with the -! * distribution. -! * -! * - Neither the name of the WiseGuys Internet B.V. nor the names of -! * its contributors may be used to endorse or promote products derived -! * from this software without specific prior written permission. -! * -! * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -! * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -! * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -! * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -! * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -! * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -! * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -! * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -! * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -! * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -! * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -! ***************************************************************************/ -! -! #ifndef _UTF8_MISC_H_ -! #include "utf8misc.h" -! #endif -! -! -! int nextcharstart(const char *str, int position){ -! int pointer = position; -! -! if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ -! -! /*then str[pointer] is an escape character*/ -! -! char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (by bit translation) following characters (only the weightest part)*/ -! -! while(escape_char & ESCAPE_MASK && str[pointer]){/*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/ -! escape_char = escape_char <<1; -! ++pointer; -! } -! } -! if(str[pointer]){ /*finaly, if we are not on the \0 character, we jump to the next character*/ -! ++pointer; -! } -! return pointer; -! } -! -! -! int charcopy(const char *str, char *dest){ -! -! int pointer = 0; -! if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ -! -! /*then str[pointer] is an escape character*/ -! -! char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count following characters (only the weightest part)*/ -! -! while(escape_char & ESCAPE_MASK && str[pointer]){ /*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/ -! dest[pointer] = str[pointer]; -! escape_char = escape_char <<1; -! ++pointer; -! } -! } -! if(str[pointer]){ -! dest[pointer] = str[pointer]; -! ++pointer; -! } -! -! return pointer; -! } -! -! -! int issame( char *lex, char *key, int len ) -! { -! /*printf("[%s] prefix of [%s] with length %i", lex, key, len);*/ -! int char_counter = 0; -! int pointer = 0; -! while(char_counter < len) { -! -! if(key[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ -! -! /*then key[pointer] is an escap character*/ -! -! char escape_char = ((key[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (only the weightest part)*/ -! -! while(escape_char & ESCAPE_MASK && key[pointer] == lex[pointer] ){ -! escape_char = escape_char <<1; -! ++pointer; -! } -! } -! ++char_counter; /*and we are on a new utf8 character*/ -! if ( key[pointer] != lex[pointer] ) { -! return 0; -! /*printf(" NO\n", lex, key, len);*/ -! } -! ++pointer; -! } -! if ( lex[pointer] != '\0' ) { -! return 0; -! /*printf(" NO\n");*/ -! } -! -! /*printf(" YES\n");*/ -! -! return 1; -! } -! -! -! extern int utfstrlen(const char* str){ -! int char_counter = 0; -! int pointer = 0; -! while(str[pointer]) { -! pointer = nextcharstart(str, pointer); -! -! ++char_counter; /*and we are on a new utf8 character*/ -! } -! return char_counter; -! } -! -*** misc/libtextcat-2.2/src/utf8misc.h Tue Nov 27 13:51:28 2007 ---- misc/build/libtextcat-2.2/src/utf8misc.h Tue Nov 27 13:49:18 2007 -*************** -*** 1 **** -! dummy ---- 1,88 ---- -! /*************************************************************************** -! * Copyright (C) 2006 by Jocelyn Merand * -! * joc.mer@gmail.com * -! * * -! * THE BSD LICENSE -! * -! * Redistribution and use in source and binary forms, with or without -! * modification, are permitted provided that the following conditions -! * are met: -! * -! * - Redistributions of source code must retain the above copyright -! * notice, this list of conditions and the following disclaimer. -! * -! * - Redistributions in binary form must reproduce the above copyright -! * notice, this list of conditions and the following disclaimer in the -! * documentation and/or other materials provided with the -! * distribution. -! * -! * - Neither the name of the WiseGuys Internet B.V. nor the names of -! * its contributors may be used to endorse or promote products derived -! * from this software without specific prior written permission. -! * -! * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -! * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -! * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -! * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -! * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -! * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -! * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -! * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -! * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -! * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -! * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -! ***************************************************************************/ -! -! #ifndef _UTF8_MISC_H_ -! #define _UTF8_MISC_H_ -! -! /** -! * These variables are used in character processing functions -! * These have been added to manage utf-8 symbols, particularly escape chars -! */ -! #ifdef _UTF8_ -! #define ESCAPE_MASK 0x80 -! #define WEIGHT_MASK 0xF0 -! #else -! #define ESCAPE_MASK 0xFF -! #define WEIGHT_MASK 0x00 -! #endif -! -! -! /* -! * Is used to jump to the next start of char -! * of course it's only usefull when encoding is utf-8 -! * This function have been added by Jocelyn Merand to use libtextcat in OOo -! */ -! int nextcharstart(const char *str, int position); -! -! -! /*Copy the char in str to dest -! * of course it's only usefull when encoding is utf8 and the symbol is encoded with more than 1 char -! * return the number of char jumped -! * This function have been added by Jocelyn Merand to use libtextcat in OOo -! */ -! int charcopy(const char *str, char *dest); -! -! -! /* checks if n-gram lex is a prefix of key and of length len -! * if _UTF8_ is defined, it uses escap characters and len is not realy the length of lex -! * in this case, len is the number of utf-8 char strlen("€") == 3 but len == 1 -! */ -! int issame( char *lex, char *key, int len ); -! -! -! /* Counts the number of characters -! * if _UTF8_ is defined, it uses escap characters and the result is not realy the length of str -! * in this case, the result is the number of utf-8 char strlen("€") == 3 but utfstrlen("€") == 1 -! */ -! #ifdef __cplusplus -! extern "C" { -! #endif -! extern int utfstrlen(const char* str); -! #ifdef __cplusplus -! } -! #endif -! -! #endif -! -*** misc/libtextcat-2.2/src/win32_config.h Tue Nov 27 13:51:28 2007 ---- misc/build/libtextcat-2.2/src/win32_config.h Tue Nov 27 13:49:18 2007 -*************** -*** 1 **** -! dummy ---- 1,136 ---- -! /* src/config.h. Generated by configure. */ -! /* src/config.h.in. Generated from configure.ac by autoheader. */ -! -! /* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP -! systems. This function is required for `alloca.c' support on those systems. -! */ -! /* #undef CRAY_STACKSEG_END */ -! -! /* Define to 1 if using `alloca.c'. */ -! /* #undef C_ALLOCA */ -! -! /* Define to 1 if you have `alloca', as a function or macro. */ -! /* #undef HAVE_ALLOCA */ -! -! /* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix). -! */ -! /* #undef HAVE_ALLOCA_H */ -! -! /* Define to 1 if you have the <dlfcn.h> header file. */ -! #define HAVE_DLFCN_H 1 -! -! /* Define to 1 if you don't have `vprintf' but do have `_doprnt.' */ -! /* #undef HAVE_DOPRNT */ -! -! /* Define to 1 if you have the `gettimeofday' function. */ -! /* #undef HAVE_GETTIMEOFDAY */ -! -! /* Define to 1 if you have the <inttypes.h> header file. */ -! /* #undef HAVE_INTTYPES_H */ -! -! /* Define to 1 if you have the <limits.h> header file. */ -! #define HAVE_LIMITS_H 1 -! -! /* Define to 1 if your system has a GNU libc compatible `malloc' function, and -! to 0 otherwise. */ -! #define HAVE_MALLOC 1 -! -! /* Define to 1 if you have the <memory.h> header file. */ -! #define HAVE_MEMORY_H 1 -! -! /* Define to 1 if you have the `memset' function. */ -! #define HAVE_MEMSET 1 -! -! /* Define to 1 if your system has a GNU libc compatible `realloc' function, -! and to 0 otherwise. */ -! #define HAVE_REALLOC 1 -! -! /* Define to 1 if you have the <stdint.h> header file. */ -! /* #undef HAVE_STDINT_H */ -! -! /* Define to 1 if you have the <stdlib.h> header file. */ -! #define HAVE_STDLIB_H 1 -! -! /* Define to 1 if you have the `strchr' function. */ -! #define HAVE_STRCHR 1 -! -! /* Define to 1 if you have the `strdup' function. */ -! #define HAVE_STRDUP 1 -! -! /* Define to 1 if you have the <strings.h> header file. */ -! /* #undef HAVE_STRINGS_H */ -! -! /* Define to 1 if you have the <string.h> header file. */ -! #define HAVE_STRING_H 1 -! -! /* Define to 1 if you have the `strpbrk' function. */ -! #define HAVE_STRPBRK 1 -! -! /* Define to 1 if you have the <sys/stat.h> header file. */ -! #define HAVE_SYS_STAT_H 1 -! -! /* Define to 1 if you have the <sys/time.h> header file. */ -! /* #undef HAVE_SYS_TIME_H */ -! -! /* Define to 1 if you have the <sys/types.h> header file. */ -! #define HAVE_SYS_TYPES_H 1 -! -! /* Define to 1 if you have the <unistd.h> header file. */ -! #define HAVE_UNISTD_H 1 -! -! /* Define to 1 if you have the `vprintf' function. */ -! #define HAVE_VPRINTF 1 -! -! /* Name of package */ -! #define PACKAGE "libtextcat" -! -! /* Define to the address where bug reports for this package should be sent. */ -! #define PACKAGE_BUGREPORT "" -! -! /* Define to the full name of this package. */ -! #define PACKAGE_NAME "libtextcat" -! -! /* Define to the full name and version of this package. */ -! #define PACKAGE_STRING "libtextcat 2.2" -! -! /* Define to the one symbol short name of this package. */ -! #define PACKAGE_TARNAME "libtextcat" -! -! /* Define to the version of this package. */ -! #define PACKAGE_VERSION "2.2" -! -! /* If using the C implementation of alloca, define if you know the -! direction of stack growth for your system; otherwise it will be -! automatically deduced at run-time. -! STACK_DIRECTION > 0 => grows toward higher addresses -! STACK_DIRECTION < 0 => grows toward lower addresses -! STACK_DIRECTION = 0 => direction of growth unknown */ -! /* #undef STACK_DIRECTION */ -! -! /* Define to 1 if you have the ANSI C header files. */ -! #define STDC_HEADERS 1 -! -! /* Define to 1 if you can safely include both <sys/time.h> and <time.h>. */ -! #define TIME_WITH_SYS_TIME 1 -! -! /* Define to 1 if your <sys/time.h> declares `struct tm'. */ -! /* #undef TM_IN_SYS_TIME */ -! -! /* Version number of package */ -! #define VERSION "2.2" -! -! /* Define to empty if `const' does not conform to ANSI C. */ -! /* #undef const */ -! -! /* Define as `__inline' if that's what the C compiler calls it, or to nothing -! if it is not supported. */ -! /* #undef inline */ -! -! /* Define to rpl_malloc if the replacement function should be used. */ -! /* #undef malloc */ -! -! /* Define to rpl_realloc if the replacement function should be used. */ -! /* #undef realloc */ -! -! /* Define to `unsigned' if <sys/types.h> does not define. */ -! /* #undef size_t */ +--- misc/libtextcat-2.2/configure Thu May 22 13:39:55 2003 ++++ misc/build/libtextcat-2.2/configure Mon Mar 31 11:29:14 2008 +@@ -5391,7 +5391,8 @@ + allow_undefined_flag= + no_undefined_flag= + need_lib_prefix=unknown +-need_version=unknown ++#need_version=unknown ++need_version=no + # when you set need_version to no, make sure it does not cause -set_version + # flags to be left without arguments + archive_cmds= +@@ -5785,7 +5786,7 @@ + # cross-compilation, but unfortunately the echo tests do not + # yet detect zsh echo's removal of \ escapes. Also zsh mangles + # `"' quotes if we put them in here... so don't! +- archive_cmds='$CC -r -keep_private_externs -nostdlib -o ${lib}-master.o $libobjs && $CC $(test .$module = .yes && echo -bundle || echo -dynamiclib) $allow_undefined_flag -o $lib ${lib}-master.o $deplibs$linker_flags $(test .$module != .yes && echo -install_name $rpath/$soname $verstring)' ++ archive_cmds='$CC -r -keep_private_externs -nostdlib -o ${lib}-master.o $libobjs && $CC $(test .$module = .yes && echo -bundle || echo -dynamiclib) $allow_undefined_flag -o $lib ${lib}-master.o $deplibs$compiler_flags $(test .$module != .yes && echo -install_name $rpath/$soname $verstring)' + # We need to add '_' to the symbols in $export_symbols first + #archive_expsym_cmds="$archive_cmds"' && strip -s $export_symbols' + hardcode_direct=yes +@@ -6280,7 +6281,7 @@ + ;; + + freebsd*) +- objformat=`test -x /usr/bin/objformat && /usr/bin/objformat || echo aout` ++ objformat=`test -x /usr/bin/objformat && /usr/bin/objformat || echo elf` + version_type=freebsd-$objformat + case $version_type in + freebsd-elf*) +--- misc/libtextcat-2.2/src/Makefile.in Thu May 22 13:39:52 2003 ++++ misc/build/libtextcat-2.2/src/Makefile.in Mon Mar 31 11:29:14 2008 +@@ -124,20 +124,20 @@ + target_vendor = @target_vendor@ + AUTOMAKE_OPTIONS = 1.4 foreign + +-WARNS = -W -Wall -Wshadow -Wpointer-arith +-IFLAGS = +-FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE ++#WARNS = -W -Wall -Wshadow -Wpointer-arith ++IFLAGS = ++#FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE + VERBOSE = -DVERBOSE + AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS) + AM_LDFLAGS = -g + + noinst_HEADERS = \ +- common.h constants.h fingerprint.h textcat.h wg_mempool.h ++ common.h constants.h fingerprint.h textcat.h wg_mempool.h utf8misc.h + + + lib_LTLIBRARIES = libtextcat.la + libtextcat_la_SOURCES = \ +- common.c fingerprint.c textcat.c wg_mempool.c ++ common.c fingerprint.c textcat.c wg_mempool.c utf8misc.c + + + bin_PROGRAMS = createfp +@@ -156,7 +156,7 @@ + libtextcat_la_LDFLAGS = + libtextcat_la_LIBADD = + am_libtextcat_la_OBJECTS = common.lo fingerprint.lo textcat.lo \ +- wg_mempool.lo ++ wg_mempool.lo utf8misc.lo + libtextcat_la_OBJECTS = $(am_libtextcat_la_OBJECTS) + bin_PROGRAMS = createfp$(EXEEXT) + noinst_PROGRAMS = testtextcat$(EXEEXT) +@@ -177,7 +177,8 @@ + @AMDEP_TRUE@DEP_FILES = ./$(DEPDIR)/common.Plo ./$(DEPDIR)/createfp.Po \ + @AMDEP_TRUE@ ./$(DEPDIR)/fingerprint.Plo \ + @AMDEP_TRUE@ ./$(DEPDIR)/testtextcat.Po ./$(DEPDIR)/textcat.Plo \ +-@AMDEP_TRUE@ ./$(DEPDIR)/wg_mempool.Plo ++@AMDEP_TRUE@ ./$(DEPDIR)/wg_mempool.Plo \ ++@AMDEP_TRUE@ ./$(DEPDIR)/utf8misc.Plo + COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) + LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) \ +@@ -213,7 +214,7 @@ + @rm -f stamp-h1 + cd $(top_builddir) && $(SHELL) ./config.status src/config.h + +-$(srcdir)/config.h.in: $(top_srcdir)/configure.ac $(ACLOCAL_M4) ++$(srcdir)/config.h.in: $(top_srcdir)/configure.ac $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOHEADER) + touch $(srcdir)/config.h.in + +@@ -247,8 +248,8 @@ + echo "rm -f \"$${dir}/so_locations\""; \ + rm -f "$${dir}/so_locations"; \ + done +-libtextcat.la: $(libtextcat_la_OBJECTS) $(libtextcat_la_DEPENDENCIES) +- $(LINK) -rpath $(libdir) $(libtextcat_la_LDFLAGS) $(libtextcat_la_OBJECTS) $(libtextcat_la_LIBADD) $(LIBS) ++libtextcat.la: $(libtextcat_la_OBJECTS) $(libtextcat_la_DEPENDENCIES) ++ $(LINK) -avoid-version -rpath $(libdir) $(libtextcat_la_LDFLAGS) $(libtextcat_la_OBJECTS) $(libtextcat_la_LIBADD) $(LIBS) + binPROGRAMS_INSTALL = $(INSTALL_PROGRAM) + install-binPROGRAMS: $(bin_PROGRAMS) + @$(NORMAL_INSTALL) +@@ -285,10 +286,10 @@ + echo " rm -f $$p $$f"; \ + rm -f $$p $$f ; \ + done +-createfp$(EXEEXT): $(createfp_OBJECTS) $(createfp_DEPENDENCIES) ++createfp$(EXEEXT): $(createfp_OBJECTS) $(createfp_DEPENDENCIES) + @rm -f createfp$(EXEEXT) + $(LINK) $(createfp_LDFLAGS) $(createfp_OBJECTS) $(createfp_LDADD) $(LIBS) +-testtextcat$(EXEEXT): $(testtextcat_OBJECTS) $(testtextcat_DEPENDENCIES) ++testtextcat$(EXEEXT): $(testtextcat_OBJECTS) $(testtextcat_DEPENDENCIES) + @rm -f testtextcat$(EXEEXT) + $(LINK) $(testtextcat_LDFLAGS) $(testtextcat_OBJECTS) $(testtextcat_LDADD) $(LIBS) + +@@ -304,6 +305,7 @@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/testtextcat.Po@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/textcat.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/wg_mempool.Plo@am__quote@ ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/utf8misc.Plo@am__quote@ + + distclean-depend: + -rm -rf ./$(DEPDIR) +--- misc/libtextcat-2.2/src/common.c Thu May 22 13:32:43 2003 ++++ misc/build/libtextcat-2.2/src/common.c Mon Mar 31 11:29:14 2008 +@@ -3,23 +3,23 @@ + * + * Copyright (c) 2003, WiseGuys Internet B.V. + * All rights reserved. +- * ++ * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +- * ++ * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +- * ++ * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +- * ++ * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +- * ++ * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +@@ -114,11 +114,11 @@ + wgmem_error( "Error while strduping %u bytes.\n", strlen(s) ); + } + +- return( result ); ++ return( result ); + } + +-extern void* wg_realloc( void *ptr, size_t size ) +-{ ++extern void* wg_realloc( void *ptr, size_t size ) ++{ + void *result; + + if (!size) { +@@ -131,7 +131,7 @@ + wgmem_error( "Error while reallocing %u bytes.\n", size ); + } + +- return( result ); ++ return( result ); + } + + extern void wg_free( void *mem ) +@@ -148,12 +148,12 @@ + if ( fgets(line, size, fp) == NULL ) { + return NULL; + } +- ++ + /** kill term null **/ + if ( (p = strpbrk( line, "\n\r" )) ) { + *p = '\0'; +- } +- ++ } ++ + return line; + } + +@@ -164,39 +164,39 @@ + * + * ARGUMENTS: + * - result: +- * ++ * + * After the split, this array contains pointers to the start of each + * detected segment. Must be preallocated and at least as large as + * maxsegments. The pointers point into the dest buffer. +- * +- * - dest: +- * ++ * ++ * - dest: ++ * + * String into which result points as an index. Must be preallocated, and + * at least as big as src. You can use src as dest, but in that case src + * is overwritten! +- * +- * - src: +- * ++ * ++ * - src: ++ * + * The string to split. Sequences of whitespace are treated as separators, unless + * escaped. There are two ways to escape: by using single quotes (anything + * between single quotes is treated as one segment), or by using a backslash + * to escape the next character. The backslash escape works inside quotation + * as well. +- * ++ * + * Example: +- * ++ * + * "It\'s very\ easy 'to use WiseGuys\' wg_split()' function" is split into: +- * ++ * + * "It's" + * "very easy" + * "to use WiseGuys' wg_split()" + * "function" +- * +- * - maxsegments: +- * ++ * ++ * - maxsegments: ++ * + * The maximum number of segments. If the splitter runs out of segments, + * the remainder of the string is stored in the last segment. +- * ++ * + * RETURN VALUE: + * The number of segments found. + */ +@@ -218,12 +218,12 @@ + switch (state) { + case 0: + /*** Skip spaces ***/ +- while ( isspace((int) *p) ) { ++ while ( isspace((unsigned char) *p) ) { + p++; + } + state = 1; + +- case 1: ++ case 1: + /*** Start segment ***/ + result[cnt] = w; + cnt++; +@@ -232,12 +232,12 @@ + case 2: + /*** Unquoted segment ***/ + while (*p) { +- if ( isspace((int) *p) ) { ++ if ( isspace((unsigned char) *p) ) { + *w++ = '\0'; + p++; + state = 0; + break; +- } ++ } + else if ( *p == '\'' ) { + /*** Start quotation ***/ + p++; +@@ -292,17 +292,17 @@ + } + + ++#ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ + extern void wg_timerstart(wgtimer_t *t) + { +-#ifdef HAVE_GETTIMEOFDAY + gettimeofday( &(t->start), NULL ); +-#endif + } ++#endif /* TL : no struct timeval under Win32 */ + + ++#ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ + extern uint4 wg_timerstop(wgtimer_t *t) + { +-#ifdef HAVE_GETTIMEOFDAY + uint4 result; + gettimeofday( &(t->stop), NULL ); + result = (t->stop.tv_sec - t->start.tv_sec) * 1000000 + +@@ -312,25 +312,23 @@ + t->start.tv_usec = t->stop.tv_usec; + + return result; +-#else +- return 0; +-#endif + } ++#endif /* TL : no struct timeval under Win32 */ + + + /** + * wg_strgmov -- a guarded strcpy() variation +- * ++ * + * copies src to dest (including terminating zero), and returns + * pointer to position of terminating zero in dest. The function is + * guaranteed not to write past destlimit. If the copy couldn't be +- * finished, the function returns NULL after restoring the first +- * character in dest for your convenience (since this is usually a zero). ++ * finished, the function returns NULL after restoring the first ++ * character in dest for your convenience (since this is usually a zero). + */ + char *wg_strgmov( char *dest, const char *src, const char *destlimit ) + { + char tmp, *w; +- ++ + if ( !dest || dest >= destlimit ) { + return NULL; + } +@@ -355,7 +353,7 @@ + } + + /* +- * wg_trim() -- remove whitespace surrounding a string. ++ * wg_trim() -- remove whitespace surrounding a string. + * + * Example: " bla bla bla " becomes "bla bla bla" after trimming. + * +@@ -373,12 +371,12 @@ + char *lastnonspace = &dest[-1]; + const char *p = src; + char *w = dest; +- +- while ( isspace((int)*p) ) { ++ ++ while ( isspace((unsigned char)*p) ) { + p++; + } + while (*p) { +- if ( !isspace((int)*p) ) { ++ if ( !isspace((unsigned char)*p) ) { + lastnonspace = w; + } + *w++ = *p++; +--- misc/libtextcat-2.2/src/common.h Thu May 22 15:02:29 2003 ++++ misc/build/libtextcat-2.2/src/common.h Mon Mar 31 11:29:14 2008 +@@ -1,28 +1,28 @@ + #ifndef _COMMON_H_ + #define _COMMON_H_ + /** +- * common.h -- a mixed bag of helper functions ++ * common.h -- a mixed bag of helper functions + * + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE +- * ++ * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +- * ++ * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +- * ++ * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +- * ++ * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +- * ++ * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +@@ -86,10 +86,12 @@ + typedef char boole; + #endif + ++#ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ + typedef struct wgtimer_s { + struct timeval start; + struct timeval stop; + } wgtimer_t; ++#endif /* TL : no struct timeval under Win32 */ + + + extern void *wg_malloc( size_t size ); +@@ -101,13 +103,15 @@ + + extern char *wg_getline( char *line, int size, FILE *fp ); + ++#ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ + extern void wg_timerstart(wgtimer_t *t); + extern uint4 wg_timerstop(wgtimer_t *t); ++#endif /* TL : no struct timeval under Win32 */ + + extern unsigned int wg_split( char **result, char *dest, char *src, int maxsegments ); + extern char *wg_strgmov( char *dest, const char *src, const char *destlimit ); + extern char *wg_trim( char *dest, const char *src ); + +- ++ + #endif + +--- misc/libtextcat-2.2/src/constants.h Thu May 22 13:32:43 2003 ++++ misc/build/libtextcat-2.2/src/constants.h Mon Mar 31 11:29:14 2008 +@@ -39,6 +39,8 @@ + */ + #include <limits.h> + ++#define _UTF8_ ++ + #define DESCRIPTION "out of place" + + /* Reported matches are those fingerprints with a score less than best +@@ -59,14 +61,21 @@ + /* Maximum number of n-grams in a fingerprint */ + #define MAXNGRAMS 400 + +-/* Maximum size of an n-gram? */ +-#define MAXNGRAMSIZE 5 ++/* Maximum number of character of an n-gram? */ ++#define MAXNGRAMSYMBOL 5 ++ ++/* Maximum size of the string representing an n-gram (must be greater than number of symbol) */ ++#ifdef _UTF8_ ++#define MAXNGRAMSIZE 20 ++#else ++#define MAXNGRAMSIZE MAXNGRAMSYMBOL ++#endif + + /* Which characters are not acceptable in n-grams? */ +-#define INVALID(c) (isspace((int)c) || isdigit((int)c)) ++#define INVALID(c) (isspace((unsigned char)c) || isdigit((unsigned char)c)) + + /* Minimum size (in characters) for accepting a document */ +-#define MINDOCSIZE 25 ++#define MINDOCSIZE 6 + + /* Maximum penalty for missing an n-gram in fingerprint */ + #define MAXOUTOFPLACE 400 +@@ -75,5 +84,8 @@ + #define TABLEPOW 13 + + #define MAXSCORE INT_MAX ++ ++/* where the fingerprints files are stored */ ++#define DEFAULT_FINGERPRINTS_PATH "" + + #endif +--- misc/libtextcat-2.2/src/fingerprint.c Thu May 22 13:32:43 2003 ++++ misc/build/libtextcat-2.2/src/fingerprint.c Mon Mar 31 11:29:14 2008 +@@ -6,23 +6,23 @@ + * All rights reserved. + * + * THE BSD LICENSE +- * ++ * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +- * ++ * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +- * ++ * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +- * ++ * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +- * ++ * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +@@ -51,7 +51,7 @@ + * The reason why we go through the trouble of doing a partial + * (heap)sort is that a full quicksort behaves horribly on the data: + * most n-grams have a very low count, resulting in a data set in +- * nearly-sorted order. This causes quicksort to behave very badly. ++ * nearly-sorted order. This causes quicksort to behave very badly. + * Heapsort, on the other hand, behaves handsomely: worst case is + * Mlog(N) for M n-grams filtered through a N-sized heap. + * +@@ -63,6 +63,10 @@ + * - put table/heap datastructure in a separate file. + */ + ++#ifndef _UTF8_ ++#define _UTF8_ ++#endif ++ + #include "config.h" + #include <stdio.h> + #ifdef HAVE_STDLIB_H +@@ -80,10 +84,12 @@ + #include "wg_mempool.h" + #include "constants.h" + ++#include "utf8misc.h" + + #define TABLESIZE (1<<TABLEPOW) + #define TABLEMASK ((TABLESIZE)-1) + ++ + typedef struct { + + sint2 rank; +@@ -96,7 +102,7 @@ + const char *name; + ngram_t *fprint; + uint4 size; +- ++ + } fp_t; + + typedef struct entry_s { +@@ -105,13 +111,13 @@ + struct entry_s *next; + } entry_t; + +-typedef struct table_s { ++typedef struct table_s { + void *pool; + entry_t **table; + entry_t *heap; + + struct table_s *next; +- ++ + uint4 heapsize; + uint4 size; + } table_t; +@@ -122,7 +128,7 @@ + * fast and furious little hash function + * + * (Note that we could use some kind of rolling checksum, and update it +- * during n-gram construction) ++ * during n-gram construction) + */ + static uint4 simplehash( const char *p, int len ) + { +@@ -134,29 +140,14 @@ + } + + +-/* checks if n-gram lex is a prefix of key and of length len */ +-inline int issame( char *lex, char *key, int len ) +-{ +- int i; +- for (i=0; i<len; i++) { +- if ( key[i] != lex[i] ) { +- return 0; +- } +- } +- if ( lex[i] != 0 ) { +- return 0; +- } +- return 1; +-} +- + + /* increases frequency of ngram(p,len) */ +-static inline int increasefreq( table_t *t, char *p, int len ) +-{ +- uint4 hash = simplehash( p, len ) & TABLEMASK; ++static int increasefreq( table_t *t, char *p, int len ) ++{ ++ uint4 hash = simplehash( p, len ) & TABLEMASK; + entry_t *entry = t->table[ hash ]; +- +- while ( entry ) { ++ ++ while ( entry ) { + if ( issame( entry->str, p, len ) ) { + /*** Found it! ***/ + entry->cnt++; +@@ -168,7 +159,7 @@ + } + + /*** Not found, so create ***/ +- entry = wgmempool_alloc( t->pool, sizeof(entry_t) ); ++ entry = (entry_t*)(wgmempool_alloc( t->pool, sizeof(entry_t) )); + strcpy( entry->str, p ); + entry->cnt = 1; + +@@ -181,12 +172,12 @@ + #if 0 + + /* looks up ngram(p,len) */ +-static entry_t *findfreq( table_t *t, char *p, int len ) +-{ +- uint4 hash = simplehash( p, len ) & TABLEMASK; ++static entry_t *findfreq( table_t *t, char *p, int len ) ++{ ++ uint4 hash = simplehash( p, len ) & TABLEMASK; + entry_t *entry = t->table[ hash ]; +- +- while ( entry ) { ++ ++ while ( entry ) { + if ( issame( entry->str, p, len ) ) { + return entry; + } +@@ -219,7 +210,7 @@ + #define GREATER(x,y) ((x).cnt > (y).cnt) + #define LESS(x,y) ((x).cnt < (y).cnt) + +-inline static void siftup( table_t *t, unsigned int child ) ++static void siftup( table_t *t, unsigned int child ) + { + entry_t *heap = t->heap; + unsigned int parent = (child-1) >> 1; +@@ -241,7 +232,7 @@ + } + + +-inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent ) ++static void siftdown( table_t *t, unsigned int heapsize, uint4 parent ) + { + entry_t *heap = t->heap; + unsigned int child = parent*2 + 1; +@@ -273,7 +264,7 @@ + if (t->size < t->heapsize) { + memcpy( &(heap[t->size]), item, sizeof(entry_t)); + siftup( t, t->size ); +- t->size++; ++ t->size++; + return 0; + } + +@@ -316,18 +307,18 @@ + + /*** Fill result heap ***/ + for (i=0; i<TABLESIZE; i++) { +- entry_t *p = t->table[i]; ++ entry_t *p = t->table[i]; + while (p) { + heapinsert(t, p); + p = p->next; + } +- } ++ } + return 1; + } + + + static table_t *inittable(uint4 maxngrams) +-{ ++{ + table_t *result = (table_t *)wg_zalloc( sizeof(table_t) ); + result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE ); + result->pool = wgmempool_Init( 10000, 10 ); +@@ -347,14 +338,14 @@ + wgmempool_Done(t->pool); + wg_free(t->table); + wg_free(t->heap); +- wg_free(t); ++ wg_free(t); + } + + + extern void *fp_Init(const char *name) + { + fp_t *h = (fp_t *)wg_zalloc( sizeof(fp_t) ); +- ++ + if ( name ) { + h->name = wg_strdup(name); + } +@@ -458,21 +449,27 @@ + return dest; + } + +- ++/** ++* this function extract all n-gram from past buffer and put them into the table "t" ++* [modified] by Jocelyn Merand to accept utf-8 multi-character symbols to be used in OpenOffice ++*/ + static void createngramtable( table_t *t, const char *buf ) + { + char n[MAXNGRAMSIZE+1]; + const char *p = buf; + int i; ++ int pointer = 0; + + /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/ +- for (;;p++) { ++ while(1) { + +- const char *q = p; ++ const char *q = &p[pointer]; /*[modified] previously p++ above (for(;;p++)) now, it's pointer wich is increased so we have to get the new pointer on the buffer*/ + char *m = n; + + /*** First char may be an underscore ***/ +- *m++ = *q++; ++ int decay = charcopy(q, m); /*[modified] previously *q++ = *m++*/ ++ q = &(p[pointer+decay]); /*[modified] the old copying method do not manage multi-character symbols*/ ++ m += decay; /*[modified]*/ + *m = '\0'; + + increasefreq( t, n, 1 ); +@@ -482,19 +479,22 @@ + } + + /*** Let the compiler unroll this ***/ +- for ( i=2; i<=MAXNGRAMSIZE; i++) { ++ for ( i=2; i<=MAXNGRAMSYMBOL; i++) { + +- *m++ = *q; ++ decay = charcopy(q, m); /*[modified] like above*/ ++ m += decay; + *m = '\0'; + + increasefreq( t, n, i ); + + if ( *q == '_' ) break; +- q++; ++ q += decay; + if ( *q == '\0' ) { + return; + } + } ++ ++ pointer = nextcharstart(p,pointer); /*[modified] p[pointer] must point on the next start of symbol, but whith utf next start is not surely next char*/ + } + return; + } +@@ -514,7 +514,7 @@ + { + ngram_t *x = (ngram_t *)a; + ngram_t *y = (ngram_t *)b; +- ++ + return mystrcmp( x->str, y->str ); + } + +@@ -522,12 +522,12 @@ + { + ngram_t *x = (ngram_t *)a; + ngram_t *y = (ngram_t *)b; +- ++ + return x->rank - y->rank; + } + + /** +- * Create a fingerprint: ++ * Create a fingerprint: + * - record the frequency of each unique n-gram in a hash table + * - take the most frequent n-grams + * - sort them alphabetically, recording their relative rank +@@ -544,20 +544,21 @@ + } + + /*** Throw out all invalid chars ***/ +- tmp = prepbuffer( buffer, bufsize ); ++ tmp = prepbuffer( buffer, bufsize ); ++ /*printf("Cleaned buffer : %s\n",tmp);*/ + if ( tmp == NULL ) { + return 0; + } +- + h = (fp_t*)handle; + t = inittable(maxngrams); ++ /*printf("Table initialized\n");*/ + + /*** Create a hash table containing n-gram counts ***/ + createngramtable(t, tmp); +- ++ /*printf("Table created\n");*/ + /*** Take the top N n-grams and add them to the profile ***/ +- table2heap(t); +- maxngrams = WGMIN( maxngrams, t->size ); ++ table2heap(t); ++ maxngrams = WGMIN( maxngrams, t->size ); + + h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams ); + h->size = maxngrams; +@@ -568,7 +569,7 @@ + entry_t tmp2; + + heapextract(t, &tmp2); +- ++ + /*** the string and its rank is all we need ***/ + strcpy( h->fprint[i].str, tmp2.str ); + h->fprint[i].rank = i; +@@ -578,7 +579,7 @@ + wg_free(tmp); + + /*** Sort n-grams alphabetically, for easy comparison ***/ +- qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); ++ qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); + return 1; + } + +@@ -608,7 +609,7 @@ + #endif + return 0; + } +- ++ + h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t)); + + while (cnt < maxngrams && wg_getline(line,1024,fp)) { +@@ -635,7 +636,7 @@ + h->size = cnt; + + /*** Sort n-grams, for easy comparison later on ***/ +- qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); ++ qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); + + fclose(fp); + +@@ -648,14 +649,15 @@ + { + uint4 i; + fp_t *h = (fp_t *)handle; +- ngram_t *tmp = wg_malloc( sizeof(ngram_t) * h->size ); +- ++ ngram_t *tmp = (ngram_t*)wg_malloc( sizeof(ngram_t) * h->size ); ++ + /*** Make a temporary and sort it on rank ***/ + memcpy( tmp, h->fprint, h->size * sizeof(ngram_t) ); +- qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank ); ++ qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank ); + + for (i=0; i<h->size; i++) { +- fprintf( fp, "%s\n", tmp[i].str ); ++ /*fprintf( fp, "%s\t%i\n", tmp[i].str, tmp[i].rank );*/ ++ fprintf( fp, "%s\n", tmp[i].str); + } + wg_free( tmp ); + } +@@ -669,7 +671,7 @@ + uint4 i = 0; + uint4 j = 0; + sint4 sum = 0; +- ++ + /*** Compare the profiles in mergesort fashion ***/ + while ( i < c->size && j < u->size ) { + +@@ -705,7 +707,7 @@ + } + + return sum; +- ++ + } + + +--- misc/libtextcat-2.2/src/fingerprint.h Mon May 19 14:16:31 2003 ++++ misc/build/libtextcat-2.2/src/fingerprint.h Mon Mar 31 11:29:14 2008 +@@ -41,7 +41,13 @@ + extern int fp_Read( void *handle, const char *fname, int maxngrams ); + extern sint4 fp_Compare( void *cat, void *unknown, int cutoff ); + extern void fp_Show( void *handle ); ++#ifdef __cplusplus ++extern "C" { ++#endif + extern const char *fp_Name( void *handle ); ++#ifdef __cplusplus ++} ++#endif + extern void fp_Print( void *handle, FILE *fp ); + + #endif +--- misc/libtextcat-2.2/src/libtextcat.map Mon Mar 31 11:30:06 2008 ++++ misc/build/libtextcat-2.2/src/libtextcat.map Mon Mar 31 11:29:14 2008 +@@ -1 +1,40 @@ +-dummy ++{ ++ global: ++ charcopy ++ issame ++ nextcharstart ++ utfstrlen ++ wgmempool_Done ++ wgmempool_Init ++ wgmempool_Reset ++ wgmempool_alloc ++ wgmempool_getline ++ wgmempool_strdup ++ special_textcat_Init ++ textcat_Classify ++ textcat_Done ++ textcat_Init ++ textcat_Version ++ fp_Compare ++ fp_Create ++ fp_Debug ++ fp_Done ++ fp_Init ++ fp_Name ++ fp_Print ++ fp_Read ++ heapextract ++ wg_calloc ++ wg_free ++ wg_getline ++ wg_malloc ++ wg_split ++ wg_strdup ++ wg_strgmov ++ wg_trim ++ wg_zalloc ++ wgmem_error ++ ++ local: ++ *; ++} +--- misc/libtextcat-2.2/src/makefile.mk Mon Mar 31 11:30:06 2008 ++++ misc/build/libtextcat-2.2/src/makefile.mk Mon Mar 31 11:29:42 2008 +@@ -1 +1,90 @@ +-dummy ++#************************************************************************* ++# ++# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. ++# ++# Copyright 2008 by Sun Microsystems, Inc. ++# ++# OpenOffice.org - a multi-platform office productivity suite ++# ++# $RCSfile: libtextcat-2.2.patch,v $ ++# ++# $Revision: 1.8 $ ++# ++# This file is part of OpenOffice.org. ++# ++# OpenOffice.org is free software: you can redistribute it and/or modify ++# it under the terms of the GNU Lesser General Public License version 3 ++# only, as published by the Free Software Foundation. ++# ++# OpenOffice.org is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU Lesser General Public License version 3 for more details ++# (a copy is included in the LICENSE file that accompanied this code). ++# ++# You should have received a copy of the GNU Lesser General Public License ++# version 3 along with OpenOffice.org. If not, see ++# <http://www.openoffice.org/license.html> ++# for a copy of the LGPLv3 License. ++# ++#************************************************************************* ++ ++PRJ = ..$/..$/..$/..$/.. ++ ++PRJNAME = libtextcat ++TARGET = libtextcat ++CFLAGSCALL=gsd ++ ++USE_DEFFILE=TRUE ++EXTERNAL_WARNINGS_NOT_ERRORS := TRUE ++ ++.INCLUDE : settings.mk ++ ++# --- Files -------------------------------------------------------- ++ ++# !! not to be compiled because those belong to a stand alone programs: !! ++# $(SLO)$/createfp.obj\ ++# $(SLO)$/testtextcat.obj ++ ++SLOFILES= \ ++ $(SLO)$/common.obj\ ++ $(SLO)$/fingerprint.obj\ ++ $(SLO)$/textcat.obj\ ++ $(SLO)$/wg_mempool.obj\ ++ $(SLO)$/utf8misc.obj ++ ++#SHL1TARGET= $(TARGET)$(UPD)$(DLLPOSTFIX) ++SHL1TARGET= $(TARGET) ++ ++SHL1STDLIBS= ++ ++# build DLL ++SHL1LIBS= $(SLB)$/$(TARGET).lib ++SHL1IMPLIB= i$(TARGET) ++SHL1DEPN= $(SHL1LIBS) ++SHL1DEF= $(MISC)$/$(SHL1TARGET).def ++ ++# build DEF file ++DEF1NAME= $(SHL1TARGET) ++DEF1DEPN=$(MISC)$/$(SHL1TARGET).flt ++ ++SHL1VERSIONMAP= libtextcat.map ++ ++# --- Targets ------------------------------------------------------ ++ ++.INCLUDE : target.mk ++ ++# copy hand supplied configuration file for Win32 builds to the file ++# which is included in the source code ++$(SLOFILES) : config.h ++config.h : ++ $(GNUCOPY) $(OUT)$/misc$/build$/libtextcat-2.2$/src$/win32_config.h $(OUT)$/misc$/build$/libtextcat-2.2$/src$/config.h ++ ++ ++$(MISC)$/$(SHL1TARGET).flt: makefile.mk ++ @echo ------------------------------ ++ @echo Making: $@ ++ @echo Imp>$@ ++ @echo __CT>>$@ ++ @echo _real>>$@ ++ @echo unnamed>>$@ +--- misc/libtextcat-2.2/src/textcat.c Thu May 22 13:32:43 2003 ++++ misc/build/libtextcat-2.2/src/textcat.c Mon Mar 31 11:29:14 2008 +@@ -4,23 +4,23 @@ + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE +- * ++ * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +- * ++ * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +- * ++ * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +- * ++ * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +- * ++ * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +@@ -74,6 +74,7 @@ + typedef struct { + + void **fprint; ++ char *fprint_disable; + uint4 size; + uint4 maxsize; + +@@ -112,11 +113,21 @@ + fp_Done( h->fprint[i] ); + } + wg_free( h->fprint ); ++ wg_free( h->fprint_disable ); + wg_free( h ); + + } + +-extern void *textcat_Init( const char *conffile ) ++/** Replaces older function */ ++extern void *textcat_Init( const char *conffile ){ ++ return special_textcat_Init( conffile, DEFAULT_FINGERPRINTS_PATH ); ++} ++ ++/** ++ * Originaly this function had only one parameter (conffile) it has been modified since OOo use ++ * Basicaly prefix is the directory path where fingerprints are stored ++ */ ++extern void *special_textcat_Init( const char *conffile, const char *prefix ) + { + textcat_t *h; + char line[1024]; +@@ -134,11 +145,13 @@ + h->size = 0; + h->maxsize = 16; + h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize ); ++ h->fprint_disable = (char *)wg_malloc( sizeof(char*) * h->maxsize ); /*added to store the state of languages*/ + + while ( wg_getline( line, 1024, fp ) ) { + char *p; + char *segment[4]; +- int res; ++ char finger_print_file_name[512]; ++ int res; + + /*** Skip comments ***/ + #ifdef HAVE_STRCHR +@@ -156,17 +169,23 @@ + /*** Ensure enough space ***/ + if ( h->size == h->maxsize ) { + h->maxsize *= 2; +- h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize ); ++ h->fprint = (void **)wg_realloc( h->fprint, sizeof(void*) * h->maxsize ); ++ h->fprint_disable = (char *)wg_realloc( h->fprint_disable, sizeof(char*) * h->maxsize ); + } + + /*** Load data ***/ + if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) { + goto ERROR; + } +- if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) { ++ finger_print_file_name[0] = '\0'; ++ strcat(finger_print_file_name, prefix); ++ strcat(finger_print_file_name, segment[0]); ++ ++ if ( fp_Read( h->fprint[h->size], finger_print_file_name, 400 ) == 0 ) { + textcat_Done(h); + goto ERROR; +- } ++ } ++ h->fprint_disable[h->size] = 0xF0; /*0xF0 is the code for enabled languages, 0x0F is for disabled*/ + h->size++; + } + +@@ -203,11 +222,18 @@ + result = _TEXTCAT_RESULT_SHORT; + goto READY; + } +- ++ + /*** Calculate the score for each category. ***/ + for (i=0; i<h->size; i++) { +- int score = fp_Compare( h->fprint[i], unknown, threshold ); +- candidates[i].score = score; ++ int score; ++ if(h->fprint_disable[i] & 0x0F){ /*if this language is disabled*/ ++ score = MAXSCORE; ++ } ++ else{ ++ score = fp_Compare( h->fprint[i], unknown, threshold ); ++ /*printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score);*/ ++ } ++ candidates[i].score = score; + candidates[i].name = fp_Name( h->fprint[i] ); + if ( score < minscore ) { + minscore = score; +@@ -218,7 +244,6 @@ + /*** Find the best performers ***/ + for (i=0; i<h->size; i++) { + if ( candidates[i].score < threshold ) { +- + if ( ++cnt == MAXCANDIDATES+1 ) { + break; + } +@@ -235,7 +260,7 @@ + else { + char *p = result; + char *plimit = result+MAXOUTPUTSIZE; +- ++ + qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates ); + + *p = '\0'; +@@ -247,7 +272,7 @@ + } + READY: + fp_Done(unknown); +-#ifdef SHOULD_FREE ++#ifdef SHOULD_FREE + free(candidates); + #undef SHOULD_FREE + #endif +--- misc/libtextcat-2.2/src/textcat.h Mon May 19 14:16:31 2003 ++++ misc/build/libtextcat-2.2/src/textcat.h Mon Mar 31 11:29:14 2008 +@@ -40,6 +40,9 @@ + #define _TEXTCAT_RESULT_UNKOWN "UNKNOWN" + #define _TEXTCAT_RESULT_SHORT "SHORT" + ++#ifdef __cplusplus ++extern "C" { ++#endif + + /** + * textcat_Init() - Initialize the text classifier. The textfile +@@ -51,10 +54,19 @@ + * Returns: handle on success, NULL on error. (At the moment, the + * only way errors can occur, is when the library cannot read the + * conffile, or one of the fingerprint files listed in it.) ++ * ++ * Replace older function (and has exacly the same behaviour) ++ * see below + */ + extern void *textcat_Init( const char *conffile ); + + /** ++ * Originaly this function had only one parameter (conffile) it has been modified since OOo must be able to load alternativ DB ++ * Basicaly prefix is the directory path where fingerprints are stored ++ */ ++extern void *special_textcat_Init( const char *conffile, const char *prefix ); ++ ++/** + * textcat_Done() - Free up resources for handle + */ + extern void textcat_Done( void *handle ); +@@ -77,4 +89,8 @@ + * textcat_Version() - Returns a string describing the version of this classifier. + */ + extern char *textcat_Version(); ++ ++#ifdef __cplusplus ++} ++#endif + #endif +--- misc/libtextcat-2.2/src/utf8misc.c Mon Mar 31 11:30:06 2008 ++++ misc/build/libtextcat-2.2/src/utf8misc.c Mon Mar 31 11:29:14 2008 +@@ -1 +1,132 @@ +-dummy ++/*************************************************************************** ++ * Copyright (C) 2006 by Jocelyn Merand * ++ * joc.mer@gmail.com * ++ * * ++ * THE BSD LICENSE ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the ++ * distribution. ++ * ++ * - Neither the name of the WiseGuys Internet B.V. nor the names of ++ * its contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ***************************************************************************/ ++ ++#ifndef _UTF8_MISC_H_ ++#include "utf8misc.h" ++#endif ++ ++ ++int nextcharstart(const char *str, int position){ ++ int pointer = position; ++ ++ if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ ++ ++ /*then str[pointer] is an escape character*/ ++ ++ char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (by bit translation) following characters (only the weightest part)*/ ++ ++ while(escape_char & ESCAPE_MASK && str[pointer]){/*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/ ++ escape_char = escape_char <<1; ++ ++pointer; ++ } ++ } ++ if(str[pointer]){ /*finaly, if we are not on the \0 character, we jump to the next character*/ ++ ++pointer; ++ } ++ return pointer; ++} ++ ++ ++int charcopy(const char *str, char *dest){ ++ ++ int pointer = 0; ++ if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ ++ ++ /*then str[pointer] is an escape character*/ ++ ++ char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count following characters (only the weightest part)*/ ++ ++ while(escape_char & ESCAPE_MASK && str[pointer]){ /*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/ ++ dest[pointer] = str[pointer]; ++ escape_char = escape_char <<1; ++ ++pointer; ++ } ++ } ++ if(str[pointer]){ ++ dest[pointer] = str[pointer]; ++ ++pointer; ++ } ++ ++ return pointer; ++} ++ ++ ++int issame( char *lex, char *key, int len ) ++{ ++ /*printf("[%s] prefix of [%s] with length %i", lex, key, len);*/ ++ int char_counter = 0; ++ int pointer = 0; ++ while(char_counter < len) { ++ ++ if(key[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ ++ ++ /*then key[pointer] is an escap character*/ ++ ++ char escape_char = ((key[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (only the weightest part)*/ ++ ++ while(escape_char & ESCAPE_MASK && key[pointer] == lex[pointer] ){ ++ escape_char = escape_char <<1; ++ ++pointer; ++ } ++ } ++ ++char_counter; /*and we are on a new utf8 character*/ ++ if ( key[pointer] != lex[pointer] ) { ++ return 0; ++ /*printf(" NO\n", lex, key, len);*/ ++ } ++ ++pointer; ++ } ++ if ( lex[pointer] != '\0' ) { ++ return 0; ++ /*printf(" NO\n");*/ ++ } ++ ++ /*printf(" YES\n");*/ ++ ++ return 1; ++} ++ ++ ++extern int utfstrlen(const char* str){ ++ int char_counter = 0; ++ int pointer = 0; ++ while(str[pointer]) { ++ pointer = nextcharstart(str, pointer); ++ ++ ++char_counter; /*and we are on a new utf8 character*/ ++ } ++ return char_counter; ++} ++ +--- misc/libtextcat-2.2/src/utf8misc.h Mon Mar 31 11:30:06 2008 ++++ misc/build/libtextcat-2.2/src/utf8misc.h Mon Mar 31 11:29:14 2008 +@@ -1 +1,88 @@ +-dummy ++/*************************************************************************** ++ * Copyright (C) 2006 by Jocelyn Merand * ++ * joc.mer@gmail.com * ++ * * ++ * THE BSD LICENSE ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the ++ * distribution. ++ * ++ * - Neither the name of the WiseGuys Internet B.V. nor the names of ++ * its contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ***************************************************************************/ ++ ++#ifndef _UTF8_MISC_H_ ++#define _UTF8_MISC_H_ ++ ++/** ++ * These variables are used in character processing functions ++ * These have been added to manage utf-8 symbols, particularly escape chars ++ */ ++#ifdef _UTF8_ ++#define ESCAPE_MASK 0x80 ++#define WEIGHT_MASK 0xF0 ++#else ++#define ESCAPE_MASK 0xFF ++#define WEIGHT_MASK 0x00 ++#endif ++ ++ ++/* ++ * Is used to jump to the next start of char ++ * of course it's only usefull when encoding is utf-8 ++ * This function have been added by Jocelyn Merand to use libtextcat in OOo ++ */ ++int nextcharstart(const char *str, int position); ++ ++ ++/*Copy the char in str to dest ++ * of course it's only usefull when encoding is utf8 and the symbol is encoded with more than 1 char ++ * return the number of char jumped ++ * This function have been added by Jocelyn Merand to use libtextcat in OOo ++ */ ++int charcopy(const char *str, char *dest); ++ ++ ++/* checks if n-gram lex is a prefix of key and of length len ++* if _UTF8_ is defined, it uses escap characters and len is not realy the length of lex ++* in this case, len is the number of utf-8 char strlen("€") == 3 but len == 1 ++*/ ++int issame( char *lex, char *key, int len ); ++ ++ ++/* Counts the number of characters ++* if _UTF8_ is defined, it uses escap characters and the result is not realy the length of str ++* in this case, the result is the number of utf-8 char strlen("€") == 3 but utfstrlen("€") == 1 ++*/ ++#ifdef __cplusplus ++extern "C" { ++#endif ++extern int utfstrlen(const char* str); ++#ifdef __cplusplus ++} ++#endif ++ ++#endif ++ +--- misc/libtextcat-2.2/src/win32_config.h Mon Mar 31 11:30:06 2008 ++++ misc/build/libtextcat-2.2/src/win32_config.h Mon Mar 31 11:29:14 2008 +@@ -1 +1,136 @@ +-dummy ++/* src/config.h. Generated by configure. */ ++/* src/config.h.in. Generated from configure.ac by autoheader. */ ++ ++/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP ++ systems. This function is required for `alloca.c' support on those systems. ++ */ ++/* #undef CRAY_STACKSEG_END */ ++ ++/* Define to 1 if using `alloca.c'. */ ++/* #undef C_ALLOCA */ ++ ++/* Define to 1 if you have `alloca', as a function or macro. */ ++/* #undef HAVE_ALLOCA */ ++ ++/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix). ++ */ ++/* #undef HAVE_ALLOCA_H */ ++ ++/* Define to 1 if you have the <dlfcn.h> header file. */ ++#define HAVE_DLFCN_H 1 ++ ++/* Define to 1 if you don't have `vprintf' but do have `_doprnt.' */ ++/* #undef HAVE_DOPRNT */ ++ ++/* Define to 1 if you have the `gettimeofday' function. */ ++/* #undef HAVE_GETTIMEOFDAY */ ++ ++/* Define to 1 if you have the <inttypes.h> header file. */ ++/* #undef HAVE_INTTYPES_H */ ++ ++/* Define to 1 if you have the <limits.h> header file. */ ++#define HAVE_LIMITS_H 1 ++ ++/* Define to 1 if your system has a GNU libc compatible `malloc' function, and ++ to 0 otherwise. */ ++#define HAVE_MALLOC 1 ++ ++/* Define to 1 if you have the <memory.h> header file. */ ++#define HAVE_MEMORY_H 1 ++ ++/* Define to 1 if you have the `memset' function. */ ++#define HAVE_MEMSET 1 ++ ++/* Define to 1 if your system has a GNU libc compatible `realloc' function, ++ and to 0 otherwise. */ ++#define HAVE_REALLOC 1 ++ ++/* Define to 1 if you have the <stdint.h> header file. */ ++/* #undef HAVE_STDINT_H */ ++ ++/* Define to 1 if you have the <stdlib.h> header file. */ ++#define HAVE_STDLIB_H 1 ++ ++/* Define to 1 if you have the `strchr' function. */ ++#define HAVE_STRCHR 1 ++ ++/* Define to 1 if you have the `strdup' function. */ ++#define HAVE_STRDUP 1 ++ ++/* Define to 1 if you have the <strings.h> header file. */ ++/* #undef HAVE_STRINGS_H */ ++ ++/* Define to 1 if you have the <string.h> header file. */ ++#define HAVE_STRING_H 1 ++ ++/* Define to 1 if you have the `strpbrk' function. */ ++#define HAVE_STRPBRK 1 ++ ++/* Define to 1 if you have the <sys/stat.h> header file. */ ++#define HAVE_SYS_STAT_H 1 ++ ++/* Define to 1 if you have the <sys/time.h> header file. */ ++/* #undef HAVE_SYS_TIME_H */ ++ ++/* Define to 1 if you have the <sys/types.h> header file. */ ++#define HAVE_SYS_TYPES_H 1 ++ ++/* Define to 1 if you have the <unistd.h> header file. */ ++#define HAVE_UNISTD_H 1 ++ ++/* Define to 1 if you have the `vprintf' function. */ ++#define HAVE_VPRINTF 1 ++ ++/* Name of package */ ++#define PACKAGE "libtextcat" ++ ++/* Define to the address where bug reports for this package should be sent. */ ++#define PACKAGE_BUGREPORT "" ++ ++/* Define to the full name of this package. */ ++#define PACKAGE_NAME "libtextcat" ++ ++/* Define to the full name and version of this package. */ ++#define PACKAGE_STRING "libtextcat 2.2" ++ ++/* Define to the one symbol short name of this package. */ ++#define PACKAGE_TARNAME "libtextcat" ++ ++/* Define to the version of this package. */ ++#define PACKAGE_VERSION "2.2" ++ ++/* If using the C implementation of alloca, define if you know the ++ direction of stack growth for your system; otherwise it will be ++ automatically deduced at run-time. ++ STACK_DIRECTION > 0 => grows toward higher addresses ++ STACK_DIRECTION < 0 => grows toward lower addresses ++ STACK_DIRECTION = 0 => direction of growth unknown */ ++/* #undef STACK_DIRECTION */ ++ ++/* Define to 1 if you have the ANSI C header files. */ ++#define STDC_HEADERS 1 ++ ++/* Define to 1 if you can safely include both <sys/time.h> and <time.h>. */ ++#define TIME_WITH_SYS_TIME 1 ++ ++/* Define to 1 if your <sys/time.h> declares `struct tm'. */ ++/* #undef TM_IN_SYS_TIME */ ++ ++/* Version number of package */ ++#define VERSION "2.2" ++ ++/* Define to empty if `const' does not conform to ANSI C. */ ++/* #undef const */ ++ ++/* Define as `__inline' if that's what the C compiler calls it, or to nothing ++ if it is not supported. */ ++/* #undef inline */ ++ ++/* Define to rpl_malloc if the replacement function should be used. */ ++/* #undef malloc */ ++ ++/* Define to rpl_realloc if the replacement function should be used. */ ++/* #undef realloc */ ++ ++/* Define to `unsigned' if <sys/types.h> does not define. */ ++/* #undef size_t */ |