diff options
author | kramm <kramm> | 2008-04-05 07:27:03 +0000 |
---|---|---|
committer | kramm <kramm> | 2008-04-05 07:27:03 +0000 |
commit | 8154e11e1c06aefe18c16b33f2b12d6de21273a4 (patch) | |
tree | 30afac30be87bde486481ec954f131afcfa95c3b /lib/gocr | |
parent | e8fe2f290123fc66181709a8a5263ad9e91c6939 (diff) |
(patched) gocr-0.44
Diffstat (limited to 'lib/gocr')
-rw-r--r-- | lib/gocr/box.c | 369 | ||||
-rw-r--r-- | lib/gocr/database.c | 451 | ||||
-rw-r--r-- | lib/gocr/detect.c | 943 | ||||
-rw-r--r-- | lib/gocr/gocr.h | 286 | ||||
-rw-r--r-- | lib/gocr/job.c | 83 | ||||
-rw-r--r-- | lib/gocr/lines.c | 348 | ||||
-rw-r--r-- | lib/gocr/list.c | 334 | ||||
-rw-r--r-- | lib/gocr/list.h | 90 | ||||
-rw-r--r-- | lib/gocr/ocr0.c | 6591 | ||||
-rw-r--r-- | lib/gocr/ocr0.h | 63 | ||||
-rw-r--r-- | lib/gocr/ocr0n.c | 1254 | ||||
-rw-r--r-- | lib/gocr/ocr1.c | 84 | ||||
-rw-r--r-- | lib/gocr/ocr1.h | 3 | ||||
-rw-r--r-- | lib/gocr/otsu.c | 284 | ||||
-rw-r--r-- | lib/gocr/otsu.h | 23 | ||||
-rw-r--r-- | lib/gocr/output.c | 193 | ||||
-rw-r--r-- | lib/gocr/output.h | 36 | ||||
-rw-r--r-- | lib/gocr/pgm2asc.c | 2875 | ||||
-rw-r--r-- | lib/gocr/pgm2asc.h | 106 | ||||
-rw-r--r-- | lib/gocr/pixel.c | 537 | ||||
-rw-r--r-- | lib/gocr/pnm.h | 24 | ||||
-rw-r--r-- | lib/gocr/progress.c | 87 | ||||
-rw-r--r-- | lib/gocr/progress.h | 42 | ||||
-rw-r--r-- | lib/gocr/remove.c | 687 | ||||
-rw-r--r-- | lib/gocr/unicode.c | 1314 | ||||
-rw-r--r-- | lib/gocr/unicode.h | 1257 |
26 files changed, 18364 insertions, 0 deletions
diff --git a/lib/gocr/box.c b/lib/gocr/box.c new file mode 100644 index 00000000..f0cc989f --- /dev/null +++ b/lib/gocr/box.c @@ -0,0 +1,369 @@ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000-2007 Joerg Schulenburg + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + see README for EMAIL address + + */ + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <string.h> +/* do we need #include <math.h>? conflicts with INFINITY in unicode.h */ +#include "gocr.h" +#include "pgm2asc.h" + +/* for sorting letters by position on the image +/ ToDo: - use function same line like this or include lines.m1 etc. */ +int box_gt(struct box *box1, struct box *box2) { + // box1 after box2 ? + if (box1->line > box2->line) + return 1; + if (box1->line < box2->line) + return 0; + if (box1->x0 > box2->x1) // before + return 1; + if (box1->x1 < box2->x0) // before + return 0; + if (box1->x0 > box2->x0) // before, overlapping! + return 1; + + return 0; +} + +/* --- copy part of pix p into new pix b ---- len=10000 + * Returns: 0 on success, 1 on error. + * naming it as copybox isnt very clever, because it dont have to do with the + * char boxes (struct box) + */ +int copybox (pix * p, int x0, int y0, int dx, int dy, pix * b, int len) { + int x, y; + + /* test boundaries */ + if (b->p == NULL || dx < 0 || dy < 0 || dx * dy > len) { + fprintf(stderr, " error-copybox x=%5d %5d d=%5d %5d\n", x0, y0, dx, dy); + return 1; + } + + b->x = dx; + b->y = dy; + b->bpp = 1; +#ifdef FASTER_INCOMPLETE + for (y = 0; y < dy; y++) + memcpy(&pixel_atp(b, 0, y), &pixel_atp(p, x0, y + y0 ), dx); + // and unmark pixels +#else + for (y = 0; y < dy; y++) + for (x = 0; x < dx; x++) + pixel_atp(b, x, y) = getpixel(p, x + x0, y + y0); +#endif + + return 0; +} + +/* reset table of alternative chars (and free memory) */ +int reset_box_ac(struct box *box){ + int i; + for (i=0; i<box->num_ac; i++) + if (box->tas[i]) { + /* fprintf(stderr,"DBG free_s[%d] %p %s\n",i,box->tas[i],box->tas[i]); */ + free(box->tas[i]); + box->tas[i]=0; /* prevent double freeing */ + } + box->num_ac=0; /* mark as freed */ + return 0; +} + +/* ini or copy a box: get memory for box and initialize the memory */ +struct box *malloc_box (struct box *inibox) { + struct box *buf; + int i; + + buf = (struct box *) malloc(sizeof(struct box)); + if (!buf) + return NULL; + if (inibox) { + memcpy(buf, inibox, sizeof(struct box)); + /* only pointer are copied, we want to copy the contents too */ + for (i=0;i<inibox->num_ac;i++) { + if (inibox->tas[i]) { + buf->tas[i]=(char *)malloc(strlen(inibox->tas[i])+1); + memcpy(buf->tas[i], inibox->tas[i], strlen(inibox->tas[i])+1); + } + } + } + else { /* ToDo: init it */ + buf->num_ac=0; + buf->num_frames=0; + } + /* fprintf(stderr,"\nDBG ini_box %p",buf); */ + return buf; +} + +/* free memory of box */ +int free_box (struct box *box) { + if (!box) return 0; + /* fprintf(stderr,"DBG free_box %p\n",box); out_x(box); */ + reset_box_ac(box); /* free alternative char table */ + free(box); /* free the box memory */ + return 0; +} + +/* simplify the vectorgraph, + * but what is the best way? + * a) melting two neighbouring vectors with nearly same direction? + * (nearest angle to pi) + * b) melting three neigbours with smallest area? + * ToDo: + * mode = 0 - only lossless + * mode = 1 - reduce one vector, smallest possible loss + * mode = 2 - remove jitter (todo, or somewhere else) + * ToDo: include also loop around (last - first element) + * ToDo: reduce by 10..50% + */ +int reduce_vectors ( struct box *box1, int mode ) { + int i1, i2, nx, ny, mx, my, len, + minlen=1024, /* minlength of to neighbouring vectors */ + besti1=0, /* frame for best reduction */ + besti2=2; /* vector replacing its predecessor */ + double sprod, maxsprod=-1; + if (mode!=1) fprintf(stderr,"ERR not supported yet, ToDo\n"); + for (i2=1,i1=0; i1<box1->num_frames; i1++) { /* every frame */ + for (;i2<box1->num_frame_vectors[i1]-1; i2++) { /* every vector */ + /* predecessor n */ + nx = box1->frame_vector[i2-0][0] - box1->frame_vector[i2-1][0]; + ny = box1->frame_vector[i2-0][1] - box1->frame_vector[i2-1][1]; + /* successor m */ + mx = box1->frame_vector[i2+1][0] - box1->frame_vector[i2-0][0]; + my = box1->frame_vector[i2+1][1] - box1->frame_vector[i2-0][1]; + /* angle is w = a*b/(|a|*|b|) = 1 means parallel */ + /* normalized: minimize w^2 = (a*b/(|a|*|b|)-1)^2 */ + /* -1=90grd, 0=0grd, -2=180grd */ + sprod = /* fabs */(abs(nx*mx+ny*my)*(nx*mx+ny*my) + /(1.*(nx*nx+ny*ny)*(mx*mx+my*my))-1); + /* we dont include math.h because INFINITY conflicts to unicode,h */ + if (sprod<0) sprod=-sprod; + len = (mx*mx+my*my)*(nx*nx+ny*ny); /* sum lengths^2 */ +// ..c ###c ... .. ... +// .b. len=2+2 #b.. len=2+5 #bc len=1+2 bc len=1+1 b#a len=4+5 +// a.. spr=0 a... spr=1/10 a.. spr=1/4 a. spr=1 ##c spr=9/5 +// + if ( len* sprod* sprod* sprod* sprod + <minlen*maxsprod*maxsprod*maxsprod*maxsprod + || maxsprod<0) /* Bad! ToDo! */ + { maxsprod=sprod; besti1=i1; besti2=i2; minlen=len; } + } + } + if (box1->num_frames>0) + for (i2=besti2; i2<box1->num_frame_vectors[ box1->num_frames-1 ]-1; i2++) { + box1->frame_vector[i2][0]=box1->frame_vector[i2+1][0]; + box1->frame_vector[i2][1]=box1->frame_vector[i2+1][1]; + } + for (i1=besti1; i1<box1->num_frames; i1++) + box1->num_frame_vectors[i1]--; +// fprintf(stderr,"\nDBG_reduce_vectors i= %d nv= %d sprod=%f len2=%d\n# ...", +// besti2,box1->num_frame_vectors[ box1->num_frames-1 ],maxsprod,minlen); +// out_x(box1); + return 0; +} + +/* add the contents of box2 to box1 + * especially add vectors of box2 to box1 + */ +int merge_boxes( struct box *box1, struct box *box2 ) { + int i1, i2, i3, i4; + struct box tmpbox, *bsmaller, *bbigger; /* for mixing and sorting */ + /* DEBUG, use valgrind to check uninitialized memory */ +#if 0 + fprintf(stderr,"\nDBG merge_boxes_input:"); out_x(box1); out_x(box2); +#endif + /* pair distance is to expendable, taking borders is easier */ + if ((box2->x1 - box2->x0)*(box2->y1 - box2->y0) + >(box1->x1 - box1->x0)*(box1->y1 - box1->y0)) { + bbigger=box2; bsmaller=box1; } + else { + bbigger=box1; bsmaller=box2; } + /* ToDo: does not work if a third box is added */ + if (box2->y0>box1->y1 || box2->y1<box1->y0 + || box2->x0>box1->x1 || box2->x1<box1->x0) { + box1->num_boxes += box2->num_boxes; /* num seperate objects 2=ij */ + } else { + if (box2->num_boxes>box1->num_boxes) box1->num_boxes=box2->num_boxes; + box1->num_subboxes += box2->num_subboxes+1; /* num holes 1=abdepq 2=B */ + } + box1->dots += box2->dots; /* num i-dots */ + if ( box2->x0 < box1->x0 ) box1->x0 = box2->x0; + if ( box2->x1 > box1->x1 ) box1->x1 = box2->x1; + if ( box2->y0 < box1->y0 ) box1->y0 = box2->y0; + if ( box2->y1 > box1->y1 ) box1->y1 = box2->y1; + i1 = i2 = 0; + if (bbigger->num_frames) + i1 = bbigger->num_frame_vectors[ bbigger->num_frames - 1 ]; + if (bsmaller->num_frames) + i2 = bsmaller->num_frame_vectors[ bsmaller->num_frames - 1 ]; + while (i1+i2 > MaxFrameVectors) { + if (i1>i2) { reduce_vectors( bbigger, 1 ); i1--; } + else { reduce_vectors( bsmaller, 1 ); i2--; } + } + /* if i1+i2>MaxFrameVectors simplify the vectorgraph */ + /* if sum num_frames>MaxNumFrames through shortest graph away and warn */ + /* first copy the bigger box */ + memcpy(&tmpbox, bbigger, sizeof(struct box)); + /* attach the smaller box */ + for (i4=i3=0; i3<bsmaller->num_frames; i3++) { + if (tmpbox.num_frames>=MaxNumFrames) break; + + for (; i4<bsmaller->num_frame_vectors[i3]; i4++) { + memcpy(tmpbox.frame_vector[i1], + bsmaller->frame_vector[i4],2*sizeof(int)); + i1++; + } + tmpbox.num_frame_vectors[ tmpbox.num_frames ] = i1; + tmpbox.frame_vol[ tmpbox.num_frames ] = bsmaller->frame_vol[ i3 ]; + tmpbox.frame_per[ tmpbox.num_frames ] = bsmaller->frame_per[ i3 ]; + tmpbox.num_frames++; + if (tmpbox.num_frames>=MaxNumFrames) { + if (JOB->cfg.verbose) + fprintf(stderr,"\nDBG merge_boxes MaxNumFrames reached"); + break; + } + } + /* copy tmpbox to destination */ + box1->num_frames = tmpbox.num_frames; + memcpy(box1->num_frame_vectors, + tmpbox.num_frame_vectors,sizeof(int)*MaxNumFrames); + memcpy(box1->frame_vol, + tmpbox.frame_vol,sizeof(int)*MaxNumFrames); + memcpy(box1->frame_per, + tmpbox.frame_per,sizeof(int)*MaxNumFrames); + memcpy(box1->frame_vector, + tmpbox.frame_vector,sizeof(int)*2*MaxFrameVectors); +#if 0 + if (JOB->cfg.verbose) + fprintf(stderr,"\nDBG merge_boxes_result:"); out_x(box1); +#endif + return 0; +} + +/* used for division of glued chars + * after a box is splitted into 2, where vectors are copied to both, + * vectors outside the new box are cutted and thrown away, + * later replaced by + * - 1st remove outside vectors with outside neighbours (complete frames?) + * add vector on outside vector with inside neighbours + * care about connections through box between outside vectors + * - 2nd reduce outside crossings (inclusive splitting frames if necessary) + * depending on direction (rotation) of outside connections + * - 3th shift outside vectors to crossing points + * - split add this points, connect only in-out...out-in, + * - cutting can result in more objects + * ToDo: dont connect --1---2--------3----4-- new-y1 (inside above not drawn) + * \ \->>>>-/ / outside + * \----<<<<-----/ old-y1 + * |======| subtractable? + * + * only connect --1---2--------3----4-- new-y1 + * \>>/ \>>>/ old-y1 outside + * + * ToDo: new vol, per + */ +int cut_box( struct box *box1) { + int i1, i2, i3, i4, x, y, lx, ly, dbg=0; + if (JOB->cfg.verbose) dbg=1; // debug level, enlarge to get more output + if (dbg) fprintf(stderr,"\n cut box x= %3d %3d", box1->x0, box1->y0); + /* check if complete frames are outside the box */ + for (i1=0; i1<box1->num_frames; i1++){ + if (dbg>2) fprintf(stderr,"\n checking frame %d outside", i1); + i2 = ((i1)?box1->num_frame_vectors[ i1-1 ]:0); // this frame + i3 = box1->num_frame_vectors[ i1 ]; // next frame + for (i4=i2; i4 < i3; i4++) { + x = box1->frame_vector[i4][0]; + y = box1->frame_vector[i4][1]; + /* break, if one vector is lying inside */ + if (x>=box1->x0 && x<=box1->x1 && y>=box1->y0 && y<=box1->y1) break; + } + if (i4==i3) { /* all vectors outside */ + if (dbg>1) fprintf(stderr,"\n remove frame %d",i1); + /* replace all frames i1,i1+1,... by i1+1,i1+2,... */ + /* replace (x,y) pairs first */ + for (i4=i2; i4<box1->num_frame_vectors[ box1->num_frames-1 ]-(i3-i2); + i4++) { + box1->frame_vector[i4][0] = box1->frame_vector[i4+i3-i2][0]; + box1->frame_vector[i4][1] = box1->frame_vector[i4+i3-i2][1]; + } + /* replace the num_frame_vectors */ + for (i4=i1; i4<box1->num_frames-1; i4++) + box1->num_frame_vectors[ i4 ] = + box1->num_frame_vectors[ i4+1 ]-(i3-i2); + box1->num_frames--; i1--; + } + } + /* remove vectors outside the box */ + i3=0; + for (i1=0; i1<box1->num_frames; i1++){ + if (dbg>2) fprintf(stderr,"\n check cutting vectors on frame %d", i1); + x = box1->frame_vector[0][0]; /* last x */ + y = box1->frame_vector[0][1]; /* last y */ + /* ToDo: start inside to get a closed object */ + if (x<box1->x0 || x>box1->x1 || y<box1->y0 || y>box1->y1) i3=1; + for (i2=0; i2<box1->num_frame_vectors[ i1 ]; i2++) { + lx = x; /* last x */ + ly = y; /* last y */ + x = box1->frame_vector[i2][0]; + y = box1->frame_vector[i2][1]; + // fprintf(stderr,"DBG LEV3 i2= %3d xy= %3d %3d",i2,x,y); + /* check if outside */ + if (x<box1->x0 || x>box1->x1 || y<box1->y0 || y>box1->y1) { + /* replace by nearest point at border, ToDo: better crossingpoint */ + if (i3==0) { /* wrong if it starts outside */ + if (x < box1->x0) x = box1->frame_vector[i2][0] = box1->x0; + if (x > box1->x1) x = box1->frame_vector[i2][0] = box1->x1; + if (y < box1->y0) y = box1->frame_vector[i2][1] = box1->y0; + if (y > box1->y1) y = box1->frame_vector[i2][1] = box1->y1; + } else { + /* remove vector */ + if (dbg>1) fprintf(stderr,"\n remove vector[%d][%d] x= %2d %2d",i1,i2,x-box1->x0,y-box1->y0); + for (i4=i2;i4<box1->num_frame_vectors[ box1->num_frames-1 ]-1;i4++) { + box1->frame_vector[i4][0] = box1->frame_vector[i4+1][0]; + box1->frame_vector[i4][1] = box1->frame_vector[i4+1][1]; + } + for (i4=i1; i4<box1->num_frames; i4++) + box1->num_frame_vectors[ i4 ]--; + i2--; /* next element is shiftet now, setting back the counter */ + } + i3++; + // fprintf(stderr," outside i3= %d\n",i3); + continue; + } + // fprintf(stderr," inside i3= %d",i3); + if (i3) { /* ToDo: better crossing point last vector and border */ + if (lx < box1->x0) lx = box1->x0; + if (lx > box1->x1) lx = box1->x1; + if (ly < box1->y0) ly = box1->y0; + if (ly > box1->y1) ly = box1->y1; + x = box1->frame_vector[i2][0] = lx; + y = box1->frame_vector[i2][1] = ly; + i3 = 0; + } + // fprintf(stderr," xy= %3d %3d\n",x,y); + } + } + //if (dbg>2) { fprintf(stderr,"\nDBG cut_box_result:"); out_x(box1); } + return 0; +} + diff --git a/lib/gocr/database.c b/lib/gocr/database.c new file mode 100644 index 00000000..21a4f02a --- /dev/null +++ b/lib/gocr/database.c @@ -0,0 +1,451 @@ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000-2006 Joerg Schulenburg + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + see README for EMAIL address + */ + +#include <stdio.h> +#include <stdlib.h> +#include "gocr.h" +#include "pnm.h" +#include "pgm2asc.h" +#include <string.h> +#include <time.h> + +#define Blen 256 + +// load boxes from database into boxlist (for faster access) +// used as alternate engine, comparing chars with database +int load_db(void) { + FILE *f1; + char s1[Blen+1], + s2[Blen+1] = "./db/", /* ToDo: replace by constant! by configure */ + *s3; + int i, j, ii, i2, line; + struct box *box1; + pix *pp; + + if( JOB->cfg.db_path ) strncpy(s2,JOB->cfg.db_path,Blen-1); + i2=strlen(s2); + if (JOB->cfg.verbose) + fprintf(stderr, "# load database %s %s ... ",s2,JOB->cfg.db_path); + + strncpy(s2+i2,"db.lst",Blen-i2);s2[Blen]=0; + f1 = fopen(s2, "r"); + if (!f1) { + fprintf(stderr, " DB %s not found\n",s2); + return 1; + } + + line = 0; /* line counter for better error report */ + for (ii = 0; !feof(f1); ii++) { +/* bbg: should write a better input routine */ + if (!fgets(s1, Blen, f1)) break; line++; + j = strlen(s1); + /* remove carriage return sequences from line */ + while (j > 0 && (s1[j - 1] == '\r' || s1[j - 1] == '\n')) + s1[--j] = 0; + if (!j) continue; /* skip empty line */ + if (s1[0]=='#') continue; /* skip comments (v0.44) */ + /* copy file name */ + for (i = 0; i < j && i+i2 < Blen && strchr(" \t,;",s1[i]) == 0; i++) + s2[i2 + i] = s1[i]; + s2[i2+i]=0; + /* skip spaces */ + for (; i < j && strchr(" \t",s1[i]) != 0; i++); + /* by now: read pix, fill box, goto next ??? */ + pp = (pix *)malloc(sizeof(pix)); + if( !pp ) fprintf(stderr,"malloc error in load_db pix\n"); + + //readpgm(s2, pp, 0 * JOB->cfg.verbose); + fprintf(stderr, "Can't call readpgm()\n"); + + box1 = (struct box *)malloc_box(NULL); + if(!box1) fprintf(stderr,"malloc error in load_db box1\n"); + box1->x0 = 0; + box1->x1 = pp->x-1; // white border 1 pixel width + box1->y0 = 0; + box1->y1 = pp->y-1; + box1->x = 1; + box1->y = 1; + box1->dots = 0; + box1->c = 0; + box1->modifier = 0; /* ToDo: obsolete */ + box1->tas[0]=NULL; + box1->tac[0]=0; + box1->wac[0]=100; /* really 100% sure? */ + box1->num_ac=1; + if (s1[i]=='"'){ /* parse a string */ + j=strrchr(s1+i+1,'"')-(s1+i+1); /* we only look for first and last "" */ + if (j>=1) { + s3=(char *)malloc(j+1); + if (!s3) fprintf (stderr, "malloc error in load_db s3\n"); + if (s3) { + memcpy(s3,s1+i+1,j); + s3[j]=0; + box1->tas[0]=s3; + // fprintf(stderr,"\nstring=%s",s3); + } + } else { fprintf(stderr,"load_db: string parse error L%d\n",line); } + } else { + box1->tac[0] = box1->c = s1[i]; /* try to interpret as ASCII */ + /* we can live without hexcode in future if we use UTF8-strings */ + s3=s1+i; + j=strtol( s1+i, &s3, 16); /* try to read 4 to 8 digit hex unicode */ + /* if its an hexcode, ASCII interpretation is overwritten */ + if( j && i+3<=Blen && s3-s1-i>3 ) box1->tac[0] = box1->c = j; + // fprintf(stderr,"\nhexcode=%04x=%04x %d",(int)j,(int)box1->c,s3-s1-i); + } + box1->num = 0; + box1->line = -1; + box1->m1 = 0; /* ToDo: should be given too in the database! */ + box1->m2 = 0; + box1->m3 = 0; + box1->m4 = 0; + box1->p = pp; + list_app(&JOB->tmp.dblist, box1); // append to list +#if 0 + out_x(box1); +#endif + } + fclose(f1); + if (JOB->cfg.verbose) + fprintf(stderr, " %d chars loaded\n", ii); + return 0; +} + +// expand database from box/boxlist name=db_$utime.pbm +// this is added in version v0.3.3 +int store_db(struct box *box1) { + FILE *f1; + char s2[Blen+1] = "./db/", s3[Blen+1]; + int i2, dx, dy; + pix b; /* temporary mini page */ + + if( JOB->cfg.db_path ) strncpy(s2,JOB->cfg.db_path,Blen-1); + i2=strlen(s2); + + /* name generation can cause problems, if called twice within a second */ + if (box1->num_ac && box1->tas[0]) + sprintf(s3,"db_%04x_%lu.pbm", (unsigned int)box1->tas[0][0], + (unsigned long)time(NULL)); + else + sprintf(s3,"db_%04x_%lu.pbm", (unsigned int)box1->c, + (unsigned long)time(NULL)); + /* ToDo: the file name may be not unique */ + strncpy(s2+i2,"db.lst",Blen-i2);s2[Blen]=0; + f1 = fopen(s2, "a"); + if (!f1) { + fprintf(stderr, " could not access %s\n",s2); + return 1; + } + strncpy(s2+i2,s3,strlen(s3)); s2[i2+strlen(s3)]=0; + /* store image and infos about the char */ + /* ToDo: store the vector list instead of the pixelarray */ + + if (JOB->cfg.verbose) + fprintf(stderr, "store_db: add file %s to database\n#",s3); + dx=box1->x1-box1->x0+1; + dy=box1->y1-box1->y0+1; + b.p = (unsigned char *) malloc( dx * dy ); + if( !b.p ){ + fprintf( stderr, "\nFATAL: malloc failed, skip store_db" ); + return 2; + } + if (copybox(box1->p, box1->x0, box1->y0, dx, dy, &b, dx * dy)) + return -1; + + //writepbm(s2,&b); /* What is to do on error? */ + + free(b.p); + + /* store the database line */ + /* some infos about box1->m1,..,m4 should added (base line, high etc.) */ + if (box1->num_ac && box1->tas[0]) { + fprintf(f1, "%s \"%s\"\n",s3,box1->tas[0]); + /* ToDo: what if tas contains '"'? */ + } else { + if( (box1->c >= '0' && box1->c <= '9') + || (box1->c >= 'A' && box1->c <= 'Z') + || (box1->c >= 'a' && box1->c <= 'z') ) + fprintf(f1, "%s %c\n",s3,(char)box1->c); + else { + if (((box1->c)>>16)>>16) + fprintf(f1, "%s %08x\n",s3,(unsigned int)box1->c); + else + fprintf(f1, "%s %04x\n",s3,(unsigned int)box1->c); + } + } + fclose(f1); + return 0; +} + +/* function is only for user prompt on console to identify chars + it prints out a part of pixmap b at point x0,y0 to stderr + using dots .,; if no pixel, and @xoO for pixels + */ +void out_env(struct box *px ){ + int x0,y0,x1,y1,dx,dy,x,y,x2,y2,yy0,tx,ty,i,cs; + char c1, c2; pix *b; + cs=JOB->cfg.cs; + yy0=px->y0; + { /* overwrite rest of arguments */ + b=px->p; + x0=px->x0; x1=px->x1; dx=x1-x0+1; + y0=px->y0; y1=px->y1; dy=y1-y0+1; + y0-=2; y1+=2; + if (px->m4 && y0>px->m1) y0=px->m1; + if (px->m4 && y1<px->m4) y1=px->m4; + if (x1-x0+1<52) { x0-=10; x1+=10; } /* fragment? expand frame */ + if (x1-x0+1<52) { x0-=10; x1+=10; } /* fragment? expand frame */ + if (x1-x0+1<62) { x0-=5; x1+=5; } + if (y1-y0+1<10) { y0-= 4; y1+= 4; } /* fragment? */ + if (x0<0) x0=0; if (x1>=b->x) x1=b->x-1; + if (y0<0) y0=0; if (y1>=b->y) y1=b->y-1; + dx=x1-x0+1; + dy=y1-y0+1; yy0=y0; + fprintf(stderr,"\n# show box + environment"); + fprintf(stderr,"\n# show box x= %4d %4d d= %3d %3d r= %d %d", + px->x0, px->y0, px->x1 - px->x0 + 1, px->y1 - px->y0 + 1, + px->x - px->x0, px->y - px->y0); + if (px->num_ac){ /* output table of chars and its probabilities */ + fprintf(stderr,"\n# list box char: "); + for(i=0;i<px->num_ac && i<NumAlt;i++) + /* output the (xml-)string (picture position, barcodes, glyphs, ...) */ + if (px->tas[i]) + fprintf(stderr," %s(%d)", px->tas[i] ,px->wac[i]); + else + fprintf(stderr," %s(%d)",decode(px->tac[i],ASCII),px->wac[i]); + } + fprintf(stderr,"\n"); + if (px->dots && px->m2 && px->m1<y0) { yy0=px->m1; dy=px->y1-yy0+1; } + } + tx=dx/80+1; + ty=dy/40+1; // step, usually 1, but greater on large maps + fprintf(stderr,"# show pattern x= %4d %4d d= %3d %3d t= %d %d\n", + x0,y0,dx,dy,tx,ty); + if (dx>0) + for(y=yy0;y<yy0+dy;y+=ty) { /* reduce the output to max 78x40 */ + + /* image is the boxframe + environment in the original bitmap */ + for(x=x0;x<x0+dx;x+=tx){ /* by merging sub-pixels */ + c1='.'; + for(y2=y;y2<y+ty && y2<y0+dy;y2++) /* sub-pixels */ + for(x2=x;x2<x+tx && x2<x0+dx;x2++) + { if((getpixel(b,x2,y2)<cs)) c1='#'; } + // show pixels outside the box thinner/weaker + if (x+tx-1 < px->x0 || x > px->x1 + || y+ty-1 < px->y0 || y > px->y1) c1=((c1=='#')?'O':','); + fprintf(stderr,"%c", c1 ); + } + + c1=c2=' '; + /* mark lines with < */ + if (px) if (y==px->m1 || y==px->m2 || y==px->m3 || y==px->m4) c1='<'; + if (y==px->y0 || y==px->y1) c2='-'; /* boxmarks */ + fprintf(stderr,"%c%c\n",c1,c2); + } +} + + +/* +// second variant, for database (with slightly other behaviour) +// new variant +// look at the environment of the pixel too (contrast etc.) +// detailed analysis only of diff pixels! +// +// 100% * distance, 0 is best fit +// = similarity of 2 chars for recognition of noisy chars +// weigth of pixels with only one same neighbour set to 0 +// look at contours too! + ToDo: especially on small boxes distance should only be 0 if + characters are 100% identical! +*/ +// #define DEBUG 2 +int distance2( pix *p1, struct box *box1, + pix *p2, struct box *box2, int cs){ + int rc=0,x,y,v1,v2,i1,i2,rgood=0,rbad=0, + x1,y1,x2,y2,dx,dy,dx1,dy1,dx2,dy2,tx,ty; +#if DEBUG == 2 + if(JOB->cfg.verbose) + fprintf(stderr," DEBUG: distance2\n"); +#endif + x1=box1->x0;y1=box1->y0;x2=box2->x0;y2=box2->y0; + dx1=box1->x1-box1->x0+1; dx2=box2->x1-box2->x0+1; dx=((dx1>dx2)?dx1:dx2);dx=dx1; + dy1=box1->y1-box1->y0+1; dy2=box2->y1-box2->y0+1; dy=((dy1>dy2)?dy1:dy2);dy=dy1; + if(abs(dx1-dx2)>1+dx/16 || abs(dy1-dy2)>1+dy/16) rbad++; // how to weight? + // compare relations to baseline and upper line + if(box1->m4>0 && box2->m4>0){ // used ??? + if(2*box1->y1>box1->m3+box1->m4 && 2*box2->y1<box2->m3+box2->m4) rbad+=128; + if(2*box1->y0>box1->m1+box1->m2 && 2*box2->y0<box2->m1+box2->m2) rbad+=128; + } + tx=dx/16; if(dx<17)tx=1; // raster + ty=dy/32; if(dy<33)ty=1; + // compare pixels + for( y=0;y<dy;y+=ty ) + for( x=0;x<dx;x+=tx ) { // try global shift too ??? + v1=((getpixel(p1,x1+x*dx1/dx,y1+y*dy1/dy)<cs)?1:0); i1=8; // better gray? + v2=((getpixel(p2,x2+x*dx2/dx,y2+y*dy2/dy)<cs)?1:0); i2=8; // better gray? + if(v1==v2) { rgood+=16; continue; } // all things are right! + // what about different pixel??? + // test overlapp of surounding pixels ??? + v1=1; rbad+=4; + v1=-1; + for(i1=-1;i1<2;i1++) + for(i2=-1;i2<2;i2++)if(i1!=0 || i2!=0){ + if( ((getpixel(p1,x1+x*dx1/dx+i1*(1+dx1/32),y1+y*dy1/dy+i2*(1+dy1/32))<cs)?1:0) + !=((getpixel(p2,x2+x*dx2/dx+i1*(1+dx2/32),y2+y*dy2/dy+i2*(1+dy2/32))<cs)?1:0) ) v1++; + } + if(v1>0) + rbad+=16*v1; + } + if(rgood+rbad) rc= 100*rbad/(rgood+rbad); else rc=99; + /* if width/high is not correct add badness */ + rc += ( abs(dx1*dy2-dx2*dy1) * 10 ) / (dy1*dy2); + if (rc>100) rc=100; + if(/* rc<10 && */ JOB->cfg.verbose /* &1024 */){ +#if DEBUG == 2 + fprintf(stderr," distance2 rc=%d rgood=%d rbad=%d\n",rc,rgood,rbad); +// out_b(NULL,p1,box1->x0,box1->y0,box1->x1-box1->x0+1, +// box1->y1-box1->y0+1,cs); +// out_b(NULL,p2,box2->x0,box2->y0,box2->x1-box2->x0+1, +// box2->y1-box2->y0+1,cs); + out_x(box1); + out_x(box2); +#endif + } + return rc; +} + +wchar_t ocr_db(struct box *box1) { + int dd = 1000, dist = 1000; + wchar_t c = UNKNOWN; + char buf[200]; + Box *box2, *box3; + + if (!list_empty(&JOB->tmp.dblist)){ + box3 = (Box *)list_get_header(&JOB->tmp.dblist); + if(JOB->cfg.verbose) + fprintf(stderr,"\n#DEBUG: ocr_db (%d,%d) ",box1->x0, box1->y0); + + for_each_data(&JOB->tmp.dblist) { + box2 = (Box *)list_get_current(&JOB->tmp.dblist); + /* do preselect!!! distance() slowly */ + dd = distance2( box2->p, box2, box1->p, box1, JOB->cfg.cs); + if (dd <= dist) { /* new best fit */ + dist = dd; + box3 = box2; /* box3 is a pointer and not copied box2 */ + + if (dist<100 && 100-dist > JOB->cfg.certainty) { + /* some deviation of the pattern is tolerated */ + int i, wa; + for (i=0;i<box3->num_ac;i++) { + wa = (100-dist)*box3->wac[i]/100; /* weight *= (100-dist) */ + if (box3->tas[i]) setas(box1,box3->tas[i],wa); + else setac(box1,box3->tac[i],wa); + } + if (box3->num_ac) c=box3->tac[0]; /* 0 for strings (!UNKNOWN) */ + if (JOB->cfg.verbose) + fprintf(stderr, " dist=%4d c= %c 0x%02x %s wc= %3d", dist, + ((box3->c>32 && box3->c<127) ? (char) box3->c : '.'), + (int)box3->c, ((box3->tas[0])?box3->tas[0]:""), box3->wac[0]); + } + if (dd<=0 && ((box3->num_ac && box3->tas[0]) || box3->c >= 128 + || !strchr ("l1|I0O", box3->c))) + break; /* speedup if found */ + } + } end_for_each(&JOB->tmp.dblist); + + } + + if( (JOB->cfg.mode&128) != 0 && c == UNKNOWN ) { /* prompt the user */ + /* should the output go to stderr or special pipe??? */ + int utf8_ok=0; /* trigger this flag if input is ok */ + int i, endchar; /* index */ + out_env(box1); /* old: out_x(box1); */ + fprintf(stderr,"The above pattern was not recognized.\n" + "Enter UTF8 char or string for above pattern. Leave empty if unsure.\n" + "Press RET at the end (ALT+RET to store into RAM only) : " + ); /* ToDo: empty + alt-return (0x1b 0x0a) for help? ^a for skip all */ + /* UTF-8 (man 7 utf-8): + * 7bit = 0xxxxxxx (0000-007F) + * 11bit = 110xxxxx 10xxxxxx (0080-07FF) + * 16bit = 1110xxxx 10xxxxxx 10xxxxxx (0800-FFFF) + * 21bit = 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 26bit = 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 31bit = 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + */ + buf[0]=0; + /* shift/ctrl/altgr-enter acts like enter or ^j or ^m, + * alt-enter returns 0x1b 0x0a and returns from fgets() + * ^d (EOF) returns (nil) from fgets() + * x+(2*)ctrl-d returns from fgets() without returning a 0x0a + * if not UTF-input-mode, we are in trouble? + * ^a=0x01, ^b=0x02, ^e=05, ..., ToDo: meaning of no-input or <=space + */ + fgets(buf,200,stdin); /* including \n=0x0a */ + dd=strlen(buf); + /* output hexcode if verbose set */ + if (JOB->cfg.verbose) { + fprintf(stderr, "\n# fgets [%d]:", dd); + for(i=0; i<dd; i++) + fprintf(stderr, " %02x", (unsigned)((unsigned char)buf[i])); + fprintf(stderr, "\n#"); + } + /* we dont accept chars which could destroy database file */ + for (i=0; i<dd; i++) if (buf[i]<32) break; + endchar=buf[i]; /* last char is 0x0a (ret) 0x00 (EOF) or 0x1b (alt+ret) */ + if (endchar==0x01) { i=0;JOB->cfg.mode&=~128; } /* skip all */ + buf[dd=i]=0; /* replace final 0x0a or other special codes */ + if (dd==1 && !(buf[0]&128)) { c=buf[0]; utf8_ok=1; } /* single char */ + if (dd>1 && dd<7) { /* try to decode single wide char (utf8) */ + int u0, u1; /* define UTF8-start sequences, u0=0bits u1=1bits */ + u0= 1<<(7-dd); /* compute start byte from UTF8-length */ + u1=255&~((1<<(8-dd))-1); + for (i=1;i<dd;i++) if ((buf[i]&0xc0)!=0x80) break; /* 10xxxxxx */ + if (i==dd && (buf[0]&(u0|u1))==u1) { utf8_ok=1; + c=buf[0]&(u0-1); /* 11..0x.. */ + for (i=1;i<dd;i++) { c<<=6; c|=buf[i]&0x3F; } /* 10xxxxxx */ + } + } + if (dd>0){ /* ToDo: skip space and tab too? */ + if (utf8_ok==1) { setac(box1, c, 100); } /* store single wchar */ + if (utf8_ok==0) { /* store a string of chars (UTF8-string) */ + c='_'; /* what should we do with c? probably a bad idea? */ + setas(box1, buf, 100); + } + /* decide between + * 0) just help gocr to find the results and (dont remember, 0x01) + * 1) help and remember in the same run (store to memory, 0x1b) + * 2) expand the database (dont store ugly chars to the database!) + */ + if (endchar!=0x01){ /* ^a before hit return */ + /* is there a reason to dont store to memory? */ + list_app(&JOB->tmp.dblist, box1); /* append to list for 1+2 */ + } + if (endchar!=0x01 && endchar!=0x1b){ + store_db(box1); /* store to disk for 2 */ + } + if (JOB->cfg.verbose) + fprintf(stderr, " got char= %c 16bit= 0x%04x string= \"%s\"\n", + ((c>32 && c<127)?(char)c:'.'), (int)c, buf); + } + } + + return c; +} diff --git a/lib/gocr/detect.c b/lib/gocr/detect.c new file mode 100644 index 00000000..bfd3ec9e --- /dev/null +++ b/lib/gocr/detect.c @@ -0,0 +1,943 @@ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000-2007 Joerg Schulenburg + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + check README for my email address +*/ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <ctype.h> // toupper, tolower +#include "pgm2asc.h" +#include "gocr.h" + +// ----- detect lines --------------- +/* suggestion: Fourier transform and set line frequency where the + amplitude has a maximum (JS: slow and not smarty enough). + + option: range for line numbers 1..1000 or similar + todo: look for thickest line, and divide if thickness=2*mean_thickness + Set these elements of the box structs: + + m1 <-- top of upper case letters and (bdfhkl) (can differ) + m2 <-- top of letters (acegmnopqrsuvwxyz) + m3 <-- baseline + m4 <-- bottom of hanging letters (gqpy) + + performance can be improved by working with a temporary + list of boxes of the special text line + + - Jun23,00 more robustness of m3 (test liebfrau1) + - Feb01,02 more robustness of m4 (test s46_084.pgm) + - Dec03,12 fix problems with footnotes + ToDo: + - generate lists of boxes per line (faster access) + - use statistics + - for each box look at it neighbours and set box-m1..m4 + - m[1..4].max .min if m4.min-m3.max<1 probability lower + */ +int detect_lines1(pix * p, int x0, int y0, int dx, int dy) +{ + int i, jj, j2, y, yy, my, mi, mc, i1, i2, i3, i4, + m1, m2, m3, m4, ma1, ma2, ma3, ma4, m3pre, m4pre; + struct box *box2, *box3; /* box3 is for verbose / debugging */ + struct tlines *lines = &JOB->res.lines; + + /* ToDo: optional read line-data from external source??? */ + if (lines->num == 0) { // initialize one dummy-line for pictures etc. + lines->m4[0] = 0; + lines->m3[0] = 0; + lines->m2[0] = 0; + lines->m1[0] = 0; + lines->x0[0] = p->x; /* expand to left end during detection */ + lines->x1[0] = 0; /* expand to right end */ + lines->pitch[0] = JOB->cfg.spc; /* default word pitch */ + lines->mono[0] = 0; /* default spacing = prop */ + lines->num++; + } + i = lines->num; + if (dy < 4) + return 0; /* image is to low for latin chars */ + my = jj = 0; + // get the mean height of all hollow chars + // (better than mean value of everything including bg-pattern or dust?) + for_each_data(&(JOB->res.boxlist)) { + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if ( box2->c != PICTURE + && box2->num_frames>1 && box2->num_frames<3 /* 1 or 2 holes */ + && box2->y0 >= y0 && box2->y1 <= y0 + dy + && box2->x0 >= x0 && box2->x1 <= x0 + dx + && box2->frame_vol[0]>0 + && box2->frame_vol[1]<0 + ) { + jj++; + my += box2->y1 - box2->y0 + 1; + } + } end_for_each(&(JOB->res.boxlist)); + if (jj==0) { + // get the mean height of all chars + for_each_data(&(JOB->res.boxlist)) { + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if ( box2->c != PICTURE + && box2->y1 - box2->y0 + 1 >= 4 /* 4x6 font */ + && box2->y0 >= y0 && box2->y1 <= y0 + dy + && box2->x0 >= x0 && box2->x1 <= x0 + dx ) { + jj++; + my += box2->y1 - box2->y0 + 1; + } + } end_for_each(&(JOB->res.boxlist)); + } + if (jj == 0) + return 0; /* no chars detected */ + + + /* ToDo: a better way could be to mark good boxes (of typical high a-zA-Z0-9) + * first and handle only marked boxes for line scan, exclude ?!,.:;etc + * but without setect the chars itself (using good statistics) + * see adjust_text_lines() + */ + my /= jj; /* we only care about chars with high arround my */ + if (JOB->cfg.verbose & 16) + fprintf(stderr,"\n# detect_lines1(%d %d %d %d) vvv&16 chars=%d my=%d\n# ", + x0, y0, dx, dy, jj, my); + // "my" is the average over the whole image (bad, if different fontsizes) + + if (my < 4) + return 0; /* mean high is to small => error */ + + m4pre=m3pre=y0; /* lower bond of upper line */ + // better function for scanning line around a letter ??? + // or define lines around known chars "eaTmM" + for (j2 = y = y0; y < y0 + dy; y++) { + // look for max. of upper and lower bound of next line + m1 = y0 + dy; + jj = 0; +#if 1 + /* this is only for test runs */ + if (JOB->cfg.verbose & 16) + fprintf(stderr,"searching new line %d\n# ",i /* lines->num */); +#endif + + box3 = NULL; /* mark the most upper box starting next line */ + // find highest point of next line => store to m1-min (m1>=y) + // only objects greater 2/3*my and smaller 3*my are allowed + // a higher "!" at end of line can result in a to low m1 + for_each_data(&(JOB->res.boxlist)) { + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if (box2->line>0 || box2->c == PICTURE) continue; + if (lines->dx) + yy = lines->dy * box2->x0 / (lines->dx); /* correct crooked lines */ + else yy=0; + if ( box2->y0 >= y + yy && box2->y1 < y0 + dy // lower than y + && box2->x0 >= x0 && box2->x1 < x0 + dx // within box ? + && box2->c != PICTURE // no picture + && box2->num_boxes <= 1 // ignore 2 for "!?i" 3 for "ä" + && 3 * (box2->y1 - box2->y0) > 2 * my // not to small + && (box2->y1 - box2->y0) < 3 * my // not to big + && (box2->y1 - box2->y0) > 4) // minimum absolute size + { + if (box2->y0 < m1 + yy) { + m1 = box2->y0 - yy; /* highest upper boundary */ + box3 = box2; + } + // fprintf(stderr,"\n %3d %3d %+3d %d m1= %3d", + // box2->x0, box2->y0, box2->y1 - box2->y0 + 1, box2->num_boxes, m1); + } + } end_for_each(&(JOB->res.boxlist)); + if (!box3 || m1 >= y0+dy) break; /* no further line found */ + if (JOB->cfg.verbose & 16) + fprintf(stderr," most upper box at new line xy= %4d %4d %+4d %+4d\n# ", + box3->x0, box3->y0, box3->x1-box3->x0, box3->y1-box3->y0); + + // at the moment values depend from single chars, which can + // result in bad values (ex: 4x6 /\=) + // ToDo: 2) mean size of next line (store list of y0,y1) + // ToDo: 3) count num0[(y0-m1)*16/my], num1[(y1-m1)*16/my] + // ToDo: or down-top search horizontal nerarest neighbours + lines->x0[i] = x0 + dx - 1; /* expand during operation to left end */ + lines->x1[i] = x0; /* expand to the right end of line */ + m4=m2=m1; mi=m1+my; m3=m1+2*my; jj=0; + // find limits for upper bound, base line and ground line + // m2-max m3-min m4-max + for_each_data(&(JOB->res.boxlist)) { + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if (box2->line>0 || box2->c == PICTURE) continue; + if ( box2->y0 < y0 || box2->y1 >= y0 + dy + || box2->x0 < x0 || box2->x1 >= x0 + dx ) continue; // out of image + if (lines->dx) yy = lines->dy * box2->x0 / (lines->dx); + else yy=0; + /* check for ij-dots, used if chars of same high */ + if ( box2->y0 >= y + yy + && box2->y0 >= y + && (box2->y1 - box2->y0) < my + && box2->y1 < m1 + yy + my/4 + && box2->y0 < mi + yy ) { + mi = box2->y0 - yy; /* highest upper boundary i-dot */ + } + // fprintf(stderr,"\n check %3d %3d-%3d y=%d yy=%d m1=%d", box2->x0, box2->y0, box2->y1, y, yy, m1); + /* get m2-max m3-min m4-max */ + if ( box2->y0 >= y + yy // lower than y + && 3 * (box2->y1 - box2->y0 + 1) > 2 * my // right size ? + && (box2->y1 - box2->y0 + 1) < 3 * my // font mix, size = 2.6*my + && (box2->y1 - box2->y0 + 1) > 3 // 4x6 lowercase=4 + && box2->y0 >= m1 // in m1 range? + && box2->y0 <= m1 + yy + 9 * my / 8 // my can be to small if mixed + // ToDo: we need a better (local?) algorithm for big headlines > 2*my + && box2->y1 <= m1 + yy + 3 * my + && box2->y1 >= m1 + yy + my / 2 + // lines can differ in high, my may be to small (smaller headlines) + && box2->y0+box2->y1 <= 2*box3->y1 + ) + { + jj++; // count chars for debugging purpose + if (box2->y0 > m2 + yy) { + m2 = box2->y0 - yy; /* highest upper boundary */ + if (JOB->cfg.verbose & 16) + fprintf(stderr," set m2= %d yy= %d\n# ",m2, yy); + } + if (box2->y1 > m4 + yy && (my>6 || box2->y1 < m3+my)) { + m4 = box2->y1 - yy; /* lowest lower boundary, small font lines can touch */ + } + if ( box2->y1 < m3 + yy + && ( ( 2*box2->y1 > m2+ m4+yy && m2>m1) + || ( 4*box2->y1 > m1+3*m4+yy) ) ) // care for TeX: \(^1\)Footnote 2003 + /* "'!?" could cause trouble here, therefore this lines */ + /* ToDo: get_bw costs time, check pre and next */ + if( get_bw(box2->x0,box2->x1,box2->y1+1 ,box2->y1+my/2,box2->p,JOB->cfg.cs,1) == 0 + || get_bw(box2->x0,box2->x1,box2->y1+my/2,box2->y1+my/2,box2->p,JOB->cfg.cs,1) == 1 + || num_cross(box2->x0,box2->x1,(box2->y0+box2->y1)/2,(box2->y0+box2->y1)/2,box2->p,JOB->cfg.cs)>2 ) + { + m3 = box2->y1 - yy; /* highest lower boundary */ + // printf("\n# set1 m3 m=%3d %+2d %+2d %+2d",m1,m2-m1,m3-m1,m4-m1); + // out_x(box2); + } + if (box2->y0 + box2->y1 > 2*(m3 + yy) + && box2->y1 < m4 + yy - my/4 -1 + && box2->y1 >= (m2 + m4)/2 // care for TeX: \(^1\)Footnote 2003 + && m2 > m1 ) // be sure to not use ', m2 must be ok + { + m3 = box2->y1 - yy; /* highest lower boundary */ + // printf("\n# set2 m3 m=%3d %+2d %+2d %+2d",m1,m2-m1,m3-m1,m4-m1); + // out_x(box2); + } + if (box2->x1>lines->x1[i]) lines->x1[i] = box2->x1; /* right end */ + if (box2->x0<lines->x0[i]) lines->x0[i] = box2->x0; /* left end */ + // printf(" m=%3d %+2d %+2d %+2d yy=%3d\n",m1,m2-m1,m3-m1,m4-m1,yy); + } + } end_for_each(&(JOB->res.boxlist)); + +#if 1 + /* this is only for test runs */ + if (JOB->cfg.verbose & 16) + fprintf(stderr," step 1 y=%4d m= %4d %+3d %+3d %+3d" + " my=%2d chars=%3d\n# ", + y, m1, m2-m1, m3-m1, m4-m1, my, jj); +#endif + + if (m3 == m1) + break; +#if 1 /* make averages about the line */ + // same again better estimation + mc = (3 * m3 + m1) / 4; /* lower center ? */ + ma1 = ma2 = ma3 = ma4 = i1 = i2 = i3 = i4 = jj = 0; + for_each_data(&(JOB->res.boxlist)) { + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if (box2->line>0 || box2->c == PICTURE) continue; + if (lines->dx) yy = lines->dy * box2->x0 / (lines->dx); else yy=0; + if (box2->y0 >= y + yy && box2->y1 < y0 + dy // lower than y + && box2->x0 >= x0 && box2->x1 < x0 + dx // in box ? + && box2->c != PICTURE // no picture + && 2 * (box2->y1 - box2->y0) > my // right size ? + && (box2->y1 - box2->y0) < 4 * my) { + if ( box2->y0 - yy >= m1-my/4 + && box2->y0 - yy <= m2+my/4 + && box2->y1 - yy >= m3-my/4 + && box2->y1 - yy <= m4+my/4 ) { /* its within allowed range! */ + // jj++; // not used + if (abs(box2->y0 - yy - m1) <= abs(box2->y0 - yy - m2)) + { i1++; ma1 += box2->y0 - yy; } + else { i2++; ma2 += box2->y0 - yy; } + if (abs(box2->y1 - yy - m3) < abs(box2->y1 - yy - m4)) + { i3++; ma3 += box2->y1 - yy; } + else { i4++; ma4 += box2->y1 - yy; } + if (box2->x1>lines->x1[i]) lines->x1[i] = box2->x1; /* right end */ + if (box2->x0<lines->x0[i]) lines->x0[i] = box2->x0; /* left end */ + } + } + } end_for_each(&(JOB->res.boxlist)); + + if (i1) m1 = (ma1+i1/2) / i1; /* best rounded */ + if (i2) m2 = (ma2+i2/2) / i2; + if (i3) m3 = (ma3+i3-1) / i3; /* round up */ + if (i4) m4 = (ma4+i4-1) / i4; + // printf("\n# .. set3 m3 m=%3d %+2d %+2d %+2d",m1,m2-m1,m3-m1,m4-m1); + +#endif + + /* expand right and left end of line */ + for_each_data(&(JOB->res.boxlist)) { + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if (box2->line>0 || box2->c == PICTURE) continue; + if (lines->dx) yy = lines->dy * box2->x0 / (lines->dx); else yy=0; + if ( box2->y0 >= y0 && box2->y1 < y0 + dy + && box2->x0 >= x0 && box2->x1 < x0 + dx // in box ? + && box2->c != PICTURE // no picture + && box2->y0 >= m1-1 + && box2->y0 <= m4 + && box2->y1 >= m1 + && box2->y1 <= m4+1 ) { /* its within line */ + if (box2->x1>lines->x1[i]) lines->x1[i] = box2->x1; /* right end */ + if (box2->x0<lines->x0[i]) lines->x0[i] = box2->x0; /* left end */ + } + } end_for_each(&(JOB->res.boxlist)); + +#if 1 + /* this is only for test runs */ + if (JOB->cfg.verbose & 16) + fprintf(stderr," step 2 y=%4d m= %4d %+3d %+3d %+3d\n# ", + y,m1,m2-m1,m3-m1,m4-m1); +#endif + + if (m4 == m1) { + if(m3+m4>2*y) y = (m4+m3)/2; /* lower end may overlap the next line */ + continue; + } + jj=0; + lines->wt[i] = 100; + if (5 * (m2 - m1 +1) < m3 - m2 || (m2 - m1) < 2) jj|=1; /* same high */ + if (5 * (m4 - m3 +1) < m3 - m2 || (m4 - m3) < 1) jj|=2; /* same base */ + if (jj&1) lines->wt[i] = 75*lines->wt[i]/100; + if (jj&2) lines->wt[i] = 75*lines->wt[i]/100; + if (jj>0 && JOB->cfg.verbose) { + fprintf(stderr," trouble on line %d, wt*100= %d\n",i,lines->wt[i]); + fprintf(stderr,"# m= %4d %+3d %+3d %+3d\n",m1,m2-m1,m3-m1,m4-m1); + fprintf(stderr,"# i= %3d %3d %3d %3d (counts)\n",i1,i2,i3,i4); + if (jj==3) fprintf(stderr,"# all boxes of same high!\n# "); + if (jj==1) fprintf(stderr,"# all boxes of same upper bound!\n# "); + if (jj==2) fprintf(stderr,"# all boxes of same lower bound!\n# "); + } + /* ToDo: check for dots ij,. to get the missing information */ +#if 1 + /* jj=3: ABCDEF123456 or mnmno or gqpy or lkhfdtb => we are in trouble */ + if (jj==3 && (m4-m1)>my) { jj=0; m2=m1+my/8+1; m4=m3+my/8+1; } /* ABC123 */ + /* using idots, may fail on "ABCDEFGÄÜÖ" */ + if (jj==3 && mi>0 && mi<m1 && mi>m4pre) { jj=2; m1=mi; } /* use ij dots */ + if (jj==1 && m2-(m3-m2)/4>m3pre ) { /* expect: acegmnopqrsuvwxyz */ + if (m1-m4pre<m4-m1) /* fails for 0123ABCD+Q$ */ + m1 = ( m2 + m4pre ) / 2 ; + else + m1 = ( m2 - (m3 - m2) / 4 ); + } + if (jj==3) + m2 = m1 + (m3 - m1) / 4 + 1; /* expect: 0123456789ABCDEF */ + if ( (m2 - m1) < 2) + m2 = m1 + 2; /* font hight < 8 pixel ? */ + if (jj&2) + m4 = m3 + (m4 - m1) / 4 + 1; /* chars have same lower base */ + if (jj>0 && JOB->cfg.verbose & 16) { + fprintf(stderr," m= %4d %+2d %+2d %+2d my= %4d\n# ", + m1, m2-m1, m3-m1, m4-m1, my); + } +#endif + + + { // empty space between lines + lines->m4[i] = m4; + lines->m3[i] = m3; + lines->m2[i] = m2; + lines->m1[i] = m1; + lines->pitch[i] = JOB->cfg.spc; /* default word pitch */ + lines->pitch[i] = 0; /* default spacing */ + if (JOB->cfg.verbose & 16) + fprintf(stderr, " m= %4d %+3d %+3d %+3d w= %d (line=%d)\n# ", + m1, m2 - m1, m3 - m1, m4 - m1, lines->wt[i], i); + if (i < MAXlines && m4 - m1 > 4) + i++; + if (i >= MAXlines) { + fprintf(stderr, "Warning: lines>MAXlines\n"); + break; + } + } + if (m3+m4>2*y) y = (m3+m4)/2; /* lower end may overlap the next line */ + if (m3>m3pre) m3pre = m3; else m3=y0; /* set for next-line scan */ + if (m4>m4pre) m4pre = m4; else m4=y0; /* set for next-line scan */ + } + lines->num = i; + if (JOB->cfg.verbose) + fprintf(stderr, " num_lines= %d", lines->num-1); + return 0; +} + +// ----- layout analyzis of dx*dy region at x0,y0 ----- +// ----- detect lines via recursive division (new version) --------------- +// what about text in frames??? +// ToDo: change to bottom-top analyse or/and take rotation into account +int detect_lines2(pix *p,int x0,int y0,int dx,int dy,int r){ + int i,x2,y2,x3,y3,x4,y4,x5,y5,y6,mx,my,x30,x31,y30,y31; + struct box *box2,*box3; + // shrink box + if(dx<=0 || dy<=0) return 0; + if(y0+dy< p->y/128 && y0==0) return 0; /* looks like dust */ + if(y0>p->y-p->y/128 && y0+dy==p->y) return 0; /* looks like dust */ + + if(r>1000){ return -1;} // something is wrong + if(JOB->cfg.verbose)fprintf(stderr,"\n# r=%2d ",r); + + mx=my=i=0; // mean thickness + // remove border, shrink size + x2=x0+dx-1; // min x + y2=y0+dy-1; // min y + x3=x0; // max x + y3=y0; // max y + for_each_data(&(JOB->res.boxlist)) { + box3 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if(box3->y0>=y0 && box3->y1<y0+dy && + box3->x0>=x0 && box3->x1<x0+dx) + { + if( box3->x1 > x3 ) x3=box3->x1; // max x + if( box3->x0 < x2 ) x2=box3->x0; // min x + if( box3->y1 > y3 ) y3=box3->y1; // max y + if( box3->y0 < y2 ) y2=box3->y0; // min y + if(box3->c!=PICTURE) + if( box3->y1 - box3->y0 > 4 ) + { + i++; + mx+=box3->x1-box3->x0+1; // mean x + my+=box3->y1-box3->y0+1; // mean y + } + } + } end_for_each(&(JOB->res.boxlist)); + x0=x2; dx=x3-x2+1; + y0=y2; dy=y3-y2+1; + + if(i==0 || dx<=0 || dy<=0) return 0; + mx/=i;my/=i; + // better look for widest h/v-gap, ToDo: vertical lines? + if(r<8){ // max. depth + + // detect widest horizontal gap + y2=y3=y4=y5=y6=0; + x2=x3=x4=x5=y5=0;// min. 3 lines + // position and thickness of gap, y6=num_gaps, nbox^2 ops + for_each_data(&(JOB->res.boxlist)) { // not very efficient, sorry + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if( box2->c!=PICTURE ) /* ToDo: not sure, that this is a good idea */ + if( box2->y0>=y0 && box2->y1<y0+dy + && box2->x0>=x0 && box2->x1<x0+dx + && box2->y1-box2->y0>my/2 ){ // no pictures & dust??? + + y4=y0+dy-1; // nearest vert. box + x4=x0+dx-1; + // ToDo: rotate back box2->x1,y1 to x21,y21 + // look for nearest lowest (y4) and right (x4) neighbour + // of every box (box2) + for_each_data(&(JOB->res.boxlist)) { + box3 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if(box3!=box2) + if(box3->y0>=y0 && box3->y1<y0+dy) + if(box3->x0>=x0 && box3->x1<x0+dx) + if(box3->c!=PICTURE) /* ToDo: not sure, that this is a good idea */ + if(box3->y1-box3->y0>my/2 ){ + // ToDo: here we need the rotation around box2 + x30=box3->x0; + x31=box3->x1; + y30=box3->y0; + y31=box3->y1; + // get min. distances to lower and to right direction + if( y31 > box2->y1 && y30 < y4 ) y4=y30-1; + if( x31 > box2->x1 && x30 < x4 ) x4=x30-1; + } + } end_for_each(&(JOB->res.boxlist)); + // set the witdht and position of largest hor./vert. gap + // largest gap: width position + if( y4-box2->y1 > y3 ) { y3=y4-box2->y1; y2=(y4+box2->y1)/2; } + if( x4-box2->x1 > x3 ) { x3=x4-box2->x1; x2=(x4+box2->x1)/2; } + } + } end_for_each(&(JOB->res.boxlist)); + // fprintf(stderr,"\n widest y-gap= %4d %4d",y2,y3); + // fprintf(stderr,"\n widest x-gap= %4d %4d",x2,x3); + + i=0; // i=1 at x, i=2 at y + // this is the critical point + // is this a good decision or not??? + if(x3>0 || y3>0){ + if(x3>mx && x3>2*y3 && (dy>5*x3 || (x3>10*y3 && y3>0))) i=1; else + if(dx>5*y3 && y3>my) i=2; + } + + // compare with largest box??? + for_each_data(&(JOB->res.boxlist)) { // not very efficient, sorry + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if( box2->c == PICTURE ) + if( box2->y0>=y0 && box2->y1<y0+dy + && box2->x0>=x0 && box2->x1<x0+dx ) + { // hline ??? + // largest gap: width position + if( box2->x1-box2->x0+4 > dx && box2->y1+4<y0+dy ) { y3=1; y2=box2->y1+1; i=2; break; } + if( box2->x1-box2->x0+4 > dx && box2->y0-4>y0 ) { y3=1; y2=box2->y0-1; i=2; break; } + if( box2->y1-box2->y0+4 > dy && box2->x1+4<x0+dx ) { x3=1; x2=box2->x1+1; i=1; break; } + if( box2->y1-box2->y0+4 > dy && box2->x0-4>x0 ) { x3=1; x2=box2->x0-1; i=1; break; } + } + } end_for_each(&(JOB->res.boxlist)); + if(JOB->cfg.verbose)fprintf(stderr," i=%d",i); + + if(JOB->cfg.verbose && i) fprintf(stderr," divide at %s x=%4d y=%4d dx=%4d dy=%4d", + ((i)?( (i==1)?"x":"y" ):"?"),x2,y2,x3,y3); + // divide horizontally if v-gap is thicker than h-gap + // and length is larger 5*width + if(i==1){ detect_lines2(p,x0,y0,x2-x0+1,dy,r+1); + return detect_lines2(p,x2,y0,x0+dx-x2+1,dy,r+1); } + // divide vertically + if(i==2){ detect_lines2(p,x0,y0,dx,y2-y0+1,r+1); + return detect_lines2(p,x0,y2,dx,y0+dy-y2+1,r+1); + } + } + + + if(JOB->cfg.verbose) if(dx<5 || dy<7)fprintf(stderr," empty box"); + if(dx<5 || dy<7) return 0; // do not care about dust + if(JOB->cfg.verbose)fprintf(stderr, " box detected at %4d %4d %4d %4d",x0,y0,dx,dy); + if(JOB->tmp.ppo.p){ + for(i=0;i<dx;i++)put(&JOB->tmp.ppo,x0+i ,y0 ,255,16); + for(i=0;i<dx;i++)put(&JOB->tmp.ppo,x0+i ,y0+dy-1,255,16); + for(i=0;i<dy;i++)put(&JOB->tmp.ppo,x0 ,y0+i ,255,16); + for(i=0;i<dy;i++)put(&JOB->tmp.ppo,x0+dx-1,y0+i ,255,16); + // writebmp("out10.bmp",p2,JOB->cfg.verbose); // colored should be better + } + return detect_lines1(p,x0-0*1,y0-0*2,dx+0*2,dy+0*3); + +/* + struct tlines *lines = &JOB->res.lines; + i=lines->num; lines->num++; + lines->m1[i]=y0; lines->m2[i]=y0+5*dy/16; + lines->m3[i]=y0+12*dy/16; lines->m4[i]=y0+dy-1; + lines->x0[i]=x0; lines->x1[i]=x0+dx-1; + if(JOB->cfg.verbose)fprintf(stderr," - line= %d",lines->num); + return 0; + */ +} + +/* ToDo: herons algorithm for square root x=(x+y/x)/2 is more efficient + * than interval subdivision (?) (germ.: Intervallschachtelung) + * without using matlib + * see http://www.math.vt.edu/people/brown/doc/sqrts.pdf + */ +int my_sqrt(int x){ + int y0=0,y1=x,ym; + for (;y0<y1-1;){ + ym=(y0+y1)/2; + if (ym*ym<x) y0=ym; else y1=ym; + } + return y0; +} + +/* +** Detect rotation angle (one for whole image) +** old: longest text-line and determining the angle of this line. + * + * search right nearest neighbour of each box and average vectors + * to get the text orientation, + * upside down decision is not made here (I dont know how to do it) + * ToDo: set job->res.lines.{dx,dy} + * pass 1: get mean vector to nearest char + * pass 2: get mean vector to nearest char without outriders to pass 1 + * extimate direction as (dx,dy,num)[pass] + * ToDo: estimate an error, boxes only work fine for zero-rotation + * for 45 degree use vectors, not boxes to get base line + */ +#define INorm 1024 /* integer unit 1.0 */ +int detect_rotation_angle(job_t *job){ + struct box *box2, *box3, + *box_nn; /* nearest neighbour box */ + int x2, y2, x3, y3, dist, mindist, pass, + rx=0, ry=0, re=0, // final result + /* to avoid 2nd run, wie store pairs in 2 different categories */ + nn[4]={0,0,0,0}, /* num_pairs used for estimation [(pass-1)%2,pass%2] */ + dx[4]={0,0,0,0}, /* x-component of rotation vector per pass */ + dy[4]={0,0,0,0}, /* y-component of rotation vector per pass */ + er[4]={INorm/4,0,0,0}; /* mean angle deviation to pass-1 (radius^2) */ + // de; /* ToDo: absolute maximum error (dx^2+dy^2) */ + // ToDo: next pass: go to bigger distances and reduce max error + // error is diff between passes? or diff of bottoms and top borders (?) + + rx=1024; ry=0; // default + for (pass=0;pass<4;pass++) { + for_each_data(&(job->res.boxlist)) { + box2 = (struct box *)list_get_current(&(job->res.boxlist)); + if (box2->c==PICTURE) continue; + /* subfunction probability of char */ + // i? + // if (box2->x1 - box2->x0 < 3) continue; /* smallest font is 4x6 */ + if (box2->y1 - box2->y0 < 4) continue; + /* set maximum possible distance */ + box_nn=box2; // initial box to compare with + + // ToDo: clustering or majority + // the algorithm is far from being perfect, pitfalls are likely + // but its better than the old algorithm, ToDo: database-rotated-images + mindist = job->src.p.x * job->src.p.x + job->src.p.y * job->src.p.y; + /* get middle point of the box */ + x2 = (box2->x0 + box2->x1)/2; + y2 = (box2->y0 + box2->y1)/2; + re=0; + /* search for nearest neighbour box_nn[pass+1] of box_nn[pass] */ + for_each_data(&(job->res.boxlist)) { + box3 = (struct box *)list_get_current(&(job->res.boxlist)); + /* try to select only potential neighbouring chars */ + /* select out all senseless combinations */ + if (box3->c==PICTURE || box3==box2) continue; + x3 = (box3->x0 + box3->x1)/2; + y3 = (box3->y0 + box3->y1)/2; /* get middle point of the box */ + if (x3<x2) continue; /* simplify by going right only */ + // through-away deviation of angles if > pass-1? + // scalprod max in direction, cross prod min in direction + // a,b (vectors): <a,b>^2/(|a|*|b|)^2 = 0(90deg)..0.5(45deg).. 1(0deg) + // * 1024 ?? + if (pass>0) { // new variant = scalar product + // danger of int overflow, ToDo: use int fraction + re =(int) ((1.*(x3-x2)*dx[pass-1]+(y3-y2)*dy[pass-1]) + *(1.*(x3-x2)*dx[pass-1]+(y3-y2)*dy[pass-1])*INorm + /(1.*((x3-x2)*(x3-x2)+(y3-y2)*(y3-y2)) + *(1.*dx[pass-1]*dx[pass-1]+dy[pass-1]*dy[pass-1]))); + if (INorm-re>er[pass-1]) continue; // hits mean deviation + } + /* neighbours should have same order of size (?) */ + if (3*(box3->y1-box3->y0+4) < 2*(box2->y1-box2->y0+1)) continue; + if (2*(box3->y1-box3->y0+1) > 3*(box2->y1-box2->y0+4)) continue; + if (2*(box3->x1-box3->x0+1) > 5*(box2->x1-box2->x0+4)) continue; + if (5*(box3->x1-box3->x0+4) < 2*(box2->x1-box2->x0+1)) continue; + /* should be in right range, Idea: center3 outside box2? noholes */ + if ((x3<box2->x1-1) && (x3>box2->x0+1) + && (y3<box2->y1-1) && (y3>box2->y0+1)) continue; + // if chars are of different size, connect careful + if ( abs(x3-x2) > 2*(box2->x1 - box2->x0 + box3->x1 - box3 ->x0 + 2)) continue; + if ( abs(y3-y2) > (box2->x1 - box2->x0 + box3->x1 - box3 ->x0 + 2)) continue; + dist = (y3-y2)*(y3-y2) + (x3-x2)*(x3-x2); + // make distances in pass-1 directions shorter or continue if not in pass-1 range? + if (dist<9) continue; /* minimum distance^2 is 3^2 */ + if (dist<mindist) { mindist=dist; box_nn=box3;} + // fprintf(stderr,"x y %d %d %d %d dist %d min %d\n", + // x2,y2,x3,y3,dist,mindist); + } end_for_each(&(job->res.boxlist)); + + if (box_nn==box2) continue; /* has no neighbour, next box */ + + box3=box_nn; dist=mindist; + x3 = (box3->x0 + box3->x1)/2; + y3 = (box3->y0 + box3->y1)/2; /* get middle point of the box */ + // dist = my_sqrt(1024*((x3-x2)*(x3-x2)+(y3-y2)*(y3-y2))); + // compare with first box + x2 = (box2->x0 + box2->x1)/2; + y2 = (box2->y0 + box2->y1)/2; + // if the high of neighbouring boxes differ, use min diff (y0,y1) + if (pass>0 && 16*abs(dy[pass-1]) < dx[pass-1]) // dont work for strong rot. + if (abs(box2->y1-box2->y0-box3->y1+box3->y0)>(box2->y1-box2->y0)/8) { + // ad eh ck ... + if (abs(box2->y1-box3->y1)<abs(y3-y2)) { y2=box2->y1; y3=box3->y1; } + // ag ep qu ... + if (abs(box2->y0-box3->y0)<abs(y3-y2)) { y2=box2->y0; y3=box3->y0; } + } + if (abs(x3-x2)<4) continue; + dx[pass]+=(x3-x2)*1024; /* normalized before averaging */ + dy[pass]+=(y3-y2)*1024; /* 1024 is for the precision */ + nn[pass]++; + if (pass>0) { // set error = mean deviation from pass -1 + re = INorm-(int)((1.*(x3-x2)*dx[pass-1]+(y3-y2)*dy[pass-1]) + *(1.*(x3-x2)*dx[pass-1]+(y3-y2)*dy[pass-1])*INorm + /((1.*(x3-x2)*(x3-x2)+(y3-y2)*(y3-y2)) + *(1.*dx[pass-1]*dx[pass-1]+dy[pass-1]*dy[pass-1])) + ); + er[pass]+=re; + } +#if 0 + if(JOB->cfg.verbose) + fprintf(stderr,"# next nb (x,y,dx,dy,re) %6d %6d %5d %5d %5d pass %d\n", + x2, y2, x3-x2, y3-y2, re, pass+1); +#endif + } end_for_each(&(job->res.boxlist)); + if (!nn[pass]) break; + if (nn[pass]) { + /* meanvalues */ + rx=dx[pass]/=nn[pass]; + ry=dy[pass]/=nn[pass]; + if (pass>0) er[pass]/=nn[pass]; + } + if(JOB->cfg.verbose) + fprintf(stderr,"# rotation angle (x,y,maxr,num)" + " %6d %6d %6d %4d pass %d\n", + rx, ry, er[pass], nn[pass], pass+1); + } + if (abs(ry*100)>abs(rx*50)) + fprintf(stderr,"<!-- gocr will fail, strong rotation angle detected -->\n"); + /* ToDo: normalize to 2^10 bit (square fits to 32 it) */ + JOB->res.lines.dx=rx; + JOB->res.lines.dy=ry; + return 0; +} + +/* ----- detect lines --------------- */ +int detect_text_lines(pix * pp, int mo) { + + if (JOB->cfg.verbose) + fprintf(stderr, "# detect.c detect_text_lines (vvv=16 for more info) "); + if (mo & 4){ + if (JOB->cfg.verbose) fprintf(stderr, "# zoning\n# ... "); + detect_lines2(pp, 0, 0, pp->x, pp->y, 0); // later replaced by better algo + } else + detect_lines1(pp, 0, 0, pp->x, pp->y); // old algo + + if(JOB->cfg.verbose) fprintf(stderr,"\n"); + return 0; +} + + +/* ----- adjust lines --------------- */ +// rotation angle? JOB->res.lines.dy, .x0 removed later +// this is for cases, where m1..m4 is not very sure detected before +// chars are recognized +int adjust_text_lines(pix * pp, int mo) { + struct box *box2; + int *m, /* summ m1..m4, num_chars for m1..m4, min m1..m4, max. m1..m4 */ + l, i, dy, dx, diff=0, y0, y1; + + if ((l=JOB->res.lines.num)<2) return 0; // ??? + if (JOB->cfg.verbose) + fprintf(stderr, "# adjust text lines "); + m=(int *)malloc(l*16*sizeof(int)); + if (!m) { fprintf(stderr," malloc failed\n"); return 0;} + for (i=0;i<16*l;i++) m[i]=0; /* initialize */ + dy=JOB->res.lines.dy; /* tan(alpha) of skewing */ + dx=JOB->res.lines.dx; /* old: width of image */ + // js: later skewing is replaced by one transformation of vectorized image + + if (dx) + for_each_data(&(JOB->res.boxlist)) { + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if (box2->line<=0) continue; + if (box2->num_ac<1) continue; + if (box2->wac[0]<95) continue; + if (box2->m2==0 || box2->y1<box2->m2) continue; // char outside line + if (box2->m3==4 || box2->y0>box2->m3) continue; // char outside line + y0=box2->y0-((box2->x1)*dy/dx); /* corrected by page skewing */ + y1=box2->y1-((box2->x1)*dy/dx); + if (strchr("aemnr",(char)box2->tac[0])) { // cC vV sS oO ... is unsure! + m[box2->line*16+1]+=y0; m[box2->line*16+5]++; // num m2 + m[box2->line*16+2]+=y1; m[box2->line*16+6]++; // num m3 + if (m[box2->line*16+ 9]>y0) m[box2->line*16+ 9]=y0; /* min m2 */ + if (m[box2->line*16+13]<y0) m[box2->line*16+13]=y0; /* max m2 */ + if (m[box2->line*16+10]>y1) m[box2->line*16+10]=y1; /* min m3 */ + if (m[box2->line*16+14]<y1) m[box2->line*16+14]=y1; /* max m3 */ + } + if (strchr("bdhklABDEFGHIKLMNRT123456789",(char)box2->tac[0])) { + m[box2->line*16+0]+=y0; m[box2->line*16+4]++; // num m1 + m[box2->line*16+2]+=y1; m[box2->line*16+6]++; // num m3 + if (m[box2->line*16+ 8]>y0) m[box2->line*16+ 8]=y0; /* min m1 */ + if (m[box2->line*16+12]<y0) m[box2->line*16+12]=y0; /* max m1 */ + if (m[box2->line*16+10]>y1) m[box2->line*16+10]=y1; /* min m3 */ + if (m[box2->line*16+14]<y1) m[box2->line*16+14]=y1; /* max m3 */ + } + if (strchr("gq",(char)box2->tac[0])) { + m[box2->line*16+1]+=y0; m[box2->line*16+5]++; // num m2 + m[box2->line*16+3]+=y1; m[box2->line*16+7]++; // num m4 + if (m[box2->line*16+ 9]>y0) m[box2->line*16+ 9]=y0; /* min m2 */ + if (m[box2->line*16+13]<y0) m[box2->line*16+13]=y0; /* max m2 */ + if (m[box2->line*16+11]>y1) m[box2->line*16+11]=y1; /* min m4 */ + if (m[box2->line*16+15]<y1) m[box2->line*16+15]=y1; /* max m4 */ + } + } end_for_each(&(JOB->res.boxlist)); + + for (i=1;i<l;i++) { + diff=0; // show diff per line + if (m[i*16+4]) diff+=abs(JOB->res.lines.m1[i]-m[i*16+0]/m[i*16+4]); + if (m[i*16+5]) diff+=abs(JOB->res.lines.m2[i]-m[i*16+1]/m[i*16+5]); + if (m[i*16+6]) diff+=abs(JOB->res.lines.m3[i]-m[i*16+2]/m[i*16+6]); + if (m[i*16+7]) diff+=abs(JOB->res.lines.m4[i]-m[i*16+3]/m[i*16+7]); + /* recalculate sureness, empirically */ + if (m[i*16+4]*m[i*16+5]*m[i*16+6]*m[i*16+7] > 0) + JOB->res.lines.wt[i]=(JOB->res.lines.wt[i]+100)/2; + else + JOB->res.lines.wt[i]=(JOB->res.lines.wt[i]*90)/100; + // set mean values of sure detected bounds (rounded precisely) + if ( m[i*16+4]) JOB->res.lines.m1[i]=(m[i*16+0]+m[i*16+4]/2)/m[i*16+4]; + if ( m[i*16+5]) JOB->res.lines.m2[i]=(m[i*16+1]+m[i*16+5]/2)/m[i*16+5]; + if ( m[i*16+6]) JOB->res.lines.m3[i]=(m[i*16+2]+m[i*16+6]/2)/m[i*16+6]; + if ( m[i*16+7]) JOB->res.lines.m4[i]=(m[i*16+3]+m[i*16+7]/2)/m[i*16+7]; + // care about very small fonts + if (JOB->res.lines.m2[i]-JOB->res.lines.m1[i]<=1 && m[i*16+5]==0 && m[i*16+4]) + JOB->res.lines.m2[i]=JOB->res.lines.m1[i]+2; + if (JOB->res.lines.m2[i]-JOB->res.lines.m1[i]<=1 && m[i*16+4]==0 && m[i*16+5]) + JOB->res.lines.m1[i]=JOB->res.lines.m2[i]-2; + if (JOB->res.lines.m4[i]-JOB->res.lines.m3[i]<=1 && m[i*16+7]==0 && m[i*16+6]) + JOB->res.lines.m4[i]=JOB->res.lines.m3[i]+2; + if (JOB->res.lines.m4[i]-JOB->res.lines.m3[i]<=1 && m[i*16+6]==0 && m[i*16+7]) + JOB->res.lines.m3[i]=JOB->res.lines.m4[i]-2; + if ( m[i*16+7]<1 && + JOB->res.lines.m4[i] + <=JOB->res.lines.m3[i]+(JOB->res.lines.m3[i]-JOB->res.lines.m2[i])/4 ) + JOB->res.lines.m4[i]= + JOB->res.lines.m3[i]+(JOB->res.lines.m3[i]-JOB->res.lines.m2[i])/4; + if ( m[i*16+7]<1 && m[i*16+12+2]>0 && // m4 < max.m3+.. + JOB->res.lines.m4[i] < 2*m[i*16+12+2]-JOB->res.lines.m3[i]+2 ) + JOB->res.lines.m4[i] = 2*m[i*16+12+2]-JOB->res.lines.m3[i]+2; + if (JOB->res.lines.m4[i]<=JOB->res.lines.m3[i]) + JOB->res.lines.m4[i]= JOB->res.lines.m3[i]+1; /* 4x6 */ + + if (JOB->cfg.verbose & 17) + fprintf(stderr, "\n# line= %3d m= %4d %+3d %+3d %+3d " + " n= %2d %2d %2d %2d w= %3d diff= %d", + i, JOB->res.lines.m1[i], + JOB->res.lines.m2[i] - JOB->res.lines.m1[i], + JOB->res.lines.m3[i] - JOB->res.lines.m1[i], + JOB->res.lines.m4[i] - JOB->res.lines.m1[i], + m[i*16+4],m[i*16+5],m[i*16+6],m[i*16+7], + JOB->res.lines.wt[i], diff); + } + diff=0; // count adjusted chars +#if 1 + if (dx) + for_each_data(&(JOB->res.boxlist)) { + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if (box2->line<=0) continue; + /* check if box was on the wrong line, ToDo: search a better line */ + if (2*box2->y0<2*JOB->res.lines.m1[box2->line] + -JOB->res.lines.m4[box2->line] + +JOB->res.lines.m1[box2->line]) box2->line=0; + if (2*box2->y1>2*JOB->res.lines.m4[box2->line] + +JOB->res.lines.m4[box2->line] + -JOB->res.lines.m1[box2->line]) box2->line=0; + /* do adjustments */ + if (box2->num_ac>0 + && box2->num_ac > 31 && box2->tac[0] < 127 /* islower(>256) may SIGSEGV */ + && strchr("cCoOpPsSuUvVwWxXyYzZ",(char)box2->tac[0])) { // no_wchar + if (box2->y0-((box2->x1)*dy/dx) + < (JOB->res.lines.m1[box2->line]+JOB->res.lines.m2[box2->line])/2 + && islower(box2->tac[0]) + ) { setac(box2,toupper((char)box2->tac[0]),(box2->wac[0]+101)/2); diff++; } + if (box2->y0-((box2->x1)*dy/dx) + > (JOB->res.lines.m1[box2->line]+JOB->res.lines.m2[box2->line]+1)/2 + && isupper(box2->tac[0]) + ){ setac(box2,tolower((char)box2->tac[0]),(box2->wac[0]+101)/2); diff++; } + } + box2->m1=JOB->res.lines.m1[box2->line]+((box2->x1)*dy/dx); + box2->m2=JOB->res.lines.m2[box2->line]+((box2->x1)*dy/dx); + box2->m3=JOB->res.lines.m3[box2->line]+((box2->x1)*dy/dx); + box2->m4=JOB->res.lines.m4[box2->line]+((box2->x1)*dy/dx); + } end_for_each(&(JOB->res.boxlist)); +#endif + + free(m); + if(JOB->cfg.verbose) fprintf(stderr,"\n# changed_chars= %d\n",diff); + return(diff); +} + +/* ---- measure mean character + * recalculate mean width and high after changes in boxlist + * ToDo: only within a Range? + */ +int calc_average() { + int i = 0, x0, y0, x1, y1; + struct box *box4; + + JOB->res.numC = 0; + JOB->res.sumY = 0; + JOB->res.sumX = 0; + for_each_data(&(JOB->res.boxlist)) { + box4 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if( box4->c != PICTURE ){ + x0 = box4->x0; x1 = box4->x1; + y0 = box4->y0; y1 = box4->y1; + i++; + if (JOB->res.avX * JOB->res.avY > 0) { + if (x1 - x0 + 1 > 4 * JOB->res.avX + && y1 - y0 + 1 > 4 * JOB->res.avY) continue; /* small picture */ + if (4 * (y1 - y0 + 1) < JOB->res.avY || y1 - y0 < 2) + continue; // dots .,-_ etc. + } + if (x1 - x0 + 1 < 4 + && y1 - y0 + 1 < 6 ) continue; /* dots etc */ + JOB->res.sumX += x1 - x0 + 1; + JOB->res.sumY += y1 - y0 + 1; + JOB->res.numC++; + } + } end_for_each(&(JOB->res.boxlist)); + if ( JOB->res.numC ) { /* avoid div 0 */ + JOB->res.avY = (JOB->res.sumY+JOB->res.numC/2) / JOB->res.numC; + JOB->res.avX = (JOB->res.sumX+JOB->res.numC/2) / JOB->res.numC; + } + if (JOB->cfg.verbose){ + fprintf(stderr, "# averages: mXmY= %d %d nC= %d n= %d\n", + JOB->res.avX, JOB->res.avY, JOB->res.numC, i); + } + return 0; +} + + +/* ---- analyse boxes, find pictures and mark (do this first!!!) + */ +int detect_pictures(job_t *job) { + int i = 0, x0, y0, x1, y1, num_h; + struct box *box2, *box4; + + if ( job->res.numC == 0 ) { + if (job->cfg.verbose) fprintf(stderr, + "# detect.C L%d Warning: numC=0\n", __LINE__); + return -1; + } + /* ToDo: set Y to uppercase mean value? */ + job->res.avY = (job->res.sumY+job->res.numC/2) / job->res.numC; + job->res.avX = (job->res.sumX+job->res.numC/2) / job->res.numC; + /* ToDo: two highest volumes? crosses, on extreme volume + on border */ + if (job->cfg.verbose) + fprintf(stderr, "# detect.C L%d pictures, frames, mXmY= %d %d ... ", + __LINE__, job->res.avX, job->res.avY); + for_each_data(&(job->res.boxlist)) { + box2 = (struct box *)list_get_current(&(job->res.boxlist)); + if (box2->c == PICTURE) continue; + x0 = box2->x0; x1 = box2->x1; + y0 = box2->y0; y1 = box2->y1; + + /* pictures could be of unusual size */ + if (x1 - x0 + 1 > 4 * job->res.avX || y1 - y0 + 1 > 4 * job->res.avY) { + /* count objects on same baseline which could be chars */ + /* else: big headlines could be misinterpreted as pictures */ + num_h=0; + for_each_data(&(job->res.boxlist)) { + box4 = (struct box *)list_get_current(&(job->res.boxlist)); + if (box4->c == PICTURE) continue; + if (box4->y1-box4->y0 > 2*(y1-y0)) continue; + if (2*(box4->y1-box4->y0) < y1-y0) continue; + if (box4->y0 > y0 + (y1-y0+1)/2 + || box4->y0 < y0 - (y1-y0+1)/2 + || box4->y1 > y1 + (y1-y0+1)/2 + || box4->y1 < y1 - (y1-y0+1)/2) continue; + // ToDo: continue if numcross() only 1, example: |||IIIll||| + num_h++; + } end_for_each(&(job->res.boxlist)); + if (num_h>4) continue; + box2->c = PICTURE; + i++; + } + /* ToDo: pictures could have low contrast=Sum((pixel(p,x,y)-160)^2) */ + } end_for_each(&(job->res.boxlist)); + // start second iteration + if (job->cfg.verbose) { + fprintf(stderr, " %d - boxes %d\n", i, job->res.numC-i); + } + calc_average(); + return 0; +} diff --git a/lib/gocr/gocr.h b/lib/gocr/gocr.h new file mode 100644 index 00000000..a252ba60 --- /dev/null +++ b/lib/gocr/gocr.h @@ -0,0 +1,286 @@ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000-2006 Joerg Schulenburg + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + see README for EMAIL-address + + sometimes I have written comments in german language, sorry for that + + - look for ??? for preliminary code +*/ + +/* General headerfile with gocr-definitions */ + +#ifndef __GOCR_H__ +#define __GOCR_H__ + +#include "pnm.h" +#include "unicode.h" +#include "list.h" +#include <stddef.h> +#ifdef HAVE_GETTIMEOFDAY +#include <sys/time.h> +#endif + +/* + * wchar_t should always exist (ANSI), but WCHAR.H is sometimes missing + * USE_UNICODE should be removed or replaced by HAVE_WCHAR_H in future + */ +#ifdef HAVE_WCHAR_H +#define USE_UNICODE 1 +#endif + +/* extern "C"{ */ +/* ------------------------ feature extraction ----------------- */ +#define AT 7 /* mark */ +#define M1 1 /* mark */ +enum direction { + UP=1, DO, RI, LE +}; +typedef enum direction DIRECTION; +#define ST 7 /* stop */ +/* ------------------------------------------------------------- */ +/* detect maximas in of line overlapps (return in %) and line koord */ +#define HOR 1 /* horizontal */ +#define VER 2 /* vertikal */ +#define RIS 3 /* rising=steigend */ +#define FAL 4 /* falling=fallend */ + +#define MAXlines 1024 + +/* ToDo: if we have a tree instead of a list, a line could be a node object */ +struct tlines { + int num; + int dx, dy; /* direction of text lines (straight/skew) */ + int m1[MAXlines], /* start of line = upper bound of 'A' */ + m2[MAXlines], /* upper bound of 'e' */ + m3[MAXlines], /* lower bound of 'e' = baseline */ + m4[MAXlines]; /* stop of line = lower bound of 'q' */ + int x0[MAXlines], + x1[MAXlines]; /* left and right border */ + int wt[MAXlines]; /* weight, how sure thats correct in percent, v0.41 */ + int pitch[MAXlines]; /* word pitch (later per box?), v0.41 */ + int mono[MAXlines]; /* spacing type, 0=proportional, 1=monospaced */ +}; + +#define NumAlt 10 /* maximal number of alternative chars (table length) */ +#define MaxNumFrames 8 /* maximum number of frames per char/box */ +#define MaxFrameVectors 128 /* maximum vectors per frame (*8=1KB/box) */ +/* ToDo: use only malloc_box(),free_box(),copybox() for creation, destroy etc. + * adding reference_counter to avoid pointer pointing to freed box + */ +struct box { /* this structure should contain all pixel infos of a letter */ + int x0,x1,y0,y1,x,y,dots; /* xmin,xmax,ymin,ymax,reference-pixel,i-dots */ + int num_boxes, /* 1 "abc", 2 "!i?", 3 "ä" (composed objects) 0.41 */ + num_subboxes; /* 1 for "abdegopqADOPQR", 2 for "B" (holes) 0.41 */ + wchar_t c; /* detected char (same as tac[0], obsolete?) */ + wchar_t modifier; /* default=0, see compose() in unicode.c */ + int num; /* same number = same char */ + int line; /* line number (points to struct tlines lines) */ + int m1,m2,m3,m4; /* m2 = upper boundary, m3 = baseline */ + /* planed: sizeof hole_1, hole_2, certainty (run1=100%,run2=90%,etc.) */ + pix *p; /* pointer to pixmap (v0.2.5) */ + /* tac, wac is used together with setac() to manage very similar chars */ + int num_ac; /* length of table (alternative chars), default=0 */ + wchar_t tac[NumAlt]; /* alternative chars, only used by setac(),getac() */ + int wac[NumAlt]; /* weight of alternative chars */ + char *tas[NumAlt]; /* alternative UTF8-strings or XML codes if tac[]=0 */ + /* replacing old obj */ + /* ToDo: (*obj)[NumAlt] + olen[NumAlt] ??? */ + /* ToDo: bitmap for possible Picture|Object|Char ??? */ +/* char *obj; */ /* pointer to text-object ... -> replaced by tas[] */ + /* ... (melted chars, barcode, picture coords, ...) */ + /* must be freed before box is freed! */ + /* do _not_ copy only the pointer to object */ + /* -------------------------------------------------------- + * extension since v0.41 js05, Store frame vectors, + * which is a table of vectors sourrounding the char and its + * inner white holes. The advantage is the independence from + * resolution, handling of holes, overlap and rotation. + * --------------------------------------------------------- */ + int num_frames; /* number of frames: 1 for cfhklmnrstuvwxyz */ + /* 2 for abdegijopq */ + int frame_vol[MaxNumFrames]; /* volume inside frame +/- (black/white) */ + int frame_per[MaxNumFrames]; /* periphery, summed length of vectors */ + int num_frame_vectors[MaxNumFrames]; /* index to next frame */ + /* biggest frame should be stored first (outer frame) */ + /* biggest has the maximum pair distance */ + /* num vector loops */ + int frame_vector[MaxFrameVectors][2]; /* may be 16*int=fixpoint_number */ + +}; +typedef struct box Box; + +/* true if the coordination pair (a,b) is outside the image p */ +#define outbounds(p, a, b) (a < 0 || b < 0 || a >= (p)->x || b >= (p)->y) + +/* ToDo: this structure seems to be obsolete, remove it */ +typedef struct path { + int start; /* color at the beginning of the path, (0=white, 1=black) */ + int *x; /* x coordinates of transitions */ + int *y; /* y coordinates of transitions */ + int num; /* current number of entries in x or y */ + int max; /* maximum number of entries in x or y */ + /* (if more values need to be stored, the arrays are enlarged) */ +} path_t; + +/* job_t contains all information needed for an OCR task */ +typedef struct job_s { + struct { /* source data */ + char *fname; /* input filename; default value: "-" */ + pix p; /* source pixel data, pixelmap 8bit gray */ + } src; + struct { /* temporary stuff, e.g. buffers */ +#ifdef HAVE_GETTIMEOFDAY + struct timeval init_time; /* starting time of this job */ +#endif + pix ppo; /* pixmap for visual debugging output, obsolete */ + + /* sometimes recognition function is called again and again, if result was 0 + n_run tells the pixel function to return alternative results */ + int n_run; /* num of run, if run_2 critical pattern get other results */ + /* used for 2nd try, pixel uses slower filter function etc. */ + List dblist; /* list of boxes loaded from the character database */ + } tmp; + struct { /* results */ + List boxlist; /* store every object in a box, which contains */ + /* the characteristics of the object (see struct box) */ + List linelist; /* recognized text lines after recognition */ + + struct tlines lines; /* used to access to line-data (statistics) */ + /* here the positions (frames) of lines are */ + /* stored for further use */ + int avX,avY; /* average X,Y (avX=sumX/numC) */ + int sumX,sumY,numC; /* sum of all X,Y; num chars */ + } res; + struct { /* configuration */ + int cs; /* critical grey value (pixel<cs => black pixel) */ + /* range: 0..255, 0 means autodetection */ + int spc; /* spacewidth/dots (0 = autodetect); default value: 0 */ + int mode; /* operation modes; default value: 0 */ + /* operation mode (see --help) */ + int dust_size; /* dust size; default value: 10 */ + int only_numbers; /* numbers only; default value: 0 */ + int verbose; /* verbose mode; default value: 0 */ + /* verbose option (see --help) */ + FORMAT out_format; /* output format; default value: ISO8859_1*/ + char *lc; /* debuglist of chars (_ = not recognized chars) */ + /* default value: "_" */ + char *db_path; /* pathname for database; default value: NULL */ + char *cfilter; /* char filter; default value: NULL, ex: "A-Za-z" */ + /* limit of certainty where chars are accepted as identified */ + int certainty; /* in units of 100 (percent); 0..100; default 95 */ + } cfg; +} job_t; + +/* initialze job structure */ +void job_init(job_t *job); + +/* free job structure */ +void job_free(job_t *job); + +/*FIXME jb: remove JOB; */ +extern job_t *JOB; + +/* calculate the overlapp of the line (0-1) with black points + * by rekursiv bisection + * (evl. Fehlertoleranz mit pixel in Umgebung dx,dy suchen) (umschaltbar) ??? + * MidPoint Line Algorithm (Bresenham) Foley: ComputerGraphics better? + * will be replaced by vector functions + */ + +/* gerade y=dy/dx*x+b, implizit d=F(x,y)=dy*x-dx*y+b*dx=0 + * incrementell y(i+1)=m*(x(i)+1)+b, F(x+1,y+1)=f(F(x,y)) */ +int get_line(int x0, int y0, int x1, int y1, pix *p, int cs, int ret); +int get_line2(int x0, int y0, int x1, int y1, pix *p, int cs, int ret); + +/* look for white 0x02 or black 0x01 dots (0x03 = white+black) */ +char get_bw(int x0, int x1, int y0, int y1, + pix *p, int cs,int mask); + +/* look for black crossing a line x0,y0,x1,y1 + * follow line and count crossings ([white]-black-transitions) + */ +int num_cross(int x0, int x1, int y0, int y1, + pix *p, int cs); + +/* memory allocation with error checking */ +void *xrealloc(void *ptr, size_t size); + +/* follow a line x0,y0,x1,y1 recording locations of transitions, + * return count of transitions + */ +int follow_path(int x0, int x1, int y0, int y1, pix *p, int cs, path_t *path); + +/* ------------------------------------------------------------- + * mark edge-points + * - first move forward until b/w-edge + * - more than 2 pixel? + * - loop around + * - if forward pixel : go up, rotate right + * - if forward no pixel : rotate left + * - stop if found first 2 pixel in same order + * mit an rechter-Wand-entlang-gehen strategie + * -------------------------------------------------------------- + * turmite game: inp: start-x,y, regel r_black=UP,r_white=RIght until border + * out: last-position + * Zaehle dabei, Schritte,Sackgassen,xmax,ymax,ro-,ru-,lo-,lu-Ecken + * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + * + * is this the right place for declaration? + */ +void turmite(pix *p, int *x, int *y, + int x0, int x1, int y0, int y1, int cs, int rw, int rb); + +/* test if points are connected via t-pixel (rekursiv!) */ +int joined(pix *p, int x0, int y0, int x1, int y1, int cs); + +/* move from x,y to direction r until pixel or l steps + * return number of steps + */ +int loop(pix *p, int x, int y, int l, int cs, int col, DIRECTION r); + +#define MAX_HOLES 3 +typedef struct list_holes { + int num; /* numbers of holes, initialize with 0 */ + struct hole_s { + int size,x,y,x0,y0,x1,y1; /* size, start point, outer rectangle */ + } hole[MAX_HOLES]; +} holes_t; + +/* look for white holes surrounded by black points + * at moment white point with black in all four directions + */ +int num_hole(int x0, int x1, int y0, int y1, pix *p, int cs, holes_t *holes); + +/* count for black nonconnected objects --- used for i,auml,ouml,etc. */ +int num_obj(int x0, int x1, int y0, int y1, pix *p, int cs); + +int distance( pix *p1, struct box *box1, /* box-frame */ + pix *p2, struct box *box2, int cs); + +/* call the OCR engine ;) */ +/* char whatletter(struct box *box1,int cs); */ + +/* declared in pixel.c */ +/* getpixel() was pixel() but it may collide with netpnm pixel declaration */ +int getpixel(pix *p, int x, int y); +int marked(pix *p, int x, int y); +void put(pix * p, int x, int y, int ia, int io); + +/* } */ /* extern C */ +#endif /* __GOCR_H__ */ diff --git a/lib/gocr/job.c b/lib/gocr/job.c new file mode 100644 index 00000000..eacb5054 --- /dev/null +++ b/lib/gocr/job.c @@ -0,0 +1,83 @@ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000-2006 Joerg Schulenburg + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + see README for email address */ + +#include "pgm2asc.h" +#include "gocr.h" + +/* initialize job structure */ +void job_init(job_t *job) { + /* init source */ + job->src.fname = "-"; + /* FIXME jb: init pix */ + job->src.p.p = NULL; + + /* init results */ + list_init( &job->res.boxlist ); + list_init( &job->res.linelist ); + job->res.avX = 5; + job->res.avY = 8; + job->res.sumX = 0; + job->res.sumY = 0; + job->res.numC = 0; + job->res.lines.dy=0; + job->res.lines.num=0; + + /* init temporaries */ + list_init( &job->tmp.dblist ); + job->tmp.n_run = 0; + /* FIXME jb: init ppo */ + job->tmp.ppo.p = NULL; + job->tmp.ppo.x = 0; + job->tmp.ppo.y = 0; + + /* init cfg */ + job->cfg.cs = 0; + job->cfg.spc = 0; + job->cfg.mode = 0; + job->cfg.dust_size = -1; /* auto detect */ + job->cfg.only_numbers = 0; + job->cfg.verbose = 0; + job->cfg.out_format = UTF8; /* old: ISO8859_1; */ + job->cfg.lc = "_"; + job->cfg.db_path = (char*)NULL; + job->cfg.cfilter = (char*)NULL; + job->cfg.certainty = 95; +} + +/* free job structure */ +void job_free(job_t *job) { + + /* if tmp is just a copy of the pointer to the original image */ + if (job->tmp.ppo.p==job->src.p.p) job->tmp.ppo.p=NULL; + + /* FIMXE jb: free lists + * list_free( &job->res.linelist ); + * list_free( &job->tmp.dblist ); + */ + + list_and_data_free(&(job->res.boxlist), (void (*)(void *))free_box); + + /* FIXME jb: free pix */ + if (job->src.p.p) { free(job->src.p.p); job->src.p.p=NULL; } + + /* FIXME jb: free pix */ + if (job->tmp.ppo.p) { free(job->tmp.ppo.p); job->tmp.ppo.p=NULL; } + +} diff --git a/lib/gocr/lines.c b/lib/gocr/lines.c new file mode 100644 index 00000000..396000dd --- /dev/null +++ b/lib/gocr/lines.c @@ -0,0 +1,348 @@ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000-2006 Joerg Schulenburg + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + see README for EMAIL-address +*/ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <limits.h> +#include <assert.h> +#include "pgm2asc.h" +#include "gocr.h" +#include "unicode.h" + +const char *getTextLine (int line) { + int i; + Element *elem; + + if (line < 0 || line > list_total(&(JOB->res.linelist))) + return NULL; + + for ( i = 0, elem = JOB->res.linelist.start.next; i < line && elem != NULL; i++ ) + elem = elem->next; + + if ( elem != NULL ) + return (const char *)elem->data; + + return NULL; +} + +void free_textlines(void) { + for_each_data(&(JOB->res.linelist)) { + if (list_get_current(&(JOB->res.linelist))) + free(list_get_current(&(JOB->res.linelist))); + } end_for_each(&(JOB->res.linelist)); + list_free(&(JOB->res.linelist)); +} + +/* append a string (s1) to the string buffer (buffer) of length (len) + * if buffer is to small or len==0 realloc buffer, len+=512 + */ +char *append_to_line(char *buffer, const char *s1, int *len) { + char *temp; + int slen=0, alen; + if( s1==NULL || s1[0] == 0 ){ + fprintf(stderr,"\n#BUG: appending 0 to a line makes no sense!"); + return buffer; + } + if ( *len>0 ) slen= strlen(buffer); // used buffer + alen = strlen(s1); + if ( slen+alen+1 >= *len ) { + *len+=512; + temp = (char *)realloc(buffer, *len); + if( !temp ) { fprintf(stderr,"realloc failed!\n"); *len-=512; return buffer; } + else buffer = temp; // buffer successfull enlarged + } + temp = buffer + slen; // end of buffered string + memcpy(temp,s1,alen+1); // copy including end sign '\0' + return buffer; +} + +int calc_median_gap(struct tlines * lines) { + int gaps[MAXlines], l; + if (lines->num<2) return 0; + for (l = 0; l < lines->num - 1; l++) + gaps[l] = lines->m2[l + 1] - lines->m3[l]; + qsort(gaps, lines->num - 1, sizeof(gaps[0]), intcompare); + return gaps[(lines->num - 1) / 2]; +} + +/* + * Return the indent in pixels of the least-indented line. + * Will be subtracted as base_indent to avoid negativ indent. + * + * This is adjusted to account for an angle on the page as + * a whole. For instance, if the page is rotated clockwise, + * lower lines may be physically closer to the left edge + * than higher lines that are logically less indented. + * We rotate around (0,0). Note that this rotation could + * rotate lines "off the left margin", leading to a negative + * indent. + * + * boxlist -- list of character boxes. + * dx, dy -- rotation angle as vector + */ +int get_least_line_indent(List * boxlist, int dx, int dy) { + int min_indent = INT_MAX; + int adjusted_indent; + struct box * box2; + if (JOB->cfg.verbose) + fprintf(stderr, "get_least_line_indent: rot.vector dxdy %d %d\n", + dx, dy); + for_each_data(boxlist) { + box2 = (struct box *)list_get_current(boxlist); + /* if num == -1, indicates this is a space or newline box, + * inserted in list_insert_spaces. */ + if (box2->num != -1) { + adjusted_indent = box2->x0; + if (dx) adjusted_indent += box2->y0 * dy / dx; + if (adjusted_indent < min_indent) { + min_indent = adjusted_indent; + if (dy!=0 && JOB->cfg.verbose) + fprintf(stderr, + "# Line %2d, unadjusted xy %3d %3d, adjusted x %2d\n", + box2->line, box2->x0, box2->y0, adjusted_indent); + } + } + } end_for_each(boxlist); + if (JOB->cfg.verbose) + fprintf(stderr, "# Minimum adjusted x: %d (min_indent)\n", min_indent); + return min_indent; +} + +/* collect all the chars from the box tree and write them to a string buffer + mo is the mode: mode&8 means, use chars even if unsure recognized + ToDo: store full text(?), store decoded text+boxes+position chars (v0.4) + (HTML,UTF,ASCII,XML), not wchar incl. dexcriptions (at<95% in red) + remove decode(*c, job->cfg.out_format) from gocr.c! + XML add alternate-tags, format tags and position tags + ToDo: better output XML to stdout instead of circumstantial store to lines + not all texts/images follow the line concept? + Better use a tree of objects where leafes are chars instead of simple list. + Chars or objects are taken into account. Objects can be text strings + or XML strings. + */ +void store_boxtree_lines(int mo) { + char *buffer; /* temp buffer for text */ + int i = 0, j = 0; + int len = 1024; // initial buffer length for text line + struct box *box2; + int median_gap = 0; + int max_single_space_gap = 0; + struct tlines line_info; + int line, line_gap, oldline=-1; + int left_margin; + int i1=0, i2=0; + + buffer = (char *)malloc(len); + if ( !buffer ) { + fprintf(stderr,"malloc failed!\n"); // ToDo: index_to_error_list + return; + } + *buffer = 0; + + if ( JOB->cfg.verbose&1 ) + fprintf(stderr,"# store boxtree to lines ..."); + + /* wew: calculate the median line gap, to determine line spacing + * for the text output. The line gap used is between one line's + * m3 (baseline) and the next line's m2 (height of non-rising + * lowercase). We use these lines as they are the least likely + * to vary according to actual character content of lines. + */ + median_gap = calc_median_gap(&JOB->res.lines); + if (median_gap <= 0) { + fprintf(stderr, "# Warning: non-positive median line gap of %d\n", + median_gap); + median_gap = 8; + max_single_space_gap = 12; /* arbitrary */ + } else { + max_single_space_gap = median_gap * 7 / 4; + } + + // Will be subtracted as base_indent to avoid negativ indent. + left_margin = get_least_line_indent(&JOB->res.boxlist, + JOB->res.lines.dx, + JOB->res.lines.dy); + + if (JOB->cfg.out_format==XML) { /* subject of change */ + char s1[255]; /* ToDo: avoid potential buffer overflow !!! */ + /* output lot of usefull information for XML filter */ + sprintf(s1,"<page x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\">\n", + 0,0,0,0); + buffer=append_to_line(buffer,s1,&len); + sprintf(s1,"<block x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\">\n", + 0,0,0,0); + buffer=append_to_line(buffer,s1,&len); + } + + for_each_data(&(JOB->res.boxlist)) { + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + line = box2->line; + line_info = JOB->res.lines; + /* reset the output char if certainty is below the limit v0.44 */ + if (box2->num_ac && box2->wac[0]<JOB->cfg.certainty) box2->c=UNKNOWN; + if (line!=oldline) { + if (JOB->cfg.out_format==XML && oldline>-1) { /* subject of change */ + buffer=append_to_line(buffer,"</line>\n",&len); + list_app( &(JOB->res.linelist), (void *)strdup(buffer) ); // wcsdup + memset(buffer, 0, len); + j=0; // reset counter for new line + } + if (JOB->cfg.out_format==XML) { /* subject of change */ + char s1[255]; /* ToDo: avoid potential buffer overflow !!! */ + /* output lot of usefull information for XML filter */ + sprintf(s1,"<line x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" value=\"%d\">\n", + line_info.x0[line],line_info.m1[line], + line_info.x1[line]-line_info.x0[line]+1, + line_info.m4[line]-line_info.m1[line],line); + buffer=append_to_line(buffer,s1,&len); + } + oldline=line; + } + if (box2->c > ' ' && + box2->c <= 'z') i1++; /* count non-space chars */ + if (box2->c == '\n') { + if (JOB->cfg.out_format!=XML) { /* subject of change */ + line_info = JOB->res.lines; + line = box2->line; + if (line > 0) { + line_gap = line_info.m2[line] - line_info.m3[line - 1]; + for (line_gap -= max_single_space_gap; line_gap > 0; + line_gap -= median_gap) { + buffer=append_to_line(buffer,"\n",&len); + j++; /* count chars in line */ + } + } + list_app( &(JOB->res.linelist), (void *)strdup(buffer) ); // wcsdup + memset(buffer, 0, len); + j=0; // reset counter for new line + } + } + if (box2->c == ' ') // fill large gaps with spaces + { + if (JOB->res.avX) { /* avoid SIGFPE */ + if (JOB->cfg.out_format==XML) { /* subject of change */ + char s1[255]; /* ToDo: avoid potential buffer overflow !!! */ + /* output lot of usefull information for XML filter */ + sprintf(s1," <space x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" />\n", + box2->x0,box2->y0,box2->x1-box2->x0+1,box2->y1-box2->y0+1); + buffer=append_to_line(buffer,s1,&len); + } else + for (i = (box2->x1 - box2->x0) / (2 * JOB->res.avX) + 1; i > 0; i--) { + buffer=append_to_line(buffer," ",&len); + j++; + } + } + } + else if (box2->c != '\n') { + if (j==0 && JOB->res.avX) /* first char in new line? */ { + int indent = box2->x0 - JOB->res.lines.x0[box2->line]; + /* correct for angle of page as a whole. */ + if (JOB->res.lines.dx) + indent += box2->y0 * JOB->res.lines.dy / JOB->res.lines.dx; + /* subtract the base margin. */ + indent -= left_margin; + if (JOB->cfg.out_format==XML) { /* subject of change */ + char s1[255]; /* ToDo: avoid potential buffer overflow !!! */ + /* output lot of usefull information for XML filter */ + sprintf(s1," <space x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" />\n", + box2->x0,box2->y0,box2->x1-box2->x0+1,box2->y1-box2->y0+1); + buffer=append_to_line(buffer,s1,&len); + } else + for (i = indent / JOB->res.avX; i > 0; i--) { + buffer=append_to_line(buffer," ",&len); j++; + } + } + if (JOB->cfg.out_format==XML) { /* subject of change */ + char s1[255]; /* ToDo: avoid potential buffer overflow !!! */ + /* output lot of usefull information for XML filter */ + sprintf(s1," <box x=\"%d\" y=\"%d\" dx=\"%d\" dy=\"%d\" value=\"", + box2->x0,box2->y0,box2->x1-box2->x0+1,box2->y1-box2->y0+1); + buffer=append_to_line(buffer,s1,&len); + if (box2->num_ac>1) { /* output alist */ + } + } + if (box2->c != UNKNOWN && box2->c!=0) { + buffer= + append_to_line(buffer,decode(box2->c,JOB->cfg.out_format),&len); + if (box2->c > ' ' && + box2->c <= 'z') i2++; /* count non-space chars */ + } else { + wchar_t cc; cc=box2->c; + if (box2->num_ac>0 && box2->tas[0] + && (JOB->cfg.out_format!=XML || box2->tas[0][0]!='<')) { + buffer=append_to_line(buffer,box2->tas[0],&len); + j+=strlen(box2->tas[0]); + } else { + buffer= + append_to_line(buffer,decode(cc,JOB->cfg.out_format),&len); + } + } + if (JOB->cfg.out_format==XML) { + if (box2->num_ac>0) { + /* output alist ToDo: separate <altbox ...> */ + int i1; char s1[256]; + sprintf(s1,"\" numac=\"%d\" weights=\"",box2->num_ac); + buffer=append_to_line(buffer,s1,&len); + for (i1=0;i1<box2->num_ac;i1++) { + sprintf(s1,"%d",box2->wac[i1]); + buffer=append_to_line(buffer,s1,&len); + if (i1+1<box2->num_ac) buffer=append_to_line(buffer,",",&len); + } + if (box2->num_ac>1) + buffer=append_to_line(buffer,"\" achars=\"",&len); + for (i1=1;i1<box2->num_ac;i1++) { + if (box2->tas[i1] && box2->tas[i1][0]!='<') + buffer=append_to_line(buffer,box2->tas[i1],&len); + else + buffer=append_to_line(buffer, + decode(box2->tac[i1],JOB->cfg.out_format),&len); + // ToDo: add tas[] (achars->avalues or alternate_strings? + if (i1+1<box2->num_ac) buffer=append_to_line(buffer,",",&len); + } + } + buffer=append_to_line(buffer,"\" />\n",&len); + } + if (box2->num_ac && box2->tas[0]) { + if (box2->tas[0][0]=='<') { /* output special XML object */ + buffer=append_to_line(buffer,box2->tas[0],&len); + buffer=append_to_line(buffer,"\n",&len); + j+=strlen(box2->tas[0]); + } + } + j++; + } + i++; + } end_for_each(&(JOB->res.boxlist)); + if (JOB->cfg.out_format==XML && oldline>-1) { /* subject of change */ + buffer=append_to_line(buffer,"</line>\n",&len); + } + if (JOB->cfg.out_format==XML) { /* subject of change */ + buffer=append_to_line(buffer,"</block>\n</page>\n",&len); + } + + /* do not forget last line */ + // is there no \n in the last line? If there is, delete next line. + list_app( &(JOB->res.linelist), (void *)strdup(buffer) ); + free(buffer); + if( JOB->cfg.verbose&1 ) + fprintf(stderr,"... %d lines, boxes= %d, chars= %d\n",i,i1,i2); +} diff --git a/lib/gocr/list.c b/lib/gocr/list.c new file mode 100644 index 00000000..332d2bd3 --- /dev/null +++ b/lib/gocr/list.c @@ -0,0 +1,334 @@ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000-2006 Joerg Schulenburg + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + see README for email address + + ***********************************IMPORTANT********************************* + Notes to the developers: read the following notes before using these + functions. + * Be careful when using for_each_data() recursively and calling list_del. + It may mangle with the current[] pointers, and possibly segfault or do an + unpredictable or just undesirable behavior. We have been working on a + solution for this problem, and solved some of the biggest problems. + In a few words, the problem is this: when you delete a node, it may be + the current node of a lower level loop. The current code takes care of + access to previous/next elements of the now defunct node. So, if you do + something like: + + for_each_data(l) { + for_each_data(l) { + list_del(l, header_data); + free(header_data); + } end_for_each(l); ++ tempnode = list_cur_next(l); + } end_for_each(l); + + It will work, even though the current node in the outer loop was deleted. + However, if you replace the line marked with + with the following code: + + tempnode = list_next(l, list_get_current(l)); + + it will break, since list_get_current is likely to return NULL or garbage, + since you deleted header_data(). + Conclusion: use list_del carefully. The best way to avoid this problem is + to not use list_del inside a big stack of loops. + * If you have two elements with the same data, the functions will assume + that the first one is the wanted one. Not a bug, a feature. ;-) + * avoid calling list_prev and list_next. They are intensive and slow + functions. Keep the result in a variable or, if you need something more, + use list_get_element_from_data. + + */ + +#include <stdio.h> +#include <stdlib.h> +#include "list.h" +#include "progress.h" + +void list_init( List *l ) { + if ( !l ) + return; + + l->start.next = &l->stop; + l->stop.previous = &l->start; + l->start.previous = l->stop.next = NULL; + l->start.data = l->stop.data = NULL; + l->current = NULL; + l->level = -1; + l->n = 0; +} + +/* inserts data before data_after. If data_after == NULL, appends. + Returns 1 on error, 0 if OK. */ +int list_ins( List *l, void *data_after, void *data) { + Element *e, *after_element; + + /* test arguments */ + if ( !l || !data ) + return 1; + + if ( !data_after || !l->n ) + return list_app(l, data); + + /* get data_after element */ + if ( !(after_element = list_element_from_data(l, data_after)) ) + return 1; + + /* alloc a new element */ + if( !(e = (Element *)malloc(sizeof(Element))) ) + return 1; + e->data = data; + e->next = after_element; + e->previous = after_element->previous; + after_element->previous->next = e; + after_element->previous = e; + l->n++; + + return 0; +} + +/* appends data to the list. Returns 1 on error, 0 if OK. */ +/* same as list_ins(l,NULL,data) ??? */ +int list_app( List *l, void *data ) { + Element *e; + + if ( !l || !data ) + return 1; + if ( !(e = (Element *)malloc(sizeof(Element))) ) + return 1; + + e->data = data; + e->previous = l->stop.previous; + e->next = l->stop.previous->next; + l->stop.previous->next = e; + l->stop.previous = e; + l->n++; + return 0; +} + +/* returns element associated with data. */ +Element *list_element_from_data( List *l, void *data ) { + Element *temp; + + if ( !l || !data || !l->n) + return NULL; + + temp = l->start.next; + + while ( temp->data != data ) { + if ( !temp || temp==&l->stop ) + return NULL; + temp = temp->next; + } + return temp; +} + +/* deletes (first) element with data from list. User must free data. + Returns 0 if OK, 1 on error. + This is the internal version, that shouldn't be called usually. Use the + list_del() macro instead. + */ +int list_del( List *l, void *data ) { + Element *temp; + int i; + + if (!data) return 1; /* do not delete start or stop element */ + + /* find element associated with data */ + if ( !(temp = list_element_from_data(l, data)) ) + return 1; + + /* test if the deleted node is current in some nested loop, and fix it. */ + for ( i = l->level; i >= 0; i-- ) { + if ( l->current[i] == temp ) { + l->current[i] = temp->previous; + } + } + + temp->previous->next = temp->next; + temp->next->previous = temp->previous; + temp->previous = temp->next = NULL; /* mark as freed */ +/* + fprintf(stderr,"\n# list_del=%p start=%p stop=%p",temp,&l->start,&l->stop); +*/ + + /* and free stuff */ + free(temp); /* element pointing to data, fixed mem-leak 0.41 */ + l->n--; + return 0; +} + +/* frees list. See also list_and_data_free() */ +void list_free( List *l ) { + Element *temp, *temp2; + + if ( !l || !l->n ) + return; + + if ( l->current ) { + free(l->current); + } + l->current = NULL; + + temp = l->start.next; + while ( temp && temp!=&l->stop) { + temp2 = temp->next; + free(temp); + temp = temp2; + } + l->start.next = &l->stop; + l->stop.previous = &l->start; +} + +/* setup a new level of for_each */ +int list_higher_level( List *l ) { + Element **newcur; + + if ( !l ) return(1); + + /* + Security-check: NULL pointer passed to realloc. + ANSI allows this, but it may cause portability problems. + */ + newcur = (Element **)realloc(l->current, (l->level+2)*sizeof(Element *)); + if (newcur) { + l->current = newcur; + l->level++; + l->current[l->level] = l->start.next; + } + g_debug(fprintf(stderr, " level++=%d current[]=%p\n", + l->level, l->current);) + if ( !newcur ) { + fprintf(stderr, " realloc failed! abort\n"); return(1); + } + return 0; +} + +void list_lower_level( List *l ) { + if ( !l ) + return; + + if (!l->level) { + free(l->current); /* calm -lefence */ + l->current = NULL; /* could be important */ + } else { + l->current = (Element **)realloc(l->current, l->level*sizeof(Element *)); + } + l->level--; + g_debug(fprintf(stderr, " level--=%d current[]=%p\n", l->level, + l->current);) +} + +/* returns the next item data */ +void *list_next( List *l, void *data ) { + Element *temp; + + if ( !l || !(temp = list_element_from_data(l, data)) ) + return NULL; + if( !temp->next ) return NULL; + return (temp->next->data); +} + +/* returns the previous item data */ +void *list_prev( List *l, void *data ) { + Element *temp; + + if ( !l || !(temp = list_element_from_data(l, data)) ) + return NULL; + if( !temp->previous ) return NULL; + return (temp->previous->data); +} + +/* Similar to qsort. Sorts list, using the (*compare) function, which is + provided by the user. The comparison function must return an integer less + than, equal to, or greater than zero if the first argument is considered to + be respectively less than, equal to, or greater than the second. + Uses the bubble sort algorithm. + */ +void list_sort( List *l, int (*compare)(const void *, const void *) ) { + Element *temp, *prev; + int i, sorted; + progress_counter_t *pc = NULL; + + if ( !l ) + return; + + /* start progress meter, sorting is slow for huge number of elements */ + /* l->n is the worst case, real time is less or equal estimated time */ + pc = open_progress(l->n,"list_sort"); + + for (i = 0; i < l->n; i++ ) { + sorted = 1; /* Flag for early break */ + for ( temp = l->start.next->next; + temp != NULL && temp != &l->stop; temp = temp->next ) { + if ( temp->previous == &l->start ) continue; + if ( compare((const void *)temp->previous->data, + (const void *)temp->data) > 0 ) { + + sorted = 0; /* rest flag */ + /* swap with the previous node */ + prev = temp->previous; + prev->previous->next = temp; + temp->next->previous = prev; + temp->previous = prev->previous; + prev->next = temp->next; + prev->previous = temp; + temp->next = prev; + /* and make sure the node in the for loop is correct */ + temp = prev; + +#ifdef SLOWER_BUT_KEEP_BY_NOW +/* this is a slower version, but guaranteed to work */ + void *data; + + data = temp->data; + prev = temp->previous; + list_del(l, data); + list_ins(l, prev->data, data); + temp = prev; +#endif + } + } + if (sorted) break; + progress(i,pc); /* progress meter */ + } + + close_progress(pc); + g_debug(fprintf(stderr, "list_sort()\n");) +} + +/* calls free_data() for each data in list l, + * before free list with list_free() */ +int list_and_data_free( List *l, void (*free_data)(void *data)) { + void *data; + + if ( !l ) return 0; + if ( !free_data ) return 1; + + for_each_data(l) { + if ((data = list_get_current(l))) + free_data(data); + } end_for_each(l); + + list_free(l); + + g_debug(fprintf(stderr, "list_and_data_free()\n");) + + return 0; +} + diff --git a/lib/gocr/list.h b/lib/gocr/list.h new file mode 100644 index 00000000..dd300998 --- /dev/null +++ b/lib/gocr/list.h @@ -0,0 +1,90 @@ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000 Joerg Schulenburg + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + see README for EMAIL-address + + */ + +#ifndef GOCR_LIST_H +#define GOCR_LIST_H + +#ifdef DEBUG +#define g_debug(a) a +#else +#define g_debug(a) +#endif + +/* + * Structures + */ + +struct element { + struct element *next, *previous; + void *data; +}; +typedef struct element Element; + +struct list { + Element start; /* simplifies for(each_element) { ... */ + Element stop; /* ... list_del() ... } v0.41 */ + Element **current; /* for(each_element) */ + int n; /* number of elements */ + int level; /* level of nested fors */ +}; +typedef struct list List; + +/* + * Functions + */ + +void list_init ( List *l ); +int list_app ( List *l, void *data ); +int list_ins ( List *l, void *data_after, void *data); +Element*list_element_from_data ( List *l, void *data ); +int list_del ( List *l, void *data ); +void list_free ( List *l ); +int list_and_data_free ( List *l, void (*free_data)(void *data)); +int list_higher_level ( List *l ); +void list_lower_level ( List *l ); +void * list_next ( List *l, void *data ); +void * list_prev ( List *l, void *data ); +void list_sort ( List *l, int (*compare)(const void *, const void *) ); + +#define list_empty(l) ((l)->start.next == &(l)->stop ? 1 : 0) +#define list_get_header(l) ((l)->start.next->data) +#define list_get_tail(l) ((l)->stop.previous->data) +#define list_get_current(l) ((l)->current[(l)->level]->data) +#define list_get_cur_prev(l) ((l)->current[(l)->level]->previous == NULL ? \ + NULL : (l)->current[(l)->level]->previous->data ) +#define list_get_cur_next(l) ((l)->current[(l)->level]->next == NULL ? \ + NULL : (l)->current[(l)->level]->next->data ) +#define list_total(l) ((l)->n) + +#define for_each_data(l) \ + if (list_higher_level(l) == 0) { \ + for ( ; (l)->current[(l)->level] \ + && (l)->current[(l)->level]!=&(l)->stop; (l)->current[(l)->level] = \ + (l)->current[(l)->level]->next ) { + + +#define end_for_each(l) \ + } \ + list_lower_level(l); \ + } + +#endif diff --git a/lib/gocr/ocr0.c b/lib/gocr/ocr0.c new file mode 100644 index 00000000..1066b7cb --- /dev/null +++ b/lib/gocr/ocr0.c @@ -0,0 +1,6591 @@ +/* + rule based OCR engine, partly rewritten for edges (old=pixel) + */ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000-2007 Joerg Schulenburg + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + see README for email address + + >>> DO NOT EDIT THIS FILE IF YOU NOT REALLY KNOW WHAT YOU ARE DOING! <<< + + I have invested lot of time, to write this part of the program. + This engine should recognize chars allways right or return UNKNOWN. + If you change something, test all other example files too, + to be sure that all things work better. (JoergS) + + This engine was pixelbased until 0.40 which was not successfull enough. + Also code changes always hade side effects. The vectorisation of the code + starts from version 0.41 with the chars XNz and seems to be much better + to handle. Vectorization means we frame each character by a chain of + vectors and dont care about pixels anymore. Unfortunatly I have to + replace all the pixel codes, which is a long process. Old code will be lost. + (JorgS) + + +ToDo: + - if box1->p and b differ, reduce probability + - probability makes life much easier here + - use only one box!?, may be bits have usefull infos + - divide this file, suggestion: classify chars: + high=ABCDEFGHIJKLMNOPQRSTUVWXYZbdfhklt, low=acegijmnopqrsuvwxyz + or + often_used=etianmsurwdkgo rarely_used=hvjcflpqxyz.,: + or + every char (large overhead) + - two-pass version (first pass without tolerance) + 2nd pass with tolerance (ex: one tiny more in sdata->holes) + + general feature extraction: + - white holes at middle, upper, lower position (cost much time) + - test lines and triangles insteat of rectangles + + char is removed, wchar_t is used (better code) + + making a static global variable-set x.x0,x.x1, and call test_a, + test_b ... (faster compilation, but not reentrant!) + + - adding slant-angle (if detected) to distinguish between l and / ? + - ac (alternate chars) as string add_ac(box1,"/") => box1->ac="Il/"; + for better context correction or output: "Ha[lI][lI]o!" + +*/ + +#include <stdlib.h> +#include <stdio.h> +// #include "pgm2asc.h" +#include "ocr0.h" +// #include "ocr1.h" +#include "pnm.h" +#include "gocr.h" + +#define IFV if(JOB->cfg.verbose&4) +#define MM {IFV fprintf(stderr,"\nDBG %c L%04d (%d,%d): ",(char)c_ask,__LINE__,box1->x0,box1->y0);} + +// the old debug mode (0.40) was only for a special char, for another char +// code must be recompiled with C_ASK='char' +// new debug mode (0.41) explains why char is declined or accepted as ABC... +// the output can be filtered by external scripts +// ToDo: we could reduce output to filter string +#ifndef DO_DEBUG /* can be defined outside */ +#define DO_DEBUG 0 /* 0 is the default */ +#endif + +/* this macro is for debugging output: "if char is declined, why?" */ +#if DO_DEBUG /* 0=Work mode, 1=debugging mode */ +// Setac: output, that char is choosen with a probability +// Break: output, why the char is not choosen +// MSG: debugging functions for char C_ASK, mostly messages +// DBG: definitions usefull only for debugging +#define Setac(box1,ac,ad) { MM;IFV fprintf(stderr,"setac %d",ad);setac(box1,ac,ad); } +#define Break { MM;IFV fprintf(stderr,"break"); break; } +#define MSG(x) { MM;IFV x } +#define DBG(x) x +#else +#define Setac(box1,ac,ad) setac(box1,ac,ad) +#define Break break +#define MSG(x) +#define DBG(x) +#endif + +/* extern "C"{ */ + +// static inline int sq(int x) { return x*x; } /* square */ + +/* + * go from vector j1 to vector j2 and measure maximum deviation of + * the steps from the line connecting j1 and j2 + * return the squared maximum distance + * in units of the box size times 1024 + * ToDo: 1) better give back max-dx and max-dy ??? + * errors if j1 and j2 are in different frames or belong to + * more then one frame? + * 2) Better get deviation from a complete vector graphic? + * The vectorgraphic is the ideal test char adapted to the + * extrem vertices of the real char. + */ +int line_deviation( struct box *box1, int j1, int j2 ) { + int r1x, r1y, r2x, r2y, r3x, r3y, i, x, y, d, dist, maxdist=0, frame, l2; + r1x=box1->frame_vector[j1][0]; + r1y=box1->frame_vector[j1][1]; + r2x=box1->frame_vector[j2][0]; + r2y=box1->frame_vector[j2][1]; + if (!box1->num_frames) return(-1); + if (j1<0 || j1>box1->num_frame_vectors[box1->num_frames-1] || + j2<0 || j2>box1->num_frame_vectors[box1->num_frames-1]) { + fprintf(stderr,"Error in "__FILE__" L%d: idx out of range",__LINE__); + return(-1); + } + /* get the frame the endvector belongs to */ + for (i=0;i<box1->num_frames;i++) + if (j2<box1->num_frame_vectors[i]) break; + frame=i; + /* frame(j1)<=frame(j2) possible */ + for (i=j1;;i++) { // do it for each vector between j1 and j2 + if (i >= box1->num_frame_vectors[frame]) + i=((frame)?box1->num_frame_vectors[frame-1]:0); /* go around */ + if (i==j2) break; + // for (i=j1;i!=j2;i=(i+1)%box1->num_frame_vectors[0]) {~} + r3x=box1->frame_vector[i][0]; + r3y=box1->frame_vector[i][1]; + // Language=german + // german: Abstand Punkt von Strecke, Laenge Lotrechte + // germ.Strecke : l1=(r1+r2)/2+d*(r2-r1)/2 for d=-1..1 + // germ.Lotrechte: l2=r3+b*[-(r2-r1).y,(r2-r1).x] + // Schnittpunkt : l1=l2, + // eq1x: (r1x+r2x)/2-r3x+d*(r2x-r1x)/2+b*(r2y-r1y)=0 + // eq1y: (r1y+r2y)/2-r3y+d*(r2y-r1y)/2-b*(r2x-r1x)=0 + // eq2x: b*(r2x-r1x)*(r2y-r1y)=-((r1x+r2x)/2-r3x+d*(r2x-r1x)/2)*(r2x-r1x) + // eq2y: b*(r2x-r1x)*(r2y-r1y)= ((r1y+r2y)/2-r3y+d*(r2y-r1y)/2)*(r2y-r1y) + // eq2y-eq2x: ... in units of 1024 (fast integer rounded correctly) + l2=sq(r2x-r1x)+sq(r2y-r1y); // square of distance r2-r1 + if (l2==0) { + // fprintf(stderr,"ocr0 L%d: r1==r2 r1= %d %d",__LINE__, r1x, r1y); // debugging + d=-1024; + } else + d=-( ((r1x+r2x)-2*r3x)*(r2x-r1x) + +((r1y+r2y)-2*r3y)*(r2y-r1y))*1024/l2; // ..-1024..+1024.. + if (d<=-1024) { x=r1x; y=r1y; } // starting point + else { + if (d>=1024) { x=r2x; y=r2y; } // end point + else { + x=((r1x+r2x)+1)/2+(d*(r2x-r1x))/2048; + y=((r1y+r2y)+1)/2+(d*(r2y-r1y))/2048; + /* we have the crossing point x,y now */ + } + } + dist=sq((x-r3x)*1024/(box1->x1-box1->x0+1)) + +sq((y-r3y)*1024/(box1->y1-box1->y0+1)); // 0..2*sq(1024) + if (dist>maxdist) maxdist=dist; + // for debugging: + // fprintf(stderr,"\nDBG dev: %d-%d-%d dist=%5d max=%5d d=%d %d,%d-%d,%d" + // " vector= %d %d crosspoint= %d %d ", + // j1,i,j2,dist,maxdist,d,r1x,r1y,r2x,r2y,r3x,r3y,x,y); + } + return maxdist; +} + +/* + * search vectors between j1 and j2 for nearest point a to point r + * example: + * + * r-> $$...$$ $ - mark vectors + * @@$..@@ @ - black pixels + * @@$..@@ . - white pixels + * @@@@.$@ + * a-> @@$@$@@ + * @$.@@@@ + * @@..$@@ + * @@..$@@ + * j1 --> $$...$$ <-- j2 + * + * ToDo: vector aa[5] = {rx,ry,x,y,d^2,idx} statt rx,ry? + * j1 and j2 must be in the same frame + * return aa? + */ +int nearest_frame_vector( struct box *box1, int j1, int j2, int rx, int ry) { + int x,y,d,i,aa[4]; /* x,y,normalized_distance^2,vector_index */ + int frame=0, x0=box1->x0, y0=box1->y0, + x1=box1->x1, y1=box1->y1, + dx=box1->x1-x0+1, dy=box1->y1-y0+1; + if (!box1->num_frames) return(-1); + if (j1<0 || j1>box1->num_frame_vectors[box1->num_frames-1] || + j2<0 || j2>box1->num_frame_vectors[box1->num_frames-1]) { + fprintf(stderr,"Error in "__FILE__" L%d: idx %d-%d out of range\n",__LINE__,j1,j2); + //out_x(box1); + return(-1); + } + aa[0]=x=box1->frame_vector[j2][0]; /* x */ + aa[1]=y=box1->frame_vector[j2][1]; /* y */ + /* maximum is (distance*128)^2 if r is inside the box */ + aa[2]=d=2*sq(128)+sq((rx-(x0+x1)/2)*128/dx)+sq((ry-(y0+y1)/2)*128/dy); + aa[3]=j2; /* vector index */ + /* get the frame the endvector belongs to */ + for (i=0;i<box1->num_frames;i++) + if (j2<box1->num_frame_vectors[i]) break; + frame=i; + /* frame(j1)<=frame(j2) possible */ + for (i=j1;;i++) { + if (i >= box1->num_frame_vectors[frame]) + i=((frame)?box1->num_frame_vectors[frame-1]:0); /* go around */ + x=box1->frame_vector[i][0]; /* take a vector */ + y=box1->frame_vector[i][1]; + /* distance to upper left end, normalized to 128 */ + d=sq((x-rx)*128/dx)+sq((y-ry)*128/dy); + if (d<aa[2]) { aa[0]=x; aa[1]=y; aa[2]=d; aa[3]=i; } + if (i==j2) break; + } + return aa[3]; +} + +// test for umlauts, if ret>0 and m==1 box1 is changed +// m>0 modify box1->dots +// m==2 modify box1->y0 +// called by pgm2asc + ocr0(?) +int testumlaut(struct box *box1, int cs, int m, wchar_t *modifier){ + // pix p=*(box1->p); + int r,y,x,x0,x1,y0,y1,dx,dy,m1,m2,m3, + xl,xr,yu,yl; // left, right, upper and lower border of dots + wchar_t mod='\0'; /* (TeX-) modifier ~"'` for compose() */ + DBG( wchar_t c_ask='"'; ) + r=0; + x0=box1->x0; x1=box1->x1; dx=x1-x0+1; + y0=box1->y0; y1=box1->y1; dy=y1-y0+1; + m1=box1->m1; m2=box1->m2; m3=box1->m3; + xl=x0; xr=x1; yu=yl=y0; + if( dy < 5 || 4*y0 > 3*m2+m3 ) return 0; // no low chars: .,-= + /* modifier in box included? */ + if( 2*y1 > m1+m2 ){ + /* modifier in box included? */ + for(y=y0;2*y<y0+y1;y++)if( get_bw(xl,xr,y,y,box1->p,cs,1)==0 ) break; + if( 2*y<y0+y1 ){ /* yes => extract */ + yl=y; + while( get_bw(xl,xr,y,y,box1->p,cs,1)==0 && 2*y<=y0+y1) y++; + if( m&2 ) box1->y0=y; /* set new upper bond */ + } + } + if( yu>=yl ) { if(m) box1->dots=0; return 0; } /* nothing found */ + if( get_bw(xl-1,xl-1,yu,yl-1,box1->p,cs,1)==1 ) // neighbour overlap? + while( get_bw(xl ,xl ,yu,yl-1,box1->p,cs,1)==1 && 2*xl<x0+x1) xl++; + for(;xl<x1;xl++)if( get_bw(xl,xl,yu,yl,box1->p,cs,1)==1 ) break; + for(;xr>xl;xr--)if( get_bw(xr,xr,yu,yl,box1->p,cs,1)==1 ) break; + + if ( yl-1>yu ) { // tall box ij"a"o"u +#if 0 + x=box1->y0; box1->y0=m1; out_x(box1); box1->y0=x; + fprintf(stderr,"\n#testumlaut x= %d %d m1=%d m2=%d",x0,y0,m1-y0,m2-y0); + fprintf(stderr," yu=%d yl=%d xl=%d xr=%d",yu-y0,yl-y0,xl-x0,xr-x0); +#define DEBUG 1 +#endif + { + + x=xl;y=yu; + if( get_bw(xl,x1+1,yu,yl-1,box1->p,cs,1)==0 ) r=0; // neighbour overlap? + else + if( get_bw(xl ,xl ,yu,yl-1,box1->p,cs,1)==0 + || get_bw(xl-1,xl-1,yu,yl-1,box1->p,cs,1)==0 ) // be sure there are gap to neighbours + if( get_bw(xr ,xr ,yu,yl-1,box1->p,cs,1)==0 + || get_bw(xr+1,xr+1,yu,yl-1,box1->p,cs,1)==0 ) + { int i,j,x; + r=1; + // ...@@@.... RING_ABOVE // ..@@@..@@. TILDE + // ..@...@... // @@.@@@@@.. + // ..@...@... // @......... + // ..@..@@... + // ...@@@.... + for (i=yu;i<yl;i++) if (get_bw(xl,xr,i,i,box1->p,cs,1)==1) break; + for ( ;i<yl;i++) if (get_bw(xl,xr,i,i,box1->p,cs,1)==0) break; + for (j=xl;j<xr;j++) if (get_bw(j,j,yu,i,box1->p,cs,1)==1) break; + for ( ;j<xr;j++) if (get_bw(j,j,yu,i,box1->p,cs,1)==0) break; + for ( x=j;x<xr;x++) if (get_bw(x,x,yu,i,box1->p,cs,1)==1) break; + // vert. gap detected + if( j<xr && x<xr && j<x && xr-xl>2 + && num_obj(xl,xr,yu,yl-1,box1->p,cs)>=2 // not best!!! + && num_cross(xl,xr,yu +(yl-yu)/4,yu+ (yl-yu)/4,box1->p,cs) == 2 + && num_cross(xl,xr,yl-1-(yl-yu)/2,yl-1-(yl-yu)/2,box1->p,cs) == 2 + ){ // may be the following lines are not quite ok + while( get_bw(xl,xr,yl,yl,box1->p,cs,1)==0 && 2*yl<y0+y1) yl++; + r=2; +// out_x(box1);printf(" x,y=%d,%d i=%d xl=%d xr=%d yu=%d yl=%d",x0,y0,i-x0,xl-x0,xr-x0,yu-y0,yl-y0); + mod = DIAERESIS; + } + if( m&2 ) box1->y0=yl; +/* if( m&2 ) box1->y0= ( (r==1) ? yu : yl ); */ + // out_x(box1); + } + if(r==0){ // divided fr != fi + while( get_bw(x0,x1,yu,yu,box1->p,cs,1)==0 && 2*yu<y0+y1) yu++; + if(m)box1->y0=yu; + } + if( r==1 ){ yl--; +// .@@@. ..@@. +// .@@.. .@@.. +// .@... .@@.. +// +// if( loop(box1->p,xl,yu,xr-xl,cs,0,RI) +// > loop(box1->p,xl,yl,xr-xl,cs,0,RI) // +dx/8 +// && loop(box1->p,xr,yu,xr-xl,cs,0,LE) +// < loop(box1->p,xr,yl,xr-xl,cs,0,LE)) // -dx/8 ) // é Nov03 + if( loop(box1->p,xl,yu,xr-xl,cs,0,RI) + - loop(box1->p,xr,yu,xr-xl,cs,0,LE) + > loop(box1->p,xl,yl,xr-xl,cs,0,RI) // +dx/8 + - loop(box1->p,xr,yl,xr-xl,cs,0,LE)+1) // -dx/8 ) // é Nov03 + mod = ACUTE_ACCENT; // ' + + if( xr-xl+1 > 3*(yl-yu+1) + && get_bw(xl,xr,yu,yl,box1->p,cs,2)==0 ) + mod = MACRON; // "-" above + +// .@@@. .@@.. +// ..@@. ..@@. +// ...@. ..@@. +// +// if( loop(box1->p,xl,yu,xr-xl,cs,0,RI) +// < loop(box1->p,xl,yl,xr-xl,cs,0,RI) // -dx/8 +// && loop(box1->p,xr,yu,xr-xl,cs,0,LE) +// > loop(box1->p,xr,yl,xr-xl,cs,0,LE) ) // +dx/8 ) à Nov03 + if( loop(box1->p,xl,yu,xr-xl,cs,0,RI) + - loop(box1->p,xr,yu,xr-xl,cs,0,LE) + < loop(box1->p,xl,yl,xr-xl,cs,0,RI) // -dx/8 + - loop(box1->p,xr,yl,xr-xl,cs,0,LE) -1 ) // +dx/8 ) à Nov03 + mod = GRAVE_ACCENT; // ` + +#ifdef DEBUG + fprintf(stderr,"\n#testumlaut x= %d %d m1=%d m2=%d",x0,y0,m1-y0,m2-y0); + fprintf(stderr," yu=%d yl=%d xl=%d xr=%d",yu-y0,yl-y0,xl-x0,xr-x0); +#endif + if( (xr-xl+1) < 2*(yl-yu+1)+2 + && 2*(xr-xl+1)+2 > (yl-yu+1) ) { + int i,i1,i2,i3,i4; + i1=loop(box1->p,xl ,(yu+yl)/2,xr-xl+1,cs,0,RI); + i1=loop(box1->p,xl+i1,(yu+yl)/2,xr-xl+1,cs,1,RI); + i2=loop(box1->p,(xl+xr)/2,yu ,yl-yu+1,cs,0,DO); + i2=loop(box1->p,(xl+xr)/2,yu+i2,yl-yu+1,cs,1,DO); + for (i=0;i<xr-xl+1 && i<yl-yu+1;i++) + if (getpixel(box1->p,xl+i,yu+i)< cs) break; i3=i; + for ( ;i<xr-xl+1 && i<yl-yu+1;i++) + if (getpixel(box1->p,xl+i,yu+i)>=cs) break; i3=i-i3; + for (i=0;i<xr-xl+1 && i<yl-yu+1;i++) + if (getpixel(box1->p,xr-i,yu+i)< cs) break; i4=i; + for ( ;i<xr-xl+1 && i<yl-yu+1;i++) + if (getpixel(box1->p,xr-i,yu+i)>=cs) break; i4=i-i4; +#ifdef DEBUG + fprintf(stderr,"\n#DEBUG DOT_ABOVE %d %d %d %d",i1,i2,i3,i4); +#endif + if ( (xr-xl<5 && yl-yu<8) /* to small */ + || (i1>=(xr-xl+1)/2+2 && i2>=(yl-yu+1)/2+2 /* symmetrical */ + && abs(i3-i4)<=i1/4+2 && abs(i1-i2)<=i1/4+2 + && abs(i3-i1)<=i1/4+4 && abs(i4-i2)<=i1/4+4) + ) + mod = DOT_ABOVE; // "." above, ToDo: improve it! + } + + if( ( loop(box1->p,xl,yu ,xr-xl,cs,0,RI) + > loop(box1->p,xl,yl ,xr-xl,cs,0,RI)-dx/8 + || loop(box1->p,xl,yu ,xr-xl,cs,0,RI) + > loop(box1->p,xl,yl-1,xr-xl,cs,0,RI)-dx/8 ) + && ( loop(box1->p,xr,yu ,xr-xl,cs,0,LE) + > loop(box1->p,xr,yl ,xr-xl,cs,0,LE)-dx/8 + || loop(box1->p,xr,yu ,xr-xl,cs,0,LE) + > loop(box1->p,xr,yl-1,xr-xl,cs,0,LE)-dx/8 ) + && num_cross(xl,xr,yu ,yu ,box1->p,cs) == 1 + && ( num_cross(xl,xr,yl ,yl ,box1->p,cs) == 2 + || num_cross(xl,xr,yl-1,yl-1,box1->p,cs) == 2 )) + mod = CIRCUMFLEX_ACCENT; // "^" + + if( ( loop(box1->p,xl,yu ,xr-xl,cs,0,RI) + < loop(box1->p,xl,yl ,xr-xl,cs,0,RI)-dx/10 + || loop(box1->p,xl,yu+1,xr-xl,cs,0,RI) + < loop(box1->p,xl,yl ,xr-xl,cs,0,RI)-dx/10 ) + && ( loop(box1->p,xr,yu ,xr-xl,cs,0,LE) + < loop(box1->p,xr,yl ,xr-xl,cs,0,LE)-dx/10 + || loop(box1->p,xr,yu+1,xr-xl,cs,0,LE) + < loop(box1->p,xr,yl ,xr-xl,cs,0,LE)-dx/10 ) + && ( num_cross(xl,xr,yu ,yu ,box1->p,cs) == 2 + || num_cross(xl,xr,yu+1,yu+1,box1->p,cs) == 2 ) + && num_cross(xl,xr,yl ,yl ,box1->p,cs) == 1 ) + mod = CARON; // "v" above + + if( /* test for bow (new0.3.6) */ + loop(box1->p,xl,yu ,xr-xl,cs,0,RI) + + loop(box1->p,xl,yl ,xr-xl,cs,0,RI) + - 2*loop(box1->p,xl,(yl+yu)/2,xr-xl,cs,0,RI) > dx/16+1 + && xr-xl>10) + if( ( loop(box1->p,xl,yu ,xr-xl,cs,0,RI) + < loop(box1->p,xl,yl ,xr-xl,cs,0,RI)-dx/10 + || loop(box1->p,xl,yu+1,xr-xl,cs,0,RI) + < loop(box1->p,xl,yl ,xr-xl,cs,0,RI)-dx/10 ) + && ( loop(box1->p,xr,yu ,xr-xl,cs,0,LE) + < loop(box1->p,xr,yl ,xr-xl,cs,0,LE)-dx/10 + || loop(box1->p,xr,yu+1,xr-xl,cs,0,LE) + < loop(box1->p,xr,yl ,xr-xl,cs,0,LE)-dx/10 ) + && ( num_cross(xl,xr,yu ,yu ,box1->p,cs) == 2 + || num_cross(xl,xr,yu+1,yu+1,box1->p,cs) == 2 ) + && num_cross(xl,xr,yl ,yl ,box1->p,cs) == 1 ) + mod = BREVE; // round "u" above + + if( xr-xl>3 && yl-yu>1 ) + if( loop(box1->p,xl,yu,xr-xl,cs,0,RI) + > loop(box1->p,xl,yl,xr-xl,cs,0,RI) + && loop(box1->p,xr,yu,xr-xl,cs,0,LE) + < loop(box1->p,xr,yl,xr-xl,cs,0,LE) + && num_cross(xl,xr,yu,yu,box1->p,cs) == 2 + && num_cross(xl,xr,yl,yl,box1->p,cs) == 2 ) + mod = TILDE; + + if( xr-xl>2 && yl-yu>2) + if( num_cross(xl,xr,(yu+yl)/2,(yu+yl)/2,box1->p,cs) >1 ) + if( num_cross((xl+xr)/2,(xl+xr)/2,yu,yl,box1->p,cs) >1 ) + if( num_hole(xl,xr,yu,yl,box1->p,cs,NULL) == 1 ) + mod = RING_ABOVE; + +#ifdef DEBUG + printf("\n#DEBUG umlaut mod=0x%04x x=%d..%d y=%d..%d r=%d %s", + (int)mod,yu-box1->y0,yl-box1->y0, + xl-box1->x0,xr-box1->x0,r,((mod==CARON)?"CARON": + ((mod==ACUTE_ACCENT)?"ACUTE": + ((mod==TILDE)?"TILDE":"?")))); + out_x(box1); +#endif + + } + } + if (m) box1->dots=r; // set to 0 also possible after division + if (m) box1->modifier=mod; /* should be resetted after compose ??? */ + MSG(fprintf(stderr,"umlaut mod=%s dots=%d y0o=%d",decode(mod,ASCII),r,y0);) + } +// printf(" modifier=%c",mod); + if (modifier) *modifier=mod; /* set modifier */ + return r; +} + + +static wchar_t ocr0_eE(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + int i,i1,i2,j,d,x,y,hchar=sdata->hchar,gchar=sdata->gchar,bad_e=0, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + int (*aa)[4]=sdata->aa; /* corner-points, (x,y,dist^2,vector_idx) */ + + // --- most frequent letter e first!!! + // --- test e --------------------------------------------------- + for(ad=d=100;dx>2 && dy>3;){ // min 3x4 (smallest seen is 5x6) + DBG( wchar_t c_ask='e'; ) + if (sdata->holes.num > 2) Break; /* tolerant against a tiny hole */ + if (sdata->holes.num != 1) ad=97*ad/100; + /* ToDo: may be a two pass version intolerant/tolerant is better */ + if( loop(box1->p,x0,y0+dy/2,x1-x0,cs,0,RI)>dx/3 ) Break; // rough test + if( loop(box1->p,x0+dx/2,y0,y1-y0,cs,0,DO)>dy/3 ) Break; + if( loop(box1->p,x0+dx/2,y1,y1-y0,cs,0,UP)>dy/3 ) Break; + if( num_cross(x0,x1,y0+dy/4 ,y0+dy/4 ,box1->p,cs) > 2 + && num_cross(x0,x1,y0+dy/4+1,y0+dy/4+1,box1->p,cs) > 2 ) Break; // gt + x=(x0+x1)/2;i= num_cross(x,x,y0,y1,box1->p,cs); // v0.40 + if (i!=3) { x=(x0+2*x1)/3;i= num_cross(x,x,y0,y1,box1->p,cs); } + if (i!=3) { x=(x0+3*x1)/4;i= num_cross(x,x,y0,y1,box1->p,cs); } + if (i!=3) { i= num_cross((x0+2*x1)/3,(x0+x1)/2,y0,y1,box1->p,cs); } + i=loop(box1->p,x0,y0+dy/2,x1-x0,cs,0,RI); if( i>dx/2 ) Break; + j=loop(box1->p,x0,y0 ,x1-x0,cs,0,RI); if( j<i ) Break; + j=loop(box1->p,x0,y1 ,x1-x0,cs,0,RI); if( j<i ) Break; + i=loop(box1->p,x0+dx/2,y0,y1-y0,cs,0,DO); if( i>dx/2 ) Break; + j=loop(box1->p,x1-dx/3,y0,y1-y0,cs,0,DO); if( j<i ) i=j; + j=loop(box1->p,x0 ,y0,y1-y0,cs,0,DO); if( j<i ) Break; + j=loop(box1->p,x1 ,y0,y1-y0,cs,0,DO); if( j<i ) Break; + i=loop(box1->p,x0+dx/2,y1,y1-y0,cs,0,UP); if( i>dx/2 ) Break; + j=loop(box1->p,x0 ,y1,y1-y0,cs,0,UP); if( j<i ) Break; + j=loop(box1->p,x1 ,y1,y1-y0,cs,0,UP); if( j<i ) Break; + j=2*loop(box1->p,x0, (y0+y1)/2,x1-x0,cs,0,RI) + -loop(box1->p,x0,(3*y0+y1)/4,x1-x0,cs,0,RI) + -loop(box1->p,x0,(y0+3*y1)/4,x1-x0,cs,0,RI); + if (dx>3 && j>=dx/4) Break; // ~g 4x6font + for(y=1;y<dy/2;y++) if( num_cross(x0,x1,y0+y,y0+y,box1->p,cs) == 2 ) break; + if( y==dy/2 ) Break; // v0.2.5 ~ bad_t + for(i=0,j=x0+dx/4;j<=x1-dx/4 && i<=dx/4;j++) + if( num_cross(j,j,y0,y1,box1->p,cs) == 3 ) i++; + if( dx>4 && dy>5 && (i<dx/4-1 || i==0) ) Break; // ~g but 4x6-e + // look for horizontal white line (right gap) => set x,y + for(x=0,y=i=y0+dy/3;i<y1-dy/6;i++){ + j=loop(box1->p,x1,i,y1-y0,cs,0,LE); + if(j>=x) { x=j;y=i; } + } + if (x<dx/2){ // no gap found, fat font??? + // check smallest thickness left > 2* smallest thickness right + for(i1=dx,i=y0+dy/3;i<y1-dy/6;i++){ + j =loop(box1->p,x0 ,i,y1-y0,cs,0,RI); if (j>dx/2) break; + j =loop(box1->p,x0+j,i,y1-y0,cs,1,RI); + if (j<i1) i1=j; // smallest thickness on left bow + } + for(i2=dx,y=i=y0+dy/3;i<y1-dy/6;i++){ + j =loop(box1->p,x1 ,i,y1-y0,cs,0,LE); + j =loop(box1->p,x1-j,i,y1-y0,cs,1,LE); + if(j<i2) { i2=j;y=i; } + } if (3*i2>2*i1) Break; // not accepted, if right line is not very thinn + x =loop(box1->p,x1 ,y,y1-y0,cs,0,LE); + x+=loop(box1->p,x1-x,y,y1-y0,cs,1,LE); + x+=loop(box1->p,x1-x,y,y1-y0,cs,0,LE); + if (3*i2>i1) ad=99*ad/100; + if (2*i2>i1) ad=99*ad/100; + bad_e=60; // used later? + } + if (x<dx/2) Break; + for(i=1,j=x0+dx/6;j<x1-dx/6 && i;j++) + if( num_cross(j,j,y0,y,box1->p,cs) > 1 ) i=0; + if( i ) Break; +// ..@@@@...<- +// .@@@@@@;. +// @@,...@@. +// @@.....@, +// @@@@@@@@@ +// @@.,;.@,. <- problem (y) == bad_e>50 +// @@.....@. +// @@,...@@. +// .@@@,@@@. +// ..@@@@;..<- + if (dy>11 && bad_e<50) + if ( num_cross(x0,x1,y,y,box1->p,cs) != 1 ) Break; // except "geschwungenem e" + if ( num_cross(x0,x1-dx/3,y ,y ,box1->p,cs) != 1 + && num_cross(x0,x1-dx/3,y+1,y+1,box1->p,cs) != 1 ) Break; + // if( num_hole(x0, x1, y0 , y ,box1->p,cs,NULL) < 1 ){ + if( sdata->holes.num == 0 || sdata->holes.hole[0].y1 >= y-y0){ + if( sdata->hchar ) Break; // ~ \it t + // look if thinn font (may be h-line is broken) Mai00 + for(j=0,i=x0+dx/8;i<x1-1;i++) + if( get_bw(i,i,y0+dy/4,y,box1->p,cs,1) == 1 ) j++; + if(j<2*dx/4) Break; + } + if( sdata->holes.num>0 && sdata->holes.hole[0].y0 > y-y0) Break; + if( sdata->holes.num>1 && sdata->holes.hole[1].y0 > y-y0) Break; + if( sdata->holes.num==1 && sdata->holes.hole[0].x0 >= dx/2) { + ad=95*ad/100; } /* 8*10 @ (=at) is not an e */ + // look for horizontal gap + for(x=0,y=i=y0+dy/4;i<y1-dy/4;i++){ + j=loop(box1->p,x0,i,x1-x0,cs,0,RI); + if(j>=x) { x=j;y=i; } + } + if (y>y0+dy/4 && y<y1-dy/4 && x>dx/2) Break; // s + if (x>dx/4) ad=99*ad/100; + + if( num_cross(x0+dx/2,x1 ,y1-dy/4,y1 ,box1->p,cs) == 0 + && num_cross(x0+dx/2,x1-1,y1-dy/4,y1 ,box1->p,cs) == 0 + && num_cross(x0+dx/2,x1 ,y1-dy/4,y1-1,box1->p,cs) == 0 ) { + if (sdata->gchar) Break; // ~p + ad=99*ad/100; + } + /* upper case is for 5x6 box */ + if( sdata->hchar // broken B ? should also work when linedetection fails + && loop(box1->p,x1,y1-dy/3,dx,cs,0,LE)<=dx/8 ) { + x = loop(box1->p,x0,y0+dy/2,dx,cs,0,RI); + if( loop(box1->p,x0,y0+dy/4,dx,cs,0,RI)<=x + && loop(box1->p,x0,y0+dy/8,dx,cs,0,RI)<=x ) Break; + if( loop(box1->p,x0,y1-dy/4,dx,cs,0,RI)<=x + && loop(box1->p,x0,y1-dy/8,dx,cs,0,RI)<=x ) Break; + } + x = loop(sdata->bp,0,dy-2 ,dx,cs,0,RI); + if( loop(sdata->bp,0,dy-1-dy/8,dx,cs,0,RI)>x && dy>16) Break; // some Q + if (box1->m2) { + if (sdata->gchar) ad=99*ad/100; + if (sdata->hchar) ad=99*ad/100; + } else ad=99*ad/100; + + Setac(box1,(wchar_t)'e',ad); + if (ad>=100) return 'e'; + break; + } + // --- test E --------------------------------------------------- + for(ad=d=100;dx>2 && dy>4 ;){ // min 3x4 + // rewritten for vectors 0.43 + int i1, i2, i3, i4, i5; // line derivation + corners + DBG( wchar_t c_ask='E'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + /* half distance to the center */ + d=2*sq(128/4); + /* now we check for the upper right end of the h */ + if (aa[3][2]>d/2) Break; /* [2] = distance, ~dj... */ + if (aa[0][2]>d/2) Break; /* upper left end */ + if (aa[1][2]>d/2) Break; /* lower left end */ + if (aa[2][2]>d/2) Break; /* lowerright end */ +/* + E f near E + + OOOOOOOO OOOO + O5 O O + O4 O + OOOO3 OOOOOO + O2 O + O O + O1 O O + OOOOOOOO OOOOOO +*/ + // check the bow from below + for (i=aa[1][3];i!=aa[2][3];i=(i+1)%box1->num_frame_vectors[0]) { + if (y1-box1->frame_vector[ i][1]>dy/4) break; // fatal! + } if (i!=aa[2][3]) Break; // ~AHKMNRX + // search most left+down between bottom right and top right + i1=nearest_frame_vector(box1, aa[2][3],aa[3][3], x0, y1); + i5=nearest_frame_vector(box1, i1,aa[3][3], x0, y0); + i3=nearest_frame_vector(box1, i1, i5, x1, (y0+y1)/2); + i2=nearest_frame_vector(box1, i1, i3, x0, (2*y0+y1)/3); + i4=nearest_frame_vector(box1, i3, i5, x0, (y0+2*y1)/3); + i =nearest_frame_vector(box1, aa[0][3],aa[1][3], x0-dx/4, (y0+y1)/2); + if (2*box1->frame_vector[i][0] < aa[0][0]+aa[1][0]-1-dx/16) Break; + if (2*box1->frame_vector[i][0] < aa[0][0]+aa[1][0]) ad=99*ad/100; // f + + MSG(fprintf(stderr,"i1-5 %d %d %d %d %d",i1,i2,i3,i4,i5);) + // holes right open? + for( i=1,y=y0; y<y0+dy/4 && i; y++ ) // long black line + if( get_bw(x0+dx/3,x1-dx/6,y,y,box1->p,cs,2) == 0 ) i=0; + if( i ) Break; + for( i=1,y=y1; y>y1-dy/4 && i; y-- ) // long black line + if( get_bw(x0+dx/6,x1-dx/4,y,y,box1->p,cs,2) == 0 ) i=0; + if( i ) Break; + for( i=1,y=y0+dy/3; y<y1-dy/3 && i; y++ ){ // black line + j=loop(box1->p,x0 ,y,dx,cs,0,RI); + j=loop(box1->p,x0+j,y,dx,cs,1,RI); if( j>dx/3 ) i=0; + } if( i ) Break; + x=x1-dx/3; y=y0; // von oben durchbohren! + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,DO,ST); if( y>y0+dy/4 ) Break; + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,ST,DO); if( y>y0+dy/3 ) Break; + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,RI,DO); if( x<=x1 || y>y0+dy/2 ) Break; + x=x1-dx/3; y=y1; // von unten durchbohren! + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,UP,ST); if( y<y1-dy/4 ) Break; + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,ST,UP); if( y<y0-dy/3 ) Break; + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,RI,UP); if( x<=x1 || y<y0+dy/2 ) Break; + x=x1-dx/3; y=y0; // von oben durchbohren! + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,DO,ST); if( y>y0+dy/4 ) Break; + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,ST,DO); if( y>y0+dy/3 ) Break; + y+=dy/15; + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,LE,ST); if( x<x0 ) Break; + if (dx>15 && x==x0) ad=99*ad/100; // to thin + x+=dx/15+1; + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,DO,ST); if( y>y1-dy/3 ) Break; + // if( num_hole(x0, x1, y0 , y1 ,box1->p,cs,NULL) > 0 ) Break; + if (sdata->holes.num > 0) Break; + i=loop(box1->p,x0,y0+dy/4,dx,cs,0,RI); if(i>dx/2) Break; + j=loop(box1->p,x0,y0+dy/2,dx,cs,0,RI); if(j<i-dx/4 || j>i+dx/8) Break; i=j; + j=loop(box1->p,x0,y1-dy/4,dx,cs,0,RI); if(j<i-dx/4 || j>i+dx/8) Break; + j=loop(box1->p,x1,y1-dy/4,dx,cs,0,LE); + for( x=dx,y=y0+dy/6; y<y1-dy/9; y++ ) // left border straight + { i=loop(box1->p,x0,y,dx,cs,0,RI); + if (i>j/2 && ad>98) ad=99*ad/100; + if (i>dx/4) break; + if(i<x) x=i; + } if( y<y1-dy/9 ) Break; // t + if(dy>3*dx) // ~[ + if( get_bw(x0+dx/2,x0+dx/2,y0+dy/4,y1-dy/4,box1->p,cs,1) == 0 ) Break; + + if (box1->m2) { + if (!hchar) ad=ad*99/100; + if ( gchar) ad=ad*99/100; + } + Setac(box1,(wchar_t)'E',ad); + if (ad>=100) return 'E'; + break; + } + return box1->c; +} + +static wchar_t ocr0_n(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + int i,j,d,x,y,i1,i2,i3,handwritten=0, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + + // --- test n --------------------------------------------------- + // glued rm is very similar to glued nn -> thickness of h-line should grow + // may02: tested for 8x12 font + for(ad=d=100;dx>2 && dy>3;){ // min 3x4 + DBG( wchar_t c_ask='n'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + i= num_cross( 0,dx-1,dy/4,dy/4,sdata->bp,cs); + j= num_cross( 0,dx-1,dy/2,dy/2,sdata->bp,cs); + if( (i<2 || i>3) && j!=2 ) Break; + if( loop(sdata->bp,dx/2,0,dy,cs,0,DO) > dy/8 && sdata->hchar ) Break; /* tt */ + y=5*dy/8; /* also for handwritten n, where first bow goes not down enough */ + if( num_cross( 0,dx/2,y ,y ,sdata->bp,cs) != 1 + && num_cross( 0,dx/2,y-1,y-1,sdata->bp,cs) != 1 + && num_cross(dx/2,dx-1,y ,y ,sdata->bp,cs) < 1 ) Break; // n rr + // ~thick_w + y=loop(sdata->bp,dx-1-dx/4,0,dy,cs,0,DO); if(y>dy/2) Break; + if(y>1)if( get_bw(dx-1-dx/4,dx-1,0,y-2,sdata->bp,cs,1) == 1 ) Break; + + y=3*dy/4; + if( num_cross(0, dx/2,y ,y ,sdata->bp,cs) == 1 + && num_cross(dx/2,dx-1,y ,y ,sdata->bp,cs) == 0 ) Break; // ~p + y=dy/2; + if( num_cross(0,dx-1,dy/2-dy/8,dy/2-dy/8,sdata->bp,cs) == 2 + && num_cross(0,dx-1,dy/2, dy/2 ,sdata->bp,cs) == 2 ) { // n rr + /* printed n */ + x =loop(sdata->bp,0,y,dx ,cs,0,RI); if(x> dx/4) Break; // search 1st v-line + x+=loop(sdata->bp,x,y,dx-x,cs,1,RI); if(x> dx/2) Break; i1=x; // 1st gap + x+=loop(sdata->bp,x,y,dx-x,cs,0,RI); if(x< dx/2) Break; i2=x; // 2nd v-line + x+=loop(sdata->bp,x,y,dx-x,cs,1,RI); if(x<3*dx/4) Break; i3=x; // 2nd gap + i=dy/4; y=13*dy/16; + if( num_cross(dx/2,dx-1,y,y,sdata->bp,cs)==2 ) i=3*dy/8; // \it n + if (i<2 && i<dy/2) i++; // correct for small fonts like 8x12 + // the same game for the lower part =>l1 l2 l3 l4 ??? + for(x=i1;x<i2;x++) if( loop(sdata->bp,x, 0,dy,cs,0,DO)>=i ) break; + if(x <i2) Break; // gap detected + for(x=i1;x<i2;x++) if( loop(sdata->bp,x,dy-1,dy,cs,0,UP) >dy/4 ) break; + if(x==i2) Break; // no gap detected (glued serifs ??? ) + // glued rm as nn ??? + for(y=0,x=(i1+i2)/2;x<i2;x++){ + i=loop(sdata->bp,x,0,dy,cs,0,DO); + i=loop(sdata->bp,x,i,dy,cs,1,DO); // measure thickness + if( i>y ) y=i; if( i<y/2 ) break; + } + if(x <i2) Break; // unusual property for n + if( dy>7 ) + if( loop(sdata->bp,dx-1,dy-1-dy/8,dx,cs,0,LE) + +loop(sdata->bp, 0,dy-1-dy/8,dx,cs,0,RI)-dx/8-1 + > loop(sdata->bp,dx-1,dy-1-dy/2,dx,cs,0,LE) + +loop(sdata->bp, 0,dy-1-dy/2,dx,cs,0,RI) ) ad=90*ad/100; // broken o + if( dy>7 && dx>7 ) + if( loop(sdata->bp,dx-1, dy/2,dx,cs,0,LE)==0 + && loop(sdata->bp,dx-1,dy-1-dy/8,dx,cs,0,RI)>dx/8 ) ad=98*ad/100; // broken o + } else { /* check handwritten n */ + if( num_cross(0,dx-1,dy/2, dy/2 ,sdata->bp,cs) != 3 + && num_cross(0,dx-1,dy/2-dy/8,dy/2-dy/8,sdata->bp,cs) != 3 ) Break; + i =loop(sdata->bp,0,dy/2-dy/8,dx,cs,0,RI); if (i>dx/4) Break; + i+=loop(sdata->bp,i,dy/2-dy/8,dx,cs,1,RI); if (i>dx/2) Break; + i+=loop(sdata->bp,i,dy/2-dy/8,dx,cs,0,RI); + if( num_cross(i,i, 0,dy/2-2*dy/8,sdata->bp,cs) != 0 ) Break; + i+=loop(sdata->bp,i,dy/2-dy/8,dx,cs,1,RI); + if( num_cross(i,i,dy/2+1, dy-1,sdata->bp,cs) != 0 ) Break; + handwritten=80; + } + + i= loop(sdata->bp,dx-1 ,dy/2,dx,cs,0,LE); if(i>5) + if( get_bw(dx-1-i/2,dx-1-i/2,0,dy/2,sdata->bp,cs,1) == 1 ) Break; // ~rr + i+=loop(sdata->bp,dx-1-i,dy/2,dx,cs,1,LE); + if( get_bw(dx-1-i ,dx-1-i ,0,dy/2,sdata->bp,cs,1) == 0 ) Break; // ~rv + + if( get_bw(dx/2,dx/2,dy/4,dy/4,sdata->bp,cs,1) == 0 + && get_bw(dx/2,dx-1,dy-2,dy-2,sdata->bp,cs,1) == 0 + && get_bw(dx/2,dx/2,dy/4,dy-2,sdata->bp,cs,1) == 1 ) Break; // ~P + + // glued ri ??? + if( box1->dots>0 && box1->m1 ) + if( get_bw((x1+x0)/2,x1,box1->m1,y0-1,box1->p,cs,1) == 1 ) + if( num_cross( 0,dx-1,0 ,0 ,sdata->bp,cs) >2 + || num_cross( 0,dx-1,1 ,1 ,sdata->bp,cs) >2 ) Break; + + + i=loop(sdata->bp,dx-1, dy-1,dx,cs,0,LE); if (i>dx/2) + i=loop(sdata->bp,dx-1, dy-2,dx,cs,0,LE); + x=loop(sdata->bp,dx-1,dy-1-dy/4,dx,cs,0,LE); + if (sdata->hchar && i-x>1) Break; // ß + x=loop(sdata->bp, 0,dy-1,dx,cs,0,LE); // check for serifs + i=loop(sdata->bp, 0,dy-2,dx,cs,0,LE); if (i<x) x=i; + i=loop(sdata->bp, 0, 1,dx,cs,0,LE); if (i<x) x=i; + i=loop(sdata->bp, 0, 2,dx,cs,0,LE); if (i<x) x=i; + if (sdata->hchar && x>0) Break; // fl + + if (num_cross( 0,dx-1,dy/4,dy/4,sdata->bp,cs)>=3) ad=98*ad/100; // small M + if (sdata->hchar || 2*y0<box1->m1+box1->m2) ad=96*ad/100; + if (sdata->gchar) ad=96*ad/100; // ß fl + if (dx<5) { // for small fonts no middle line is possible for m + ad=99*ad/100; // 4x6 m + if (num_cross(0,dx-1,dy/8,dy/8,sdata->bp,cs)>=2) { + ad=97*ad/100; // ~m + if (dy<=4) Setac(box1,'m',97); // only for 4x6 font! + } + } + Setac(box1,'n',ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_M(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int d,x,y,i0,i1,i2,i3,t1,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + + // ------------------ test M --------------------------- + for(ad=d=100;dx>3 && dy>3;){ // dy<=dx nicht perfekt! besser mittleres + // min-suchen fuer m + DBG( wchar_t c_ask='M'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + if( num_cross(0,dx-1, dy/2, dy/2,bp,cs)<3 + && num_cross(0,dx-1, dy/4, dy/4,bp,cs)<3 + && num_cross(0,dx-1,5*dy/8,5*dy/8,bp,cs)<3 + && num_cross(0,dx-1,3*dy/4,3*dy/4,bp,cs)<3 + && dx>4 ) Break; + if( num_cross(0,dx-1, dy/4, dy/4,bp,cs)<2 + && num_cross(0,dx-1, dy/8, dy/8,bp,cs)<2 ) Break; /* fat M */ + if( num_cross(0,dx-1,3*dy/4,3*dy/4,bp,cs)<2 ) Break; + + x = loop(bp,dx-1 ,dy-1,dx,cs,0,LE); // ~ melted kl + x = loop(bp,dx-1-x,dy-1,dx,cs,1,LE); if( x>dx/2 ) Break; + + if( loop(bp, 0,7*dy/16,dx,cs,0,RI) + + loop(bp,dx-1,7*dy/16,dx,cs,0,LE) > dx/2 ) Break; // ~K + + if( dy>8 /* following lines should be extend to range check */ + && loop(bp, dx/4,dy-1, dy,cs,0,UP)<dy/4 + && loop(bp,3*dx/8,dy-1, dy,cs,0,UP)<dy/4 ) + if( loop(bp, 0,dy-1-dy/ 8,dx,cs,0,RI) + < loop(bp, 0,dy-1-dy/16,dx,cs,0,RI)-dx/32 ) Break; // ~it_u + if( num_cross(0,dx-1, dy/2, dy/2,bp,cs)==2 + && num_cross(0,dx-1, dy/4, dy/4,bp,cs)> 2 + && num_cross(0,dx-1,3*dy/4,3*dy/4,bp,cs)> 2 ) Break; // ~it_u + if( num_cross(0 ,dx-1,3*dy/4,3*dy/4,bp,cs)==2 + && num_cross(dx/2,dx/2,3*dy/4, dy-1,bp,cs)> 0 ) Break; // ~it_v + + if( loop(bp,3*dx/4, 0,dy,cs,0,DO) + > loop(bp,2*dx/4, 0,dy,cs,0,DO) + && loop(bp,3*dx/4,dy-1,dy,cs,0,UP) + < loop(bp,2*dx/4,dy-1,dy,cs,0,UP) ) Break; // ~N + if( loop(bp,3*dx/4, dy/8,dy,cs,0,DO) + > loop(bp,2*dx/4, dy/8,dy,cs,0,DO) + && loop(bp,3*dx/4,dy-1-dy/8,dy,cs,0,UP) + < loop(bp,2*dx/4,dy-1-dy/8,dy,cs,0,UP) ) Break; // ~serif_N + + // i0 is lower end of upper serifen (widest gap? ) + i0=0; + + if( num_cross(0,dx-1,dy/2,dy/2,bp,cs)!=4 ){ // Is it a N ? + if( num_cross(0,dx-1,dy/2,dy/2,bp,cs)==3 ){ + for(y=dy/2+1;y<dy;y++){ + if( num_cross(0,dx-1,y,y,bp,cs)<3 ) break; + } + if( num_cross(0,dx-1,y,y,bp,cs)==2 ){ + x =loop(bp,dx-1 ,y-1,dx,cs,0,LE); + x+=loop(bp,dx-1-x,y-1,dx,cs,1,LE); + x+=loop(bp,dx-1-x,y-1,dx,cs,0,LE); + if( loop(bp,dx-x,y-1,dy,cs,0,UP)>y-2 ) Break; // ~N + } + } + } + // MNWK + for(i2=0,i1=x=dx/2;x<dx-dx/4;x++){ // lowest pixel + y=loop(bp,x,0,dy,cs,0,DO); if(y>i2) {i2=y;i1=x;} else break; } + i3=i2+loop(bp,i1,i2,dy-i2,cs,1,DO); + if(i2<dy/4) { + if (!sdata->hchar) Break; // rm + ad=99*ad/100; + } + if (i2==0 && dx>8 && dy>12) Break; // glued and bad splitted serifen-MN + + // if( num_hole(x0, x1, y0 , y1 ,box1->p,cs,NULL) != 0 ) Break; // small A + if (sdata->holes.num != 0) Break; + t1=loop(bp,0 ,3*dy/4,dx,cs,0,RI); + t1=loop(bp,t1,3*dy/4,dx,cs,1,RI); // thickness of line? + if( 7*(t1+1)<dx ) + if( num_cross(i1,dx-1,i2-1,i2-1,bp,cs)!=2 + || num_cross(0 ,i1 ,i2-1,i2-1,bp,cs)!=2 ) Break; // too hard ??? + + // ~u_n-pair + if( num_cross(0,dx-1,0,0,bp,cs)!=2 + && num_cross(0,dx-1,1,1,bp,cs)!=2 + && num_cross(0,dx-1,2,2,bp,cs)!=2 ) Break; + + // ~nn v0.2.4a3 + if( num_cross(0,dx-1, dy/4, dy/4,bp,cs)==4 + && num_cross(0,dx-1,3*dy/4,3*dy/4,bp,cs)==4 ){ + i1 =loop(bp, 0, dy/4,dx,cs,0,RI); + i1+=loop(bp,i1, dy/4,dx,cs,1,RI); + i1+=loop(bp,i1, dy/4,dx,cs,0,RI); + i2 =loop(bp, 0,3*dy/4,dx,cs,0,RI); + i2+=loop(bp,i2,3*dy/4,dx,cs,1,RI); + i2+=loop(bp,i2,3*dy/4,dx,cs,0,RI); + if( i1>=i2 ) Break; // no good M + i1+=loop(bp,i1, dy/4,dx,cs,1,RI); + i2+=loop(bp,i2,3*dy/4,dx,cs,1,RI); + if( i1>=i2 ) Break; // no good M + i1+=loop(bp,i1, dy/4,dx,cs,0,RI); + i2+=loop(bp,i2,3*dy/4,dx,cs,0,RI); + if( i1<=i2 ) Break; // no good M + } + if( num_cross(0,dx-1,dy/2,dy/2,bp,cs)==2 + && num_cross(0,dx-1,dy/4,dy/4,bp,cs)==2 && !hchar ) Break; // ~ \it u + + if (dy<17) + if( num_cross(0,dx-1, 0, 0,bp,cs)<2 ) ad=99*ad/100; + if (dx>5) /* 4x6 font has only 1 cross at y=1 */ + if( num_cross(0,dx-1, 1, 1,bp,cs)<2 ) ad=96*ad/100; // kt + if( num_cross(dx/2,dx/2, 0, dy-1,bp,cs)!=1) ad=98*ad/100; // kt + if (dx<5 && loop(bp,dx/2,0,dy,cs,0,DO)>=3*dy/8) ad=96*ad/100; // 4x6 H + + if( num_cross(0,dx-1, dy/4, dy/4,bp,cs)<=2 + && num_cross(0,dx-1,3*dy/4,3*dy/4,bp,cs)<=2 + && dx>8 && dy>12 ){ + ad=98*ad/100; + for(y=5*dy/16;y<5*dy/8;y++) // look for H-line + if( num_cross(0,dx-1,y ,y ,bp,cs)==1 ) break; + if( y<5*dy/8 ) ad=95*ad/100; + if( y<5*dy/8 ) + if( num_cross(2+dx/6,dx-3-dx/6,y-2,y-2,bp,cs)==0 + || num_cross(2+dx/6,dx-3-dx/6,y-1,y-1,bp,cs)==0 ) Break; // ~H bad! + } + + if( loop(bp,3*dx/8, 0,dy,cs,0,DO) >dy/2 + && loop(bp,5*dx/8,dy-1,dy,cs,0,UP) >dy/2 ) ad=95*ad/100; + + if(!hchar){ + ad=98*ad/100; /* not sure */ + if( loop(bp,0, dy/4,dx,cs,0,RI) + < loop(bp,0,dy-1-dy/8,dx,cs,0,RI)-dx/16 ) Break; // ~wi glued + } + if( gchar ) ad=98*ad/100; + if (ad>99 && dx<8) ad=99*ad/100; /* give 5x8 N a chance */ + Setac(box1,'M',ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_N(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + int d,x,y,i1,i2,i3,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + (*aa)[4]=sdata->aa, /* corner-points, (x,y,dist^2,vector_idx) */ + dbg[9], + ad; /* tmp-vars */ + + // --- test N ------- +hchar -gchar + for(ad=d=100;dx>3 && dy>3;){ // 4x6font + DBG( wchar_t c_ask='N'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + if (sdata->holes.num > 0) ad=98*ad/100; /* # */ + if (dx<6) ad=99*ad/100; + if (dx<5) ad=99*ad/100; + /* half distance to the center */ + d=2*sq(128/4); + /* now we check for the 4 ends of the x */ + if (aa[0][2]>d) Break; + if (aa[1][2]>d) Break; + if (aa[2][2]>d) Break; + if (aa[3][2]>d) Break; + if (aa[3][0]-aa[0][0]<dx/2) Break; + if (aa[2][0]-aa[1][0]<dx/2) Break; + if (aa[1][1]-aa[0][1]<dy/2) Break; + if (aa[2][1]-aa[3][1]<dy/2) Break; + if (aa[3][0]-aa[0][0]<4-1) Break; /* to small to hold an N */ + if (aa[2][0]-aa[1][0]<4-1) Break; /* to small */ + if (abs(aa[3][1]-aa[0][1])>(dy+2)/5) Break; /* glued tu */ + if (abs(aa[3][1]-aa[0][1])>(dy+4)/8) ad=98*ad/100; /* glued tu */ + /* left and right vertical line */ + d=line_deviation(box1, aa[0][3], aa[1][3]); if (d>2*sq(1024/4)) Break; + ad=(100-(d-sq(1024)/2)/sq(1024)/4)*ad/100; + d=line_deviation(box1, aa[2][3], aa[3][3]); if (d>2*sq(1024/4)) Break; + + /* search uppermost left ^ (between near 0,0) */ + i1=nearest_frame_vector(box1,aa[1][3],aa[2][3], x0+dx/8, y0); + x=box1->frame_vector[i1][0]; + y=box1->frame_vector[i1][1]; + MSG( fprintf(stderr,"i1= %d (%d,%d) left ^", i1,x-x0,y-y0);) + if (y-y0 > 5*dy/8) Break; + if (x-x0 > 5*dx/8) Break; + /* search uppermost right ^ ~H */ + i3=nearest_frame_vector(box1,aa[1][3],aa[2][3], x1, y0); + MSG( fprintf(stderr,"i3= %d (%d,%d) right ^",\ + i3, box1->frame_vector[i3][0]-x0,box1->frame_vector[i3][1]-y0);) + + /* check if upper left and lower right point are joined directly */ + dbg[0]=d=line_deviation(box1,i1, aa[2][3]); + /* check if lower left and lower left point are joined directly */ + dbg[1]=d=line_deviation(box1, aa[1][3],i1); + MSG( fprintf(stderr," i1-a2 %d a1-i1 %d",dbg[0],dbg[1]); ) + if (dbg[0] > sq(1024/4)) Break; + if (dx>4 && dbg[1] > sq(1024/4)) ad=97*ad/100; // d=0..2*sq(1024) + if (dx>4 && dbg[1] > sq(1024/3)) Break; // d=0..2*sq(1024) + // serif N has d=sq(1024/3)=116508 + + /* serach lowest right v, same frame? N-tilde etc.? */ + i2=nearest_frame_vector(box1,aa[3][3],aa[0][3], x1, y1-dy/8); + x=box1->frame_vector[i2][0]; + y=box1->frame_vector[i2][1]; + MSG( fprintf(stderr,"i2= %d (%d,%d) right v",\ + i2, box1->frame_vector[i2][0]-x0,box1->frame_vector[i2][1]-y0);) + if (y-y0 < 3*dy/8) Break; + if (x-x0 < 3*dx/8) Break; + // test H + if ( box1->frame_vector[i3][0]-box1->frame_vector[i1][0]> dx/4 + && box1->frame_vector[i3][1]-box1->frame_vector[i1][1]<=dy/8 + && y<=box1->frame_vector[i1][1]) Break; + /* check if upper left and lower right point are joined directly */ + dbg[2]=d=line_deviation(box1,i2, aa[0][3]); + /* check if lower right and lower right point are joined directly */ + dbg[3]=d=line_deviation(box1, aa[3][3],i2); + MSG( fprintf(stderr," i2-a0 %d a3-i2 %d",dbg[2],dbg[3]); ) + if (dbg[2] > sq(1024/4)) Break; + if (dbg[3] > sq(1024/4)) ad=97*ad/100; // serif N, ToDo: do it better + if (dbg[3] > sq(1024/3)) Break; + + if (abs((box1->frame_vector[i1][1]-y0) + -(y1-box1->frame_vector[i2][1]))>dy/8) ad=99*ad/100; /* ~ tu */ + if (abs(((y0+y1)/2-box1->frame_vector[i1][1]) + -(box1->frame_vector[i2][1]-(y0+y1)/2))>dy/8) ad=99*ad/100; /* ~ tu */ + if (box1->frame_vector[i2][0] + -box1->frame_vector[i1][0]<=dx/8) Break; /* nonsignificant distance */ + if (box1->frame_vector[i2][1] + -box1->frame_vector[i1][1]<=dy/8) ad=97*ad/100; /* too flat (ff,H) */ + if (box1->frame_vector[i2][1] + -box1->frame_vector[i1][1]<=dy/2) ad=99*ad/100; + MSG( \ + fprintf(stderr,"^v %d %d %d %d line deviation %d %d %d %d max %d %d",\ + box1->frame_vector[i1][0]-x0,box1->frame_vector[i1][1]-y0,\ + box1->frame_vector[i2][0]-x0,box1->frame_vector[i2][1]-y0,\ + dbg[0],dbg[1],dbg[2],dbg[3],sq(1024/4),sq(1024));) + ad=(100-(dbg[0]-sq(1024)/2)/sq(1024)/4)*ad/100; + ad=(100-(dbg[1]-sq(1024)/2)/sq(1024)/4)*ad/100; + ad=(100-(dbg[2]-sq(1024)/2)/sq(1024)/4)*ad/100; + ad=(100-(dbg[3]-sq(1024)/2)/sq(1024)/4)*ad/100; + + if (!hchar) ad=99*ad/100; + if ( gchar) ad=98*ad/100; // \sc N + Setac(box1,'N',ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_h(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,d,x,y,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + int (*aa)[4]=sdata->aa; /* corner-points, (x,y,dist^2,vector_idx) */ + + // --- test h --------------------------------------------------- + for(ad=d=100;dx>2 && dy>3;){ // min 3x4 + // rewritten for vectors 0.42 + int i1, i2, i3, i4, i5, i6, i7, i8; // line derivation + corners + DBG( wchar_t c_ask='h'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + /* half distance to the center */ + d=2*sq(128/4); + /* now we check for the upper right end of the h */ + if (aa[3][2]<d/4) Break; /* [2] = distance, ~BCDEF... */ + if (aa[0][2]>d/2) Break; /* upper left end */ + if (aa[1][2]>d/2) Break; /* lower left end */ + if (aa[2][2]>d/2) Break; /* lowerright end */ +/* + type A B=italic ??? + 18 OOO + O O O + O O + O7OOO OOOO + O4 O O O + O O O O + O O O O O + 2O3 5O6 O OOO +*/ + i1=i8=aa[0][3]; + i2=i3=aa[1][3]; + i5=i6=aa[2][3]; + // check the bow from below + for (i4=i=i2;i!=i5;i=(i+1)%box1->num_frame_vectors[0]) { + if (box1->frame_vector[ i][1] + <box1->frame_vector[i4][1]) i4=i; // get next maximum + if (box1->frame_vector[ i][1]<=y0) break; // fatal! + } + if (box1->frame_vector[i4][1]-y0<dy/4) Break; // ~MN + if (y1-box1->frame_vector[i4][1]<dy/4) Break; // ~BCDEGIJLOQSUYZ + // two steps for i7 to go around pitfalls on italic h + i7=nearest_frame_vector(box1, i6, i8, (x0+x1)/2, (y0+y1)/2); + i7=nearest_frame_vector(box1, i6, i7, x0, (y0+y1)/2); + i3=nearest_frame_vector(box1, i2, i4, (x0+x1)/2, y1); + i5=nearest_frame_vector(box1, i4, i6, (x0+x1)/2, y1); + + MSG(fprintf(stderr,"i1-7 %d %d %d %d %d %d %d",i1,i2,i3,i4,i5,i6,i7);) + /* ... new part /// old obsolete part ... */ + if( get_bw(0 ,dx/2,dy/8 ,dy/8 ,bp,cs,1) != 1 ) Break; + if( get_bw(0 ,dx/2,dy/2 ,dy/2 ,bp,cs,1) != 1 ) Break; + if( get_bw(dx/2 ,dx-1,dy-1-dy/3,dy-1-dy/3,bp,cs,1) != 1 ) Break; + if( get_bw(dx/2 ,dx/2,dy/5 ,dy-1-dy/3,bp,cs,1) != 1 ) Break; + if( get_bw(dx-1-dx/3,dx-1,0 ,1 ,bp,cs,1) == 1 ) Break; + if( get_bw(dx-1-dx/3,dx-1,1 ,dy/6 ,bp,cs,1) == 1 ) Break; + if( dy>18 ) + if( get_bw(dx-1-dx/3,dx-1,dy/6 ,dy/5 ,bp,cs,1) == 1 ) Break; + if( get_bw(dx-1-dx/3,dx-1,dy-1-dy/4,dy-1 ,bp,cs,1) == 0 ) Break; // s- + for( x=x0+dx/3;x<x1-dx/3;x++) + if( get_bw(x, x,y1-dy/4, y1, box1->p,cs,1) == 0 ) break; + if( x>=x1-dx/3 ) Break; + for(i=dy/4,y=y0+dy/3;y<=y1 && i;y++){ + if( num_cross(x0,x1 ,y,y, box1->p,cs) == 2 ) i--; + } if( i ) Break; + for(i=dy/4,y=y0;y<=y0+dy/2 && i;y++){ + if( num_cross(x0,x0+dx/2,y,y, box1->p,cs) == 1 ) i--; + } if( i ) Break; + // if( num_hole(x0, x1, y0 , y1 ,box1->p,cs,NULL) > 0 ) // could happen + if (sdata->holes.num > 0) + if (sdata->holes.hole[0].y0 > dy/3 + && sdata->holes.hole[0].y1 < dy-1-dy/3) Break; + // if( num_hole(x0, x1, y0+dy/3 , y1-dy/3 ,box1->p,cs,NULL) != 1 ) Break; // mini + if( loop(bp,dx-1,dy/3,dx,cs,0,LE)+dx/8 + < loop(bp,dx-1,dy/2,dx,cs,0,LE) + && loop(bp,dx-1,dy-1-dy/8,dx,cs,0,LE)+dx/8 + < loop(bp,dx-1,dy/2,dx,cs,0,LE)) Break; // ~k Okt00 + i=loop(bp,0,dy-1-dy/4,dx,cs,0,RI); + if (i>1 && num_cross(x0,x0,y0+dy/8+2,y0+dy/2, box1->p,cs) == 1 ){ // fi fu + ad=(99-(1<<i))*ad/100; + if (num_cross(x0,x0,y0,y0+dy/8+2, box1->p,cs) == 0 ) ad=97*ad/100; + if (num_cross(x0+dx/2,x0+dx/2,y0,y0+dy/8+2, box1->p,cs) == 1 ) ad=97*ad/100; + if (ad<1) break; + } + i =loop(bp,0,dy/4,dx,cs,0,RI); + i+=loop(bp,i,dy/4,dx,cs,1,RI)+1; + for ( ; i<dx-dx/3; i++ ) + if( loop(bp,i,0,dy,cs,0,DO)>5*dy/8 ) { + ad=98*ad/100; // melted hi, li, but handwritten h + MSG(fprintf(stderr,"ad=%d",ad);) } + if( num_cross(x0,x0,y0+(dy+3)/8,y1,box1->p,cs) > 1 ) { + ad=98*ad/100; // melted fr + MSG(fprintf(stderr,"ad=%d",ad);) } + + i=loop(bp,dx-1,3*dy/4,dx,cs,0,LE); // melted "fr" for vertikal letters + if (i>dx/4 && loop(bp,dx-1-i,dy-1,dy,cs,1,UP)>dy/2) { + ad=94*ad/100; MSG(fprintf(stderr,"ad=%d",ad);) } + + i=loop(bp,dx-1,1+dy/16,dx,cs,0,LE); if (i<dx/4) { + ad=98*ad/100; + MSG(fprintf(stderr,"ad=%d",ad);) } + if( num_cross(dx-i+1+dx/8,dx-i+1+dx/8,0,1+dy/16,bp,cs) > 0 ) { + ad=95*ad/100; // melted fi + MSG(fprintf(stderr,"ad=%d",ad);) } + if (loop(box1->p,x1,y0+1+dy/16,dx,cs,0,LE)<dx/4) { + ad=98*ad/100; // fi + MSG(fprintf(stderr,"ad=%d",ad);) } + if (loop(box1->p,x1,y0 ,dx,cs,0,LE)<dx/4 + || loop(box1->p,x1,y0+1,dx,cs,0,LE)<dx/4) { + ad=98*ad/100; // li + MSG(fprintf(stderr,"ad=%d",ad);) } + + + if (sdata->holes.num > 0) ad=97*ad/100; + if (box1->m2) { + if ( gchar) ad=98*ad/100; + if (!hchar) ad=97*ad/100; + } else ad=99*ad/100; + Setac(box1,'h',ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_H(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,j1,d,x,y,ya,i1,i2,i3,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + + // --- test H --------------------------------------------------- + for(ad=d=100;dx>2 && dy>3;){ // min 3x4 + DBG( wchar_t c_ask='H'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + if( num_cross(0,dx-1,dy/4 ,dy/4 ,bp,cs) != 2 + && num_cross(0,dx-1,dy/4-1,dy/4-1,bp,cs) != 2 ) Break; + if( num_cross(0,dx-1,3*dy/4 ,3*dy/4 ,bp,cs) != 2 + && num_cross(0,dx-1,3*dy/4+1,3*dy/4+1,bp,cs) != 2 ) Break; + if( loop(bp,0 ,dy/8,dx,cs,0,RI) + + loop(bp,dx-1,dy/8,dx,cs,0,LE)>dx/2 ) Break; // ~A + for( j1=0,i=1,y=y0+dy/10; y<y1-dy/10 && i; y++ ) // 2 vertikal lines + { j=loop(box1->p,x0 ,y,dx,cs,0,RI) + +loop(box1->p,x1 ,y,dx,cs,0,LE); if( j>dx/2 ) i=0; if(j>j1)j1=j; } + if( !i ) Break; + for( i=1,y=dy/4; y<dy-1-dy/4 && i; y++ ) // max - min width + { j=loop(bp,0 ,y,dx,cs,0,RI) + +loop(bp,dx-1,y,dx,cs,0,LE); if( j1-j>dx/5 ) i=0; } + if( !i ) Break; // ~K Jul00 + for( i=0,ya=y=y0+dy/3; y<y1-dy/3; y++ ) // horizontal line + { j=loop(box1->p,x0 ,y,dx,cs,0,RI); + j=loop(box1->p,x0+j,y,dx,cs,1,RI); if( j>i ) { i=j; ya=y; } } + if( i<=dx/2 ) Break; ya-=y0; + if( num_cross(0,dx-1,ya ,ya ,bp,cs) != 1 + && num_cross(0,dx-1,ya+1,ya+1,bp,cs) != 1 ) Break; /* Dec00 */ + for( y=ya; y<dy-dy/4; y++ ) // ~M Dec00 + if( num_cross(0,dx-1,y ,y ,bp,cs) > 2 + && num_cross(0,dx-1,y+1,y+1,bp,cs) > 2 ) break; + if ( y<dy-dy/4 ) Break; + for(i=1,x=x0+dx/4;x<=x1-dx/4 && i;x++){ + if( get_bw( x, x,y0 ,y0+dy/4,box1->p,cs,1) == 0 ) i=0; + } if( i ) Break; + for(i=1,x=x0+dx/4;x<=x1-dx/4 && i;x++){ + if( get_bw( x, x,y1-dy/4,y1 ,box1->p,cs,1) == 0 ) i=0; + } if( i ) Break; + for(i=1,x=x0+dx/4;x<=x1-dx/4 && i;x++){ + if( num_cross(x,x,y0+dy/8,y1-dy/8, box1->p,cs) == 1 ) i=0; + } if( i ) Break; + for(i=1,y=y0;y<=y0+dy/4 && i;y++){ + if( num_cross(x0,x1,y,y, box1->p,cs) == 2 ) i=0; + } if( i ) Break; + for(i=1,y=y1-dy/4;y<=y1 && i;y++){ + if( num_cross(x0,x1,y,y, box1->p,cs) == 2 ) i=0; + } if( i ) Break; + if( get_bw(x1-dx/8, x1 , y0, y0+dy/8,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0 , x0+dx/8, y1-dy/8, y1,box1->p,cs,1) != 1 ) Break; + i1=loop(bp,dx-1, dy/4,dx,cs,0,LE); if(i1>dx/2) Break; + i2=loop(bp,dx-1, dy/2,dx,cs,0,LE); if(i2<i1-dx/4 || i2>i1+dx/8) Break; + i3=loop(bp,dx-1,dy-1-dy/4,dx,cs,0,LE); if(i3<i2-dx/4 || i3>i2+dx/8) Break; + if(abs(i1+i3-2*i2)>dx/16+1) Break; + // test for thick tall N looking like a H + if( num_cross(x0,x1,y0,y1, box1->p,cs) < 2 ) Break; // sure N + i1=loop(bp, 0, dy/4,dx,cs,0,RI); + i1=loop(bp, i1, dy/4,dx,cs,1,RI); + i2=loop(bp, 0,dy-1-dy/4,dx,cs,0,RI); + i2=loop(bp, i2,dy-1-dy/4,dx,cs,1,RI); + i3=loop(bp,dx-1 ,dy-1-dy/4,dx,cs,0,LE); + i3=loop(bp,dx-1-i3,dy-1-dy/4,dx,cs,1,LE); + i =loop(bp, 0,dy/2+1+dy/8,dx,cs,0,RI); + i+=loop(bp, i,dy/2+1+dy/8,dx,cs,1,RI); + i =loop(bp, i,dy/2+1+dy/8,dx,cs,0,RI); + if (i<dx/2-1 && 5*i1>6*i2 && 5*i3>6*i2 && i1>i2 && i3>i2 ) Break; + if( dx>8 ) + if ( loop(bp,dx-1, 3*dy/8,dx,cs,0,LE) + -loop(bp,dx-1, dy/8,dx,cs,0,LE)>dx/4 + && loop(bp,dx-1, 3*dy/8,dx,cs,0,LE) + -loop(bp,dx-1,dy-1-dy/8,dx,cs,0,LE)>dx/4 ) Break; // ~K + // if( num_hole(x0,x1,y0,y1,box1->p,cs,NULL) != 0 ) Break; + if (sdata->holes.num != 0) Break; + if ( gchar) ad=99*ad/100; + if (!hchar) ad=98*ad/100; + Setac(box1,'H',ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_k(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,x,y,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + int (*aa)[4]=sdata->aa; /* corner-points, (x,y,dist^2,vector_idx) */ + + // --- test k --------------------------------------------------- + for(ad=100;dx>2 && dy>3;){ // min 3x4 + // rewritten for vectors 0.43 + int d, i1, i2, i3, i4, i5, i6, i7, i8; // line derivation + corners + DBG( wchar_t c_ask='k'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + /* half distance to the center */ + d=2*sq(128/4); + /* now we check for the upper right end of the h */ + if (aa[3][2]<d/4) Break; /* [2] = distance, ~BCDEF... */ + if (aa[0][2]>d/2) Break; /* upper left end */ + if (aa[1][2]>d/2) Break; /* lower left end */ + if (aa[2][2]>d/2) Break; /* lowerright end */ +/* + type A B=italic ??? + 18 OOO + O O O + O O6 O + O7 OO O OO + O4OO OO OO + O OO O O + O OO O O O + 2O3 O5 O OOO +*/ + i1=i8=aa[0][3]; + i2=i3=aa[1][3]; + i5= aa[2][3]; + // check the bow from below + for (i4=i=i2;i!=i5;i=(i+1)%box1->num_frame_vectors[0]) { + if (box1->frame_vector[ i][1] + <box1->frame_vector[i4][1]) i4=i; // get next maximum + if (box1->frame_vector[ i][1]<=y0) break; // fatal! + } + if (box1->frame_vector[i4][1]-y0<dy/4) Break; // ~MN + if (y1-box1->frame_vector[i4][1]<dy/4) Break; // ~BCDEGIJLOQSUYZ + i6=nearest_frame_vector(box1, i5, i8, x1, (2*y0+y1)/3); + // two steps for i7 to go around pitfalls on italic h + i7=nearest_frame_vector(box1, i6, i8, x0, y1); + i3=nearest_frame_vector(box1, i2, i4, (x0+x1)/2, y1); + i =nearest_frame_vector(box1, i5, i6, x0, (y0+2*y1)/3); + if (x1-box1->frame_vector[i][0]<dy/4) Break; // h + if (x1-box1->frame_vector[i][0]<dy/2) ad=98*ad/100; + + MSG(fprintf(stderr,"i1-7 %d %d %d %d %d %d %d",i1,i2,i3,i4,i5,i6,i7);) + if( num_cross(0, dx-1,0,0,bp,cs) != 1 + && num_cross(0, dx-1,1,1,bp,cs) != 1 ) Break; + if( num_cross(0,3*dx/4, dy/8 , dy/8 ,bp,cs) != 1 + || num_cross(0,3*dx/4,3*dy/16,3*dy/16,bp,cs) != 1 ) Break; + if( num_cross(0,dx-1,dy-1,dy-1,bp,cs) != 2 + && num_cross(0,dx-1,dy-2,dy-2,bp,cs) != 2 ) Break; + if( dx<8 + && num_cross(dx-1,dx-1,dy/4,dy-1,bp,cs) != 2 + && num_cross(dx-2,dx-2,dy/4,dy-1,bp,cs) != 2 ) Break; + i1=loop(bp,0,dy/2-dy/4,dx,cs,0,RI); + i2=loop(bp,0,dy/2 ,dx,cs,0,RI);if(i2>dx/2) Break; + i3=loop(bp,0,dy/2+dy/4,dx,cs,0,RI); + if(abs(i1+i3-2*i2)>dx/16+1 || i1<i3-1) Break; // v-line on left side? + if( get_bw(x0 ,x0+dx/2,y0 ,y0+dy/4,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2,x1, y1-dy/3,y1 ,box1->p,cs,1) != 1 ) Break; + if( get_bw(x1-dx/4,x1, y0 ,y0+3*dy/16,box1->p,cs,1) == 1 ) Break; + if( get_bw(x1-dx/4,x1, y0+dy/4,y1-dy/4,box1->p,cs,1) != 1 ) Break; //~1 + if( get_bw(x1-dx/4,x1, y1-dy/8,y1 ,box1->p,cs,1) != 1 ) Break; + if (sdata->holes.num > 0) + if (sdata->holes.hole[0].y0 > dy/4) Break; + // if( num_hole(x0,x1,y0+dy/4,y1,box1->p,cs,NULL) != 0 ) Break; + for(y=y0+1;y<y0+dy/2;y++) // luecke ??? + if( get_bw(x0,x1,y,y,box1->p,cs,1) == 0 ) break; + if( y<y0+dy/2 ) Break; + for(i=1,x=x0;x<=x0+dx/2 && i;x++) + if(get_line(x,y0 ,x ,y1,box1->p,cs,100)>50) i=0; + if( i ) Break; // no vertikal line! + + /* check for falling line in the lower left corner */ + for (j=x=0,y=5*dy/8;y<7*dy/8;y++) { + i= loop(bp,dx-1,y,dx,cs,0,LE); if(i>x) { x=i;j=y; } + } // x=dx/6 on fat k + if (x + loop(bp,dx-1-x,y,dx,cs,1,LE)/2 <dx/4) Break; + if (x + loop(bp,dx-1-x,y,dx,cs,1,LE)/2 <dx/2) ad=98*ad/100; + x=dx-1-x; y=j; + i =loop(bp,dx-1,dy-1,dx,cs,0,LE); if(i>dx/2) + i =loop(bp,dx-1,dy-2,dx,cs,0,LE); if(i>dx/2) Break; + i+=loop(bp,dx-1-i,dy-1,dx,cs,1,LE)/2; + if( get_line(x,y,dx-1-i,dy-1,bp,cs,100)<60 ) Break; + + for(y=y0+dy/3;y<y1;y++) if( num_cross(x0,x1,y,y,box1->p,cs)==2 ) break; + if( y==y1 ) Break; + if( + // num_hole(x0,x1 ,y0 ,y1 ,box1->p,cs,NULL)>0 // ~A happens! + sdata->holes.num > 0 ) + if (sdata->holes.hole[0].x1>dx-1-dx/4 + || sdata->holes.hole[0].y1>dy-1-dy/4 + || sdata->holes.hole[0].y0< dy/4) Break; + // if ( num_hole(x0,x1-dx/4,y0+dy/4,y1-dy/4,box1->p,cs,NULL)==0 ) Break; + i=loop(bp,0,dy-1,dx,cs,0,RI); + i=loop(bp,i,dy-1,dx,cs,1,RI); if (dx>8 && 4*i>3*dx) Break; // ~glued_tz + i =loop(bp,0,dy/4,dx,cs,0,RI); + if (i>dx/4 + && i+loop(bp,i,dy/4,dx,cs,1,RI)>dx/2 + && loop(bp, 0,0,dx,cs,0,RI)<=dx/4 + && loop(bp,dx-1,0,dx,cs,0,LE)>=dx/2 ) ad=90*ad/100; // divided Q + + if( 2*y0>(box1->m1+box1->m2) ) ad=99*ad/100; + + if ( gchar) ad=98*ad/100; + if (!hchar) ad=98*ad/100; + Setac(box1,'k',ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_K(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,i1,i2,d,x,y,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad,ya,xa,yb,xb,yc,xc,yd,xd,ye,xe,yf,xf; /* tmp-vars */ + + // --- test K --------------------------------------------------- + for(ad=d=100;dx>2 && dy>3;){ // updated 29 Mar 2000 perfect??? + DBG( wchar_t c_ask='K'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + for(y=dy/8;y<dy-dy/8;y++) + if( !get_bw(0,dx/2,y,y,bp,cs,1) ) break; + if( y<dy-dy/8 ) Break; + for(j=0,i=1,x=x0+dx/4;x<=x1-dx/4 && i;x++){ + y= loop(box1->p,x,y0,y1-y0,cs,0,DO); if (y>3*dy/4) { i=1;break; } + if (dy>15 && j>dy/8){ + j =loop(box1->p,x-1,y0+y-1,x1-x0,cs,0,LE)/2; + y+=loop(box1->p,x-j,y0+y-1,y1-y0,cs,0,DO)-1; + } + if(y>=dy/4) i=0; /* ok, found gap */ + } if( i ) Break; + for(y=0,x=x0+dx/4;x<=x1-dx/4;x++){ // lower h-gap + i=loop(box1->p,x,y1,dy,cs,0,UP); + /* on small chars bypass possible low left serifs */ + if (i>0) { i2=loop(box1->p,x-1,y1-i-1,dy,cs,0,UP); + if (i2>1) i+=i2-1; } + if (i>y) { y=i; i1=x; } + } if( y<=dy/8 ) Break; if (y<dy/4) ad=80*ad/100; + for(i=1,x=x0+dx/3;x<=x1-dx/8 && i;x++){ + if( num_cross(x,x,y0,y1, box1->p,cs) == 2 ) i=0; + } if( i ) Break; + for(i=1,y=y0;y<=y0+dy/4 && i;y++){ + if( num_cross(x0,x1,y,y, box1->p,cs) == 2 ) i=0; + } if( i ) Break; + if( dx<10 ){ + for(i=1,y=y0+dy/3;y<=y1-dy/3 && i;y++){ + if( num_cross(x0,x1,y,y, box1->p,cs) == 1 ) i=0; + } if( i ) Break; + } + for(i=1,y=y1-dy/4;y<=y1 && i;y++){ + if( num_cross(x0,x1,y,y, box1->p,cs) == 2 ) i=0; + } if( i ) Break; + if( get_bw(x1-dx/3,x1,y0,y0+dy/8,box1->p,cs,1) != 1 ) Break; // ~k + if( dy>16 + && loop(bp,0, dy/4,dx,cs,0,RI) + +loop(bp,0,3*dy/4,dx,cs,0,RI) + <2*loop(bp,0, dy/2,dx,cs,0,RI)-2-dx/32 ) Break; // ~X + + i=loop(box1->p,x1,y0+ dy/4,x1-x0+1,cs,0,LE); if(i>dx/2) Break; + j=loop(box1->p,x1,y0+ dy/2,x1-x0+1,cs,0,LE); + x=loop(box1->p,x1,y0+3*dy/8,x1-x0+1,cs,0,LE); if(x>j) j=x; + if(j<=i ) Break; i=j; + j=loop(box1->p,x1,y1-dy/4,x1-x0+1,cs,0,LE); if(j>=i ) Break; + // out_x(box1); // detailed analysis + // + // a d <= that are main points of K + // | / + // b/e + // | \ . + // c f + ya= dy/4;xa=loop(bp,0,ya,dx,cs,0,RI);xa+=loop(bp,xa,ya,dx,cs,1,RI)/2; + yc=dy-dy/4;xc=loop(bp,0,yc,dx,cs,0,RI);xc+=loop(bp,xc,yc,dx,cs,1,RI)/2; + yb=dy/2; xb=dx-1-loop(bp,dx-1,dy/2,dx,cs,0,LE); + for(yd=ye=yf=xe=y=i=0,xf=xd=dx;y<dy/4;y++){ // range 0..1/4 + x =loop(bp,dx-1, y,dx,cs,0,LE); if(x<xd){ xd=x;yd= y; } + x =loop(bp,dx-1,dy-1-y,dx,cs,0,LE); if(x<xf){ xf=x;yf=dy-1-y; } + x =loop(bp,dx-1,dy/2+y,dx,cs,0,LE); if(x>xe){ xe=x;ye=dy/2+y; } + x =loop(bp,dx-1,dy/2-y,dx,cs,0,LE); if(x>xe){ xe=x;ye=dy/2-y; } +#if 0 // removed v0.2.4a2 + x =loop(bp,0 ,dy/2+y,dx,cs,0,RI); // middle left border + x+=loop(bp,x ,dy/2+y,dx,cs,1,RI); // test 2nd cross + x+=loop(bp,x ,dy/2+y,dx,cs,0,RI); if(x<xb){ xb=x;yb=dy/2+y; } +#endif + x =loop(bp,0 ,dy/2-y,dx,cs,0,RI); + x+=loop(bp,x ,dy/2-y,dx,cs,1,RI); // test 2nd cross + x+=loop(bp,x ,dy/2-y,dx,cs,0,RI); if(x<xb){ xb=x;yb=dy/2-y; } + x =dx-1-loop(bp,dx-1,dy/2-y,dx,cs,0,LE); if(x<xb){ xb=x;yb=dy/2-y; } + } + xd=dx-1-xd;xe=dx-1-xe;xf=dx-1-xf; + xb+=loop(bp,xb,yb,dx,cs,1,RI)/4; // detect center of line + xe-=loop(bp,xe,ye,dx,cs,1,LE)/4; + xd-=loop(bp,xd,yd,dx,cs,1,LE)/4; + xf-=loop(bp,xf,yf,dx,cs,1,LE)/4; +#if 0 + MSG( \ + printf("a=%d %d b=%d %d c=%d %d d=%d %d e=%d %d f=%d %d dxdy %d %d",\ + xa,ya,xb,yb,xc,yc,xd,yd,xe,ye,xf,yf,dx,dy);\ + ) +#endif + if( get_line2(xa,ya,xc,yc,bp,cs,100)<95 ) Break; + if( dx>8 ){ // example szaka0103 + if( xe>5*dx/8 || xb>5*dx/8 ) Break; // ~{\it n} + i=loop(bp,xb,yb,xb,cs,1,LE); // thick center? see font22 + if( get_line2(xb,yb,xd,yd,bp,cs,100)<95 ) // right up + if( get_line2(xb-i/2,yb,xd,yd,bp,cs,100)<95 ) Break; + if( get_line2(xe,ye,xf,yf,bp,cs,100)<95 ) Break; // right down + xe+=loop(bp,xe,ye,dx,cs,1,RI); if( xe>=xf ) Break; // ~{\it n} + } else { + if( dy<16 && !hchar ) Break; + if( loop(bp,0,1,dy,cs,1,DO)<=3*dx/4 + && loop(bp,1,1,dy,cs,1,DO)<=3*dx/4 + && loop(bp,2,1,dy,cs,1,DO)<=3*dx/4 ) Break; // ~x + } + if (loop(bp,dx-1,dy-1-dy/4,dx,cs,0,LE)<=dx/8){ + ad=99*ad/100; /* broken B ? */ + if (sdata->holes.num > 0) + if (sdata->holes.hole[0].y1 < dy-1-dy/3) Break; + // if( num_hole(x0,x1,y0,(y0+2*y1)/3,box1->p,cs,NULL)>0) Break; // broken B + } + if(box1->m3 && !hchar) ad=99*ad/100; + if(box1->m3 && gchar) ad=99*ad/100; + // printf(" ok xe=%d",xe); + Setac(box1,'K',ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_f(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + (*aa)[4]=sdata->aa, /* the for line ends, (x,y,dist^2,vector_idx) */ + ab[8][4], /* special points (x,y,dist^2,vector_idx) */ + ad; /* tmp-vars */ + /* x=mindist_to_a y=0 "t" + 0>..$$. 0>..$$ 0>..$$ end right bow a--..$$ a--.$7. y>0 "f" + 1>.$..$ 1>.$.. 1>.$$$ start right bow .$7. .$.. + .@... .@.. 2>.@@. start upper end .@.. .@.. + 2>.$... 2>.$.. 3>$$$$ crossing bar .$.. $$$. + 3>$@$$. 3>$@$. $@@$ $@$. .@.. + 4>.$... 4>.$.. 4>.$$. lower end .$.. .$.. + .@... .@.. .@@. .@.. .@.. + .@... .@.. .@@. .@.. .@.. + 5>.$... 5>.$.. 5>.$$. lower start .$.. .$.. + 6>..... 6>$... 6>.... optional left bow + */ + // --- test f like t --------------------------------------------------- + for(ad=d=100;dx>2 && dy>5;){ // sometimes no hchar! + // rewritten for vectors 0.43 + int d, i1, i2, i3, i4, i5, i6, i7, i8, i9; // line derivation + corners + DBG( wchar_t c_ask='f'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + /* half distance to the center */ + d=2*sq(128/4); + /* now we check for the upper right end of the h */ + if (aa[3][2]>d/2) Break; /* [2] = distance, ~BCDEF... */ + if (aa[0][2]>d ) Break; /* upper left end */ +/* + 9 + OOO + O 7 O8 + O6 + 1OOOO5 + O4 + O + 2O3 + OOOOO +*/ + i1=nearest_frame_vector(box1,aa[0][3],aa[1][3],x0-dx/2,(5*y0+3*y1)/8); + /* we need i for 4x6 font, where left side of h-bar is near (x0,y1) */ + i =aa[1][3]; if (box1->frame_vector[i][1]<y1-dy/8) + i =nearest_frame_vector(box1,aa[1][3],aa[2][3], x0, y1+dy/4); + i2=nearest_frame_vector(box1, i1, i, x1, y1); + i =nearest_frame_vector(box1,aa[1][3],aa[2][3], x1, y1+dy/4); + i3=nearest_frame_vector(box1, i,aa[3][3], x0, y1); + i7=nearest_frame_vector(box1, i3,aa[3][3],(x0+x1)/2, y0); + i8=nearest_frame_vector(box1, i7,aa[0][3], x1, (3*y0+y1)/4); + i9=nearest_frame_vector(box1,aa[3][3],aa[0][3],(x0+2*x1)/3,y0-dy/4); + i5=nearest_frame_vector(box1, i3, i7, x1+dx/4, (5*y0+3*y1)/8); + i4=nearest_frame_vector(box1, i3, i5, x0, (3*y0+y1)/4); + i6=nearest_frame_vector(box1, i5, i7, x0, (y0+3*y1)/4); + + MSG(fprintf(stderr,"i1-9 %d %d %d %d %d %d %d %d %d",i1,i2,i3,i4,i5,i6,i7,i8,i9);) + + // check if vertical line is near to the left side + if (box1->frame_vector[i2][0]-x0>dx/2) Break; // ~3 + i =nearest_frame_vector(box1, aa[0][3], i2, x1+2*dx, (y0+y1)/2); + // MSG(fprintf(stderr,"i %d",i);) + if (box1->frame_vector[i ][0] + -box1->frame_vector[i9][0]>dx/8) Break; // ~3 + + if( (box1->dots) ) Break; // Bold-face is gchar + if (dy<=box1->m3-box1->m2+1) Break; + for(x=0,j=y=2+(3*dy+4)/32;y<=5*dy/8;y++){ // upper cross line min=2 + i=loop(bp,0,y,dx,cs,0,RI); if( y>dy/4 && i>5*dx/8 ) break; + i=loop(bp,i,y,dx,cs,1,RI); if( i>x ) { x=i;j=y; } + if( y<3*dy/4 && y>dy/4 + && num_cross(0,dx-1,y ,y ,bp,cs) != 1 + && num_cross(0,dx-1,y+1,y+1,bp,cs) != 1 // against noise + ) break; + } if( y<=5*dy/8 ) Break; y=j;// if( y>dy/2 || y<dy/8 ) Break; + // x is thickest width of vertical line here + i=loop(bp,(dx+1)/2,0,dy,cs,0,DO)/2; + if( i>dy/8 + && num_cross( 0, (dx+1)/2,i,i,bp,cs) > 0 + && num_cross((dx+1)/2,dx-1,i,i,bp,cs) > 0 ) Break; // ~Y + + if (loop(bp,3*dx/4, 0,dy,cs,0,DO)>dy/8 + && loop(bp,3*dx/4-1,0,dy,cs,0,DO)>dy/8) Break; // upper bow + i=3*dy/4; if (box1->m3 && i>=box1->m3) i=box1->m3-1; + if (num_cross(0,dx-1,i,i,bp,cs)!=1) Break; + + // the middle bar appear in a wide vertical range, get part below + for (i1=dx,i2=y,j=y+1;j<dy-dy/4;j++){ + i=loop(bp,0,j,dx,cs,0,RI); + i=loop(bp,i,j,dx,cs,1,RI); // thickness vert. line + if (i<i1) { i1=i; i2=j; if (2*i<=x) break; } + } i=i1; j=i2; /* i=dx, j=y below horiz-bar */ + MSG(fprintf(stderr,"j=%d i=%d y=%d x=%d",j,i,y,x);) + // bar should have twice of the thickness of v-line + if (x<2*i && x<dx) Break; + if (x<i+2+dx/8) ad=97*ad/100; // fat f + + // check for the upper bow to the right top side + i3=nearest_frame_vector(box1,aa[2][3],aa[3][3], x0, y0); + MSG(fprintf(stderr,"xy= %d %d %d %d",x0,y0,\ + box1->frame_vector[i3][0]-x0,box1->frame_vector[i3][1]-y0);) + ab[7][0]=box1->frame_vector[i3][0]; + ab[7][1]=box1->frame_vector[i3][1]; + ab[7][3]=i3; + if (ab[7][1]-y0<=dy/16) ad=95*ad/100; // ~t + // because of the dx,dy scaling the horiz. bar could be nearer to (x1,y0) + // as the upper right end of the "t" + if (aa[3][0]-x0>3*dx/4 && aa[3][1]-y0>3*dy/16) ad=99*ad/100; // ~t + + + j=loop(bp,0,dy/8,dx,cs,0,RI); // if j>dx/2 we have italic f + if ((2*x<dx && j<=dx/2) || 3*x<dx) Break; // bar should be not to small + for(i=dy/8;i<dy;i++) + if (loop(bp,0,i,dx,cs,0,RI)>(j+dx/4)) break; + if (i<dy) Break; // check for v-line + + if( loop(bp,dx-1,dy/2,dx,cs,0,LE)<dx/2 ) + if( loop(bp,dx-1,dy/2,dx,cs,0,LE)-1 + <=loop(bp,dx-1, y ,dx,cs,0,LE) ) + if( loop(bp,dx-1, y-1,dx,cs,0,LE) + <=loop(bp,dx-1, y ,dx,cs,0,LE) ) Break; // ~1 + + if( loop(bp,0,dy/2,dx,cs,0,RI)-1 + >loop(bp,0, 1,dx,cs,0,RI) ) Break; // ~X + + i=y;j=1; // j used as flag + if( num_cross(0,dx-1,0,0,bp,cs)==1 && hchar) //~r + if( num_cross(0,dx-1,dy-1,dy-1,bp,cs)!=1 + && num_cross(0,dx-1,dy-2,dy-2,bp,cs)!=1 ) Break; // ~* etc. + // check for upper bow to right + for(y=1;j && y<i; y++) // no @@ pattern + if( num_cross(0,dx-1,y ,y ,bp,cs) ==2 ) j=0; + if (j==0) { ad=(ad+101)/2; } + for(y=1;j && y<i; y++) // no @@ pattern, try to detect it + for(x=0;j && x<dx ;x++){ // .. + if( (getpixel(bp,x ,y )>=cs || dx<7) && getpixel(bp,x+1,y )>=cs + && getpixel(bp,x ,y-1)< cs && getpixel(bp,x+1,y-1)< cs ) + { j=0;break; } + } if(j) ad=98*ad/100; // not detected + + // if( num_hole (x0 , x1 , y0, y1,box1->p,cs,NULL) != 0 ) Break; // ~e + if (sdata->holes.num != 0) Break; // ~e + for(i1=i2=dx,y=7*dy/8;y<dy;y++){ + x=loop(bp,0 ,y,dx,cs,0,RI);if(x<i1)i1=x; + x=loop(bp,dx-1,y,dx,cs,0,LE);if(x<i2)i2=x; + } + if(i1>i2+dx/4) Break; // ~t ~e + if(i1>i2+1) ad=96*ad/100; // ~t ~e + if( loop(bp,0,3*dy/4,dx,cs,0,RI)<i1-dx/4 ) Break; + if( dx>5 && !hchar) + if( loop(bp,dx-1,dy/2,dx,cs,0,LE)>3*dx/4 ) + if( loop(bp,dx-1,dy-1,dy,cs,0,UP)<dx/2 ) Break; // ~c + if( dx>8 ) + if( loop(bp, 0,2*dy/3 ,dx,cs,0,RI)>2*dx/3 + || loop(bp, 0,2*dy/3-1,dx,cs,0,RI)>2*dx/3 ) + if( loop(bp,dx-1, dy/4 ,dx,cs,0,LE)>2*dx/3 ) Break; // ~5 ~S + + if (!hchar) + if ( get_bw(x0+dx/8,x0+dx/8,y0+dy/4,y1-dy/16,box1->p,cs,2) == 0 + && num_cross(x1-dx/4,x1-dx/4,y0,y1,box1->p,cs)!=2 + && num_cross(x1-dx/8,x1-dx/8,y0,y1,box1->p,cs)!=2 ) Break; // ~r + + if (dy>15) + if( num_cross(x0,x1,y1-dy/4,y1-dy/4,box1->p,cs)>1 + && num_cross(x0,x1,y0+dy/4,y0+dy/4,box1->p,cs)>1 ) Break; // ~H + + if( dx>4 ) + if( loop(bp,dx-1 ,3*dy/4,dx,cs,0,LE)- + loop(bp,0 ,3*dy/4,dx,cs,0,RI)>dx/5+1 + && loop(bp,dx-1-dx/8,dy-1 ,dy,cs,0,UP)<dy/4 ) { + if( loop(bp,dx-1 ,5*dy/16,dx,cs,0,LE)- + loop(bp,0 ,5*dy/16,dx,cs,0,RI)>=dx/5+1) ad=98*ad/100; // ~E + i=loop(bp,dx/8,0,dy,cs,0,DO); + if (i<dy/8 || i>dy/2) { + ad=98*ad/100; // ~E, could also be a "f" with big serifs + MSG(fprintf(stderr,"ad=%d",ad);) } + if (!gchar) { ad=98*ad/100; + MSG(fprintf(stderr,"ad=%d",ad);) } + } + i = loop(bp,dx-1 ,3*dy/4,dx ,cs,0,LE)/2; + if (loop(bp,dx-1-i , dy-1,dy/2,cs,0,UP)<dy/4) + if (loop(bp,0 ,3*dy/4,dx ,cs,0,RI)<dx/4) { + ad=98*ad/100; // ~E but serif-f + MSG(fprintf(stderr,"ad=%d",ad);) } + + if( loop(bp,0,dy/4,dx ,cs,0,RI)>1 + && loop(bp,0, 0,dy/4,cs,0,DO)<dy/4 ) { + ad=95*ad/100; // ~I + MSG(fprintf(stderr,"ad=%d",ad);) } + + if (get_bw(x0+dx/16,x1-dx/16,y0,y0,box1->p,cs,2) == 0) { // white pixels? + ad=98*ad/100; // F + MSG(fprintf(stderr,"ad=%d",ad);) } + + if (!hchar) ad=ad*98/100; // d*=100;d/=128 // not 100% ! + if (box1->m4>0 && gchar && ad<99 && + 8*box1->y1 >= box1->m4*7+box1->m3) ad++; + Setac(box1,'f',ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_bB(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,i1,i2,i3,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + + // --- test B --------------------------------------------------- + for(ad=d=100;dx>2 && dy>4;){ // min 3x4 + DBG( wchar_t c_ask='B'; ) + if (sdata->holes.num < 2) Break; /* tolerant against a tiny hole */ + for(i=1,y=y0;y<y1-dy/2 && i;y++) + if( get_bw(x0,x0+dx/2, y , y ,box1->p,cs,1) != 1 ) i=0; + if( !i ) Break; + for(i=1,y=y1-dy/2;y<y1 && i;y++) + if( get_bw(x0,x0+dx/3, y , y ,box1->p,cs,1) != 1 ) i=0; + if( !i ) Break; + if( get_bw(x1,x1 , y0 , y0 ,box1->p,cs,1) == 1 ) Break; + if( num_cross(x0+dx/2, x0+dx/2,y0,y1 ,box1->p,cs) != 3 ) + if( num_cross(x1-dx/3, x1-dx/3,y0,y1 ,box1->p,cs) != 3 ) Break; + /* --- detect center of lower hole --- */ + y = loop(box1->p,x0+dx/2,y1 ,dy,cs,0,UP); if (y>1+dy/8) Break; + y+= loop(box1->p,x0+dx/2,y1-y,dy,cs,1,UP); if (y>dy/3) Break; + y=y1-y-loop(box1->p,x0+dx/2,y1-y,dy,cs,0,UP)/2; if (y<y0+3*dy/8) Break; + if (y<y0+dy/2) ad=96*ad/100; + if( num_cross(0,dx-1,y-y0 ,y-y0 ,bp,cs) != 2 ) + if( num_cross(0,dx-1,y-y0+1,y-y0+1,bp,cs) != 2 ) Break; + if( num_cross(0,dx-1, dy/4 , dy/4 ,bp,cs) != 2 ) + if( num_cross(0,dx-1, dy/4+1, dy/4+1,bp,cs) != 2 ) + if( num_cross(0,dx-1, dy/4-1, dy/4-1,bp,cs) != 2 ) Break; + for( y=dy/4;y<3*dy/4;y++ ) if( num_cross(0,dx-1,y,y,bp,cs)==1 ) break; + if( y==3*dy/4 ) Break; + + if( loop(box1->p,x0,y0+ y ,dx,cs,0,RI) + > loop(box1->p,x0,y0+dy/4,dx,cs,0,RI)+dx/32 ) + if( get_bw(x0,x0,y0,y0,box1->p,cs,1) == 0 ) + if( get_bw(x0,x0,y1,y1,box1->p,cs,1) == 0 ) Break; // ~8 + i1=loop(box1->p,x0,y0+dy/4,dx,cs,0,RI); + i2=loop(box1->p,x0,y0+dy/2,dx,cs,0,RI); + i =loop(box1->p,x0,y0+dy/2-dy/ 8,dx,cs,0,RI); if(i>i2) i2=i; + i =loop(box1->p,x0,y0+dy/2-dy/16,dx,cs,0,RI); if(i>i2) i2=i; + i3=loop(box1->p,x0,y1-dy/4,dx,cs,0,RI); + if(dy>16 && i3<i2 && i1+i3<2*i2){ + if (i3+i1<2*i2-dx/16) ad=98*ad/100; // ~8 + if (i3+i1<2*i2-dx/8 ) ad=96*ad/100; + if( loop(box1->p,x0,y0+ 1 ,dx,cs,0,RI) + >= loop(box1->p,x0,y0+ 3 ,dx,cs,0,RI)+dx/32 ) + if( loop(box1->p,x0,y0+ 0 ,dx,cs,0,RI) + > loop(box1->p,x0,y0+ 3 ,dx,cs,0,RI)+dx/32 ) + if( loop(box1->p,x0,y1- 0 ,dx,cs,0,RI) + > loop(box1->p,x0,y1- 3 ,dx,cs,0,RI)+dx/32 ) + if( loop(box1->p,x0,y1- 1 ,dx,cs,0,RI) + > loop(box1->p,x0,y1- 3 ,dx,cs,0,RI)+dx/32 ) Break; // ~8 Aug00 + } + + if (sdata->holes.num != 2) Break; + if (sdata->holes.hole[0].y0 < y-1 + && sdata->holes.hole[1].y0 < y-1 ) Break; + if (sdata->holes.hole[0].y1 > y+1 + && sdata->holes.hole[1].y1 > y+1 ) Break; + // if( num_hole(0,dx-1,0 ,y+1 ,bp,cs,NULL) != 1 ) Break; + // if( num_hole(0,dx-1,y-1,dy-1,bp,cs,NULL) != 1 ) Break; + // out_x(box1); + + for( x=dx,y=dy/6; y<dy-dy/8; y++ ) // left border straight + { i=loop(box1->p,x0,y0+y,dx,cs,0,RI); if( i>x+dx/9 ) break; + if(i<x) x=i; + } if( y<dy-dy/8 ) Break; // ~8 bad_a + + for( x=dx,y=1;y<dy/4;y++ ) // right border straight + { i=loop(bp,dx-1,dy-y,dx,cs,0,LE); + if( i<x ) x=i; else if( i>x )break; + } if( y<dy/4 ) Break; // ~ff (serifen?) + + x=loop(bp,0,dy/2 ,dx,cs,0,RI); + i=loop(bp,0,dy/2-1,dx,cs,0,RI); if (i>x) x=i; // allow dust + i=loop(bp,0,dy/2+1,dx,cs,0,RI); if (i>x) x=i; + if ( loop(bp,0, dy/8,dx,cs,0,RI) + +loop(bp,0,7*dy/8,dx,cs,0,RI) > 2*x+1 ) Break; // not konvex! + + if(!hchar){ // ~ fat_a + ad=99*ad/100; + x =loop(bp,0,dy/4,dx,cs,0,RI); + if(loop(bp,0,dy/2,dx,cs,0,RI)>x+dx/8) ad=97*ad/100; + } + + if ( (!hchar) && (dx<=10 || dy<=10) ) ad=97*ad/100; // hchar or good_quality + if (gchar) ad=99*ad/100; + Setac(box1,'B',ad); + break; + } + // --- test b --------------------------------------------------- + for(ad=d=100;dx>3 && dy>4;){ // min 3x4 + DBG( wchar_t c_ask='b'; ) + if (sdata->holes.num < 1) Break; /* tolerant against a tiny hole */ + for(y=y0;y<y1;y++) + if( get_bw(x0 , x0+dx/2, y , y ,box1->p,cs,1) != 1 ) Break; + if(y<y1-dy/32-1) Break; + if( get_bw(x0+ dx/2, x0+dx/2, y1-dy/3, y1 ,box1->p,cs,1) != 1 ) Break; + if( get_bw(x1- dx/2, x1 , y1-dy/3, y1-dy/3,box1->p,cs,1) != 1 ) Break; + if( get_bw(x1- dx/3, x1 , y0 , y0+dy/5,box1->p,cs,1) == 1 ) Break; + if( get_bw(x1-4*dx/9, x1 , y0+dy/5, y0+dy/5,box1->p,cs,1) == 1 ) Break; + if( num_cross(x0,x1,y0+dy/4 ,y0+dy/4 ,box1->p,cs) > 1 ) // & + if( num_cross(x0,x1,y0+dy/4-1,y0+dy/4-1,box1->p,cs) > 1 ) + if( dy<16 || + num_cross(x0,x1,y0+dy/5 ,y0+dy/5 ,box1->p,cs) > 1 ) Break; // fat b + for(i=j=0,y=dy/2;y<dy-dy/8;y++) + if( num_cross(0,dx-1,y,y,bp,cs) == 2 ) i++; else j++; + if( i<2*j ) Break; // v024a4 + if (sdata->holes.num != 1) Break; + if (sdata->holes.hole[0].y0 < dy/4) Break; + if ((sdata->holes.hole[0].y1-sdata->holes.hole[0].y0+1) + *(sdata->holes.hole[0].x1-sdata->holes.hole[0].x0+1)*16 + < dx*dy) ad=90*ad/100; // hole to small + if( num_hole( x0, x1 , y0+dy/4, y1,box1->p,cs,NULL) != 1 ) Break; + i=loop(bp,dx-1,dy-1 ,dx,cs,0,LE); + j=loop(bp,dx-1,dy-1-dy/8,dx,cs,0,LE); if(j>i) Break; + if (!hchar) ad=99*ad/100; + if ( gchar) ad=99*ad/100; + Setac(box1,'b',ad); + if (ad>=100) return 'b'; + break; + } + return box1->c; +} + +static wchar_t ocr0_dD(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,d,x,y,ya,yb,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + + // --- test D --------------------------------------------------- + for(ad=d=100;dx>2 && dy>3;){ // min 3x4 + DBG( wchar_t c_ask='D'; ) + if (sdata->holes.num < 1) Break; /* tolerant against a tiny hole */ + if( get_bw(x0 ,x0+dx/3,y0+dy/2,y0+dy/2,box1->p,cs,1) != 1 ) Break; + if( get_bw(x1-dx/3,x1 ,y0+dy/2,y0+dy/2,box1->p,cs,1) != 1 ) Break; + if( get_bw(x1 ,x1 ,y0 ,y0+dy/16,box1->p,cs,1) == 1 ) Break; + if( get_bw(x1-dx/2,x1 ,y0+dy/4,y0+dy/4 ,box1->p,cs,1) != 1 ) Break; + if( num_cross(x0+dx/2,x0+dx/2,y0 ,y1 ,box1->p,cs) != 2 ) + if( num_cross(x1-dx/3,x1-dx/3,y0 ,y1 ,box1->p,cs) != 2 ) Break; + if( num_cross(x0 ,x1 ,y0+dy/3,y0+dy/3,box1->p,cs) != 2 ) Break; + if( num_cross(x0 ,x1 ,y1-dy/3,y1-dy/3,box1->p,cs) != 2 ) Break; + if (sdata->holes.num != 1) Break; + if (sdata->holes.hole[0].y0 > dy/3) Break; + if (sdata->holes.hole[0].y1 < dy-1-dy/3) Break; + // if( num_hole (x0 ,x1 ,y0 ,y1 ,box1->p,cs,NULL) != 1 ) Break; + // test if left edge is straight + for(x=0,y=bp->y-1-dy/8;y>=dy/5;y--){ + i=loop(bp,0,y,x1-x0,cs,0,RI); + if( i+2+dx/16<=x ) break; + if( i>x ) x=i; + } + if (y>=dy/5 ) Break; + /* test if right edge is falling */ + for(x=dx,y=0;y<dy/3;y++){ + i=loop(bp,bp->x-1,y,x1-x0,cs,0,LE); + if( i>x+dx/16 ) break; + if( i<x ) x=i; + } + if (y<dy/3 ) Break; + /* test if right edge is raising */ + for(x=dx,y=bp->y-1;y>2*dy/3;y--){ + i=loop(bp,bp->x-1,y,x1-x0,cs,0,LE); + if( i>x+dx/16 ) break; + if( i<x ) x=i; + } + if (y>2*dy/3 ) Break; + if( loop(bp,dx-1,dy-1 ,dx,cs,0,LE) <= + loop(bp,dx-1,dy-2-dy/16,dx,cs,0,LE) ) Break; // P + + y=loop(bp,dx/2,dy-1,dy,cs,0,UP)-1; if (dy>16) y/=2; + if ( y>=dy/16 ) { y-=dy/16; + if (get_bw(dx/2,dx-1,dy-1-y,dy-1-y,bp,cs,1)==1) Break; // ~A + } + + ya=loop(bp, 0,dy-1,dy,cs,0,UP); + yb=loop(bp,dx/16+1,dy-1,dy,cs,0,UP); + if( ya<dy/2 && ya>dy/16 && ya>yb ) Break; // ~O + + if ( loop(bp, dx/2, 0,dy,cs,0,DO) + -loop(bp, dx/2,dy-1,dy,cs,0,UP) > dy/8 ) ad=97*ad/100; // ~b + + + + if (loop(bp, 0, 0,dx,cs,0,RI)>=dx/2 + && loop(bp,dx-1,dy-1,dx,cs,0,LE)>=dx/2 + && loop(bp, 0,dy/2,dx,cs,0,RI)< 2 ) ad=96*ad/100; // thin O + + if(box1->dots) ad=ad*94/100; + if ( gchar) ad=99*ad/100; + if (!hchar) ad=99*ad/100; + Setac(box1,'D',ad); + break; + } + // --- test d --------------------------------------------------- + for(d=100;dx>2 && dy>3;){ // min 3x4 + DBG( wchar_t c_ask='d'; ) + ad=100; + if (sdata->holes.num < 1) Break; /* tolerant against a tiny hole */ + if( get_bw(x0 , x0+dx/2, y1-dy/6, y1-dy/9,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0 , x0+dx/2, y1-dy/3, y1-dy/3,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2, x1 , y1-dy/3, y1-dy/3,box1->p,cs,1) != 1 ) Break; + if( get_bw(x1-dx/4, x1 , y0+dy/8, y0+dy/8,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2, x0+dx/2, y1-dy/4, y1 ,box1->p,cs,1) != 1 ) Break; + if(dy>19) + if( get_bw(x0 , x0+dx/3, y0 , y0+dy/5,box1->p,cs,1) == 1 ) Break; + if( get_bw(x0 , x0+dx/3, y0 , y0+dy/6,box1->p,cs,1) == 1 ) Break; + if( get_bw(x0 , x0+dx/4, y1-dy/8, y1 ,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2-1,x0+dx/2,y1-dy/8, y1 ,box1->p,cs,1) != 1 ) Break; // ~"A + if( loop(bp,bp->x-1, bp->y/4,x1-x0,cs,0,LE) > + loop(bp,bp->x-1,3*bp->y/4,x1-x0,cs,0,LE)+1 ) Break; + for(i=dx/8+1,x=0;x<dx && i;x++){ + if( num_cross(x ,x ,0 ,dy-1, bp,cs) == 2 ) i--; + } if( i ) Break; + for(i=dy/6+1,y=dy/4;y<dy && i;y++){ + if( num_cross(0 ,dx-1,y ,y , bp,cs) == 2 ) i--; + if( num_cross(0 ,dx-1,y ,y , bp,cs) > 3 ) i++; // ~al + } if( i ) ad=98*ad/100; + for(i=dy/8+1,y=0;y<dy/2 && i;y++){ + if( num_cross(0 ,dx-1,y ,y , bp,cs) == 1 ) + if( num_cross(dx/2,dx-1,y ,y , bp,cs) == 1 ) i--; + } if( i ) Break; + if (sdata->holes.num<1) Break; + if (sdata->holes.num>1) { + if (dx<6) Break; ad=95*ad/100; } // glued j above 8 (4x6 sample) + MSG(fprintf(stderr,"hole[0].y0,y1= %d %d",sdata->holes.hole[0].y0,sdata->holes.hole[0].y1);); + if ( sdata->holes.hole[0].y0 < dy/4 ) Break; + if (dy-sdata->holes.hole[0].y1 > dy/4+1) Break; // glued et + // if( num_hole(x0 , x1 , y0+dy/4 , y1 ,box1->p,cs,NULL) !=1 ) Break; + if( num_cross(0 ,dx-1,dy-1-dy/4,dy-1-dy/4,bp,cs) != 2 ) { // glued al + if (dy>15) { Break; } else ad=96*ad/100; + } + if (!hchar) ad=98*ad/100; + if ( gchar) ad=99*ad/100; + Setac(box1,'d',ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_F(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + + // --- test F --------------------------------------------------- + for(ad=d=100;dx>2 && dy>4;){ // dx>1 dy>2*dx + DBG( wchar_t c_ask='F'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + if( get_bw(x0+dx/2,x0+dx/2,y0,y0+dy/8,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0,x0+dx/4,y1-dy/4,y1-dy/4,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0,x0+dx/2,y0+dy/4,y0+dy/4,box1->p,cs,1) != 1 ) Break; + + for (x=0,y=0;y<dy/4;y++) { + j=loop(bp,dx-1,dy-1-y,dx,cs,0,LE); if(j<3 || 3*j<dx) break; // ~f Jun00 + if (j>x) x=j; + } if (y<dy/4 || x<dx/2) Break; + + for( i=1,y=0; y<dy/4 && i; y++ ){ // long black line + j=loop(bp,0,y,dx,cs,0,RI); + j=loop(bp,j,y,dx,cs,1,RI); if( j>dx/2 ) i=0; } + if( i ) Break; + + x=loop(bp,0,dy-1-dy/4,dx,cs,0,RI); + x=loop(bp,x,dy-1-dy/4,dx,cs,1,RI); // strichdicke + for( i=1,y=dy/3; y<dy-1-dy/3 && i; y++ ) // black line + { j=loop(bp,0,y,dx,cs,0,RI); + j=loop(bp,j,y,dx,cs,1,RI); if( j>dx/3 && ((j>2*x && dx>8) || j>x+1)) i=0; } + if( i ) Break; + + y=dy/8; if (y<1) y=1; + for( i=1; y<dy-1-dy/2; y++ ){ // search horizontal white gap + x =loop(bp,dx-1,y,dx,cs,0,LE); if(x<2) continue; // skip serifs + j =loop(bp,dx-x,y,dy/4,cs,0,UP); + x+=loop(bp,dx-x,y-j+1,dx,cs,0,LE); if (x>=dx/3) { i=0; break; } + } + if( i ) Break; + + // check for vertical line on left side + for(i=1,y=1;y<=dy/2 && i;y++) + if( get_bw(0,dx/2,y,y,bp,cs,1) != 1 ) i=0; + if( !i ) Break; + + for(i=1,y=dy/2;y<dy && i;y++) + if( get_bw(0,dx/3,y,y,bp,cs,1) != 1 ) i=0; + if( !i ) Break; + + i=loop(bp,dx-1,dy-1,dx,cs,0,LE); // serif or E ? + if (i<=dx/3) { + if (loop(bp,dx-1,(dy+4)/8,dx,cs,0,LE)>dx/8 // no serif + || loop(bp, 0, dy-3,dx,cs,0,RI)<1) break; + ad=99*ad/100; + } + if( get_bw(dx-1-dx/4,dx-1,dy-1-dy/4,dy-1,bp,cs,1) == 1 ) Break; // ~E + if( get_bw(dx-1 ,dx-1,0 ,dy/3,bp,cs,1) != 1 ) Break; + + if( loop(bp,0, bp->y/4,dx,cs,0,RI) < + loop(bp,0,3*bp->y/4,dx,cs,0,RI)-1 ) Break; + // if( num_hole(x0 , x1 , y0 , y1 ,box1->p,cs,NULL) >0 ) Break; + if (sdata->holes.num > 0) Break; + for(i=0,x=dx/4;x<dx-1;x++) + if( num_cross(x,x,0,dy-2,bp,cs) == 2 ) i++; + if ( i<1 ) Break; // 0.2.4a4 + + if(dy<20) /* special case of small fi, not very elegant */ + if( get_bw( 1, 1,1,1,bp,cs,1) == 1 + && get_bw( 0, 0,2,2,bp,cs,1) == 1 + && get_bw(dx-2,dx-1,0,0,bp,cs,1) == 0 + && get_bw( 0, 1,0,0,bp,cs,1) == 0 + && get_bw( 0, 0,0,1,bp,cs,1) == 0 ) Break; + + // check for screen font f + i= loop(bp,0,3*bp->y/4,dx,cs,0,RI)-1; + if (i>=0 && loop(bp,dy-1,i,dy,cs,0,UP)<=3*dy/4 ) ad=ad*98/100; + + // check for screen font P + i= loop(bp,bp->x-1,bp->y/4,dx,cs,0,LE); + if (i<1) { + j=i+loop(bp,bp->x-1-i,bp->y/4, dx ,cs,1,LE); + j= loop(bp,bp->x-1-j,bp->y/4,3*dy/4,cs,0,DO); + if (j<=dy/2) { + i=loop(bp,bp->x-1,0,dx,cs,0,LE); + ad=ad*98/100; + if (i>dx/8) Break; + if (i) ad=98*ad/100; + } + } + + if (!hchar) if ((box1->m2-box1->y0)*8>=dy) { // ignore bad m1..4 + if ( num_cross(2*dx/3,2*dx/3,0,dy-1,bp,cs) < 2 ) ad=90*ad/100; // ~r + } + if (gchar) ad=99*ad/100; + Setac(box1,'F',ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_uU(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + wchar_t bc=UNKNOWN; + + // --- test uU --------------------------------------------------- + // in Mitte so breit wie oben (bei V kontinuierlich schmaler) + for(ad=d=100;dx>2 && dy>3;){ // min 3x4 + DBG( wchar_t c_ask='u'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + for(y=y0+dy/4;y<y1-dy/4;y++) /* also handwritten u */ + if( num_cross(x0,x1,y,y,box1->p,cs) < 2 ) break; + if( y<y1-dy/4 ) Break; + if( get_bw(dx/2,dx/2,dy/2,dy-1,bp,cs,1)==0 ) Break; + if( get_bw(dx/2,dx-1,dy/2,dy/2,bp,cs,1)==0 ) Break; + for(i=0,x=3*dx/8;x<dx-dx/4;x++){ + y=loop(bp,x,0,dy,cs,0,DO); if(y>i)i=y; if(y<i && i>1) break; + } if( i<dy/4 ) Break; x--; + if( get_bw(0,x ,i-1,i-1,bp,cs,1)==0 ) Break; + if( get_bw(x,dx-1,i-1,i-1,bp,cs,1)==0 ) Break; + + for(i=dy/8+2,y=dy/8;y<dy-(dy+2)/4 && i;y++){ // 12%+1 Fehler + j=num_cross(0,dx/2-((y>dy/2)?dx/8:0),y,y,bp,cs); + if( y<dy/2 && num_cross(dx/2,dx-1,y,y,bp,cs)>1 ) i--; // ~{\it v} + if( y<dy/2 && (j<1 && j>2) ) { i--; ad=90*ad/100; } + if( y>dy/2 && j!=1 ) { i--; ad=95*ad/100; } + } if( !i ) Break; + for(i=dy/16+1,y=dy/8;y<dy-dy/4 && i;y++){ // 12%+1 Fehler + j=num_cross(dx-dx/2,dx-1,y,y,bp,cs); + if( y>dy/2 && (j<1 && j>2) ) i--; + if( y<dy/2 && j!=1 ) i--; + } if( !i ) Break; + for(i=1,x=x0+dx/3;x<=x1-dx/3 && i;x++){ + if( get_bw( x, x, y0, y0+dy/3,box1->p,cs,1) != 1 ) i=0; + } if( i ) Break; + for(i=dx/4+1,x=x0+dx/3;x<=x1-dx/3 && i;x++){ + if( get_bw( x, x,y0+dy/3,y1-dy/3,box1->p,cs,3) != 2 ) i--; + } if( !i ) Break; + for(i=1,x=x0+dx/3;x<=x1-dx/3 && i;x++){ + if( get_bw( x, x,y1-dy/2,y1,box1->p,cs,3) == 2 ) i=0; + if( get_bw( x, x,y1-dy/3,y1,box1->p,cs,3) == 2 ) ad=98*ad/100; + } if( !i ) Break; + if( num_cross(0 ,dx/2, dy/4, dy/4,bp,cs)==2 + && num_cross(dx-dx/2,dx-1,dy-dy/4,dy-dy/4,bp,cs)==1 ) Break; // ~{\it v} + + i=loop(bp,0,dy-1-dy/16,dx,cs,0,RI); + j=loop(bp,0,dy-1-dy/8 ,dx,cs,0,RI); + if( i<j ) Break; // ~ll v0.2.4a3 + if(dy>15) + if( loop(bp,dx-1,dy/16,dx,cs,0,LE) + > loop(bp,dx-1,dy/8 ,dx,cs,0,LE)+1+dx/32 ) Break; // ~bad 0 (thinn) + if( hchar && dy>7) + if( loop(bp, 0, dy-1,dx,cs,1,RI)==dx + && loop(bp,dx-1,3*dy/4,dx,cs,0,LE)>dx/16 + && loop(bp, 0,3*dy/4,dx,cs,0,RI)>dx/16 + && loop(bp,dx-1, dy/2,dx,cs,0,LE)>dx/16 + && loop(bp, 0, dy/2,dx,cs,0,RI)>dx/16 + ) Break; // melted ll + + i=loop(bp, 0,dy-2-dy/8,dx,cs,0,RI); + j=loop(bp,dx-1,dy-2-dy/8,dx,cs,0,LE); + if ( i>dx/4 && j>dx/4 && i+j>=dx/2) Break; // v + if (i+j>=dx/2) ad=97*ad/100; + + if ( num_cross(0,dx-1,dy/2,dy/2,bp,cs)!=2 ) ad=96*ad/100; // w + if ( loop(bp,dx/2,dy-1,dy,cs,0,UP)>0 ) ad=98*ad/100; // w + + if (ad==100) ad=99; // ToDo: only if lines.wt<100 + bc='u'; + if (gchar) ad=98*ad/100; + if (hchar) bc='U'; + if (box1->dots>0) ad=99*ad/100; + Setac(box1,bc,ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_micro(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,i2,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + + // --- test \mu µ MICRO_SIGN -------------------------------------- + // in Mitte so breit wie oben (bei V kontinuierlich schmaler) + if( gchar && !hchar ) + for(ad=d=100;dx>2 && dy>4;){ // min 3x4 + DBG( wchar_t c_ask='u'; ) + if (sdata->holes.num > 1) break; /* tolerant against a tiny hole */ + for(y=y0+dy/8;y<box1->m3-dy/4;y++) + if( num_cross(x0,x1,y,y,box1->p,cs) < 2 ) break; + if( y<box1->m3-dy/4 ) break; + if( get_bw(dx/2,dx/2,3*dy/8,7*dy/8,bp,cs,1)==0 ) break; + if( get_bw(dx/2,dx-1,3*dy/8,7*dy/8,bp,cs,1)==0 ) break; + for(y=dy/2;y<dy;y++){ + x=loop(bp,dx-1,y,dx,cs,0,LE); if(8*x>5*dx) break; + } if( y>=dy || 2*y>box1->m3+box1->m4) break; i2=y; + for(i=0,x=2*dx/8;x<dx-1-dx/4;x++){ + y=loop(bp,x,0,dy,cs,0,DO); if(y>i)i=y; if(y<i && i>1) break; + } if( i<dy/4 ) break; x--; + if( get_bw(0,x ,i-1,i-1,bp,cs,1)==0 ) break; + if( get_bw(x,dx-1,i-1,i-1,bp,cs,1)==0 ) break; + for(i=dy/16+1,y=dy/8;y<dy-(box1->m4-box1->m3)-dy/4 && i;y++){ // 12%+1 Fehler + j=num_cross(0,dx/2,y,y,bp,cs); + if( y<dy/2 && num_cross(dx/2,dx-1,y,y,bp,cs)>1 ) i--; // ~{\it v} + if( y<dy/2 && (j<1 && j>2) ) i--; + if( y>dy/2 && j!=1 ) i--; + } if( !i ) break; + for(i=dy/16+1,y=dy/8;y<dy-(box1->m4-box1->m3)-dy/4 && i;y++){ // 12%+1 Fehler + j=num_cross(dx-dx/2,dx-1,y,y,bp,cs); + if( y>dy/2 && (j<1 && j>2) ) i--; + if( y<dy/2 && j!=1 ) i--; + } if( !i ) break; + for(i=1,x=x0+dx/3;x<=x1-dx/3 && i;x++){ + if( get_bw( x, x, y0, y0+dy/4,box1->p,cs,1) != 1 ) i=0; + } if( i ) break; + for(i=dx/4+1,x=x0+dx/3;x<=x1-dx/3 && i;x++){ + if( get_bw( x, x,y0+dy/4,y1-dy/2,box1->p,cs,3) != 2 ) i--; + } if( !i ) break; + if( num_cross(0 ,dx/2, dy/4, dy/4,bp,cs)!=1 ) break; + if( num_cross(dx-dx/2,dx-1,dy-dy/2,dy-dy/2,bp,cs)!=1 ) break; + if( get_bw( (dx+2)/4,dx-1,dy-2-3*dy/16,dy-1,bp,cs,1) == 1 ) break; + if( num_cross(0,dx/4,dy-1,dy-1,bp,cs)!=1 ) break; + + Setac(box1,MICRO_SIGN,ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_vV(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,i1,i2,i3,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + wchar_t bc=UNKNOWN; + + // --- test v ------------------------------------------------- + for(ad=d=100;dx>2 && dy>3;){ // min 3x4 + DBG( wchar_t c_ask='v'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + x=loop(bp,dx/2,0,dx,cs,1,RI)+dx/2; // be sure in the upper gap + y=loop(bp, x,0,(dy+1)/2,cs,0,DO)-1; // (x,y) should be in the gap + if (x>3*dx/4 || y<dy/4) Break; + if( get_bw(x0,x0+x,y0+y,y0+y,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+x,x1,y0+y,y0+y,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+x,x0+x,y1-dy/2,y1, box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+x, x0+x ,y0, y0+dy/3,box1->p,cs,1) == 1 ) // it v? + if( get_bw(x0+x+1,x0+x+1,y0, y0+dy/3,box1->p,cs,1) == 1 ) Break; + + // UVW + if(((num_cross( 0,dx/2+1,dy/ 8,dy/ 8,bp,cs)!=1) + && (num_cross( 0,dx/2+1,dy/16,dy/16,bp,cs)!=1) // it v + && (num_cross(dx/2+1,dx -1,dy/ 8,dy/ 8,bp,cs)!=1)) /* () added on Sep00 */ + || ((num_cross( 0,dx-1,dy-1-dy/8,dy-1-dy/8,bp,cs)> 1) + && (num_cross( 0,dx-1,dy-1 ,dy-1 ,bp,cs)> 1)) ) Break; + // UV + if( get_bw(0 ,dx/8,dy-1-dy/6,dy-1,bp,cs,1)==1 ) Break; + if( get_bw(dx-1-dx/8,dx-1,dy-1-dy/6,dy-1,bp,cs,1)==1 ) Break; + if( loop(bp,0 ,dy/6 ,dx,cs,0,RI) + >=loop(bp,0 ,dy-1-dy/3,dx,cs,0,RI) && dy>6 ) Break; + if( loop(bp,0 ,dy-1-dy/3,dx,cs,0,RI) + >loop(bp,0 ,dy-1-dy/8,dx,cs,0,RI) + && loop(bp,dx-1,dy-1-dy/3,dx,cs,0,LE) + >loop(bp,dx-1,dy-1-dy/8,dx,cs,0,LE) ) Break; // better OR ? + if( loop(bp,0 ,dy-1-dy/3,dx,cs,0,RI) + >=loop(bp,0 ,dy-1-dy/8,dx,cs,0,RI) + && loop(bp,dx-1,dy-1-dy/3,dx,cs,0,LE) + >=loop(bp,dx-1,dy-1-dy/8,dx,cs,0,LE) ) ad=99*ad/100; // font21 + if( loop(bp,dx-1,dy/6 ,dx,cs,0,LE) + >=loop(bp,dx-1,dy-1-dy/3,dx,cs,0,LE) && dy>6 ) Break; + x=loop(bp,0,dy-1,dx,cs,0,RI); // 3*x>dx changed to 2*x>dx May2001 JS + x=loop(bp,x,dy-1,dx,cs,1,RI); if ( dx>14 && 2*x>dx ) Break; // U + if( num_cross(0 ,dx/2, dy/4, dy/4,bp,cs)==2 + && num_cross(dx-dx/2,dx-1,dy-dy/4,dy-dy/4,bp,cs)==2 ) Break; // ~{\it u} + +#if 0 + // measure thickness of lower v + i=loop(bp, 0,dy-1-dy/16,dx,cs,0,RI) + +loop(bp,dx-1,dy-1-dy/16,dx,cs,0,LE); + j=loop(bp, 0,dy-1-dy/4 ,dx,cs,0,RI) + +loop(bp,dx-1,dy-1-dy/4 ,dx,cs,0,LE); + if( box1->m1 && hchar && dy>15 && j>=i-dx/32 ) Break; // ~Y +#endif + /* V has serifs only on upper site! Y also on bottom, check it. Okt00 */ + i=loop(bp, 0, 0,dx,cs,0,RI); + i=loop(bp, i, 0,dx,cs,1,RI); i1=i; // thickness + i=loop(bp, 0, 1,dx,cs,0,RI); + i=loop(bp, i, 1,dx,cs,1,RI); if(i>i1) i1=i; // thiggest + i=loop(bp, 0,dy/4,dx,cs,0,RI); + i=loop(bp, i,dy/4,dx,cs,1,RI); i2=i; + i=loop(bp, 0,dy ,dx,cs,0,RI); + i=loop(bp, i,dy ,dx,cs,1,RI); i3=i; // thickness + i=loop(bp, 0,dy-1,dx,cs,0,RI); + i=loop(bp, i,dy-1,dx,cs,1,RI); if(i>i3) i3=i; // thiggest + if( y0 < box1->m2 ) + if( i1-i2 > dx/32+2 + && i3-i2 > dx/32+2 ) Break; // ~serif_Y + + if( y0 < box1->m2 ) // uppercase V ? + if( i1-i2 < dx/32+2 ) /* no serif detected */ + if( num_cross(0,dx-1,dy-1-dy/4,dy-1-dy/4,bp,cs)==1 ){ + j=loop(bp, 0,dy-1-dy/4 ,dx,cs,0,RI); + j=loop(bp, j,dy-1-dy/4 ,dx,cs,1,RI); + if (j<i2+1) Break; // ~Y + if (j<=i2+1) ad=99*ad/100; // ~Y + } + + ad=99*ad/100; // be carefull (remove later) + + if( loop(bp,0 ,dy-1-dy/4,dx,cs,0,RI) + >loop(bp,0 ,dy-1 ,dx,cs,0,RI) ) ad=96*ad/100; + + if (gchar) ad=99*ad/100; + bc='v'; + if( hchar ) bc='V'; + Setac(box1, bc, ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_rR(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,i1,i2,i3,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + + // --- test r ------- + for(ad=d=100;dy>3 && dx>1;){ // dy>dx, 4x6 font, dx=2 smallest prop-font + DBG( wchar_t c_ask='r'; ) + if (sdata->holes.num > 0 + && ( sdata->holes.hole[0].y1 > dy/2 // tiny hole in upper left + || sdata->holes.hole[0].x1 > dx/2 ) // is tolerated, ~Pp + ) Break; /* tolerant against a tiny hole */ + if( 2*dy<box1->m3-box1->m1) Break; + + if( loop(bp,dx-1,dy/2,dx,cs,0,LE)<=dx/8 ) Break; + x= loop(bp,dx-1,dy/2,dx,cs,0,LE); if (x<=dx/2) ad=99*ad/100; // ~t + if (loop(bp,dx-1-x/2,0,dy,cs,0,DO)>dy/8) ad=99*ad/100; // ~t + if( dx>4 ) + if( loop(bp,dx-1,dy/2,dx,cs,0,LE)<=dx/8+2 ) Break; // ~v Jun00 + + i=dy-(dy+20)/32; // ignore dust on the ground + + for( y=4*dy/8; y<i; y++ ){ // center down v-line + if( y<dy-2*dy/8 && num_cross(0,dx-1,y,y,bp,cs) !=1 ) break; + i1= loop(bp,0 ,y,dx,cs,0,RI); if(i1>3*dx/8) break; + i2= loop(bp,dx-1,y,dx,cs,0,LE); if(i1>i2) break; + if( (i1+(dx-i2 + -1))/2 >= 4*dx/8 ) break; // mass middle should be left + } + if (y<i) Break; + + for( x=4*dx/8; x<dx-dx/8; x++ ){ // right upper h-line + if( get_bw(x,x,0,(dy+2)/4,bp,cs,1) !=1 ) break; } + if (x<dx-dx/8) Break; + + if( loop(bp,dx-1,dy-1-dy/4,dx,cs,0,LE)>5*dx/8 // not a C + && get_bw(dx-1-dx/8,dx-1,dy-1-dy/4,dy-1,bp,cs,1) ==1 ) Break; + + if( loop(bp, 0,5*dy/8,dx,cs,0,RI)<=dx/8 + && loop(bp,dx-1,5*dy/8,dx,cs,0,LE)>=5*dy/8 + && loop(bp,dx/2, dy-1,dy,cs,0,UP)<=dy/8 ) Break; // ~c + + if( loop(bp, 0,3*dy/8,dx,cs,0,RI) + > loop(bp,dx-1,3*dy/8,dx,cs,0,LE)+dx/8 ) { + if( loop(bp, 0, dy/8,dx,cs,0,RI)<dx/8 ) Break; // ~z (broken) + ad=98*ad/100; + } + + if( loop(bp,0,dy/3,dx,cs,0,RI)>3*dx/4 ) Break; // ~i + if( loop(bp,0,dy/4,dx,cs,0,RI)>3*dx/8 // ~I + && get_bw(0,dx/8,0,dy/4,bp,cs,1) ==1 ) Break; + if( num_cross(0,dx-1,dy/2, dy/2 ,bp,cs)!=1 + && num_cross(0,dx-1,dy/2+1,dy/2+1,bp,cs)!=1 ) Break; // ~n 024a3 + + // itallic t is sometimes not high enough, look for v-like shape + for(y=3*dy/4;y<dy-1;y++) + if( num_cross(0,dx-1,y, y ,bp,cs)==2 + && num_cross(0,dx-1,y+1+dy/32,y+1+dy/32,bp,cs)==2 ) break; // ~t + if(y<dy-1) Break; + if (loop(bp,dx-1-dx/4,dy-1,dx,cs,0,UP)<dy/4) ad=98*ad/100; // ~f (serif) + if( num_cross(dx-1,dx-1,0,3*dy/4,bp,cs)>1 ) ad=95*ad/100; // ~f + if( num_cross(dx/2 ,dx/2 ,0,dy-1,bp,cs)>2 + && num_cross(dx/2+1,dx/2+1,0,dy-1,bp,cs)>2 ) Break; // ~f + + if (box1->dots) ad=98*ad/100; /* could be modified latin2-r */ + if (hchar) ad=96*ad/100; + if (gchar) ad=97*ad/100; + Setac(box1,'r',ad); + break; // not 100% sure! + } + // --- test R --------------------------------------------------- + for(ad=d=100;dx>2 && dy>3;){ // min 3x4 + DBG( wchar_t c_ask='R'; ) + if (sdata->holes.num > 2) Break; /* tolerant against a tiny hole */ + if( num_cross(x0,x1,y1-dy/8,y1-dy/8, box1->p,cs) < 2 ) Break; // ~P + if (loop(bp, dx/2, dy/4,dy,cs,0,DO)>dy/2) Break; // ~C + if (loop(bp, dx/2, 0,dy,cs,0,DO)>dy/8 + && loop(bp, dx/2,dy/16,dx,cs,0,RI)<dx/2 + && dy>=16 ) Break; + for(i=1,y=y0+dy/8;y<=y1-dy/8 && i;y++){ // left v-line + if( get_bw(x0 , x0+dx/2,y, y,box1->p,cs,1) != 1 ) i=0; + } if( !i ) Break; + for(i=1,x=x0+3*dx/8;x<=x1-dx/4 && i;x++){ // upper h-line + if( get_bw( x, x, y0, y0+dy/4,box1->p,cs,1) != 1 ) i=0; + } if( !i ) Break; + for(y=0,x=x0+dx/4;x<=x1-dx/4;x++){ // lower h-gap + i=loop(box1->p,x,y1,dy,cs,0,UP); + /* on small chars bypass possible low left serifs */ + if (i>0) { i2=loop(box1->p,x-1,y1-i-1,dy,cs,0,UP); + if (i2>1) i+=i2-1; } + if (i>y) { y=i; i1=x; } + } if( y<=dy/8 ) Break; if (y<dy/4) ad=80*ad/100; + for(i=1,x=x0+dx/3;x<=x1-dx/8 && i;x++){ // vert crossed 2 ??? + if( num_cross(x,x,y0,y1, box1->p,cs) == 2 ) i=0; + } if( i ) Break; + for(i=1,y=y0;y<=y0+3*dy/8 && i;y++){ // upper 2 vert lines + if( num_cross(x0,x1,y,y, box1->p,cs) == 2 ) i=0; + } if( i ) Break; + for(i=1,y=y0+dy/3;y<=y1-dy/3 && i;y++){ // midle h line + if( num_cross(x0,x1,y,y, box1->p,cs) == 1 ) i=0; + } if( i ) ad=95*ad/100; /* sometimes there is a small gap */ + for(i=1,y=y1-dy/4;y<=y1 && i;y++){ // lower 2 vert lies + if( num_cross(x0,x1,y,y, box1->p,cs) == 2 ) i=0; + } if( i ) Break; + if( get_bw(x1-dx/3,x1,y0,y0+dy/4,box1->p,cs,1) != 1 ) Break; // pixel ru + x=loop(bp,dx-1, dy/4,dx,cs,0,LE); if(x>dx/2) Break; i=x; // ru + x=loop(bp,dx-1, dy/2,dx,cs,0,LE); if(x<=i ) Break; i=x; // rc + x=loop(bp,dx-1, 5*dy/8,dx,cs,0,LE); if(x>i ) i=x; + x=loop(bp,dx-1, 6*dy/8,dx,cs,0,LE); if(x>i ) i=x; + x=loop(bp,dx-1,dy-1-dy/8,dx,cs,0,LE); if(x>=i ) Break; // rd + + i1=loop(bp,0, dy/4,dx,cs,0,RI); // straight + i2=loop(bp,0, dy/2,dx,cs,0,RI); + i3=loop(bp,0,dy-1-dy/4,dx,cs,0,RI); if( abs(i1+i3-2*i2)>1+dx/16 ) Break; + if (dy>15) + if (loop(bp,dx-1, dy/2,dx,cs,0,LE)>=loop(bp,dx-1, dy-1,dx,cs,0,LE) + && loop(bp,dx-1,3*dy/16,dx,cs,0,LE)>=loop(bp,dx-1,dy/16,dx,cs,0,LE)+dx/8 ) Break; // ~ff + if (dy>7) + if (loop(bp,dx-1,dy-2 ,dx,cs,0,LE) + >loop(bp,dx-1,dy-2-dy/8,dx,cs,0,LE)) { + ad=98*ad/100; + if (loop(bp,dx-1,dy-1-dy/4,dx,cs,0,LE)==0 + && loop(bp,dx-1,dy-2-dy/8,dx,cs,0,LE)>0 ) Break; // broken B ?? + } + j=sdata->holes.num; + if (j != 1) { + i=num_hole (x0,x1,y0,y1-dy/3,box1->p,cs,NULL); + // j=num_hole (x0,x1,y0,y1 ,box1->p,cs,NULL); + if (i==0) ad=90*ad/100; /* some times there is a small gap */ + if (j>1 || j>i) Break; + } + if (sdata->holes.num < 1) ad=90*ad/100; + if (sdata->holes.num==1) + if (sdata->holes.hole[0].y1 > 3*dy/4) ad=95*ad/100; // alpha + + if (!hchar) ad=98*ad/100; + if ( gchar) ad=98*ad/100; + Setac(box1,'R',ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_m(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,d,x,y,i1,i2,i3,i4,i5,hchar=sdata->hchar,gchar=sdata->gchar, + handwritten=0, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + + // --- test m ------- + for(ad=d=100;dx>4 && dy>3;){ + DBG( wchar_t c_ask='m'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + if (sdata->holes.num > 0) ad=96*ad/100; + x =loop(bp,dx-1,dy/2,dx,cs,0,LE); if(3*x>dx) Break; // ~K + y=dy/2; + i=num_cross(0,dx-1,y ,y ,bp,cs); if (i!=3) + i=num_cross(0,dx-1,y+1,y+1,bp,cs); + if (i<3 && i>5) Break; // m ru rn, handwritten m + // im or glued.mm cut to nm + if (i>3) { ad=99*ad/100; MSG(fprintf(stderr,"ad=%d",ad);) } + for (i=0,y=dy-1-dy/8;y>dy/2;y--) { + i=num_cross(0,dx-1,y,y,bp,cs); if (i>2) break; + } if (i>3) Break; + for ( ;y>dy/2;y--) { + i=num_cross(0,dx-1,y,y,bp,cs); if (i!=3) break; + } if (i>5) Break; y++; i5=y; + if (y> dy/2) handwritten=10; + if (y>3*dy/4) handwritten=60; + /* @@............... + @@......,........ + @@,...@@@....@@@. + @@,,.@@@@..@@@@@, + @@@.@@@@@.@@@@@@, + @@;@@@@@@@@@;,@@, + @@@@@,.@@@@,,,@@@ <- i5 + ,@@@...;@@....@@@ + .@;...........,@@ + ...............@@ + i1 i2 i3 i4 + */ + x =loop(bp,0,y,dx ,cs,0,RI); if(x> dx/4) Break; // search 1st v-line + x+=loop(bp,x,y,dx-x,cs,1,RI); if(x> dx/2) Break; i1=x; // first gap + x+=loop(bp,x,y,dx-x,cs,0,RI); if(x>3*dx/4) Break; i2=x; // 2nd v-line + x+=loop(bp,x,y,dx-x,cs,1,RI); if(x>6*dx/8) Break; i3=x; // 2nd gap + x+=loop(bp,x,y,dx-x,cs,0,RI); if(x<5*dx/8) Break; i4=x; // 3th v-line + if (x>=dx) Break; // missing 3th v-line, ~W + MSG(fprintf(stderr,"y=%d x=%d %d %d %d",y,i1,i2,i3,i4);) + if( abs((i2-i1)-(i4-i3)) > 2+((i2-i1)+(i4-i3))/4 ) Break; // same gap width? rn + if( abs((i2-i1)-(i4-i3)) > 2+((i2-i1)+(i4-i3))/8 ) ad=98*ad/100; // same gap width? rn + // the same game for the lower part =>l1 l2 l3 l4 ??? + i =loop(bp,0,5*dy/8,dx,cs,0,RI); + i =loop(bp,i,5*dy/8,dx,cs,1,RI); + x =loop(bp,0,dy-dy/32-1,dx,cs,0,RI); + x =loop(bp,x,dy-dy/32-1,dx,cs,1,RI); + if( x > i+1 ) i=1; else i=0; /* looks like serif m, Okt00 */ + for(y=0,x=i1;x<i2;x++) { + i=loop(bp,x,dy-1,dy,cs,0,UP); if (i>y) y=i; + } + if(y<dy/4 || y<y1-y0-i5-1-dy/16) Break; // no gap detected + for(y=0,x=i3;x<i4;x++) { + i=loop(bp,x,dy-1,dy,cs,0,UP); if (i>y) y=i; + } + if(y<dy/4) Break; // no gap detected + for(x=i1;x<i4;x++) if( loop(bp,x,0,dy,cs,0,DO)>=dy/2 ) break; + if(x<i4 && handwritten<10) Break; // gap detected + // glued rn as m ??? hmm seems a ballance act + if(i2-i1>i4-i3+dx/16){ + for(y=0,x=(i1+i2)/2;x<i2;x++){ + i=loop(bp,x,0,dy,cs,0,DO); + i=loop(bp,x,i,dy,cs,1,DO); // measure thickness + if( i>y ) y=i; if( 2*i<y ) Break; + } + if(x <i2) Break; // unusual property for m (see n) + } + if(gchar) ad=99*ad/100; + if(hchar) ad=99*ad/100; + + if( loop(bp,dx-1,dy/16,dx,cs,0,LE)<2 + && loop(bp,dx-1,dy/4 ,dx,cs,0,LE)>3 ) Break; // melted WT + + x=loop(bp,dx-1,dy/2,dx,cs,0,LE); + if (x>2 && loop(bp,dx-1-x/2,0,dy,cs,0,DO)<dy/2) Break; // melt toc + if (loop(bp,(i3+i4)/2,0,dy,cs,0,DO)>dy/2) Break; // N + + // {\it m} + if( loop(bp,1, dy/4,dx,cs,0,RI) + >loop(bp,0,7*dy/8,dx,cs,0,RI) ) + Setac(box1,'m',98*ad/100); + + if (handwritten<10){ + x =loop(bp,0,dy/4,dx,cs,0,RI); + x+=loop(bp,x,dy/4,dx,cs,1,RI); + for( ;x<i4;x++){ // x=i1 ? + i=loop(bp,x,0,dy,cs,0,DO); + if (i>=dy/4) ad=99*ad/100; + if (i>(dy+2)/4) ad=95*ad/100; + if (3*i>dy) Break; + } + if(x<i4) Break; // gap detected + } + + if (box1->dots) ad=99*ad/100; + Setac(box1,'m',ad); + if (ad>=100) return 'm'; + break; + + } + return box1->c; +} + +static wchar_t ocr0_tT(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,i1,i2,i3,i4,j,d,x,y,yb,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + + // --- test T --------------------------------------------------- + for(ad=d=100;dx>2 && dy>3;){ // dx>1 dy>2*dx + DBG( wchar_t c_ask='T'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + // upper horizontal line + i1= loop (bp, dx/8,0,dy,cs,0,DO); // left side + i2= loop (bp,dx-1-dx/8,0,dy,cs,0,DO); // right side + i3= loop (bp, dx/8,i1,dy,cs,1,DO); // left side + i4= loop (bp,dx-1-dx/8,i2,dy,cs,1,DO); // right side + if (i1>dy/4 || i2>dy/4) Break; + for (x=dx/8;x<dx-1-dx/8;x++) { + i= loop (bp,x,0,dy,cs,0,DO); + if (i>i1+dy/8 && i>i2+dy/8) break; + if (i<i1-dy/8 && i<i2-dy/8) break; + } if (x<dx-1-dx/8) Break; + if( get_bw( 0,dx-1, dy/2, dy/2,bp,cs,1) != 1 ) Break; + if( get_bw( 0,(dx-1)/8, dy/2,dy-1-dy/8,bp,cs,1) == 1 ) Break; + if( get_bw( 0,3*dx/16, dy/2,dy-1-dy/4,bp,cs,1) == 1 ) Break; + if( get_bw(dx-1-dx/4,dx-1, dy/2,dy-1-dy/4,bp,cs,1) == 1 ) Break; + // center width + for( y=dy/4;y<3*dy/4;y++){ // oberer Balken? + i=dx/4+loop(bp,dx/4,y,dx,cs,0,RI); // left side of vertical line + j= loop(bp, i,y,dx,cs,1,RI); // width of vertical line + if (3*j>dx+1 || i+j>=dx || i+j/2<dx/2-1) break; // ~r?7 + } if (y<3*dy/4) Break; // Jan07 + // down width + for( y=3*dy/4;y<dy;y++){ + i= loop(bp,dx/4,y,dx,cs,0,RI); + i= loop(bp, i,y,dx,cs,1,RI);if(4*i>3*x) break; //~I + } if( y<dy ) Break; + + i =dx/4+loop(bp,dx/4,dy/4,dx,cs,0,RI);if(i>3*dx/4) Break; // ~7 + i+= loop(bp,i ,dy/4,dx,cs,1,RI);if(i>3*dx/4) Break; + + if( num_cross(0,dx-1, dy-1, dy-1,bp,cs) != 1 + && num_cross(0,dx-1, dy-2, dy-2,bp,cs) != 1 ) Break; + if( num_cross(0,dx-1,2*dy/3,2*dy/3,bp,cs) != 1 + && num_cross(0,dx-1,2*dy/3,2*dy/3,bp,cs) != 1 ) Break; + if (box1->m3 && 2*y1>box1->m3+box1->m4 + && loop(bp,0, 0,dy/2,cs,0,DO)>=dy/4 + && loop(bp,0,dy-1,dy ,cs,0,UP)<=dy/2) ad=96*ad/100; // ~J + if (gchar) ad=98*ad/100; + if( loop(bp,0,dy-1,dx,cs,0,RI)<=dx/8) ad=99*ad/100; // ~J + i = loop(bp,0,dy/2,dx,cs,0,RI); + j = loop(bp,i,dy/2,dx,cs,1,RI); + if( 2*i>=dx || 2*(dx-j-i)<i) ad=95*ad/100; // ~J + + Setac(box1,'T',ad); + if (ad>=100) return 'T'; + break; + } + // --- test t --------------------------------------------------- + // written t can look like a + or even with missing right side + // smallest t found in win-screenshot (prop-font) dx=2 + for(ad=d=100;dx>1 && dy>=box1->m3-box1->m2-1;){ // sometimes no hchar! + DBG( wchar_t c_ask='t'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + if (dy<=box1->m3-box1->m2+1) ad=96*ad/100; // bad line detection? + for(x=0,yb=j=y=dy/32+3*dy/16;y<5*dy/8;y++)if(y>0){ // upper cross line + i=loop(bp,0,y,dx,cs,0,RI); + i=loop(bp,i,y,dx,cs,1,RI); if( i>x ) { x=i;yb=j=y; } // hor. line + i=num_cross(0,dx-1,y ,y ,bp,cs); + j=num_cross(0,dx-1,y+1,y+1,bp,cs); if (i>2 && j>2) break; + if( y<11*dy/16 + && num_cross(0,dx-1,y ,y ,bp,cs) != 1 + && ( num_cross(0,dx-1,y+dy/8,y+dy/8,bp,cs) != 1 || dy<13) // against noise + ) break; + } if( y<4*dy/8 ) Break; + if (dy>12 && x>4 && x>dx/2 && yb<=(dy+4)/8) + if ( loop(bp,dx-1-3*x/4,yb,dy,cs,1,UP) + <=loop(bp,dx-1-1*x/4,yb,dy,cs,1,UP)+1 ) + if ( loop(bp,0 ,dy/2,dy,cs,1,UP)>dx/8 ) Break; // ~C + + if (x<dx/2) ad=95*ad/100; // unusual small ? + if (x>=dx && 9*dx>=8*dy) { ad=99*ad/100; } // + + + i=loop(bp,dx-1,0,dx,cs,0,LE); + for(y=0;y<dy/4;y++){ + if( num_cross(0,dx-1,y ,y ,bp,cs) == 2 + && num_cross(0,dx-1,y+1,y+1,bp,cs) == 2 ) break; + j=loop(bp,dx-1,y,dx,cs,0,LE); if(j-i>1) break; i=j; + } + if( y<dy/4 ) Break; // ~f + + i=loop(bp,dx-1,yb,dx,cs,0,LE); + for(y=dy/8;y<yb;y++) + if( loop(bp,dx-1,y,dx,cs,0,LE)>i ) break; + if( y==yb ) break; + + j=loop(bp,0, dy/2,dx,cs,0,RI); + j=loop(bp,j, dy/2,dx,cs,1,RI); i=j; // thickness + j=loop(bp,0, dy/4,dx,cs,0,RI); + j=loop(bp,j, dy/4,dx,cs,1,RI); if (j<i) i=j; // thickness + j=loop(bp,0,3*dy/4,dx,cs,0,RI); + j=loop(bp,j,3*dy/4,dx,cs,1,RI); if (j<i) i=j; // thickness + if( 2*x<3*i ) Break; + + if( loop(bp,dx-1,dy/2,dx,cs,0,LE)-dx/8 + <=loop(bp,dx-1, yb ,dx,cs,0,LE) ) + if( loop(bp,dx-1, yb ,dx,cs,0,LE)-dx/8 + >=loop(bp,dx-1,yb/2,dx,cs,0,LE) ) Break; // ~1 ??? + + j=1; + for(y=1;j && y<yb; y++) // no @@ pattern + for(x=0;j && x<dx-2;x++){ // .. + if( getpixel(bp,x ,y )>=cs && getpixel(bp,x+1,y )>=cs + && getpixel(bp,x ,y-1)< cs && getpixel(bp,x+1,y-1)< cs ) { j=0;break; } + } if(!j) Break; + + if( num_cross(0,dx-1,dy-2,dy-2,bp,cs) == 2 + && num_cross(0,dx-1,dy-1,dy-1,bp,cs) == 2 ) Break; // ~* (5er) + + if( dy>= 16 + && loop(bp, 0, 3*dy/4,dx,cs,0,RI) + >=loop(bp, 0, dy-2,dx,cs,0,RI) + && loop(bp,dx-1, 3*dy/4,dx,cs,0,LE) + <=loop(bp,dx-1, dy-2,dx,cs,0,LE) + && loop(bp,dx-1, 1,dx,cs,0,LE)+dx/16 + <loop(bp,dx-1,3*dy/16,dx,cs,0,LE) + && ( loop(bp, 0, 1,dx,cs,0,RI) + >loop(bp, 0,3*dy/16,dx,cs,0,RI)+dx/16 + || loop(bp,dx-1, 0,dx,cs,0,LE)==0 + || loop(bp,dx-1, 1,dx,cs,0,LE)==0) ) ad=96*ad/100; // ~f Jan02 + if(dx<8 && dy>12){ // thin f's could easily confound with t + x=loop(bp,dx-1,3*dy/16,dx,cs,0,LE); + if (x) + if (loop(bp,dx-x,0,dy,cs,0,DO)<3*dy/16 + && loop(bp, 0, 3*dy/4,dx,cs,0,RI)+1 + >=loop(bp, 0, dy-2,dx,cs,0,RI) + && loop(bp,dx-1, 3*dy/4,dx,cs,0,LE) + <=loop(bp,dx-1, dy-2,dx,cs,0,LE) ) Break; + } + if (dx>7) + if( num_cross( 0,dx-1,2*dy/3,2*dy/3,bp,cs) > 1 + && num_cross( 0,dx/2,2*dy/3,2*dy/3,bp,cs) > 0 + && num_cross(dx/2,dx-1,2*dy/3,2*dy/3,bp,cs) > 0 ) + if (sdata->holes.num > 0) + if (sdata->holes.hole[0].y0 > dy/4) Break; // ~6 + // if ( num_hole( x0, x1, y0+dy/4, y1, box1->p,cs,NULL) > 0 ) Break; // ~6 + + if( num_cross(0,dx-1,3*dy/4, 3*dy/4, bp,cs) >= 2 + && num_cross(0,dx-1,3*dy/4-1,3*dy/4-1,bp,cs) >= 2 ){ + ad=99*ad/100; /* italic t ? */ + if (loop(bp,dx/2 ,dy-1,dy,cs,0,UP)>dy/4) Break; // ~h + if (loop(bp,dx/2+1,dy-1,dy,cs,0,UP)>dy/4) Break; // ~h + } + + x= loop(bp,dx-1,dy/2,dx,cs,0,LE); + i= loop(bp,dx-1,dy/8,dx,cs,0,LE); + if (i>x && loop(bp,dx-x,0,dy,cs,0,DO)>=dy/2) ad=90*ad/100; /* ~\ */ + + x= loop(bp,0, 0,dx,cs,0,RI); + i= loop(bp,0, 1,dx,cs,0,RI); if (i<x) x=i; + i= loop(bp,0,dy/4,dx,cs,0,RI); + if (i-x>1) Break; // l + + // this happens quite often, do not be to strong + if (!box1->m2) ad=99*ad/100; + if (box1->m2) { + if (!hchar) ad=99*ad/100; /* some times t is not long enough */ + if( y0>=box1->m2-(box1->m2-box1->m1)/4 ) ad=99*ad/100; /* to short */ + if( y0>=box1->m2 ) ad=99*ad/100; /* to short */ + } + + if (sdata->holes.num > 0) ad=95*ad/100; + if (gchar) ad=99*ad/100; + if (box1->dots) ad=90*ad/100; + Setac(box1,'t',ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_sS(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int d,x,y,i1,i2,i3,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + wchar_t ac; + + // --- test sS near 5 --------------------------------------------------- + for(ad=d=100;dx>2 && dy>3;){ // min 3x4 (4x6 font) + DBG( wchar_t c_ask='s'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + if( num_cross( dx/2, dx/2,0,dy-1,bp,cs)!=3 + && num_cross(5*dx/8,3*dx/8,0,dy-1,bp,cs)!=3 + && dy>4 ) Break; + if( num_cross(0,dx-1,dy/2 ,dy/2 ,bp,cs)!=1 + && num_cross(0,dx-1,dy/2-1,dy/2-1,bp,cs)!=1 ) Break; + // get the upper and lower hole koords + y=dy/4; + x =loop(bp,0,y,dx,cs,0,RI); if(x>3*dx/8) Break; /* slanted too */ + x +=loop(bp,x,y,dx,cs,1,RI); if(x>5*dx/8) Break; /* fat too */ + i1 =loop(bp,x,y,dx,cs,0,RI); i1=(i1+2*x)/2; // upper center x + y=11*dy/16; + x =loop(bp,dx-1 ,y,dx,cs,0,LE); if(x>dx/4) Break; + x +=loop(bp,dx-1-x,y,dx,cs,1,LE); if(dx>5 && dy>7 && x>dx/2) Break; + if (x>3*dx/4) Break; if(x>dx/2) { ad=98*ad/100; MSG({})} + i2 =loop(bp,dx-1-x,y,dx,cs,0,LE); i2=dx-1-(i2+2*x)/2; // upper center x + for( y=dy/4;y<dy/2;y++ ) // Mai00 ~3 + if( get_bw(0,i1,y,y,bp,cs,1) != 1 ) break; + if( y<dy/2 ) Break; + y=dy/2-loop(bp,dx-1,dy/2,dy/2,cs,1,UP); +// if( !joined(bp,i1,dy/4,dx-1,y,cs) ){ + // break; // sometimes thick small fonts have no gap +// } + for(y=dy/4;y<dy/2;y++){ + x=loop(bp,dx-1,y,dx,cs,0,LE);if(x>dx/8) break; + } + if(y==dy/2) Break; // Mai00 + + y=dy/2+loop(bp,0,dy/2,dy/2,cs,1,DO); + if( !joined(bp,0,y,i2,11*dy/16,cs) ) Break; + + if (sdata->holes.num > 0) + if (sdata->holes.hole[0].y0 > dy/4) Break; // ??? + // if( num_hole( x0, x1, y0+dy/4, y1, box1->p,cs,NULL) > 0 ) Break; + + i1=loop(bp,dx-1,dy-1,dx,cs,0,LE); + i2=loop(bp,dx-1,dy-2,dx,cs,0,LE); + if (i2-i1 >= dx/4) Break; // ~{ 5x7font + + i1=loop(bp, 0, 0,dx,cs,0,RI); + i2=loop(bp, 0, 1,dx,cs,0,RI); + if (i2-i1 >= dx/4) Break; // ~} 5x7font + + // sS5 \sl z left upper v-bow ? + + i1=loop(bp, 0,dy/2,dx,cs,0,RI); + i1=loop(bp, i1,dy/2,dx,cs,1,RI); + if (4*i1>=3*dx) ad=97*ad/100; // ~5 7-segment + + i1=loop(bp,0, dy/16,dx,cs,0,RI); + i2=loop(bp,0,4*dy/16,dx,cs,0,RI); + i3=loop(bp,0,7*dy/16,dx,cs,0,RI); + if( 2*i2+dx/32 >= i1+i3 ){ + if( 2*i2+dx/32 > i1+i3 || dx>9 ) Break; + // very small s? + i1+=loop(bp,i1, dy/16,dx,cs,1,RI); + i2+=loop(bp,i2,4*dy/16,dx,cs,1,RI); + i3+=loop(bp,i3,7*dy/16,dx,cs,1,RI); + if( 2*i2+dx/32 >= i1+i3 ) Break; + } + + for(y=7*dy/16;y<5*dy/8;y++){ + if( num_cross( 0,dx-1,y ,y ,bp,cs)==2 ) + if( num_cross( 0,dx-1,y+1,y+1,bp,cs)==1 ) + if( num_cross( 0,dx/4,y,y,bp,cs)==1 ) break; // ~5 + } if(y<5*dy/8) Break; // v0.2.4a5 + if ( loop(bp, dx-1,dy-2-dy/32,dx,cs,0,LE) + > loop(bp, 0, 1+dy/32,dx,cs,0,RI) + dx/4 ) Break; // ~5 Dec00 + ac='s'; + if (gchar) { ad=98*ad/100; MSG({}) } + if( hchar ){ // S but 5 is very similar! check it + ac='S'; + if ( loop(bp, dx-1,dy-1-dy/32,dx,cs,0,LE) + > loop(bp, 0, 0+dy/32,dx,cs,0,RI) ) ad=99*ad/100; // ~5 + if ( loop(bp, 0,dy-1-dy/32,dx,cs,0,RI) + > loop(bp, dx-1, 0+dy/32,dx,cs,0,LE) ) ad=99*ad/100; // ~5 + } + Setac(box1,ac,ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_gG(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,i1,i2,i3,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + + // --- test g --------------------------------------------------- + /* some g's have crotchet at upper right end, so hchar can be set */ + // ~italic g + for(ad=d=100;dx>2 && dy>4;){ // min 3x5 + DBG( wchar_t c_ask='g'; ) + if (sdata->holes.num > 3) Break; /* tolerant against a tiny hole */ + if( get_bw(x0+dx/2, x0+dx/2, y1-dy/2, y1,box1->p,cs,1) != 1 ) Break; + if( get_bw(x1-dx/4, x1 , y1-dy/4, y1,box1->p,cs,1) != 1 ) Break; // ~p + if( get_bw(x0+dx/2, x0+dx/2, y0, y0+dy/2,box1->p,cs,1) != 1 ) Break; + + if( num_cross(x0+dx/2, x0+dx/2, y0, y1, box1->p,cs) < 3 ) + if( num_cross(x1-dx/2, x1-dx/2, y0, y1, box1->p,cs) < 3 ) Break; + if (sdata->holes.num < 1) Break; + for (i=0;i<sdata->holes.num;i++){ + if (sdata->holes.hole[i].y1 < 5*dy/8+1) break; + } if (i==sdata->holes.num) Break; // no upper hole found + // if( num_hole ( x0, x1, y0, y0+5*dy/8, box1->p,cs,NULL) != 1 ) Break; + for(y=dy/4;y<dy;y++) if( num_cross(0,dx-1,y,y,bp,cs)==2 ) break; + if( y==dy ) Break; // ~q + if( get_bw(0,dx/2,7*dy/8,7*dy/8,bp,cs,1) != 1 ) Break; // ~q + y =loop(bp,dx/16,0,dy,cs,0,DO); if(y<=dy/8) + y+=loop(bp,dx/16,y,dy,cs,1,DO); if(16*y>=15*dy) Break; // ~B + + if (num_cross(x1, x1, (y0+y1)/2, y1, box1->p,cs)>1) { + ad=98*ad/100; // ~& + if (num_cross(x1 , x1 , y0, (y0+y1)/2, box1->p,cs)<1 ) ad=96*ad/100; + if (num_cross(x1-1, x1-1, y0, (y0+y1)/2, box1->p,cs)<1 ) ad=95*ad/100; + } + // looking for a gap + for (x=0,y=dy/4;y<dy-dy/4;y++){ + i=loop(bp,dx-1,y,dy,cs,0,LE); if (i>x) x=i; + } // in a good font x is greater dx/2 + + if (x<dx/2) { // bad font? or % + if( num_cross(x0,x1 ,y0+dy/4,y0+dy/4,box1->p,cs) > 2 + || num_cross(x0,x1 ,y0+dy/8,y0+dy/8,box1->p,cs) > 2) ad=90*ad/100; + if( num_cross(x0,x1+dx/4,y1-dy/4,y1-dy/4,box1->p,cs) > 2 + || num_cross(x0,x1+dx/4,y1-dy/8,y1-dy/8,box1->p,cs) > 2) ad=90*ad/100; + } + if( num_cross(0,dx-1,dy/2,dy/2,bp,cs) >2 ) ad=99*ad/100; // ~/o + + /* test for horizontal symmetry ~8 */ + for (y=0;y<dy;y++) for (x=0;x<dx/2;x++) + if ((getpixel(bp,x,y)<cs)!=(getpixel(bp,dx-1-x,y)<cs)) { y=dy+1; break; } + if (y==dy) Break; /* ~8 */ + + if (box1->m4==0) ad=98*ad/100; + if ( hchar) ad=96*ad/100; + if (!gchar) ad=96*ad/100; + ad=98*ad/100; + Setac(box1,'g',ad); + break; + } + // --- test rundes G --------------------------------------------- + for(ad=d=100;dx>3 && dy>4;){ // min 3x4 + DBG( wchar_t c_ask='G'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + if( get_bw(x0 ,x0+dx/2,y0+dy/3,y0+dy/3,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2,x1-dx/4,y0 ,y0+dy/4,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2,x0+dx/2,y1-dy/4,y1 ,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0 ,x0+dx/2,y1-dy/3,y1-dy/3,box1->p,cs,1) != 1 ) Break; // ~S + for( y=y0+dy/4;y<y1-dy/3;y++ ) + if( get_bw(x1-dx/2,x1,y,y,box1->p,cs,1) == 0 ) break; + if( y==y1-dy/3 ) Break; // no gap + + if( num_cross(x0+dx/2 , x0+dx/2 , y0, y, box1->p,cs) != 1 + || num_cross(x0+dx/2+1, x0+dx/2+1, y0, y, box1->p,cs) != 1 ) Break; // ~e + + x=x0; y=y1; + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,UP,ST); // left bow? + if( y<y0+dy/4 ) Break; // filter W + + x=x1; y=y1-dy/3; // upper right offen bow + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,LE,ST); + if( x<x1-3*dx/8 ) Break; + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,ST,LE); + if( x<x0+dx/2 ){ // not sure, try again (not best) + x=x1; y=y1-dy/4; + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,LE,ST); + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,ST,LE); + if( x<x0+dx/2 ) Break; + } + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,RI,UP); // upper end right midle + if( x<=x1 ) Break; + if( y<y0+3*dy/8 ) Break; + if( y>y1-dy/4 ) Break; + + x=x1-dx/3;y=y1; // follow left C-bow, filter S + turmite(box1->p,&x,&y,x0,x1,y0+dy/4,y1,cs,LE,UP); // w=LE b=UP + if( y>y0+dy/4+1 ) Break; /* leave box below for S or on top for CG */ + MSG(fprintf(stderr,"xy= %d %d",x-x0,y-y0);) + /* if (y<y0) y++; else x++; */ /* enter the box again */ + turmite(box1->p,&x,&y,x0,x1,y0 ,y1,cs,RI,UP); + MSG(fprintf(stderr,"xy= %d %d",x-x0,y-y0);) + if( y>y0 ) Break; + if (sdata->holes.num > 0) Break; + // if( num_hole(x0,x1,y0,y1,box1->p,cs,NULL) > 0 ) Break; + if( dx>4 && dy>6){ // no (<[ + for(i=1,y=0;i && y<dy/3;y++) + if( num_cross(0,dx-1,y,y,bp,cs) == 2 ) i=0; + if( i ) ad=98*ad/100; + for(i=1,y=0;i && y<dy/3;y++) + if( num_cross(0,dx-1,dy-1-y,dy-1-y,bp,cs) == 2 ) i=0; + if( i ) Break; + } + for(i=1,y=dy/2;i && y<dy;y++) + if( num_cross(0,dx-1,y,y,bp,cs) == 2 ) i=0; + if( i ) Break; + for(i=0,y=3*dy/4;y<dy;y++){ + x=loop(bp,0,y,dx,cs,0,RI); // Kante abfallend <=> Z + if( x<i-dx/20 ) break; + if( x>i ) i=x; + } if( y<dy ) Break; + + // only check the middle! + for(i=0,i1=y=dy/4;y<dy-dy/4;y++){ // look for horizontal line + x=loop(bp,dx-1 ,y,dx/4,cs,0,LE); + x=loop(bp,dx-1-x,y,dx/2,cs,1,LE); if(x>i){ i=x;i1=y; } + } if( i1<=dy/4 || i1>=dy-dy/4 ) Break; // around the middle ? + // check from above for gap and left vertical line (~S) + x =loop(bp,0,i1,dx ,cs,0,RI); + x+=loop(bp,x,i1,dx-x,cs,1,RI); // left vertical bow + x+=loop(bp,x,i1,dx-x,cs,0,RI); if (x>=dx) ad=90*ad/100; + MSG(fprintf(stderr,"h-bar y dx %d %d ad= %d",i1,i,ad);) + + i=1; // Mar06: adapted to 4x6 font + for(x=dx/2;x<dx-1 && i;x++) // look for @@ (instead +1 use +delta?) + for(y=dy/2;y<dy-1 && i;y++){ // .@ + if( getpixel(bp,x ,y )>=cs + && getpixel(bp,x+1,y )< cs + && getpixel(bp,x+1,y-1)< cs + && getpixel(bp,x ,y-1)< cs ) { i=0;break; } + } + if(i) ad=95*ad/100; // ~C + if(!hchar) ad=98*ad/100; + if( gchar) ad=98*ad/100; + + Setac(box1,'G',ad); + break; + } + // --- test \it g like 9 ---------------------------------------------- + for(ad=d=100;dx>2 && dy>4;){ // dx>1 dy>2*dx + DBG( wchar_t c_ask='g'; ) + if (sdata->holes.num > 2) Break; /* tolerant against a tiny hole */ + if( num_cross(x0+dx/2,x0+dx/2,y0,y1,box1->p,cs) != 3 // pre select + && num_cross(x0+dx/4,x1-dx/4,y0,y1,box1->p,cs) != 3 ) Break; + for( x=0,i=y=y0+dy/2;y<=y1-3*dy/16;y++){ // suche kerbe + j=loop(box1->p,x0,y,dx,cs,0,RI); + if( j>2 && j>dx/4 && y<y1-3 && j<dx/2 ) // long bow + j+=loop(box1->p,x0+j-2,y+1,dx,cs,0,RI)-2; + if( j>x ) { x=j; i=y; } + } + if( x<4*dx/8 ) Break; + if( num_cross(x0+dx/2,x1,i ,y1,box1->p,cs) != 1 + && num_cross(x0+dx/2,x1,i+1,y1,box1->p,cs) != 1 ) Break; + if( num_hole(x0,x1,y0,i+1,box1->p,cs,NULL)!=1 ) Break; + if( num_hole(x0,x1,i-1,y1,box1->p,cs,NULL)!=0 ) Break; + if( loop(box1->p,x0,y1 ,dy,cs,0,RI)>dx/3 && + loop(box1->p,x0,y1-1,dy,cs,0,RI)>dx/3) Break; // no q + for( x=0,i=y=y0+dy/3;y<=y1-dy/3;y++){ // suche kerbe + j=loop(box1->p,x1,y,dx,cs,0,LE); + if( j>x ) { x=j; i=y; } + } if( x>dx/2 ) Break; // no g + i1=loop(bp,dx-1,dy/8 ,dx,cs,0,LE); if(i1>dx/2) Break; + i3=loop(bp,dx-1,dy-1-dy/8,dx,cs,0,LE); + i2=loop(bp,dx-1,dy/2 ,dx,cs,0,LE); if(i1+i3<2*i2-dx/8) Break; // konvex + i1=loop(bp,dx-1,dy/4 ,dx,cs,0,LE); if(i1>dx/2) Break; + i3=loop(bp,dx-1,dy-1-dy/8,dx,cs,0,LE); + for(y=dy/4;y<dy-1-dy/4;y++){ + i2=loop(bp,dx-1,y,dx,cs,0,LE); + if(i1+i3-2*i2<-1-dx/16) break; // konvex from right ~g ~3 + } if(y<dy-1-dy/4) Break; + x=loop(bp,dx -1,6*dy/8,dx,cs,0,LE); if(x>0){ x--; // robust + y=loop(bp,dx-x-1, dy-1,dy,cs,0,UP); + if(y<dy/8) Break; // ~q (serif!) + } + // % + if( num_cross(x0,x1 ,y0+dy/4,y0+dy/4,box1->p,cs) > 2) ad=90*ad/100; + if( num_cross(x0,x1+dx/4,y1-dy/4,y1-dy/4,box1->p,cs) > 2 + || num_cross(x0,x1+dx/4,y1-dy/8,y1-dy/8,box1->p,cs) > 2) ad=90*ad/100; + + if (box1->m4==0) ad=98*ad/100; + if ( hchar) ad=96*ad/100; + if (!gchar) ad=96*ad/100; + if (ad>99) ad=99; // never be sure to have a 9 + Setac(box1,'g',ad); + break; + } + return box1->c; +} + +// rewritten for vector usage v0.41 +static wchar_t ocr0_xX(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + // pix *bp=sdata->bp; // obsolete + int i,j,d,x,y,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0, x1=box1->x1, y0=box1->y0, y1=box1->y1; // ,cs=sdata->cs; + int dx=x1-x0+1, dy=y1-y0+1, /* size */ + (*aa)[4]=sdata->aa, /* the for line ends, (x,y,dist^2,vector_idx) */ + ad; /* tmp-vars */ + wchar_t bc=UNKNOWN; + + // --- test xX --------------------------------------------------- + // rewritten for vectors 0.41 + for(ad=d=100;dx>2 && dy>3;){ // min 3x4 + int ld, i1, i2, i3, i4; // lien derivation, 4 inner edges + DBG( wchar_t c_ask='x'; ) + if (sdata->holes.num > 0) Break; /* # */ + /* half distance to the center */ + d=2*sq(128/4); + /* now we check for the 4 ends of the x */ + if (aa[0][2]>d) Break; + if (aa[1][2]>d) Break; + if (aa[2][2]>d) Break; + if (aa[3][2]>d) Break; + if (aa[3][0]-aa[0][0]<dx/2) Break; + if (aa[2][0]-aa[1][0]<dx/2) Break; + if (aa[1][1]-aa[0][1]<dy/2) Break; + if (aa[2][1]-aa[3][1]<dy/2) Break; + /* searching for 4 notches between neighbouring ends */ + + /* only left side */ + for (j=i=aa[0][3];i!=aa[1][3];i=(i+1)%box1->num_frame_vectors[0]) { + if (box1->frame_vector[i][0] + >=box1->frame_vector[j][0]) j=i; /* notice most right vector */ + } if (j==i) Break; + /* calculate the distance to the center */ + x=box1->frame_vector[j][0]; + y=box1->frame_vector[j][1]; i1=j; + if (abs(aa[0][0]+aa[1][0]+aa[2][0]+aa[3][0]-4*x)>(dx+2)) Break; + if (abs(aa[0][1]+aa[1][1]+aa[2][1]+aa[3][1]-4*y)>(dy+2)) Break; + if ( aa[0][0]+aa[1][0]-2*x>=0) Break; + if ( aa[1][0] >= x ) Break; + if ( aa[0][0] > x ) Break; + if ( aa[0][0] >= x ) ad=99*ad/100; + if (x-x0<dx/8) Break; + if (x-x0<dx/4) ad=99*ad/100; + /* check if upper left and center point are joined directly */ + ld=line_deviation(box1, aa[0][3], j); + MSG(fprintf(stderr," 0-X %d %d dist= %d/%d",x-x0,y-y0,ld,2*sq(1024/4));) + if (ld >2*sq(1024/4)) Break; + /* check if lower left and center point are joined directly */ + ld=line_deviation(box1, j, aa[1][3]); + MSG(fprintf(stderr," X-1 %d %d dist= %d/%d",x-x0,y-y0,ld,2*sq(1024/4));) + if (ld >2*sq(1024/4)) Break; + + /* only lower side */ + for (j=i=aa[1][3];i!=aa[2][3];i=(i+1)%box1->num_frame_vectors[0]) { + if (box1->frame_vector[i][1] + <=box1->frame_vector[j][1]) j=i; /* notice most upper vector */ + } if (j==i) Break; + /* calculate the distance to the center */ + x=box1->frame_vector[j][0]; + y=box1->frame_vector[j][1]; i2=j; + if (abs(aa[0][0]+aa[1][0]+aa[2][0]+aa[3][0]-4*x)>(dx+2)) Break; + if (abs(aa[0][1]+aa[1][1]+aa[2][1]+aa[3][1]-4*y)>(dy+2)) Break; + if ( aa[1][1]+aa[2][1]-2*y<=0) Break; + /* check if lower left and center point are joined directly */ + ld=line_deviation(box1, aa[1][3], j); + MSG(fprintf(stderr," 1-X %d %d dist= %d/%d",x-x0,y-y0,ld,2*sq(1024/4));) + if (ld >2*sq(1024/4)) Break; + /* check if lower right and center point are joined directly */ + ld=line_deviation(box1, j, aa[2][3]); + MSG(fprintf(stderr," X-2 %d %d dist= %d/%d",x-x0,y-y0,ld,2*sq(1024/4));) + if (ld >2*sq(1024/4)) Break; + + /* only right side */ + for (j=i=aa[2][3];i!=aa[3][3];i=(i+1)%box1->num_frame_vectors[0]) { + if (box1->frame_vector[i][0] + <=box1->frame_vector[j][0]) j=i; /* notice most left vector */ + } if (j==i) Break; + /* calculate the distance to the center */ + x=box1->frame_vector[j][0]; + y=box1->frame_vector[j][1]; i3=j; + if (abs(aa[0][0]+aa[1][0]+aa[2][0]+aa[3][0]-4*x)>(dx+2)) Break; + if (abs(aa[0][1]+aa[1][1]+aa[2][1]+aa[3][1]-4*y)>(dy+2)) Break; + if ( aa[2][0]+aa[3][0]-2*x<=0) Break; + if ( aa[3][0] <= x ) Break; + if ( aa[2][0] < x ) Break; + if ( aa[2][0] <= x ) ad=99*ad/100; + if (dx-(x-x0)<dx/8) Break; + if (dx-(x-x0)<dx/4) ad=99*ad/100; + /* check if lower right and center point are joined directly */ + ld=line_deviation(box1, aa[2][3], j); + MSG(fprintf(stderr," 2-X %d %d dist= %d/%d",x-x0,y-y0,ld,2*sq(1024/4));) + if (ld >2*sq(1024/4)) Break; + /* check if upper right and center point are joined directly */ + ld=line_deviation(box1, j, aa[3][3]); + MSG(fprintf(stderr," X-3 %d %d dist= %d/%d",x-x0,y-y0,ld,2*sq(1024/4));) + if (ld >2*sq(1024/4)) Break; + + /* only upper side */ + for (j=i=aa[3][3];i!=aa[0][3];i=(i+1)%box1->num_frame_vectors[0]) { + if (box1->frame_vector[i][1] + >=box1->frame_vector[j][1]) j=i; /* notice lowest vector */ + } if (j==i) Break; + /* calculate the distance to the center */ + x=box1->frame_vector[j][0]; + y=box1->frame_vector[j][1]; i4=j; + if (abs(aa[0][0]+aa[1][0]+aa[2][0]+aa[3][0]-4*x)>(dx+2)) Break; + if (abs(aa[0][1]+aa[1][1]+aa[2][1]+aa[3][1]-4*y)>(dy+2)) Break; + if ( aa[3][1]+aa[0][1]-2*y>=0) Break; + /* check if upper left and center point are joined directly */ + ld=line_deviation(box1, aa[3][3], j); + MSG(fprintf(stderr," 3-X %d %d dist= %d/%d",x-x0,y-y0,ld,2*sq(1024/4));) + if (ld >2*sq(1024/4)) Break; + /* check if lower left and center point are joined directly */ + ld=line_deviation(box1, j, aa[0][3]); + MSG(fprintf(stderr," X-0 %d %d dist= %d/%d",x-x0,y-y0,ld,2*sq(1024/4));) + if (ld >2*sq(1024/4)) Break; + + // center crossing of diagonal lines is small? + if (box1->frame_vector[i3][0] - box1->frame_vector[i1][0] > dx/2) Break; + + if (gchar) ad=99*ad/100; + bc='x'; if(hchar) bc='X'; + Setac(box1,bc,ad); + break; + } + // --- test \it x --------------------------------------------------- +#if 0 + for(ad=d=99;dx>4 && dy>4;){ // min 3x4 + DBG( wchar_t c_ask='x'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + if( get_bw(x0,x0+dx/4,y0+dy/2,y0+dy/2,box1->p,cs,1) != 0 ) Break; + if( get_bw(x1-dx/4,x1,y0+dy/2,y0+dy/2,box1->p,cs,1) != 0 ) Break; + if( num_cross(x0+dx/4,x1-dx/4,y0+dy/2,y0+dy/2, box1->p,cs) != 1 ) Break; + if( num_cross(x0,x1,y0+dy/4,y0+dy/4, box1->p,cs) != 3 + && num_cross(x0,x1,y0+dy/8,y0+dy/8, box1->p,cs) < 3 ) Break; + if( num_cross(x0,x1,y1-dy/4,y1-dy/4, box1->p,cs) != 3 + && num_cross(x0,x1,y1-dy/8,y1-dy/8, box1->p,cs) < 3 ) Break; + if( gchar ) ad=97*ad/100; + if( hchar ) ad=96*ad/100; + bc='x'; + Setac(box1,bc,ad); + break; + } +#endif + return box1->c; +} + +static wchar_t ocr0_yY(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad,xa,ya,xb,yb,xc,yc,xd,yd; /* tmp-vars */ + wchar_t bc=UNKNOWN; + + // --- test italic yY -------------------------------------------- + for(ad=d=100;dx>2 && dy>3;){ // min 3x4 + DBG( wchar_t c_ask='y'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + if (sdata->holes.num > 0) ad=97*ad/100; + if( num_cross(0,dx-1,dy/8,dy/8,bp,cs) < 2 + && num_cross(0,dx-1, 1, 1,bp,cs) < 2 ) Break; + if( num_cross(0,dx-1,dy-1,dy-1,bp,cs) != 1 + && num_cross(0,dx-1,dy-2,dy-2,bp,cs) != 1 ) Break; + if( num_cross(dx-1,dx-1,0,dy-1,bp,cs) != 1 + && num_cross(dx-2,dx-2,0,dy-1,bp,cs) != 1 ) Break; + if( num_cross(dx/3,dx/3,dy/4,dy-1,bp,cs) != 2 + && num_cross(dx/2,dx/2,dy/4,dy-1,bp,cs) != 2 ) Break; + for(yc=y=0,xc=x=dx/4;x<dx-dx/4;x++){ // search deepest point + i=loop(bp,x,0,dy,cs,0,DO); if(i>y){ yc=y=i;xc=x; } + } if( y>12*dy/16 || y<3*dy/8 ) Break; + ya=dy/8; xa=xc-loop(bp,xc,ya,dx,cs,0,LE); if(xa< 0) Break; + yb=dy/8; xb=xc+loop(bp,xc,yb,dx,cs,0,RI); if(xb>=dx) Break; + for(y=dy/8;y<yc-dy/8;y++){ + if( num_cross(xc,dx-1,y,y,bp,cs) != 1 ) break; + if( num_cross(0 ,xc ,y,y,bp,cs) < 1 ) break; + } if(y<yc-dy/8) Break; + yd=dy-1-dy/8;xd=dx-1-loop(bp,dx-1,yd,dx,cs,0,LE); + g_debug(fprintf(stderr," debug_yY: \n" + " /a b \n" + " | | \n" + " -c/ \n" + " \e-d \n");) + g_debug(fprintf(stderr,"a-e: %d %d %d %d %d %d %d %d", + xa,ya,xb,yb,xc,yc,xd,yd);) + if(xd>6*dx/8) ad=99*ad/100; // why this??? + if (loop(bp,dx-1,dy-1,dx,cs,0,LE)<1) Break; + // printf(" abcd=%d %d %d %d %d %d %d %d -",xa,ya,xb,yb,xc,yc,xd,yd); + if( get_line2(xb,yb,xd,yd,bp,cs,100)<95 ) Break; + // if( get_line2(xc,yc,xd,yd,bp,cs,100)<95 ) Break; + // printf("ok"); + bc='y'; + if(gchar && !hchar) bc='y'; else + if(hchar && (!gchar || dy<14)) bc='Y'; else ad=98*ad/100; // SMALL-CAPS ??? + Setac(box1,bc,ad); + break; + } + // --- test yY --------------------------------------------------- + for(ad=d=100;dx>2 && dy>3;){ // min 3x4 + DBG( wchar_t c_ask='y'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + if( get_bw(x0,x0,y1-dy/8,y1,box1->p,cs,1) == 1 ) { + if( get_bw(x0,x0+4*dx/8,y0+dy/8,y0+dy/8,box1->p,cs,1) != 1 ) Break; + } else { + if( get_bw(x0,x0+3*dx/8,y0+dy/8,y0+dy/8,box1->p,cs,1) != 1 ) Break; + } + if( num_cross(0,dx-1,dy/8,dy/8,bp,cs) != 2 + && num_cross(0,dx-1, 1, 1,bp,cs) != 2 ) Break; + if( num_cross(dx/2,dx/2,0, 1,bp,cs) != 0 ) Break; + if( num_cross(0,dx-1,dy-1,dy-1,bp,cs) != 1 + && num_cross(0,dx-1,dy-2,dy-2,bp,cs) != 1 ) Break; + if( num_cross(dx-1,dx-1,0,dy-1,bp,cs) != 1 + && num_cross(dx-2,dx-2,0,dy-1,bp,cs) != 1 + && num_cross(dx-dx/8-1,dx-dx/8-1,0,dy-1,bp,cs) != 1 ) Break; + if( loop(bp,dx-1,dy-1-dy/8,dx,cs,0,LE)+dx/8+1 // Jul00 + < loop(bp, 0,dy-1-dy/8,dx,cs,0,RI) ) Break; + for(y=0,x=dx/4;x<dx-dx/4;x++){ // search lowest point + i=loop(bp,x,0,dy,cs,0,DO); if(i>y) y=i; + } if( y>10*dy/16 || y<2*dy/8 ) Break; + for(xc=xb=xa=dx,yc=yb=ya=y=0;y<dy/4;y++){ + x =loop(bp, 0 , y,dx,cs,0,RI); if(x<xa){ xa=x;ya=y; } + x =loop(bp,dx-1 , y,dx,cs,0,LE); if(x<xb){ xb=x;yb=y; } + } + if(yb>dy/8) Break; + for(i=dx,yc=y=dy/4;y<3*dy/4;y++){ + if( num_cross(0,dx-1,y,y,bp,cs) < 2 ) break; + x =loop(bp,dx-1 ,y,dx,cs,0,LE); + x+=loop(bp,dx-1-x,y,dx,cs,1,LE); + j =loop(bp,dx-1-x,y,dx,cs,0,LE); if(j<=i){ i=j;yc=y;xc=dx-1-x-j/2; } + } yc+=dy/16+1; + yc+=loop(bp,xc,yc,i,cs,1,DO)/2; + xa+= loop(bp,xa ,ya,dx,cs,1,RI)/2; + xb=dx-1-loop(bp,dx-1,yb,dx,cs,1,LE)/2; + yd=dy-1-dy/8;xd=dx-1-loop(bp,dx-1,yd,dx,cs,0,LE); if(xd>6*dx/8) Break; + /* check for serife at lower end */ + for (i=0,x=dx-1;i<dy/4;i++) { + j=loop(bp,dx-1,dy-1-i,dx,cs,0,LE); + if (j>x+dx/16+1) break; /* detect serif */ + if (j<x) x=j; + } if (i<dy/4) xd-=loop(bp,xd,yd,dx,cs,1,LE)/2; + MSG( fprintf(stderr," debug_yY: \n" + " a b \n" + " \\ / \n" + " c \n" + " ed ");) + MSG(fprintf(stderr,"a-e: %d %d %d %d %d %d %d %d", + xa,ya,xb,yb,xc,yc,xd,yd);) + // check upper left line + if( get_line2(xa,ya,xc ,yc,bp,cs,100)<95 + && get_line2(xa,ya,xc-1,yc,bp,cs,100)<95 ) Break; + // check upper right line + if( get_line2(xb,yb,xc ,yc,bp,cs,100)<95 + && get_line2(xb,yb,xc-1,yc,bp,cs,100)<95 ) { + // Times-Italic y ??? + xb+=loop(bp,xb,yb,dx/4,cs,1,RI)-1; + yb+=loop(bp,xb,yb,dy/8,cs,1,DO)-1; + if( get_line2(xb,yb,xc ,yc,bp,cs,100)<95 ) Break; + } + if( get_line2(xc,yc,xd,yd,bp,cs,100)<95 ) Break; + + // decission between V and Y is sometimes very difficult + // hope that the following code is the ultimate solution + if( yc>=5*dy/8 && !gchar) + if( get_line2(xa,ya,xd ,yd,bp,cs,100)>95 ) + if( get_line2(xb,yb,xd ,yd,bp,cs,100)>95 ) + { if (dx>4) { Break; } else ad=ad*98/100; } // ~V + xa=loop(bp,0,dy/8,dx,cs,0,RI); + xb=loop(bp,0,dy/2,dx,cs,0,RI); + xc=loop(bp,0,dy-1,dx,cs,0,RI); + if( 2*xb< xa+xc ) ad=98*ad/100; // ~V + if( 2*xb<=xa+xc ) ad=98*ad/100; + if( 2*xb<=xa+xc+1 ) ad=98*ad/100; + + bc='y'; + if ((!gchar) && (!hchar)) ad=98*ad/100; + if(y0<box1->m2-(box1->m2-box1->m1)/4) + { bc='Y'; if(gchar) ad=98*ad/100; } + // SMALL-CAPS ??? + Setac(box1,bc,ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_zZ(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + int i1,i2,i3,i4,i5,dbg[9], + d,x,y,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + (*aa)[4]=sdata->aa, /* the for line ends, (x,y,dist^2,vector_idx) */ + ad; /* tmp-vars */ + wchar_t bc=UNKNOWN; + + // --- test zZ ------- + for(ad=d=100;dx>3 && dy>3;){ // dy>dx + DBG( wchar_t c_ask='z'; ) /* for debugging purpose */ + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + if (sdata->holes.num > 0) ad=98*ad/100; /* # */ + /* half distance to the center */ + d=2*sq(128/4); + /* now we check for the 4 edges of the z */ + if (aa[0][2]>d) Break; + if (aa[1][2]>d) Break; + if (aa[2][2]>d) Break; + if (aa[3][2]>d) Break; + if (aa[3][0]-aa[0][0]<dx/2) Break; + if (aa[2][0]-aa[1][0]<dx/2) Break; + if (aa[1][1]-aa[0][1]<dy/2) Break; + if (aa[2][1]-aa[3][1]<dy/2) Break; + if (aa[3][0]-aa[0][0]<4-1) Break; /* to small to hold a z */ + if (aa[2][0]-aa[1][0]<4-1) Break; /* to small */ + if (aa[3][1]-y0>dy/8) ad=99*ad/100; + if (aa[0][1]-y0>dy/8) ad=99*ad/100; + if (2*dx<dy) ad=99*ad/100; + MSG( \ + fprintf(stderr,"xy= %d %d aa %d %d %d %d %d %d %d %d", \ + x0,y0,aa[0][0]-x0,aa[0][1]-y0,aa[1][0]-x0,aa[1][1]-y0,\ + aa[2][0]-x0,aa[2][1]-y0,aa[3][0]-x0,aa[3][1]-y0);) + /* upper and lower horizontal line */ + d=line_deviation(box1, aa[3][3], aa[0][3]); if (d>2*sq(1024/4)) Break; + ad=(100-(d-sq(1024)/2)/sq(1024)/4)*ad/100; + d=line_deviation(box1, aa[1][3], aa[2][3]); if (d>2*sq(1024/4)) Break; + + /* search uppermost right > */ + i1=nearest_frame_vector(box1,aa[0][3],aa[1][3], x1, y0); + x=box1->frame_vector[i1][0]; + y=box1->frame_vector[i1][1]; + if (y-y0 > 5*dy/8) Break; + if (x-x0 < 3*dx/8) Break; + if (x-aa[0][0]<=dx/4) Break; // ~lI + if (x-aa[0][0]<=dx/3) ad=98*ad/100; // ~lI + if (x-aa[0][0]<=dx/2) ad=99*ad/100; // ~lI + /* search most right > ~2 */ + i3=nearest_frame_vector(box1,aa[0][3],aa[1][3], x1+2*dx, (y0+y1)/2); + MSG(fprintf(stderr,"xy= %d %d %d %d %d %d",x0,y0,x-x0,y-y0,box1->frame_vector[i3][0]-x0,box1->frame_vector[i3][1]-y0);) + if ( box1->frame_vector[i3][1]-y0> dy/4 + && box1->frame_vector[i3][0]-x>=0) Break; + if ( box1->frame_vector[i3][1]-y> dy/8 + && box1->frame_vector[i3][0]-x>=-dx/8) ad=98*ad/100; + if ( box1->frame_vector[i3][1]-y> dy/8 + && box1->frame_vector[i3][0]-x>= 0) ad=97*ad/100; + if (box1->frame_vector[i3][0]-aa[0][0] + < aa[3][0]-box1->frame_vector[i3][0]) break; // ~lI + if (box1->frame_vector[i3][0]-aa[0][0] + <(aa[3][0]-box1->frame_vector[i3][0])*2) ad=98*ad/100; // ~lI + /* better test for a bow or peaked angle */ + /* upper part of a 2, on a Z a and b should be at c + .....$@@@@@@a...c. o1 (o1-a)=(dx+5)^2 =dx^2+10*dx+25 + ...$$@@@@@@@@@.... (o1-b)=(dx+1)^2+4^2=dx^2+ 2*dx+18 + ..$@@$@@@$@@@@@... + ..@@@.....$$@@@@.. + ..@@.......@$@@@b. + ..$.........$@@@@. + .$$..........$@@@. + .$...........@@@@. + .............@@@@.< + .............$@@$. + ............$@@@.. + ............@@$... + ............$@$... + --- snip ---- + */ + i4=nearest_frame_vector(box1,aa[2][3],aa[0][3], x1+dx, y0); + i5=nearest_frame_vector(box1,aa[2][3],aa[0][3], x1, y0-dx); + d=sq(box1->frame_vector[i5][0]-box1->frame_vector[i4][0]) + +sq(box1->frame_vector[i5][1]-box1->frame_vector[i4][1]); + if (d>2*sq(dx/8+1)) break; + + /* check if upper left and upper right point are joined directly */ + dbg[0]=d=line_deviation(box1, aa[0][3], i1); if (d >2*sq(1024/4)) Break; + /* check if lower right and upper left point are joined directly */ + dbg[1]=d=line_deviation(box1, i1, aa[1][3]); if (d >2*sq(1024/4)) Break; + + /* search lowest left < */ + i2=nearest_frame_vector(box1,aa[2][3],aa[3][3], x0, y1); + x=box1->frame_vector[i2][0]; + y=box1->frame_vector[i2][1]; + if (y-y0 < 3*dy/8) Break; + if (x-x0 > 5*dx/8) Break; + if (aa[2][0]-x<=dx/4) Break; // ~lI + if (aa[2][0]-x<=dx/3) ad=98*ad/100; // ~lI + if (aa[2][0]-x<=dx/2) ad=99*ad/100; // ~lI + /* check if upper right and lower left point are joined directly */ + dbg[2]=d=line_deviation(box1,i2, aa[3][3]); if (d >2*sq(1024/4)) Break; + /* check if lower left and lower right point are joined directly */ + dbg[3]=d=line_deviation(box1, aa[2][3],i2); if (d >2*sq(1024/4)) Break; + + if (box1->frame_vector[i1][0] + -box1->frame_vector[i2][0]<=dx/8) Break; /* nonsignificant distance */ + MSG( \ + fprintf(stderr,"^v %d %d %d %d line deviation %d %d %d %d max %d %d",\ + box1->frame_vector[i1][0]-x0,box1->frame_vector[i1][1]-y0,\ + box1->frame_vector[i2][0]-x0,box1->frame_vector[i2][1]-y0,\ + dbg[0],dbg[1],dbg[2],dbg[3],2*sq(1024/4),2*sq(1024));) + ad=(100-(dbg[0]-sq(1024)/2)/sq(1024)/4)*ad/100; + ad=(100-(dbg[1]-sq(1024)/2)/sq(1024)/4)*ad/100; + ad=(100-(dbg[2]-sq(1024)/2)/sq(1024)/4)*ad/100; + ad=(100-(dbg[3]-sq(1024)/2)/sq(1024)/4)*ad/100; + + if ( gchar) ad=98*ad/100; + bc='z'; + if( hchar ) bc='Z'; + Setac(box1,bc,ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_wW(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,hchar=sdata->hchar,gchar=sdata->gchar,handwritten=0, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad,ya,yb,xa,xb,xc,xd,xe,t1; /* tmp-vars */ + wchar_t ac; + + // ------- test w ~{\it w} --------------- + for(ad=d=100;dx>3 && dy>3;){ // dy<=dx + DBG( wchar_t c_ask='w'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + // xa xe + // \ xc / <=ya connected xa-xb-xc-xd-xe + // xb xd <=yb + // get two lowest points i3,i4,ya + // out_x(box1); + // ~ul ~uf + // out_x(box1); + for(y=dy/8;y< dy/2;y++) if( num_cross(0,dx-1,y,y,bp,cs)< 2 ) break; + if(y<dy/2) Break; + yb=dy-1; + if (dx>4) { /* 4x6 is to small */ + for(y=dy-1-dy/16;y>3*dy/4;y--) + if( num_cross(0,dx-1,y,y,bp,cs)==2 ) break; + if(y==3*dy/4) Break; + } + yb=y; + t1=loop(bp,0 ,dy/4,dx,cs,0,RI); + t1=loop(bp,t1,dy/4,dx,cs,1,RI); // thickness of line? + for(i=j=0 ;y> dy/4;y--) if( num_cross(0,dx-1,y,y,bp,cs)==4 ) i++; + else if( num_cross(0,dx-1,y,y,bp,cs)>=3 ) j++; + if(i+5<dy/4 && 7*t1<dy) Break; // only for large letters + if(i+j==0 && (dy>6 || dx>4)) Break; + if(i+j==0 && dx<=4){ + if (abs(loop(bp, 1,dy-1,dy,cs,0,UP) + -loop(bp,dx-2,dy-1,dy,cs,0,UP))>dy/8+1) Break; // 4x6 N + if ( ( loop(bp, 1, 0,dy,cs,0,DO)>=dy-2 + && loop(bp, 0,dy-1,dy,cs,0,UP)>0) + || ( loop(bp,dx-2, 0,dy,cs,0,DO)>=dy-2 + && loop(bp,dx-1,dy-1,dy,cs,0,UP)>0)) Break; // 4x6 UV + ad=ad*99/100; // 4x6 font + MSG(fprintf(stderr,"ad=%d",ad);) + } + if( num_cross(0,dx-1, 1, 1,bp,cs)< 2 + && num_cross(0,dx-1,dy/16,dy/16,bp,cs)< 2 ) Break; + x =loop(bp,0 ,yb,dx,cs,0,RI); + xb=loop(bp,x ,yb,dx,cs,1,RI);xb=x+xb/2; if(xb>dx/2) Break; + x =loop(bp,dx-1 ,yb,dx,cs,0,LE); + xd=loop(bp,dx-1-x,yb,dx,cs,1,LE);xd=dx-1-x-xd/2;if(xd<3*dx/8) Break; + for(y=0,xc=x=xb+1;x<xd;x++) + if((i=loop(bp,x,dy-1,dy,cs,0,UP))>y){xc=x;y=i;} + if(dx>4 && !y) Break; + ya=dy-1-y; // flat + y=loop(bp,xc,ya,dy,cs,1,UP);if(y)y--; + if (dy>6 || dx>4) { // ~4x6 font + if( num_cross(0 ,xc ,ya-y ,ya-y ,bp,cs)!= 2 + && num_cross(0 ,xc ,ya-y/2,ya-y/2,bp,cs)!= 2 ) Break; + if( num_cross(xc,dx-1,ya-y ,ya-y ,bp,cs)!= 2 + && num_cross(xc,dx-1,ya-y/2,ya-y/2,bp,cs)!= 2 ) Break; + } + ya-=y/2; + x =loop(bp,0 ,1 ,dx,cs,0,RI); + xa=loop(bp,x ,1 ,dx,cs,1,RI); + if( x+xa>xb ){ // may be, here is a small but thick letter + // later add some proofs + xa=x+xa/4; + } else { + xa=x+xa/2; + } + x =loop(bp,dx-1 ,1 ,dx,cs,0,LE); + xe=loop(bp,dx-1-x,1 ,dx,cs,1,LE);xe=dx-1-x-xe/2; + MSG( fprintf(stderr,"a-e: %d %d %d %d %d %d %d %d %d %d", + xa,1,xb,yb,xc,ya,xd,yb,xe,1);) + if (ya<dy/2 && xc<dx/2) ad=95*ad/100; /* ~N */ + i= loop(bp,xa ,1 ,dx,cs,1,RI); + for (x=xa;x<xa+i;x++) + if( get_line2(x,1,xb,yb,bp,cs,100)>94 ) break; + if (x==xa+i) Break; // no vert. line found + if( get_line2(xb,yb-1,xc,ya ,bp,cs,100)<95 + && get_line2(xb,yb-1,xc,ya+dy/32,bp,cs,100)<95 + && get_line2(xb,yb-1,xc,ya+dy/16,bp,cs,100)<95 ) Break; + if( get_line2(xc, ya,xd, yb,bp,cs,100)<95 + && get_line2(xc+1,ya,xd, yb,bp,cs,100)<95 ) Break; + if( get_line2(xd,yb,xe ,1+dy/16,bp,cs,100)<95 + && get_line2(xd,yb,dx-1 ,1+dy/8 ,bp,cs,100)<95 // round w + && get_line2(xd,yb,xe+dx/20,1+dy/16,bp,cs,100)<95 ) Break; + // if( num_hole(0,dx-1,0,dy-1,bp,cs,NULL) != 0 ) Break; + // ~ur + MSG(fprintf(stderr,"ad=%d",ad);) + for(i=0,y=5*dy/8;y<dy;y++){ + x=loop(bp,dx-1,y,dx,cs,0,LE); if( x>i ) i=x; if( x<i-2 ) break; + if (x<i) ad=98*ad/100; + } if( y<dy ) Break; + MSG(fprintf(stderr,"ad=%d",ad);) + ac=((hchar)?'W':'w'); + if (gchar) ad=98*ad/100; + Setac(box1,ac,ad); + break; + } + // --- test ~w {\it w} ohmega? also handwritten ------- + // italic + for(ad=d=100;dx>3 && dy>3;){ // dy<=dx 4x6font (like a H with fat bar) + DBG( wchar_t c_ask='w'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + // ~ul ~uf + if( num_cross(0,dx-1,dy/2,dy/2,bp,cs)<2 ) Break; + if( num_cross(0,dx-1,dy/8,dy/8,bp,cs)<2 ) handwritten=40; + if( num_cross(0,dx-1,dy/4,dy/4,bp,cs)<2 ) handwritten=80; + for(i=0,y=0;y<dy-1;y++) + if( num_cross(0,dx-1,y,y,bp,cs)==3 ) i++; + if(i<=dy/8) Break; // three legs + // xa xe + // \ xc / <=yb connected xa-xb-xc-xd-xe + // xb xd + for(y=dy/2;y<dy-1-dy/8;y++) + if( num_cross(0,dx-1,y,y,bp,cs)==3 ) break; + yb=y; + x =loop(bp,0 ,yb,dx,cs,0,RI); + x+=loop(bp,x ,yb,dx,cs,1,RI); if(x>dx/2) Break; + xb=loop(bp,x ,yb,dx,cs,0,RI);xb=x+xb/2; if(xb>dx/2) Break; + x =loop(bp,dx-1 ,yb,dx,cs,0,LE); + x+=loop(bp,dx-1-x,yb,dx,cs,1,LE); + xd=loop(bp,dx-1-x,yb,dx,cs,0,LE);xd=dx-1-x-xd/2;if(xd<3*dx/8) Break; + if( num_cross(xb,xd,yb,yb ,bp,cs)!= 1 ) Break; + if( num_cross(xb,xb,yb,dy-1,bp,cs)!= 1 ) Break; + if( num_cross(xd,xd,yb,dy-1,bp,cs)!= 1 ) Break; + if( num_cross(xb,xb, 0,yb ,bp,cs)!= 0 ) Break; + if( num_cross(xd,xd, 0,yb ,bp,cs)!= 0 ) Break; + // if( num_hole(0,dx-1,0,dy-1,bp,cs,NULL) != 0 ) Break; + if (sdata->holes.num != 0) Break; + // ~ur + for(i=0,y=3*dy/4;y<dy;y++){ + x=loop(bp,dx-1,y,dx,cs,0,LE); if( x>i ) i=x; if( x<i-2 ) break; + } if( y<dy ) Break; // fail for overlapping neighbouring slanted chars? + ac=((hchar)?'W':'w'); + if (gchar) ad=98*ad/100; + Setac(box1,ac,ad); + Break; + } + return box1->c; +} + +static wchar_t ocr0_aA(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,d,x,y,i1,i2,i3,i4,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad,ya; /* tmp-vars */ + + // --- test A --------------------------------------------------- + for(ad=d=100;dx>2 && dy>3;){ // min 3x4 + DBG( wchar_t c_ask='A'; ) + if (sdata->holes.num > 2) Break; /* tolerant against a tiny hole */ + // first selection (rough sieve) + if( get_bw(dx/2 ,dx/2 ,dy-1-dy/8,dy-1,bp,cs,1) == 1 + && get_bw(dx/2-1,dx/2-1,dy-1-dy/8,dy-1,bp,cs,1) == 1 ) Break; // ~B + ya=0; /* upper end, not 0 for modified A etc. */ + if (box1->modifier) + for (ya=0;ya<dy/2;ya++) + if (num_cross(0,dx-1,ya,ya,bp,cs)==0) break; + if (ya>=dy/2) ya=0; // already subtracted? + if( num_cross(0,dx-1,ya+ 1 ,ya+ 1 ,bp,cs)!=1 // 600dpi + && num_cross(0,dx-1,ya+ dy/8 ,ya+ dy/8 ,bp,cs)!=1 + && num_cross(0,dx-1,ya+ dy/16 ,ya+ dy/16 ,bp,cs)!=1 + && num_cross(0,dx-1,ya+ dy/8+1,ya+ dy/8+1,bp,cs)!=1 ) Break; + if( num_cross(0,dx-1, 7*dy/8 , 7*dy/8 ,bp,cs)!=2 + && num_cross(0,dx-1, 7*dy/8-1, 7*dy/8-1,bp,cs)!=2 ) Break; + if ( num_cross( 0,dx/8,ya+dy/8,ya+0,bp,cs)>0 ) Break; // ~R + for(y=ya+dy/8;y<ya+dy/2;y++) if( num_cross(0,dx-1,y,y,bp,cs) > 1 ) break; + if( y==ya+dy/2 ) Break; i1=y; + if (dy>20) i1++; /* get arround some noise fat font */ + + x =loop(bp,0,i1,dx,cs,0,RI); if(x>3*dx/4) Break; + x+=loop(bp,x,i1,dx,cs,1,RI); if(x>3*dx/4) Break; i2=x; + x+=loop(bp,x,i1,dx,cs,0,RI); if(x<3*dx/8) Break; i2=(x+i2)/2; + // hole (i2,i1) + y+=loop(bp,i2,y,dy,cs,1,DO); + y+=loop(bp,i2,y,dy,cs,0,DO); if(y>3*dy/4) ad=ad*99/100; + if (y>5*dy/6) { MSG(fprintf(stderr,"x,y,i1,i2= %d %d %d %d",x,y,i1,i2);) } + if (y>5*dy/6) Break; + + if( sdata->holes.num != ((box1->modifier==RING_ABOVE)?2:1) + || sdata->holes.hole[0].y1-ya >= dy-1-dy/4) Break; + // if( num_hole ( x0, x1, y0, y1-dy/4 ,box1->p,cs,NULL) != 1 ) Break; + // out_x(box1); + i3=0;i4=0; + for(x=dx/3;x<2*dx/3;x++){ + i4=num_cross(i2,x,y ,dy-1,bp,cs);if(i4<1 || i4>2) + i4=num_cross(i2,x,y+dy/16,dy-1,bp,cs);if(i4<1 || i4>2) break; + if(i4==1) i3=x; + } if(i4<1 || i4>2 || i3==0){ +// ToDo: MSG(fprintf(stderr,"x,y,i4,i3= %d %d %d %d",x,y,i4,i3);) + Break; + } + if( get_bw(dx-1-dx/4, dx-1, dy-1-dy/4, dy-1, bp,cs,1) != 1 ) Break; + + i1=loop(bp,dx-1,ya+ (dy-ya)/4,dx,cs,0,LE); + i2=loop(bp,dx-1,ya+ (dy-ya)/2,dx,cs,0,LE); + i3=loop(bp,dx-1,dy-1-(dy-ya)/4,dx,cs,0,LE); + if( 2*i2-dx/8>i1+i3 ) ad=99*ad/100; /* 6*8 font */ + if( 2*i2+dx/4<i1+i3 || 2*i2-dx/4>i1+i3 ) Break; + + i1=loop(bp,0 ,ya+ (dy-ya)/4,dx,cs,0,RI); // linke senkr. linie + i2=loop(bp,0 ,ya+ (dy-ya)/2,dx,cs,0,RI); + i3=loop(bp,0 ,dy-1-(dy-ya)/4,dx,cs,0,RI); + if( 2*i2-dx/8>i1+i3 ) ad=98*ad/100; /* 6*8 font */ + if( 2*i2+dx/4<i1+i3 || 2*i2-dx/4>i1+i3 || i1<i3) Break; + + // lower ends could be round on thick fonts + for(i3=dx,y=ya+(dy-ya)/4;y<7*dy/8;y++){ // increasing width + i1=loop(bp, 0, y,dx,cs,0,RI); + i2=loop(bp,dx-1, y,dx,cs,0,LE); + if(i1+i2>i3+dx/16) break; if( i1+12<i3 ) i3=i1+i2; + } if(y<7*dy/8) Break; + if ( loop(bp, 0,dy-1-dy/8,dx,cs,0,RI) + -loop(bp, 0,dy/2 ,dx,cs,0,RI)>0) ad=97*ad/100; // italic-a + + if (!hchar) ad=99*ad/100; // italic-a + Setac(box1,'A',ad); + break; + } + // --- test a ------------------------------------------- + // with a open bow above the circle starting + // on the right side of the circle + for(ad=d=100;dx>2 && dy>3;){ // min 3x4 + DBG( wchar_t c_ask='a'; ) + if (sdata->holes.num > 2) Break; /* tolerant against a tiny hole */ + if( get_bw(x0 , x0+dx/2, y1-dy/3, y1-dy/3,box1->p,cs,1) != 1 ) Break; + if( get_bw(x1-dx/3, x1 , y0+dy/3, y0+dy/3,box1->p,cs,1) != 1 ) Break; + if( get_bw(x1-dx/3, x1 , y0+dy/4, y0+dy/4,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2, x0+dx/2, y1-dy/3, y1, box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2, x0+dx/2, y0 , y0+dy/3,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/3, x1-dx/3, y0 , y0 ,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/4, x1-dx/2, y1 , y1 ,box1->p,cs,1) != 1 ) + if( get_bw(x0+dx/4, x1-dx/3, y1-1 , y1-1 ,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0 , x0 , y0+dy/2, y1 ,box1->p,cs,1) != 1 ) + if( get_bw(x0+dx/8, x0+dx/8, y0+dy/2, y1 ,box1->p,cs,1) != 1 ) Break; + if( loop(bp,3*dx/8,0,dy,cs,0,DO) > 3*dy/16 ) Break; // ~d + if( num_cross(0,dx-1,dy/4 ,dy/4 , bp,cs) >2 // ~glued am != an + && num_cross(0,dx-1,dy/4+1,dy/4+1, bp,cs) >2 ) Break; + + for( x=dx/4;x<dx-dx/4;x++ ){ // ar + i=loop(bp,x, 0,y1-y0,cs,0,DO); if (i>dy/2) break; + i=loop(bp,x,dy-1,y1-y0,cs,0,UP); if (i>dy/2) break; + } if( x<dx-dx/4 ) Break; + + for(i=dx/8+1,x=dx/4;x<=dx-1-dx/4 && i;x++){ + if( num_cross(x,x,0,bp->y-1, bp,cs) == 3 ) i--; + } if( i ) Break; + + i1=loop(bp,0, dy/8,dx,cs,0,RI); + i3=loop(bp,0,3*dy/4,dx,cs,0,RI); + for(y=dy/8+1;y<3*dy/4;y++){ + i2=loop(bp,0,y,dx,cs,0,RI);if(2*i2>i1+i3+1) break; + } if(y==3*dy/4) Break; // ~6 + // ~ s (small thick s), look for vertikal line piece + for(x=3*dx/4;x<dx;x++) + if( loop(bp,x,dy/4,dy/2,cs,1,DO)>dy/4 ) break; + if( x==dx ) Break; + + if (sdata->holes.num != 1) ad=96*ad/100; else + if (sdata->holes.num == 1) + if( num_hole ( x0, x1, y0+dy/3, y1 ,box1->p,cs,NULL) != 1 ) Break; + // if( num_hole ( x0, x1, y0, y1, box1->p,cs,NULL) != 1 ) Break; + if( num_hole ( x0, x1, y0, y1-dy/3 ,box1->p,cs,NULL) != 0 ){ + i =loop(bp,0,dy/4,dx,cs,0,RI); + i =loop(bp,i,dy/4,dx,cs,1,RI); + if(i<dx/4+1) Break; // fat a + i =loop(bp,0,dy/4,dx,cs,0,RI); + i+=loop(bp,i,dy/4,dx,cs,1,RI); + for(y=dy/4;y<dy/2;y++) + if( num_cross(0,dx-1,y,y, bp,cs) !=2 ) break; + x =loop(bp,0,y-1,dx,cs,0,RI); + x+=loop(bp,x,y-1,dx,cs,1,RI); + if(x>i) Break; // ~ 8 + } + /* test for horizontal symmetry ~8 */ + for (y=0;y<dy;y++) for (x=0;x<dx/2;x++) + if ((getpixel(bp,x,y)<cs)!=(getpixel(bp,dx-1-x,y)<cs)) { y=dy+1; break; } + if (y==dy) Break; /* ~8 */ + + if (hchar) ad=96*ad/100; + if (gchar) ad=96*ad/100; + Setac(box1,'a',ad); + break; + } + // --- test hand written a --------------------------------------------------- + // rarely char, without bow above the circle + for(ad=d=100;dx>3 && dy>3;){ // min 4x4 + DBG( wchar_t c_ask='a'; ) + if (sdata->holes.num > 2) Break; /* tolerant against a tiny hole */ + if( get_bw(x0 , x0+dx/2,y0+dy/2 , y0+dy/2,box1->p,cs,1) != 1 ) Break; + if( get_bw(x1-dx/2 , x1 ,y0+dy/2 , y0+dy/2,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2 , x0+dx/2,y1-dy/2 , y1, box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2 , x0+dx/2,y0 , y0+dy/2,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/3 , x0+dx/3,y0 , y0+dy/2,box1->p,cs,1) != 1 ) Break; + i = loop(bp,dx/2, 0 ,dy,cs,0,DO); if (i>dy/4) Break; + i+= loop(bp,dx/2, i ,dy,cs,1,DO); if (i>dy/2) Break; + i = loop(bp,dx/2, i ,dy,cs,0,DO); if (i<dy/4) Break; + if( get_bw(x0 , x0 ,y1 , y1 ,box1->p,cs,1) == 1 ) Break; + + if( num_cross(x0+dx/2,x0+dx/2,y0 , y1 ,box1->p,cs) != 2 ) Break; + if( num_cross(x0+dx/3,x1-dx/3,y0 , y0 ,box1->p,cs) != 1 ) // AND + if( num_cross(x0+dx/3,x1-dx/3,y0+1 , y0+1 ,box1->p,cs) != 1 ) Break; + i = loop(bp,dx/2,dy-1 ,dy,cs,0,UP); if (i>dy/3) Break; + y = i+loop(bp,dx/2,dy-1-i,dy,cs,1,UP); if (i>dy/2) Break; + // normal 'a' has a well separated vertical line right from the circle + // but fat 'a' is like a 'o', only bigger on the right side + if( num_cross(x0+dx/2-1,x1,y1 ,y1 ,box1->p,cs) < 2 /* 4x6font */ + && num_cross(x0+dx/2-1,x1,y1-i,y1-i ,box1->p,cs) < 2 /* 2 or 3 */ + && num_cross(x0+dx/2-1,x1,y1-y,y1-y ,box1->p,cs) < 2 ) + { if (loop(bp, 0,dy-1-dy/16,dx,cs,0,RI) + <4*loop(bp,dx-1,dy-1-dy/16,dx,cs,0,LE)) { Break;} + else ad=98*ad/100; + } + if( num_cross(x0,x1,y0+dy/2 , y0+dy/2,box1->p,cs) < 2 + || num_cross(x0,x1,y0+dy/3 , y0+dy/3,box1->p,cs) < 2 ) Break; // Jun00 + + if( num_cross(x0 ,x0 ,y0+dy/3 , y1-dy/4,box1->p,cs) != 1 ) + if( num_cross(x0+1 ,x0+1 ,y0+dy/3 , y1-dy/4,box1->p,cs) != 1 ) Break; + if (sdata->holes.num != 1) + if( num_hole(x0,x1-2,y0 ,y1 ,box1->p,cs,NULL) != 1 ) + // if( num_hole(x0,x1 ,y0 ,y1 ,box1->p,cs,NULL) != 1 ) + Break; + if( num_hole(x0,x1 ,y0+dy/3,y1-1 ,box1->p,cs,NULL) != 0 ) Break; + + if( loop(bp,0 ,0 ,x1-x0,cs,0,RI)<= + loop(bp,0 ,2 ,x1-x0,cs,0,RI) ) Break; + + if( loop(bp,dx-1,dy-1,x1-x0,cs,0,LE)> dx/4 + && loop(bp,dx-1,dy-2,x1-x0,cs,0,LE)> (dx+4)/8 ) ad=97*ad/100; + + x=loop(bp,dx-1,dy-1-dy/4,dx,cs,0,LE); + i=loop(bp,dx-1, dy/4,dx,cs,0,LE); if (abs(x-i)>dx/4) Break; + + for( x=dx/4;x<dx-dx/4;x++ ){ // ar + i=loop(bp,x, 0,y1-y0,cs,0,DO); if (i>dy/2) break; + i=loop(bp,x,dy-1,y1-y0,cs,0,UP); if (i>dy/2) break; + } if( x<dx-dx/4 ) Break; + + if( num_cross(x0 , x1, y1, y1,box1->p,cs) == 1 ) + if( num_cross(x0 , x1, y0, y0,box1->p,cs) == 1 ) + if( loop(bp,dx-1, 0,y1-y0,cs,0,DO)> dy/4 + && loop(bp,dx-1,dy-1,y1-y0,cs,0,UP)> dy/4 ) Break; // ~o + if( loop(bp,dx/2,dy-1,y1-y0,cs,0,UP)> dy/4 ) Break; // ~q + + if (hchar) ad=98*ad/100; + if (gchar) ad=98*ad/100; + // handwritten-a (alpha) + Setac(box1,'a',ad); + break; + } + // --- test A_A_WITH_OGONEK 0x0104 Centr.Eur.Font ------------------------- + /* not sure if we should move this to a get_CentralEuropean-function */ + for(ad=d=100;dx>2 && dy>4;){ // min 3x4 + DBG( wchar_t c_ask='A'; ) + if (sdata->holes.num > 2) break; /* tolerant against a tiny hole */ + // first selection (grobes Sieb) + if( get_bw(dx/2,dx/2,dy-1-dy/8,dy-1,bp,cs,1) == 1 ) break; // ~B + if( num_cross(0,dx-1, 1 , 1 ,bp,cs)!=1 // 600dpi + && num_cross(0,dx-1, dy/8 , dy/8 ,bp,cs)!=1 + && num_cross(0,dx-1, dy/16 , dy/16 ,bp,cs)!=1 + && num_cross(0,dx-1, dy/8+1, dy/8+1,bp,cs)!=1 ) break; + if( num_cross(0,dx-1, dy-1 , dy-1 ,bp,cs)!=1 ) break; + if( num_cross(0,dx-1, dy/4 , dy/4 ,bp,cs)!=2 + && num_cross(0,dx-1, dy/3 , dy/3 ,bp,cs)!=2 ) break; + if ( num_cross( 0,dx/8,dy/8, 0,bp,cs)>0 ) break; // ~R + for(y=dy/8;y<dy/2;y++) if( num_cross(0,dx-1,y,y,bp,cs) > 1 ) break; + if( y==dy/2 ) break; i1=y; + if (dy>20) i1++; /* get arround some noise fat font */ + + x =loop(bp,0,i1,dx,cs,0,RI); if(x>3*dx/4) break; + x+=loop(bp,x,i1,dx,cs,1,RI); if(x>3*dx/4) break; i2=x; + x+=loop(bp,x,i1,dx,cs,0,RI); if(x<3*dx/8) break; i2=(x+i2)/2; + // hole (i2,i1) + y+=loop(bp,i2,y,dy,cs,1,DO); + y+=loop(bp,i2,y,dy,cs,0,DO); if(y>3*dy/4) ad=ad*99/100; + if (y>5*dy/6) break; + + if( sdata->holes.num != 1 || sdata->holes.hole[0].y1 >= dy-1-dy/4) break; + // if( num_hole ( x0, x1, y0, y1-dy/4 ,box1->p,cs,NULL) != 1 ) break; + // out_x(box1); + i3=0;i4=0; + for(x=dx/3;x<2*dx/3;x++){ + i4=num_cross(i2,x,y ,dy-1,bp,cs);if(i4<1 || i4>2) + i4=num_cross(i2,x,y+dy/16,dy-1,bp,cs);if(i4<1 || i4>2) break; + if(i4==1) i3=x; + } if(i4<1 || i4>2 || i3==0){ +// ToDo: g_debug_A(printf(" A: x,y,i4,i3= %d %d %d %d\n",x,y,i4,i3);) + break; + } + if( get_bw(dx-1-dx/4, dx-1, dy-1-dy/4, dy-1, bp,cs,1) != 1 ) break; + /* dy/4 changed to dy/6 because of screenfonts */ + /* there are strange fonts, one has a serif on the upper end of A */ + if ( num_cross( 0,dx/8,dy/6, 0,bp,cs)>0 ) break; + if ( num_cross(dx-1-dx/4,dx-1, 0,dy/6,bp,cs)>0 ) break; + + i1=loop(bp,dx-1, dy/4,dx,cs,0,LE); + i2=loop(bp,dx-1, dy/2,dx,cs,0,LE); + i3=loop(bp,dx-1,dy-1-dy/4,dx,cs,0,LE); + if( 2*i2+dx/4<i1+i3 || 2*i2-dx/8>i1+i3 ) break; + + i1=loop(bp,0 , dy/4,dx,cs,0,RI); // linke senkr. linie + i2=loop(bp,0 , dy/2,dx,cs,0,RI); + i3=loop(bp,0 ,dy-1-dy/4,dx,cs,0,RI); + if( 2*i2+dx/4<i1+i3 || 2*i2-dx/8>i1+i3 || i1<i3) break; + + // lower ends could be round on thick fonts + for(i3=dx,y=dy/4;y<6*dy/8;y++){ // increasing width + i1=loop(bp, 0, y,dx,cs,0,RI); + i2=loop(bp,dx-1, y,dx,cs,0,LE); + if(i1+i2>i3+dx/16) break; if( i1+12<i3 ) i3=i1+i2; + } if(y<6*dy/8) break; + + if (!hchar) ad=96*ad/100; + if (!gchar) ad=98*ad/100; + Setac(box1,(wchar_t)LATIN_CAPITAL_LETTER_A_WITH_OGONEK,ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_cC(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,i1,i2,i3,i4,i5,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad,t1; /* tmp-vars */ + wchar_t bc=UNKNOWN; + + // --- test c,C --------------------------------------------------- + for(ad=d=100;dx>2 && dy>2;){ // min 3x4 + DBG( wchar_t c_ask='c'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + if( get_bw(x0 , x0+dx/3,y0+dy/2, y0+dy/2,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2, x0+dx/2,y1-dy/3, y1, box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2, x0+dx/2,y0 , y0+dy/3,box1->p,cs,1) != 1 ) Break; + if( num_cross(x0,(x0+x1)/2,(y0+y1)/2,(y0+y1)/2,box1->p,cs) > 1 ) Break; // ~ocr-a-[ + + for(y=y0+dy/4;y<y0+3*dy/4;y++) + if( get_bw(x0+dx/2,x1,y,y,box1->p,cs,1) == 0 ) break; + if( y==y0+3*dy/4 ) Break; i1=y; // i1: upper end of right gap + + // measure thickness of line! + t1=loop(bp, 0,dy/2,dx,cs,0,RI); + t1=loop(bp,t1,dy/2,dx,cs,1,RI); + if (t1>dx/2) Break; + + for(y=i1,i2=0,x=x0+dx/2;x<x0+6*dx/8;x++){ + i=y-1+loop(box1->p,x0+dx/2,i1,dy,cs,0,DO); + if( i>i2 ) { i2=i; } + } if(i2<y0+5*dy/8-t1/2) Break; // i2: lowest white point above lower bow + + i3=y+1-loop(box1->p,x0+5*dx/8,i1,dy,cs,0,UP); + i =y+1-loop(box1->p,x0+4*dx/8,i1,dy,cs,0,UP); if(i<i3) i3=i; + if(i3>y0+ dy/4+t1/2) Break; // highest + + for(y=i1;y<y1-dy/8;y++) + if( get_bw(x0+dx/2,x1,y,y,box1->p,cs,1) == 1 ) break; + if( y-i1<dy/6 ) Break; i2=y-1; // lower end of right gap + // pixelbased num_cross for streight lines could fail on small fonts + if( num_cross(x1-dx/4,x1-dx/4,i2,y0,box1->p,cs) < 1 ) Break; // ~L + if (loop(box1->p,x0,y0+3*dy/4,dx,cs,0,RI)>dx/16) + if( num_cross(x0+dx/2,x1,i3 ,y1,box1->p,cs) < 1 + && num_cross(x0+dx/2,x1,y1-dy/4,y1,box1->p,cs) < 1 ) Break; // ~r + + i=1; + for(x=dx/2;x<dx-1 && i;x++) // look for @@ (instead +1 use +delta?) + for(y=dy/2;y<dy-1-dy/8 && i;y++){ // .@ + if( getpixel(bp,x ,y )>=cs + && getpixel(bp,x+1,y )< cs + && getpixel(bp,x+1,y-1)< cs + && getpixel(bp,x ,y-1)< cs ) { i=0;break; } + } + if(!i) ad=95*ad/100; // ~G + + i=loop(bp,0,dy/2,dx,cs,0,RI); + for(y=0;y<dy;y++)if( loop(bp,0,y,dx,cs,0,RI)<i-1-dx/32 ) break; + if( y<dy ) Break; // ~r + // out_x(box1); + for(i5=0,i4=dx,y=dy/2;y>=dy/4;y--){ + x =loop(bp,0,y,dx,cs,0,RI); + x+=loop(bp,x,y,dx,cs,1,RI); if(x>i5) i5=x; + i =loop(bp,x,y,dx,cs,0,RI); if(i<i4) i4=i; + if( i5<x-dx/32 && i>i4+dx/32 ) break; // unusual for c, more a bad e? + } if( y>=dy/4 ) Break; + + if( !hchar ){ // test for e where the middle line is partly removed + x= loop(bp,0,dy/2,dx,cs,0,RI); + x=x +loop(bp,x,dy/2,dx,cs,1,RI); + y=dy/2-loop(bp,x,dy/2,dy,cs,0,UP)-1; + i=x +loop(bp,x,y,dx,cs,1,RI); + i=i +loop(bp,i,y,dx,cs,0,RI); + if( num_cross(x ,x ,1,dy/2,bp,cs) > 1 + || num_cross(x+1,x+1,1,dy/2,bp,cs) > 1 ) + if( num_cross(i-1,i-1,1,dy/2,bp,cs) > 1 + || num_cross(i ,i ,1,dy/2,bp,cs) > 1 ) Break; // ~bad e + } + if( dy>16 && dy>3*dx && hchar ){ // ~[ + x= loop(bp,0, dy/16,dx,cs,0,RI); + x=+loop(bp,0,dy-1-dy/16,dx,cs,0,RI); + i= loop(bp,0, dy/2 ,dx,cs,0,RI)*2; + if( i>=x ) + if( num_cross(0,dx-1,dy/4,dy/4,bp,cs) < 2 ) Break; + + } + if( get_bw(x0,x0,y0 ,y1 ,box1->p,cs,2) != 2 + && get_bw(x0,x1,y0 ,y0 ,box1->p,cs,2) != 2 + && get_bw(x0,x1,y1 ,y1 ,box1->p,cs,2) != 2 + && get_bw(x1,x1,y0+1,y1-1,box1->p,cs,1) != 1 ) Break; /* ~[ */ + + x =loop(bp, 0,dy/2,dx,cs,0,RI); + i =loop(bp,dx-1,dy/2,dx,cs,0,LE); + if( (i<dx/2 || i<3) && hchar && dy>7 ) + if( loop(bp, 0,7*dy/8,dx,cs,0,RI) > x+dx/8 + && loop(bp, 0, dy/8,dx,cs,0,RI) > x+dx/8 + && loop(bp,dx-1,dy-1-dy/ 8,dx,cs,0,LE) + > loop(bp,dx-1,dy-1-dy/16,dx,cs,0,LE) + && loop(bp,dx-1, dy/ 8,dx,cs,0,LE) + > loop(bp,dx-1, dy/16,dx,cs,0,LE) ) Break; // ~( + +// printf(" hchar=%d i1=%d i2=%d %d\n",hchar,i1-y0,i2-y0,9*dy/16); + // ~G without characteristic crotchet + if (hchar && dy>15 && dx>7 && i2-y0<9*dy/16 && i1-y0<=dy/4) + if ( loop(bp,5*dx/8,i2-y0,dy,cs,0,DO) > 2*dy/8 ){ + Setac(box1,'G',90); + Break; + } + + if (hchar){ + i=1; + for(x=dx/2;x<dx-1 && i;x++) // look for @@ (instead +1 use +delta?) + for(y= 1;y<dy/4 && i;y++){ // .@ + if( getpixel(bp,x ,y )>=cs + && getpixel(bp,x+1,y )< cs + && getpixel(bp,x+1,y-1)< cs + && getpixel(bp,x ,y-1)< cs ) { i=0;break; } + } + if (i) ad=98*ad/100; // ~( + if (dy>2*dx) ad=99*ad/100; + } + if( loop(bp,dx-1,dy/2,dx,cs,0,LE) < 6*dx/8 ) ad=98*ad/100; + + i= loop(bp,dx-1,dy/16,dx,cs,0,LE); + j= loop(bp,dx/2,0 ,dy,cs,0,DO); + if (i>=dx/2 && j>dy/8 && j>2 && j<dy/2) Break; // t + + if (dy>=3*dx && dy>12) ad=99*ad/100; // ( + i= loop(bp,dx-1,dy-1,dy,cs,0,UP); + j= loop(bp,dx/2,dy-1,dy,cs,0,UP); + if (i==0 && j>dy/8) ad=95*ad/100; // < + i= loop(bp,dx-1, 0,dy,cs,0,DO); + j= loop(bp,dx/2, 0,dy,cs,0,DO); + if (i==0 && j>dy/8) ad=95*ad/100; // < + if (loop(bp,0,dy-1-dy/8,dx,cs,0,RI)>= 3*dx/4) ad=98*ad/100; // < + if (loop(bp,0,dy-1-dy/8,dx,cs,0,RI)>=(dx+1)/2) ad=98*ad/100; // < + if (loop(bp,0, dy/8,dx,cs,0,RI)>=dx/2) ad=98*ad/100; // < + + if (gchar) ad=98*ad/100; // could happen for 5x7 font + bc=((hchar)?'C':'c'); + Setac(box1,bc,ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_lL(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,i0,i1,i2,i3,i4,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + + // --- test L --------------------------------------------------- + for(ad=d=100;dx>2 && dy>4;){ // min 3x4 + DBG( wchar_t c_ask='L'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + i=loop(bp,dx-1,dy/2,dx,cs,0,LE); + if (i<3 && dy>8) {Break;} + if (i<dx/2) ad=98*ad/100; // ~G + + if (dx<8 && 3*loop(bp,dx-1,0,dy,cs,0,DO)<=dy) break; // ~G + for( i=i1=0,y=y1-dy/4;y<=y1;y++){ // check bottom line (i1) + j=loop(box1->p,x0 ,y,dx,cs,0,RI); + j=loop(box1->p,x0+j,y,dx,cs,1,RI); if( j>i ){ i=j;i1=y; } + } if( i<3*dx/4 ) Break; i1=i; // length of horizontal line + // line thickness (i2) + i=loop(box1->p,x0 ,y0+dy/2,dx,cs,0,RI); if( i>dx/2 ) Break; + j=loop(box1->p,x0+i,y0+dy/2,dx,cs,1,RI); if( i+j>dx/2 ) Break; i2=j; + if (loop(bp,dx-1, 0,dx,cs,0,LE)<dx/8 + && loop(bp,dx-1, dy/4,dx,cs,0,LE)>dx/2 + && loop(bp, 0,5*dy/8,dx,cs,0,RI)<dx/4 + && loop(bp,dx-1,3*dy/4,dx,cs,0,LE)<dx/4) Break; // ~G + for( i=1,y=y0;y<=y1-dy/4 && i;y++){ // check vertical line + j=loop(box1->p,x0 ,y,dx,cs,0,RI); + if ( j>(dx+2)/4+(y1-dy/4-y)*dx/2/dy ) { i=0; break; } + x=loop(box1->p,x0+j,y,dx,cs,1,RI); + if( ((x>i2+1 || 4*x<3*i2) && y>y0+dy/8) || 4*x>3*i1 ) i=0; + } if( !i ) Break; + if( num_cross(0, dx-1-dx/8, dy-1-dy/2, dy-1-dy/2,bp,cs) != 1 ) Break; + if( num_cross(0, dx-1 , dy/3 , dy/3,bp,cs) != 1 ) Break; + if( num_cross(0, dx-1 , dy/8 , dy/8,bp,cs) != 1 ) Break; + if (loop(bp,0,dy-1,dx,cs,0,RI) + -loop(bp,0,dy-3,dx,cs,0,RI)>1+dx/16) ad=96*ad/100; // ~c + if (loop(box1->p,x0+dx/4,y1,dy,cs,0,UP)>1+dy/16) ad=99*ad/100; // ~4 + + if ( gchar) ad=98*ad/100; + if (!hchar) ad=99*ad/100; + if (5*dx<2*dy && loop(box1->p,x0,y1,dx,cs,0,RI)>dx/4) ad=99*ad/100; // ~l + Setac(box1,'L',ad); + break; + } + // --- test l --------------------------------------------------- + // recognize a "l" is a never ending problem, because there are lots of + // variants and the char is not very unique (under construction) + // --- test italic l --------------------------------------------------- + // --- test l ~italic (set flag-italic) -------------------------------- + // if unsure d should be multiplied by 80..90% + for(ad=d=100; dy>dx && dy>5;){ // min 3x4 + DBG( wchar_t c_ask='l'; ) + if( box1->dots>0 ) Break; + if( num_cross(0, dx-1,dy/2,dy/2,bp,cs) != 1 + || num_cross(0, dx-1,dy/4,dy/4,bp,cs) != 1 ) Break; + // mesure thickness + for(i1=0,i2=dx,y=dy/4;y<dy-dy/4;y++){ + j = loop(bp,0,y,dx,cs,0,RI); + j = loop(bp,j,y,dx,cs,1,RI); + if( j>i1 ) { i1=j; } // thickest + if( j<i2 ) { i2=j; } // thinnest + } + if ( i1>2*i2 ) Break; + if(box1->m3 && dy<=box1->m3-box1->m2) ad=94*ad/100; + if( box1->m2-box1->m1>1 && y0>=box1->m2 ) ad=94*ad/100; + for(i0=0,i3=0,y=0;y<dy/4;y++){ + j = loop(bp,0,y,dx,cs,0,RI); + if( j>i3 ) { i3=j; } // widest space + j = loop(bp,j,y,dx,cs,1,RI); + if( j>i0 ) { i0=j;i3=0; } // thickest + } + if ( i0>4*i2 || 3*i3>2*dx) + if ( loop(bp,dx-1,dy-1,dx,cs,0,LE)>3*dx/8 + || loop(bp, 0,dy-1,dx,cs,0,RI)>3*dx/8) Break; // ~7 + + // detect serifs + x =loop(bp,0, 0,dx,cs,0,RI); + i3=loop(bp,x, 0,dx,cs,0,RI); + x =loop(bp,0, 1,dx,cs,0,RI); + x =loop(bp,x, 1,dx,cs,0,RI); if(x>i3) i3=x; + x =loop(bp,0,dy-1,dx,cs,0,RI); + i4=loop(bp,x,dy-1,dx,cs,0,RI); + x =loop(bp,0,dy-2,dx,cs,0,RI); + x =loop(bp,x,dy-2,dx,cs,0,RI); if(x>i4) i4=x; + if( i3>i1+dx/8+1 && i4>i1+dx/8+1 ) Break; // ~I + + for(i=dx,j=0,y=1;y<dy/4;y++){ + x=loop(bp,dx-1,y,dx,cs,0,LE); if(x>i+1) break; i=x; + if( num_cross(0,dx-1,y ,y ,bp,cs)==2 + && num_cross(0,dx-1,y+1+dy/32,y+1+dy/32,bp,cs)==2 ) j=1; + } if ( y<dy/4 ) Break; + if(j){ // if loop at the upper end, look also on bottom + for(y=3*dy/4;y<dy;y++){ + if( num_cross(0,dx-1,y ,y ,bp,cs)==2 + && num_cross(0,dx-1,y-1-dy/32,y-1-dy/32,bp,cs)==2 ) break; + } if ( y==dy ) Break; + } + + // if( get_bw(x0,x1,y0,y1,p,cs,2) == 0 ) Break; // unsure !I| + + if(dx>3) + if( get_bw(dx-1-dx/8,dx-1,0,dy/6,bp,cs,1) != 1 ) + if( get_bw(dx-1-dx/8,dx-1,0,dy/2,bp,cs,1) == 1 ) Break; + + if( get_bw(dx-1-dx/8,dx-1,dy/4,dy/3,bp,cs,1) != 1 ) // large I ??? + if( get_bw(0 ,dx/8,dy/4,dy/3,bp,cs,1) != 1 ) + if( get_bw(dx-1-dx/8,dx-1,0 ,dy/8,bp,cs,1) == 1 ) + if( get_bw(0 ,dx/8,0 ,dy/8,bp,cs,1) == 1 ) ad=ad*97/100; + if( get_bw(dx-1-dx/8,dx-1,dy/2,dy-1,bp,cs,1) != 1 ) // r ??? + if( get_bw(0 ,dx/8,dy/2,dy-1,bp,cs,1) == 1 ) + if( get_bw(dx-1-dx/8,dx-1,0 ,dy/3,bp,cs,1) == 1 ) + if( get_bw(0 ,dx/8,0 ,dy/3,bp,cs,1) == 1 ) Break; + + for( y=1;y<12*dy/16;y++ ) + if( num_cross(0, dx-1, y , y ,bp,cs) != 1 // sure ? + && num_cross(0, dx-1, y-1, y-1,bp,cs) != 1 ) break; + if( y<12*dy/16 ) Break; + + if(dx>3){ + for( y=dy/2;y<dy-1;y++ ) + if( get_bw(dx/4,dx-1-dx/4,y,y,bp,cs,1) != 1 ) break; + if( y<dy-1 ) Break; + } + // test ob rechte Kante gerade + for(x=dx,y=bp->y-1-5*dy/16;y>=dy/5;y--){ // rechts abfallende Kante/Knick? + i=loop(bp,bp->x-1,y,x1-x0,cs,0,LE); + if( i-2-dx/16>=x ) break; + if( i<x ) x=i; + } + if (y>=dy/5 ) Break; + + // test ob linke Kante gerade + for(x=0,y=bp->y-1-dy/5;y>=dy/5;y--){ // rechts abfallende Kante/Knick? + i=loop(bp,0,y,x1-x0,cs,0,RI); + if( i+2+dx/16<x ) break; + if( i>x ) x=i; + } + if (y>=dy/5 ) Break; + if (box1->m4 && y1<box1->m4) + if ( get_bw(x0,x1,y1+1,box1->m4+dy/8,box1->p,cs,1) == 1 ) + ad=ad*97/100; // unsure !l| + i=loop(bp,dx-1,dy/16,dx,cs,0,LE); + j=loop(bp,dx-1,dy/2 ,dx,cs,0,LE); + if( i>3 && j>3 ) + if( get_bw(dx-1-i/2,dx-1-i/2,0,dy/2,bp,cs,1) == 1 ) Break; // ~t + + for(y=5*dy/8;y<dy;y++) + if( num_cross(0,dx-1,y,y,bp,cs) == 2 ) break; + if( y<dy ){ + i =loop(bp,0,y,dx,cs,0,RI); + i+=loop(bp,i,y,dx,cs,1,RI); + i+=loop(bp,i,y,dx,cs,0,RI)/2; // middle of v-gap + if( num_cross(0,i,5*dy/8,5*dy/8,bp,cs)==0 + && num_cross(i,i,5*dy/8, y,bp,cs)==0 ) Break; // ~J + } + if ( dx>8 + && loop(bp, 0,3*dy/4,dx,cs,0,RI)>=dx/4 + && loop(bp, 0,7*dy/8,dx,cs,0,RI)<=dx/8 + && loop(bp,dx-1,3*dy/4,dx,cs,0,LE)<=dx/8 + && loop(bp,dx-1,7*dy/8,dx,cs,0,LE)<=dx/8 ) Break; // ~J + + if ( 2*i3>5*i1 ) // hmm \tt l can look very similar to 7 + if ( loop(bp,0,dy/4,dx,cs,0,RI)>dx/2 + && get_bw(0,dx/8,0,dy/4,bp,cs,1) == 1 ) Break; // ~7 + + if ( loop(bp,dx-1,dy/2,dx,cs,0,LE)>dx/2 + && get_bw(3*dx/4,dx-1,3*dy/4,dy-1,bp,cs,1) == 1) { + if (loop(bp,0,dy-1,dx,cs,0,RI)<dx/8) ad=99*ad/100; // ~L + if(5*dx>2*dy) ad=99*ad/100; // ~L + if(5*dx>3*dy) ad=99*ad/100; // ~L + } + if(!hchar){ // right part (bow) of h is never a l + if( get_bw(dx/4,dx/4, 0,dy/4,bp,cs,1) == 1 + && get_bw(dx/4,dx/4,dy/2,dy-1,bp,cs,1) == 0 ) Break; + } + if( dx>3 && dy>3*dx ) + if( loop(bp,dx/4,dy-1 ,dy,cs,0,UP)< dy/4 + && loop(bp, 0,dy-1-dy/8,dx,cs,0,RI)>=dx/2 + && loop(bp,dx-1,dy-1-dy/8,dx,cs,0,LE)<=dx/4 ){ + ad=98*ad/100; // ~] + if ( loop(bp,dx-1,dy/2,dx,cs,0,LE)==0 ) Break; + } + + for(x=0;x<dx/2;x++) + if( get_bw( x, x, 0,dy/4 ,bp,cs,1) == 1 ) break; + // works only for perpenticular char + if( get_bw( x,x+dx/16, 0,dy/16,bp,cs,1) == 0 + && get_bw( x,x+dx/16,dy/4 ,dy/2 ,bp,cs,1) == 0 + && get_bw( x,x+dx/16,dy/16,dy/4 ,bp,cs,1) == 1 ){ + for(i=dx,y=0;y<dy/4;y++){ + x=loop(bp,0,y,dx,cs,0,RI); + if( x>i ) break; + } + if( x>=loop(bp,0,y+1,dx,cs,0,RI) ) + if( loop(bp,0 ,0,dy,cs,0,DO)>1 ) + if( loop(bp,0 ,0,dy,cs,0,DO) + - loop(bp,dx/16+1,0,dy,cs,0,DO) < dx/16+1 ) Break; // ~1 Jul00,Nov00 + if( num_cross(0,dx/2,y-1,y-1,bp,cs)==2 ) Break; // ~1 + } + if(dx<8 && dy<12){ // screen font + i= loop(bp,0,0,dy,cs,0,DO); + if( loop(bp,dx/2,1,dy,cs,1,DO)>=dy-2 + && loop(bp,0,dy/2,dx,cs,0,RI)>=2 + && i>1 && i<dy/2 ) Break; // ~1 + } + if( get_bw(x1,x1,y0 ,y1 ,box1->p,cs,2) != 2 + && get_bw(x0,x1,y0 ,y0 ,box1->p,cs,2) != 2 + && get_bw(x0,x1,y1 ,y1 ,box1->p,cs,2) != 2 + && get_bw(x0,x0+dx/4,y0+1+dy/16,y1-1-dy/16,box1->p,cs,1) != 1 ) Break; /* ~] */ + i=loop(bp,dx-1,dy/2,dx,cs,0,LE); + if( loop(bp, 0,dy/2,dx,cs,0,RI)>=dx/2 + && (i<dx/2 || i==0) ) ad=98*ad/100; // ~] + if( get_bw(x0,x0,y0 ,y1 ,box1->p,cs,2) != 2 + && get_bw(x0,x1,y0 ,y0 ,box1->p,cs,2) != 2 + && get_bw(x0,x1,y1 ,y1 ,box1->p,cs,2) != 2 + && get_bw(x1-dx/4,x1,y0+1+dy/16,y1-1-dy/16,box1->p,cs,1) != 1 ) Break; /* ~[ */ + + x =loop(bp, 0,dy/2,dx,cs,0,RI); // konvex/konkav? ~() + i =loop(bp,dx-1,dy/2,dx,cs,0,LE); + if( loop(bp, 0,7*dy/8,dx,cs,0,RI) > x+dx/8 + && loop(bp, 0, dy/8,dx,cs,0,RI) > x+dx/8 + && loop(bp,dx-1,7*dy/8,dx,cs,0,LE) < i-dx/8 + && loop(bp,dx-1, dy/8,dx,cs,0,LE) < i-dx/8 ) Break; // ~( + if( loop(bp, 0,7*dy/8,dx,cs,0,RI) < x-dx/8 + && loop(bp, 0, dy/8,dx,cs,0,RI) < x-dx/8 + && loop(bp,dx-1,7*dy/8,dx,cs,0,LE) > i+dx/8 + && loop(bp,dx-1, dy/8,dx,cs,0,LE) > i+dx/8 ) Break; // ~) + + i= loop(bp, 0, 0,dy,cs,0,DO); // horizontal line? + if(dy>=12 && i>dy/8 && i<dy/2){ + if( loop(bp,dx-1,3*dy/16,dx,cs,0,LE)-dx/8 + >loop(bp,dx-1, i,dx,cs,0,LE) + || loop(bp,dx-1,3*dy/16,dx,cs,0,LE)-dx/8 + >loop(bp,dx-1, i+1,dx,cs,0,LE) ) + if( loop(bp,dx-1,8*dy/16,dx,cs,0,LE)-dx/8 + >loop(bp,dx-1, i,dx,cs,0,LE) + || loop(bp,dx-1,8*dy/16,dx,cs,0,LE)-dx/8 + >loop(bp,dx-1, i+1,dx,cs,0,LE) ) + if( loop(bp, 0,3*dy/16,dx,cs,0,RI)-dx/8 + >loop(bp, 0, i,dx,cs,0,RI) + || loop(bp, 0,3*dy/16,dx,cs,0,RI)-dx/8 + >loop(bp, 0, i+1,dx,cs,0,RI) ) + if( loop(bp, 0,8*dy/16,dx,cs,0,RI)-dx/8 + >loop(bp, 0, i,dx,cs,0,RI) + || loop(bp, 0,8*dy/16,dx,cs,0,RI)-dx/8 + >loop(bp, 0, i+1,dx,cs,0,RI) ) Break; // ~t + if( loop(bp, 0,i-1,dx,cs,0,RI)>1 && dx<6 ) Break; // ~t + if( loop(bp, 0,8*dy/16,dx,cs,0,RI)>dx/8 + && loop(bp, 0, i,dx,cs,1,RI)>=dx-1 + && loop(bp,dx-1,8*dy/16,dx,cs,0,LE)>dx/8 + && loop(bp,dx-1, i-1,dx,cs,0,LE)>dx/8 ) Break; // ~t + } +// if( vertical_detected && dx>5 ) + if( loop(bp,0, 1,dx,cs,0,RI)>=dx/2 + && ( loop(bp,0,dy-2,dx,cs,0,RI)<=dx/8 + || loop(bp,0,dy-1,dx,cs,0,RI)<=dx/8 ) ) + if( ( loop(bp,dx-1, 0,dx,cs,0,LE)<=dx/8 + || loop(bp,dx-1, 1,dx,cs,0,LE)<=dx/8 ) + && loop(bp,dx-1,dy-2,dx,cs,0,LE)>=dx/2 ) ad=98*ad/100; // ~/ + + if( get_bw(x0,x1,y0,y1,box1->p,cs,2) == 0 ) ad=99*ad/100; + + if (!hchar || loop(bp,0,dy/4,dx,cs,0,RI)>dx/2){ // ~z + i=loop(bp,0,dy/16 ,dx,cs,0,RI); + i=loop(bp,i,dy/16 ,dx,cs,1,RI); j=i; + i=loop(bp,0,dy/16+1,dx,cs,0,RI); + i=loop(bp,i,dy/16+1,dx,cs,1,RI); if (i>j) j=i; + i=loop(bp,0,dy/16+2,dx,cs,0,RI); + i=loop(bp,i,dy/16+2,dx,cs,1,RI); if (i>j) j=i; + if (j*4>=dx*3) ad=98*ad/100; // ~z + if (j*8>=dx*7) ad=96*ad/100; // ~z + } + + if( get_bw(x0,x0,y1,y1,box1->p,cs,2) == 0 ) ad=99*ad/100; + if( get_bw(x1,x1,y1,y1,box1->p,cs,2) == 0 ) ad=99*ad/100; + if (ad==100) ad--; /* I have to fix that: + .@@@@.<- + @@..@@ + ....@@ + ....@@< + ...@@. + ..@@@. + ..@@.. + .@@... + @@.... + @@@@@@<- + */ + if(!hchar) ad=ad*99/100; + if( gchar) ad=ad*99/100; + Setac(box1,'l',ad); +// if( i<100 ) Break; ???? +// if( loop(bp,0, 1,dx,cs,0,RI)<=dx/8 +// && loop(bp,0,dy/2,dx,cs,0,RI)<=dx/8 +// && loop(bp,0,dy-2,dx,cs,0,RI)<=dx/8 ) vertical_detected=1; + break; + } + return box1->c; +} + +static wchar_t ocr0_oO(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + wchar_t bc=UNKNOWN; + + // --- test o,O --------------------------------------------------- + for(ad=d=100;dx>2 && dy>3;){ // min 3x4 + DBG( wchar_t c_ask='o'; ) + if (sdata->holes.num !=1 ) Break; + if( get_bw(x0 , x0+dx/2,y0+dy/2 , y0+dy/2,box1->p,cs,1) != 1 ) Break; + if( get_bw(x1-dx/2 , x1 ,y0+dy/2 , y0+dy/2,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2 , x0+dx/2,y1-dy/2 , y1, box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2 , x0+dx/2,y0 , y0+dy/2,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2 , x0+dx/2,y0+dy/2 , y1-dy/3,box1->p,cs,1) != 0 ) Break; + if (sdata->holes.hole[0].y0 > dy/3 + || sdata->holes.hole[0].y1 < dy-1-dy/3) Break; + + if( num_cross(x0+dx/2 ,x0+dx/2 ,y0, y1 ,box1->p,cs) != 2 + && num_cross(x0+dx/2+1,x0+dx/2+1,y0, y1 ,box1->p,cs) != 2 ) Break; + if( num_cross(x0+dx/3,x1-dx/4,y0 , y0 ,box1->p,cs) != 1 ) // AND + if( num_cross(x0+dx/3,x1-dx/4,y0+1 , y0+1,box1->p,cs) != 1 ) Break; + if( num_cross(x0+dx/4,x1-dx/3,y1 , y1 ,box1->p,cs) != 1 ) // against "rauschen" + if( num_cross(x0+dx/4,x1-dx/3,y1-1 , y1-1,box1->p,cs) != 1 ) Break; + if( num_cross(x0 ,x0 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) + if( num_cross(x0+1 ,x0+1 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) Break; + if( num_cross(x1 ,x1 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) + if( num_cross(x1-1 ,x1-1 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) Break; + + if( loop(bp,0 ,0 ,x1-x0,cs,0,RI)<= + loop(bp,0 ,2 ,x1-x0,cs,0,RI) ) Break; + + x=loop(bp,dx-1,dy-1-dy/3,x1-x0,cs,0,LE); // should be minimum + for( y=dy-1-dy/3;y<dy;y++ ){ + i=loop(bp,dx-1,y,x1-x0,cs,0,LE); + if( i<x ) break; x=i; + } + if( y<dy ) Break; + + // ~D + if( loop(bp,0, dy/16,dx,cs,0,RI) + + loop(bp,0,dy-1-dy/16,dx,cs,0,RI) + <= 2*loop(bp,0, dy/2 ,dx,cs,0,RI)+dx/8 ) Break; // not konvex + if( loop(bp,0 , 1+dy/16,dx,cs,0,RI) + dx/4 + <= loop(bp,dx-1, 1+dy/16,dx,cs,0,LE) ) Break; // Dec00 + + if( loop(bp,dx-1, dy/16,dx,cs,0,LE)>dx/8 ) + if( loop(bp,0 , dy/16,dx,cs,0,RI)<dx/16 ) Break; + if( loop(bp,dx-1,dy-1-dy/16,dx,cs,0,LE)>dx/8 ) + if( loop(bp,0 ,dy-1-dy/16,dx,cs,0,RI)<dx/16 ) Break; + if( get_bw(x1-dx/32,x1,y0,y0+dy/32,box1->p,cs,1) == 0 + && get_bw(x1-dx/32,x1,y1-dy/32,y1,box1->p,cs,1) == 0 +// && ( get_bw(x0,x0+dx/32,y0,y0+dy/32,box1->p,cs,1) == 1 + && ( get_bw(0,dx/32,0,dy/32,bp,cs,1) == 1 + || get_bw(x0,x0+dx/32,y1-dy/32,y1,box1->p,cs,1) == 1 ) ) Break; // ~D + + // search lowest inner white point + for(y=dy,j=x=0;x<dx;x++) { + i =loop(bp,x,dy-1 ,y1-y0,cs,0,UP); + i+=loop(bp,x,dy-1-i,y1-y0,cs,1,UP); + if (i<=y) { y=i; j=x; } + } i=y; + // italic a + for(y=dy-1-i;y<dy-1;y++) + if( num_cross(j,dx-1,y,y,bp,cs) > 1 ) ad=99*ad/100; // ~a \it a + for(y=0;y<dy-1-i;y++) + if( num_cross(0,dx-1,y,y,bp,cs) > 2 ) ad=98*ad/100; // ~a \it a + if (loop(bp,dx-1,dy-1,x1-x0,cs,0,LE)<dx/8) ad=98*ad/100; // \it a + if (loop(bp,dx-1, 0,x1-x0,cs,0,LE)<dx/8) ad=98*ad/100; // \it a + if (loop(bp,dx-1,dy-1-dy/8,x1-x0,cs,0,LE)+1+dx/16 + <loop(bp, 0,dy-1-dy/8,x1-x0,cs,0,RI)) + { ad=99*ad/100; MSG(fprintf(stderr,"ad=%d",ad);) } // \it a + if (loop(bp,dx-1,dy-1,y1-y0,cs,0,UP)+1+(dy+3)/8 + <loop(bp, 0,dy-1,y1-y0,cs,0,UP)) + { ad=98*ad/100; MSG(fprintf(stderr,"ad=%d",ad);) } // \it a + + if (abs(loop(bp,dx/2, 0,dy,cs,0,DO) + -loop(bp,dx/2,dy-1,dy,cs,0,UP))>dy/8 + || num_cross(0,dx-1, 0, 0,bp,cs) > 1 + || num_cross(0,dx-1,dy-1,dy-1,bp,cs) > 1 + ) ad=98*ad/100; // ~bq + + if( hchar && 2*y0<box1->m1+box1->m2 ) i=1; else i=0; + if (gchar) ad=99*ad/100; + bc='o'; + if( i ){ bc='O'; } + if ( bc=='O' && ad>99) ad=99; /* we can never 100% sure, 0O */ + Setac(box1,bc,ad); + if (bc=='O') Setac(box1,'0',ad); + if (bc=='o') Setac(box1,'0',98*ad/100); + break; + } + return box1->c; +} + +static wchar_t ocr0_pP(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,i1,i2,i3,i4,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + wchar_t bc=UNKNOWN; + + // --- test pP --------------------------------------------------- + for(ad=d=100;dx>2 && dy>3;){ // min 3x4 + DBG( wchar_t c_ask='p'; ) + if (sdata->holes.num > 2) Break; /* tolerant against a tiny hole */ + if( get_bw(0 , dx/2,3*dy/4,3*dy/4,bp,cs,1) != 1 ) Break; + if( get_bw(0 , dx/2, dy/2, dy/2,bp,cs,1) < 1 ) Break; + if( get_bw(dx/4, dx-1, dy/4, dy/4,bp,cs,1) != 1 ) Break; + i= loop(bp,dx-1,3*dy/4,dx,cs,0,LE); if (i<dx/4) Break; + if( num_cross(x1-3*i/4,x1-3*i/4, y0, y1-3*dy/16,box1->p,cs) != 2 ) + if( num_cross(x0+dx/2 ,x0+dx/2 , y0, y1-3*dy/16,box1->p,cs) != 2 ) + if( num_cross(x0+dx/2+1,x0+dx/2+1, y0, y1-3*dy/16,box1->p,cs) != 2 ) Break; + if( num_cross(0,dx-1,7*dy/8 ,7*dy/8 ,bp,cs) != 1 ) + if( num_cross(0,dx-1,7*dy/8-1,7*dy/8-1,bp,cs) != 1 ) Break; + if( num_cross(0,dx-1, dy/4 , dy/4 ,bp,cs) != 2 ) + if( num_cross(0,dx-1, dy/4-1, dy/4-1,bp,cs) != 3 ) // \it p with nice kurve + if( num_cross(0,dx-1, dy/4 , dy/4 ,bp,cs) != 2 ) + if( num_cross(0,dx-1, dy/4+1, dy/4+1,bp,cs) != 2 ) Break; + + i= loop(bp,0,dy/2,dx,cs,0,RI); if(i<1) i++; + if( num_cross(i-1,dx-1, dy/4 , dy/4 ,bp,cs) != 2 ) + if( num_cross(i-1,dx-1, dy/4+1, dy/4+1,bp,cs) != 2 ) Break; + + i1= loop(bp, 0,3*dy/8,dx,cs,0,RI); if (i1>=dx/2) ad=90*ad/100; + i2=i1+loop(bp,i1,3*dy/8,dx,cs,1,RI); // upper x-position of v line + i3= loop(bp, 0,7*dy/8,dx,cs,0,RI); + i4=i3+loop(bp,i3,7*dy/8,dx,cs,1,RI); // lower x-position of v line + // out_x(box1);printf(" p:"); + for ( y=dy/8; y<7*dy/8; y++ ){ + x=i2+ (8*y-3*dy)*(i4-i2)/(4*dy); // right limit of line + i= loop(bp,0,y,dx,cs,0,RI); if(i>x+dx/16) break; + } if ( y<7*dy/8 ) Break; + for ( x=0,j=y=dy/3; y<dy-dy/8; y++ ){ // suche unterkante (also 4x6) + i=loop(bp,dx-1,y,dx,cs,0,LE); + if ( i>x ) { x=i; j=y; } if(x>dx/2) break; + } if ( x<dx/2 || x>=dx) Break; + if( get_bw(3*dx/4,dx-1, y , dy-1,bp,cs,1) == 1 ) Break; + + i=num_hole (x0,x1,y0,y1-dy/5,box1->p,cs,NULL); + // j=num_hole (x0,x1,y0,y1 ,box1->p,cs,NULL); + j=sdata->holes.num; + + if (j!=1 && dx< 8) ad=96*ad/100; + if (j!=1 && dx>=8) ad=98*ad/100; + if (i==0 && j==0) ad=90*ad/100; /* some times there is a small gap */ + if (i>1 || j>1 || j>i) Break; + + // check for serif F + i= loop(bp,bp->x-1, bp->y/4, dx ,cs,0,LE); + i=i+loop(bp,bp->x-1-i,bp->y/4, dx ,cs,1,LE); + j= loop(bp,bp->x-1-i,bp->y/4,3*dy/4,cs,0,DO); + if (j>dy/2) ad=80*ad/100; // its an serif-F + + if( ((!hchar) && (!gchar)) || (hchar && gchar)) ad=95*ad/100; + bc='p'; + if( hchar && ((!gchar) || dy<14)) bc='P'; + if ( hchar && gchar) ad=98*ad/100; // \ss sz + if ((!hchar) && !gchar) ad=98*ad/100; + + Setac(box1,bc,ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_qQ(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad; /* tmp-vars */ + + // --- test Q --------------------------------------------------- + for(ad=d=100;dx>2 && dy>4;){ // min 3x4 + DBG( wchar_t c_ask='Q'; ) + if (sdata->holes.num > 2) Break; /* tolerant against a tiny hole */ + if( get_bw(x0 ,x0+dx/3,y0+dy/3,y0+dy/3,box1->p,cs,1) != 1 ) Break; + if( get_bw(x1-dx/3,x1 ,y0+dy/3,y0+dy/3,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2,x0+dx/2,y1-dy/3,y1, box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2,x0+dx/2,y0 ,y0+dy/4,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2,x0+dx/2,y0+dy/3,y1-dy/2,box1->p,cs,1) == 1 ) Break; + if( get_bw(x1 ,x1 ,y0 ,y0 ,box1->p,cs,1) == 1 ) Break; //alpha + if( num_cross(x0+dx/2,x0+dx/2,y0 , y1 ,box1->p,cs) < 2 ) Break; + if( num_cross(x0+dx/5,x1-dx/5,y0 , y0 ,box1->p,cs) != 1 ) // AND + if( num_cross(x0+dx/5,x1-dx/5,y0+1 , y0+1 ,box1->p,cs) != 1 ) Break; + if( num_cross(x0 ,x0 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) + if( num_cross(x0+1 ,x0+1 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) Break; + if( get_bw(x1 ,x1 ,y1-dy/8 , y1 ,box1->p,cs,1) == 0 ) + if( num_cross(x1 ,x1 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) + if( num_cross(x1-1 ,x1-1 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) Break; + // i=num_hole(x0,x1,y0,y1,box1->p,cs,NULL); + i=sdata->holes.num; + if(!i) Break; + if( i!=1 && (i!=2 || num_hole(x0,x1,y0+dy/2,y1,box1->p,cs,NULL)!=1) ) Break; + x=x1;y=y1; + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,LE,ST); if( x<x1-dx/2 ) Break; + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,ST,LE); + if( x<x1-dx/2 ) { if (gchar) ad=98*ad/100; else ad=90*ad/100; } + if( loop(bp,0 ,0 ,dx,cs,0,RI) + < loop(bp,0 ,2 ,dx,cs,0,RI) ) Break; + if( loop(bp,0 ,dy/8+2,dx,cs,0,RI) + +loop(bp,dx-1,dy/8+2,dx,cs,0,LE) > 5*dx/8 ) Break; // ~4 Okt00 + + x= loop(bp,dx-1,3*dy/8,dy,cs,0,LE); if( x>dx/4 ) Break; + if( loop(bp,dx-1-x,0 ,dy,cs,0,DO) + <= loop(bp,dx-2-x,0 ,dy,cs,0,DO) ) Break; // 4 + + if( loop(bp,dx-1,dy-2,dx,cs,0,LE) + <= loop(bp,dx-1,dy/2,dx,cs,0,LE) ) + if( loop(bp, 1,dy-1,dy,cs,0,UP) + <= loop(bp,dx/2,dy-1,dy,cs,0,UP) ) + if( loop(bp, 0,dy-2,dx,cs,0,RI)>dx/2 ) + if( loop(bp, 0, 0,dx,cs,0,RI)>dx/2 ) Break; // 4 + + if( loop(bp,dx-1,3*dy/4,dx,cs,0,LE) + + loop(bp, 0,3*dy/4,dx,cs,0,RI) + < loop(bp,dx-1,2*dy/4,dx,cs,0,LE) + + loop(bp, 0,2*dy/4,dx,cs,0,RI) ) ad=94*ad/100; // 4 + if( loop(bp,0 ,3*dy/4,dx,cs,1,RI) >= dx ) ad=94*ad/100; // 4 + + + if( loop(bp,dx-1,dy/3,dx,cs,0,LE)> dx/4 ) Break; + j=loop(bp,dx/2,dy-1,dy,cs,0,UP); + if (j>1 && j>dy/8) { + if( get_bw(0,dx/2,dy-1-j/2,dy-1-j/2,bp,cs,1) == 1 ) { // ~RA + if (j<5) ad=95*ad/100; + else Break; + } + } + + // italic a + for(i=0,y=0;y<dy/2;y++) + if( num_cross(0,dx-1,y,y,bp,cs) > 2 ) i++; if(i>dy/8) Break; // ~a \it a + if (i>0) ad=99*ad/100; + + // ~o look at the lower right side for falling line + for(j=x=0,y=dy/2;y<dy;y++){ + i=loop(bp,dx-1,y,dx,cs,0,LE);if(i>x){ x=i; } + if (x-i>j) j=x-i; + if( j>dx/16 ) Break; // falling line detected + } + if (j==0) Break; // no falling line => no Q + if (j<=dx/16) ad=98*ad/100; + if(y1<=box1->m3) ad=98*ad/100; // ~q no underlength! rare + if(!hchar) ad=96*ad/100; + Setac(box1,'Q',ad); + break; + } + // --- test q --------------------------------------------------- + for(ad=d=100;dx>2 && dy>3;){ // min 3x4 + DBG( wchar_t c_ask='q'; ) + if (sdata->holes.num > 2) Break; /* tolerant against a tiny hole */ + for ( y=y0; 2*y<=y0+y1; y++ ){ // detect ring + if( num_cross(x0,x1, y, y,box1->p,cs) == 2 ) Break; + } if (2*y>y0+y1) Break; /* < */ + for ( y=(y0+y1)/2; y<=y1; y++ ){ // detect vert line + if( num_cross(x0, x1, y, y,box1->p,cs) == 1 + && num_cross(x0,x0+dx/2, y, y,box1->p,cs) == 0 ) Break; + } if (y>y1) Break; /* O (y==y1 for 4x6font-q) */ + for ( x=0,j=y=y0+dy/3; y<=y1-dy/8; y++ ){ // detect baseline + i=loop(box1->p,x0,y,dx,cs,0,RI); + if ( i>x ) { x=i; j=y; } + if ( x>dx/2 ) break; + } if ( x<dx/2 || x>=dx) Break; + if (y1-j+1<dy/4) ad=96*ad/100; // ~\it{a} + if( num_cross(x0+x/2,x0+x/2, j, y1,box1->p,cs) != 0 ) ad=96*ad/100; // ~g + if( loop(box1->p,x0+dx/16,j,dy,cs,0,UP)<1+dy/16 ){ + ad=97*ad/100; + if (hchar || !gchar) Break; // 4 + } + if( loop(box1->p,x0+dx/16,j-dy/32-1,dy,cs,1,RI)>=dx-dx/8 + || loop(box1->p,x0+dx/16,j-dy/16-1,dy,cs,1,RI)>=dx-dx/8 ){ + ad=96*ad/100; // 4 + } + if( get_bw(x1-dx/3, x1, y0+dy/3, y0+dy/3,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0, x0+dx/3, y0+dy/3, y0+dy/3,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0, x0+dx/4, y1-dy/8, y1-dy/9,box1->p,cs,1) == 1 ) Break; + if( get_bw(x0, x0+dx/4, y1-dy/5, y1-dy/9,box1->p,cs,1) == 1 ) ad=99*ad/100; + if( num_cross(x0+dx/2,x0+dx/2, y0, j ,box1->p,cs) != 2 ) Break; + // if( num_hole (x0 ,x1 , y0, y1 ,box1->p,cs,NULL) != 1 ) + if (sdata->holes.num != 1) + { if (dx<16) ad=98*ad/100; else Break; } + if( num_hole (x0 ,x1 , y0, j ,box1->p,cs,NULL) != 1 ) + { if (dx<16) ad=98*ad/100; else Break; } + // ~\it g + if( loop(bp,0,dy-1-dy/4,dx,cs,0,RI)>5*dx/8 + && get_bw(dx/4,dx/4,dy-1-dy/4,dy-1,bp,cs,1)==1 ) Break; // ~\it g + // what about unsure m1-m4? + if(!gchar){ ad=ad*99/100; } // ~4 + if( hchar){ ad=ad*99/100; } // ~49 + Setac(box1,'q',ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_iIjJ(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,i1,i2,i3,i4,i5,hchar=sdata->hchar,gchar=sdata->gchar, + ax,ay,bx,by,cx,cy,ex,ey, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ad,ya,yb,yc,yd,ye,yf,xa,xb, /* tmp-vars */ + (*aa)[4]=sdata->aa; /* the for line ends, (x,y,dist^2,vector_idx) */ + + // --- test i --------------------------------------------------- + // if(box1->dots==1) // what about \it neighbouring ij + for(ad=d=100;dy>3 && dx>0;){ // min 3x4 without dot + DBG( wchar_t c_ask='i'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + // ToDo: ':' check that high of dot is smaller than the vert. line! + /* + * o <== ya + * o + * + * ooo <== yb + * o + * o + * o + * ooo + */ + ya=y0; + if (box1->dots!=1) ad=98*ad/100; + while(dy>3*dx && box1->m2){ // test for vertical i without detected dot + i= loop(bp,dx/2,dy-1 ,dy,cs,0,UP); + if (dy-1-i<box1->m3-2) break; + i+=loop(bp,dx/2,dy-1-i,dy,cs,1,UP); + // distance upper end to m2 > (m2-m1)/3 + if (3*abs(dy-1-i-box1->m2)>box1->m2-box1->m1) break; + if( get_bw(x0,x1,y0,(box1->m1+box1->m2)/2,box1->p,cs,1) == 1 ) + if( get_bw(x0,x1,y1-i ,y1-i ,box1->p,cs,1) == 0 + || get_bw(x0,x1,y1-i-1,y1-i-1,box1->p,cs,1) == 0 + || get_bw(x0,x1,y1-i-2,y1-i-2,box1->p,cs,1) == 0 ) + { + Setac(box1,'i',ad); + return 'i'; /* beleave me, thats an "i"! */ + } break; + } +// if( box1->dots!=1 ) Break; + if( box1->m2 && 2*y0>=box1->m2+box1->m1 ) ya=box1->m1; + +// out_x(box1); + for (y=ya;2*y<ya+y1;y++) if( get_bw(x0,x1,y,y,box1->p,cs,1) == 1 ) break; + if (2*y>=ya+y1) Break; // hmm, gap only, no dot? + ya=y; + if (box1->m2 && ya>box1->m2+2) Break; + for ( ;2*y<y1+ya;y++) if( get_bw(x0,x1,y,y,box1->p,cs,1) != 1 ) break; + if (2*y>=ya+y1) Break; // hmm no gap + for ( ;2*y<y1+ya;y++) if( get_bw(x0,x1,y,y,box1->p,cs,1) == 1 ) break; + yb=y; + if (5*yb>=3*ya+2*y1) ad=99*ad/100; // large gap + if (2*yb>= ya+ y1) ad=97*ad/100; // very large gap, ~: + if (5*yb>=2*ya+3*y1) Break; // huge gap, ~: + if (loop(bp,dx-1,y+(y1-ya+1)/32,dx,cs,0,LE)>dx/2) // unusual (right part of ouml) + ad=95*ad/100; + + // printf(" num_cross dy/2=%d %d\n",dy/2, num_cross(0,dx-1,dy/2,dy/2,bp,cs)); + // printf(" dots=%d\n",box1->dots); out_x(box1); + // \sl ~f. ! + for (y=y1;y>ya;y--) if( get_bw(x0,x1,y,y,box1->p,cs,1) != 1 ) break; + if (y>(ya+3*y1)/4) Break; + if (y>(ya+2*y1)/3) ad=96*ad/100; + + y=(y1-yb+1)/2+yb-y0; /* only one vertical line, italic i is more an tall S */ + if( num_cross(0,dx-1,y,y,bp,cs) != 1 ) Break; + for(;y<=y1-y0;y++){ if( num_cross(0,dx-1,y,y,bp,cs) != 1 ) break; } yc=y; + for(;y<=y1-y0;y++){ if( num_cross(0,dx-1,y,y,bp,cs) != 2 ) break; } yd=y; + if( yd<3*(y1-yb+1)/4+yb-y0 ) Break; + y=(y1-yb+1)/2+yb-y0; + for(;y>0;y--){ if( num_cross(0,dx-1,y,y,bp,cs) != 1 ) break; } ye=y; + for(;y>0;y--){ if( num_cross(0,dx-1,y,y,bp,cs) != 2 ) break; } yf=y; + if( yf>(y1-yb+1)/4+yb-y0 ) Break; + if(yd>yc+2){ + xa=loop(bp, 0,yc-1,dx,cs,0,RI); + xb=loop(bp,dx-1,yc-1,dx,cs,0,LE); + if( + xb-loop(bp,dx-1,yc,dx,cs,0,LE) /* Dec00 */ + > xa-loop(bp, 0,yc,dx,cs,0,RI) ){ + y= loop(bp,dx-xb,yc-1,dy,cs,0,DO); + if(y>0){ + i=loop(bp,dx-xb-1,yc-1+y-1,dy,cs,0,DO); + if( i>0 ) y+=i-1; + } + if( yc-1+y < yd-1 ) Break; + } else { + y= loop(bp,11*xa/16,yc-1,dy,cs,0,DO); + if( yc-1+y < yd-2 ) Break; + } + } + if(yf<ye-2){ + x=loop(bp,0 ,ye+1,dx,cs,0,RI); + y=loop(bp,x-1,ye+1,dy,cs,0,UP); + i=loop(bp,x ,ye+2-y,dy,cs,0,UP); + if( i>0 ) y+=i-1; + if( ye+1-y > yf+1 ) Break; + } + if( 2*y0 <= box1->m1+box1->m2 + && loop(bp,0, 0,dx,cs,0,RI)+1 + < loop(bp,0,dx/2,dx,cs,0,RI) ) ad=97*ad/100; + + if( gchar ) // i is more often than j, be sure that realy correct Mai00 + if( loop(bp, 0,2*dy/4,dx,cs,0,RI) + -loop(bp,dx-1,2*dy/4,dx,cs,0,LE)>dx/8 ) Break; + + // could be a broken + or similar thing? + if( 3 * ya > box1->m1 + 2*box1->m2 ) ad=90*ad/100; + + if( loop(bp,dx-1,3*dy/4,dx,cs,0,LE)>dx/2 + && loop(bp,dx-1, dy-1,dx,cs,0,LE)<dx/4 ) Break; // ~d=cl + + // test for é + if( dx>5 && num_cross(x0+dx/2,x0+dx/2, ya, y1 ,box1->p,cs) >= 3 ) + ad=95*ad/100; + + Setac(box1,'i',ad); + break; + } + // --- test j --------------------------------------------------- + // if(box1->dots==1) // what about \it neighbouring ij + for(ad=d=100;dy>4 && dx>0;){ // min 3x4 + DBG( wchar_t c_ask='j'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + ya=y0; + if( box1->m2 && 2*y0>=box1->m2+box1->m1 ) ya=box1->m1; + + for(y=ya;2*y<ya+y1;y++) if( get_bw(x0,x1,y,y,box1->p,cs,1) == 1 ) break; + if(2*y>=ya+y1) Break; // hmm only gap + ya=y; + if( box1->m2 && ya>box1->m2+2 ) Break; + for( ;2*y<y1+ya;y++) if( get_bw(x0,x1,y,y,box1->p,cs,1) != 1 ) break; + if(2*y>=ya+y1) Break; // hmm no gap + for( ;2*y<y1+ya;y++) if( get_bw(x0,x1,y,y,box1->p,cs,1) == 1 ) break; + if(2*y>=ya+y1) Break; // hmm very large gap + yb=y; + if( loop(bp,dx-1,y+(y1-ya+1)/32,dx,cs,0,LE)>dx/2 ) Break; // unusual (right part of ouml) + + // printf(" num_cross dy/2=%d %d\n",dy/2, num_cross(0,dx-1,dy/2,dy/2,bp,cs)); + // printf(" dots=%d\n",box1->dots); out_x(box1); + // \sl ~f. ! + for(y=(ya+y1)/2;y<=y1;y++) if( get_bw(x0,x1,y,y,box1->p,cs,1) != 1 ) break; + if(y<=y1) Break; + + y=(y1-yb+1)/2+yb-y0; /* only one vertical line, italic i is more an tall S */ + if( num_cross(0,dx-1,y,y,bp,cs) >2 ) Break; + for(;y<=y1-y0;y++){ if( num_cross(0,dx-1,y,y,bp,cs) != 1 ) break; } yc=y; + for(;y<=y1-y0;y++){ if( num_cross(0,dx-1,y,y,bp,cs) != 2 ) break; } yd=y; + if( yd<3*(y1-yb+1)/4+yb-y0 ) Break; + y=(y1-yb+1)/2+yb-y0; + for(;y>0;y--){ if( num_cross(0,dx-1,y,y,bp,cs) != 1 ) break; } ye=y; + for(;y>0;y--){ if( num_cross(0,dx-1,y,y,bp,cs) != 2 ) break; } yf=y; + if( yf>(y1-yb+1)/4+yb-y0 ) Break; + if(yd>yc+2){ + xa=loop(bp, 0,yc-1,dx,cs,0,RI); + xb=loop(bp,dx-1,yc-1,dx,cs,0,LE); + if( + xb-loop(bp,dx-1,yc,dx,cs,0,LE) /* Dec00 */ + > xa-loop(bp, 0,yc,dx,cs,0,RI) ){ + y= loop(bp,dx-xb,yc-1,dy,cs,0,DO); + if(y>0){ + i=loop(bp,dx-xb-1,yc-1+y-1,dy,cs,0,DO); + if( i>0 ) y+=i-1; + } + if( yc-1+y < yd-1 ) Break; + } else { + y= loop(bp,11*xa/16,yc-1,dy,cs,0,DO); + if( yc-1+y < yd-2 ) Break; + } + } + if(yf<ye-2){ + x=loop(bp,0 ,ye+1,dx,cs,0,RI); + y=loop(bp,x-1,ye+1,dy,cs,0,UP); + i=loop(bp,x ,ye+2-y,dy,cs,0,UP); + if( i>0 ) y+=i-1; + if( ye+1-y > yf+1 ) Break; + } + if( 2*y0 <= box1->m1+box1->m2 + && loop(bp,0, 0,dx,cs,0,RI)+1 + < loop(bp,0,dx/2,dx,cs,0,RI) ) ad=97*ad/100; + if (loop(bp,0,dy-1,dx,cs,0,RI) + -loop(bp,0,dy-3,dx,cs,0,RI)>1+dx/16) ad=96*ad/100; // ~c + + if( gchar ) // i is more often than j, be sure that realy correct Mai00 + if( loop(bp, 0,2*dy/4,dx,cs,0,RI) + -loop(bp,dx-1,2*dy/4,dx,cs,0,LE)<=dx/8 ) Break; + // could be a broken + or similar thing? + if( 3 * ya > box1->m1 + 2*box1->m2 ) ad=80*ad/100; + if (!gchar) ad=96*ad/100; + if( box1->dots!=1 ) ad=98*ad/100; + + Setac(box1,'j',ad); + + break; + } + // --- test I --------------------------------------------------- + for(ad=d=100;dy>4 && dy>dx && 5*dy>4*(box1->m3-box1->m2);){ // min 3x4 + DBG( wchar_t c_ask='I'; ) + if( box1->dots==1 ) Break; + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + + x =loop(bp,0, dy/2,dx,cs,0,RI); // konvex? divided Q + if(loop(bp,0,7*dy/8,dx,cs,0,RI) > x+dx/8) Break; + for( y=dy/16;y<dy-1-dy/16;y++ ) + if( num_cross(0, dx-1, y , y ,bp,cs) != 1 ) + if( num_cross(0, dx-1, y+dy/16 , y+dy/16 ,bp,cs) != 1 ) break; + if( y<dy-1-dy/16 ) Break; + x =loop(bp,0, dy/2,dx,cs,0,RI); + i5=loop(bp,x, dy/2,dx,cs,1,RI); // center width + for(y=dy/4;y<3*dy/4;y++ ){ // same width ? + x =loop(bp,0, y,dx,cs,0,RI); + x =loop(bp,x, y,dx,cs,1,RI); // width + if( abs(x-i5)>1+dx/8 ) break; + } if( y<3*dy/4 ) Break; + // out_x(box1); + + // upper max width + for(i2=i1=0,y=0;y<dy/4;y++ ){ + x =loop(bp,0, y,dx,cs,0,RI); + x =loop(bp,x, y,dx,cs,1,RI); if(x>i1){ i1=x;i2=y; } + } + for(i4=i3=0,y=3*dy/4;y<dy;y++ ){ + x =loop(bp,0, y,dx,cs,0,RI); + x =loop(bp,x, y,dx,cs,1,RI); if(x>i3){ i3=x;i4=y; } + } + if( abs(i3-i1)>1+dx/8 ) Break; // if i3>>i5 more sure! + if( i1>i5 ){ // look for edges else *80% + } + if(i1+1<i5 && !hchar) Break; // Jun00 + + // calculate upper and lower mass center + x =loop(bp,0, dy/8,dx,cs,0,RI); i1=x; + x+=loop(bp,x, dy/8,dx,cs,1,RI); i1=(i1+x-1)/2; + + x =loop(bp,0,dy-1-dy/8,dx,cs,0,RI); i2=x; + x+=loop(bp,x,dy-1-dy/8,dx,cs,1,RI); i2=(i2+x-1)/2; + x =loop(bp,0,dy-2-dy/8,dx,cs,0,RI); i=x; + x+=loop(bp,x,dy-2-dy/8,dx,cs,1,RI); i=(i+x-1)/2; if( i>i2 ) i2=i; + + // printf(" get_line(%d,%d) %d\n",i1,i2, + // get_line2(i1,dy/8,i2,dy-1-dy/8,bp,cs,100)); + if( get_line2(i1,dy/8,i2,dy-1-dy/8,bp,cs,100)<95 ) Break; + x =(i1-i2+4)/8; i1+=x; i2-=x; + + // upper and lower width (what about serifs?) + y=dy/8; + x =loop(bp,i1, y+0,dx,cs,1,LE); i=x; + x =loop(bp,i1, y+1,dx,cs,1,LE); if(x>i)i=x; + x =loop(bp,i1, y+0,dx,cs,1,RI); j=x; + x =loop(bp,i1, y+1,dx,cs,1,RI); if(x>j)j=x; if(abs(i-j)>1+dx/8)Break; + x =loop(bp,i2,dy-y-1,dx,cs,1,LE); j=x; + x =loop(bp,i2,dy-y-2,dx,cs,1,LE); if(x>j)j=x; if(abs(i-j)>1+dx/8)Break; + x =loop(bp,i2,dy-y-1,dx,cs,1,RI); j=x; + x =loop(bp,i2,dy-y-2,dx,cs,1,RI); if(x>j)j=x; if(abs(i-j)>1+dx/8)Break; + + if(dy>15) // v024a4 + if( loop(bp,dx-1,dy/16 ,dx,cs,0,LE) + > loop(bp,dx-1,dy/4 ,dx,cs,0,LE)+1+dx/32 ) Break; // ~bad ) (thinn) + + for(i=0,y=dy/16;y<15*dy/16 && i<2;y++) + if( num_cross(0,dx-1,y,y,bp,cs) != 1 ) i++; + if( i>1 ) Break; + + if(!hchar){ // right part (bow) of h is never a l + if( get_bw(dx/4,dx/4, 0,dy/4,bp,cs,1) == 1 + && get_bw(dx/4,dx/4,dy/2,dy-1,bp,cs,1) == 0 ) Break; + if( loop(bp, 0,dy/4,dx,cs,0,RI)> dx/4 + && loop(bp,dx-1,dy/4,dx,cs,0,LE)<=dx/4 + && loop(bp, 1, 0,dy,cs,0,DO)<=dy/4 ) Break; // ~z + } + + if( get_bw(x1,x1,y0 ,y1 ,box1->p,cs,2) != 2 + && get_bw(x0,x1,y0 ,y0 ,box1->p,cs,2) != 2 + && get_bw(x0,x1,y1 ,y1 ,box1->p,cs,2) != 2 + && get_bw(x0,x0,y0+1,y1-1,box1->p,cs,1) != 1 ) Break; /* ~] */ + + if ( loop(bp,dx-1, dy/4,dx,cs,0,LE) > dx/2 + && loop(bp,dx-1,3*dy/4,dx,cs,0,LE) > dx/2 + && loop(bp, 0, dy/2,dx,cs,0,RI) < dx/4 ) Break; /* ~[ */ + + x =loop(bp, 0,dy/2,dx,cs,0,RI); // konvex/konkav? ~() + i =loop(bp,dx-1,dy/2,dx,cs,0,LE); + if( loop(bp, 0,7*dy/8,dx,cs,0,RI) > x+dx/8 + && loop(bp, 0, dy/8,dx,cs,0,RI) > x+dx/8 + && loop(bp,dx-1,7*dy/8,dx,cs,0,LE) < i-dx/8 + && loop(bp,dx-1, dy/8,dx,cs,0,LE) < i-dx/8 ) Break; // ~( + if( loop(bp, 0,7*dy/8,dx,cs,0,RI) < x-dx/8 + && loop(bp, 0, dy/8,dx,cs,0,RI) < x-dx/8 + && loop(bp,dx-1,7*dy/8,dx,cs,0,LE) > i+dx/8 + && loop(bp,dx-1, dy/8,dx,cs,0,LE) > i+dx/8 ) Break; // ~) + if( loop(bp, 0, dy/8,dx,cs,0,RI) + -(dx-loop(bp,dx-1,7*dy/8,dx,cs,0,LE)) > dx/4 ) Break; // ~/ + if( loop(bp, 0, 0,dx,cs,0,RI) > dx/2 // ToDo: check for serifs + && loop(bp, 0, dy/8,dx,cs,0,RI) > dx/2 + && loop(bp,dx-1,dy-1 ,dx,cs,0,LE) > dx/2 + && loop(bp,dx-1,dy-1-dy/8,dx,cs,0,LE) > dx/2 ) ad=99*ad/100; // ~/ + + if (box1->m2 && 3*y0>box1->m1+2*box1->m2) + if( get_bw(x0+dx/8,x1-dx/8,box1->m1,(box1->m1+box1->m2)/2,box1->p,cs,1) == 1 ) + Break; // ~i + + if(i1+1<i5 && !hchar){ ad=65*ad/100; MSG({}) } // ~ slanted I + + // be sure only for serif + i3=loop(bp,dx-1, dy/4,dx,cs,0,LE); + i4=loop(bp, 0,dy-1-dy/4,dx,cs,0,RI); + if (i3<2 || i4<2 + || get_bw(x1-i3/4,x1-i3/4,y0,y0+dy/4,box1->p,cs,1) != 1 + || get_bw(x0+i4/4,x0+i4/4,y1-dy/4,y1,box1->p,cs,1) != 1 ) + { ad=99*ad/100; MSG(fprintf(stderr,"ad=%d",ad);) } // ToDo: improve it + if(!hchar){ ad=96*ad/100; MSG({}) } // ~bad_small_r + if (box1->m4 && y1<box1->m4) { // probably lower dot? + if ((dx>2 && get_bw(x0+1,x1-1,y1+1,box1->m4,box1->p,cs,1) == 1) + || (dx<3 && get_bw(x0 ,x1 ,y1+1,box1->m4,box1->p,cs,1) == 1)) { + ad=96*ad/100; + } + } // ~! + // a---b + // I + // I + // c---e + // check against Z + for(bx=0,ax=dx,ay=by=y=0;y<dy/4;y++){ + i =loop(bp,dx-1 ,y,dx,cs,0,LE); if (dx-i-1>bx) { bx=dx-1-i; by=y; } + i+=loop(bp,dx-1-i,y,dx,cs,1,LE); if (dx-i-1<ax) { ax=dx-i; ay=y; } + } + for(cx=dx,ex=0,ey=cy=y=dy-1;y>dy-1-dy/4;y--){ + i =loop(bp,0,y,dx,cs,0,RI); if (i<cx) { cx=i; cy=y; } + i+=loop(bp,i,y,dx,cs,1,RI); if (i>ex) { ex=i; ey=y; } + } + x=(3*ax+cx)/4; y=(3*ay+cy)/4; i= loop(bp,x,y,dx,cs,0,RI); + x=(3*bx+ex)/4; y=(3*by+ey)/4; j= loop(bp,x,y,dx,cs,0,LE); + if (j>0 && (2*i>3*j || 3*i<2*j )) ad=99*ad/100; + if (j>0 && ( i>2*j || 2*i< j )) ad=97*ad/100; + i=loop(bp,0,0,dy,cs,0,DO); + if (i>dy/8 && i<dy/2) ad=99*ad/100; // ~1 + if (loop(bp,dx-1,0,dx,cs,0,LE) + -loop(bp, 0,0,dx,cs,0,RI)>dx/4) ad=96*ad/100; // ~l 5x7 + + if( get_bw(x0,x1,y0,y1,box1->p,cs,2) == 0 ) ad=99*ad/100; + if (gchar) ad=98*ad/100; // J + if (box1->m3 && 2*y1<=box1->m2+box1->m3) ad=96*ad/100; // ' + + Setac(box1,'I',ad); + break; + } + // --- test J --------------------------------------------------- 22Nov06 + for(ad=d=100;dy>4 && dy>=dx && dx>2;){ // min 3x4 ~Y)]d', + // rewritten for vectors 0.42 + int ld, i1, i2, i3, i4, i5, i6, i7; // line derivation + corners + DBG( wchar_t c_ask='J'; ) + if (sdata->holes.num > 0) Break; /* no hole */ + /* half distance to the center */ + d=2*sq(128/4); + /* now we check for the upper right end of the J */ + if (aa[3][2]>d) Break; /* [2] = distance */ + /* searching for 4 notches between neighbouring ends */ + +/* + type A B + + 6OOOO 6O5 + 7O5 7O + O O + O O + 2O 1O4 1O4 + OO 2OO + 3 3 +*/ + + /* Warning: aa0 can be left upper or left lower point for type B */ + /* get a point on the inner low left side of the J */ + i =nearest_frame_vector(box1,aa[3][3],aa[1][3],(x0+x1)/2,y0); + i1=nearest_frame_vector(box1,i ,aa[1][3], x1+dx,(y0+3*y1)/4); + /* get the most left point on the lower part of the J */ + i2=nearest_frame_vector(box1,i1,aa[3][3], x0-2*dx, y1-dy/8); + /* get a point on the middle of the bottom of the J */ + i3=nearest_frame_vector(box1,aa[1][3],aa[2][3], (x0+x1)/2, y1); + /* get a point on the outer low right side of the J */ + i4=nearest_frame_vector(box1,aa[1][3],aa[3][3], x1, (y0+2*y1)/3); + /* get a point on the outer right side below top serif */ + i5=nearest_frame_vector(box1,aa[2][3],aa[3][3], (x0+2*x1)/3,y0); + /* get a point on the left side of upper serif */ + i6=nearest_frame_vector(box1,aa[3][3],i1, x0, y0); + /* get a point on the most right left side of upper serif */ + i7=nearest_frame_vector(box1,i6,i1, x1, y0); + MSG(fprintf(stderr," i1-i7 %d %d %d %d %d %d %d",i1,i2,i3,i4,i5,i6,i7);) + + /* check the highest point on lower left area */ + i =nearest_frame_vector(box1,i1,i3,x0,y0); + if (box1->frame_vector[i ][1]-y0<dy/4) Break; // U + if (box1->frame_vector[i ][1]-y0<=dy/2) ad=97*ad/100; // imperfect a + /* check the lowest point on upper left area, serife? */ + j =nearest_frame_vector(box1,i6,i7,x0,y1); + if (box1->frame_vector[i ][1] + -box1->frame_vector[j ][1]<=dy/4) Break; // imperfect a + if (box1->frame_vector[i7][1]>y0+dy/4) Break; // not to low + if (box1->frame_vector[i1][1] + -box1->frame_vector[i7][1]<dy/2) Break; + if (box1->frame_vector[i4][1] + -box1->frame_vector[i5][1]<dy/2) Break; + if (box1->frame_vector[i7][0]<x0+dx/2) Break; + if (box1->frame_vector[i1][0] + -box1->frame_vector[i2][0]<=dx/8) Break; // ~1 + if (box1->frame_vector[i1][0] + -box1->frame_vector[i2][0]<=dx/4) ad=ad*99/100; // ~1 + if (box1->frame_vector[i6][1]>y0+dy/8) ad=99*ad/100; // ~1 + if (aa[0][2]==0) { // ]? + ad=99*ad/100; + if (aa[1][2]==0) ad=98*ad/100; + if (aa[2][2]<=aa[3][2]) ad=97*ad/100; + } + + /* check for left bow */ + for (j=i=i2;i!=i4;i=(i+1)%box1->num_frame_vectors[0]) { + if (box1->frame_vector[ i][0] /* [0]=x */ + <box1->frame_vector[i1][0]) break; /* curve? */ + } if (i==i4) Break; // ~I + /* check for no right bow */ + for (j=i=i2;i!=i4;i=(i+1)%box1->num_frame_vectors[0]) { + if (box1->frame_vector[ i][0] /* [0]=x */ + >box1->frame_vector[i4][0]) break; + } if (i!=i4) Break; // ~I + /* check for no right bow */ + for (j=i=i5;i!=i6;i=(i+1)%box1->num_frame_vectors[0]) { + if (box1->frame_vector[ i][1] > y0+dy/4) break; + } if (i!=i6) Break; // ~Y + /* check if upper left and lower left points are joined directly */ + ld=line_deviation(box1, i7, i1); + MSG(fprintf(stderr," i7,i1 %d %d linedist= %d/%d",i7,i1,ld,2*sq(1024/4));) + if (ld >2*sq(1024/4)) Break; + if (5*ld >4*2*sq(1024/4)) ad=99*ad/100; // ~3 + if (6*ld >4*2*sq(1024/4)) ad=99*ad/100; // ~3 + if (7*ld >4*2*sq(1024/4)) ad=99*ad/100; // ~3 + if (8*ld >4*2*sq(1024/4)) ad=99*ad/100; // ~3 + /* check if lower right and upper right points are joined directly */ + ld=line_deviation(box1, i4, i5); + MSG(fprintf(stderr," i4,i5 %d %d linedist= %d/%d",i4,i5,ld,2*sq(1024/4));) + if (ld >2*sq(1024/4)) Break; + if (5*ld >4*2*sq(1024/4)) ad=99*ad/100; + + // J exists as gchar and ~gchar + if(!hchar){ ad=99*ad/100; } + Setac(box1,'J',ad); + break; + } + return box1->c; +} + +static wchar_t ocr0_brackets(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,i1,i2,i3,i4,i5,i6,hchar=sdata->hchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + (*aa)[4]=sdata->aa, /* corner-points, (x,y,dist^2,vector_idx) */ + ad,r1,r2; /* tmp-vars */ + wchar_t bc=UNKNOWN; + + // --- test > derived from xX --------------------------------------------------- + // rewritten for vectors v0.41 + for(ad=d=100;dx>1 && dy>2;){ // min 3x2 + // 0 - indizes 0,1,i1,i2 pointing to edges of the char + // \ . + // \ . + // i1,i2 + // / + // / + // 1 + DBG( wchar_t c_ask='>'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + if (sdata->holes.num > 0 && (dx<6 || dy<6)) Break; /* # */ + /* calculate the half distance to the center */ + d=2*sq(128/4); + /* now we check for the 2 left ends of the > */ + if (aa[0][2]>d) Break; /* upper left end */ + if (aa[1][2]>d) Break; /* lower left end */ + if (aa[1][1]-aa[0][1]<dy/2) Break; + /* searching for 4 notches between neighbouring ends */ + + /* run along left side from top to bottom */ + for (j=i=aa[0][3];i!=aa[1][3];i=(i+1)%box1->num_frame_vectors[0]) { + if (box1->frame_vector[i][0] + >=box1->frame_vector[j][0]) j=i; /* notice most right vector */ + } if (j==i || j==aa[0][3]) Break; + /* calculate the distance to the center */ + x=box1->frame_vector[j][0]; + y=box1->frame_vector[j][1]; + if (2*x-aa[0][0]-aa[1][0]<dx) ad=99*ad/100; + if (abs(aa[0][1]+aa[1][1]-2*y)>(dy+2)) Break; + if ( aa[0][0]+aa[1][0]-2*x>=0) Break; + i1=j; + d=line_deviation(box1, aa[0][3], j) >sq(1024/4); + /* check if upper left and center point are joined directly */ + MSG(fprintf(stderr,"x %d %d dist= %d/%d",x-x0,y-y0,d,sq(1024/4));) + if (d >sq(1024/4)) Break; ad=ad-d*100/sq(1024); + MSG(fprintf(stderr,"ad=%d", ad);) + d=line_deviation(box1, j, aa[1][3]); + /* check if lower left and center point are joined directly */ + MSG(fprintf(stderr,"x %d %d dist= %d/%d",x-x0,y-y0,d,sq(1024/4));) + if (d >sq(1024/4)) Break; ad=ad-d*100/sq(1024); + MSG(fprintf(stderr,"ad=%d", ad);) + + /* run along right side from bottom to top */ + for (j=i=aa[1][3];i!=aa[0][3];i=(i+1)%box1->num_frame_vectors[0]) { + if (box1->frame_vector[i][0] + >=box1->frame_vector[j][0]) j=i; /* notice most right vector */ + // MSG(fprintf(stderr,"search right: %d %d %d %d",i,j,aa[1][3],aa[0][3]);) + } if (j==i || j==aa[1][3]) Break; + /* calculate the distance to the center */ + x=box1->frame_vector[j][0]; + y=box1->frame_vector[j][1]; + if ( (aa[0][0]+aa[1][0]-2*x)>= 0 ) Break; + if (abs(aa[0][1]+aa[1][1]-2*y)>(dy+2)/4) Break; + if (aa[0][0]>=x || aa[1][0]>=x) Break; + i2=j; + d=line_deviation(box1, j, aa[0][3]); + /* check if upper left and center point are directly joined directly */ + MSG(fprintf(stderr,"x %d %d dist= %d/%d",x-x0,y-y0,d,sq(1024/4));) + if (d >sq(1024/4)) Break; ad=ad-d*100/sq(1024); + MSG(fprintf(stderr,"ad=%d", ad);) + d=line_deviation(box1, aa[1][3], j); + /* check if lower left and center point are directly joined */ + MSG(fprintf(stderr,"x %d %d dist= %d/%d",x-x0,y-y0,d,sq(1024/4));) + if (d >sq(1024/4)) Break; ad=ad-d*100/sq(1024); + MSG(fprintf(stderr,"ad=%d", ad);) + + /* + ToDo: calculate momentums or max derivations + along lines to distinguish )]}> + i1,i2 + */ + + if (sdata->gchar) ad=98*ad/100; + if (sdata->hchar) ad=99*ad/100; + bc='>'; + Setac(box1,bc,ad); + break; + } + // --- test /\\ ------------------------------------------------ +// if(bc==UNKNOWN) +// if(!box1->dots) + for(ad=d=100;dx>3 && dy>3;){ // min 4x4 for 4x6 font + DBG( wchar_t c_ask='/'; ) + if (sdata->holes.num > 0) Break; /* tolerant against a tiny hole */ +#if 1 + for(i=y=0;y<dy;y++){ + if( num_cross(0,dx-1,y,y,bp,cs) != 1 ) i++; + if( loop(bp, 0,y,dx,cs,0,RI) + + loop(bp,dx-1,y,dx,cs,0,LE)<3*dx/8 ) break; + } + if( y<dy ) Break; + if ( i>2 || (i>0 && dy<16)) Break; +#endif + /* get the center as exact as possible */ + i2=dx-1-loop(bp,dx-1,dy/2 ,dx,cs,0,LE) // be exact for small fonts + +dx-1-loop(bp,dx-1,dy/2+dy%2-1,dx,cs,0,LE) + + loop(bp, 0,dy/2 ,dx,cs,0,RI) + + loop(bp, 0,dy/2+dy%2-1,dx,cs,0,RI); + if (abs(i2-2*dx)>1+dx/2) Break; + if (abs(i2-2*dx)> dx/2) ad=99*ad/100; + + i1=loop(bp,dx-1,dy/16,dx,cs,0,LE); // right side + i3=loop(bp,dx-1,dy-1 ,dx,cs,0,LE); + i4=loop(bp, 0,0 ,dx,cs,0,RI); // left side + i6=loop(bp, 0,dy-1 ,dx,cs,0,RI); + i=(box1->m4+box1->m3)/2-box1->m2; + // + // out_x(box1);printf("() %d %d %d %d %d %d %d\n",i,i1,i2,i3,i4,i5,i6); + + // ~lI + for(i=i4,y=0;y<dy;y++){ + x=loop(bp,0 ,y,dx,cs,0,RI);if(abs(x-i)>dx/6+1 ) break; i=x; + } if( y<dy ) Break; + for(i=i1,y=0;y<dy;y++){ + x=loop(bp,dx-1,y,dx,cs,0,LE);if(abs(x-i)>dx/6+1 ) break; i=x; + } if( y<dy ) Break; + if(i1<=dx/8 && i6<=dx/8 && i4-(dx-i3)>dx/4 ) { Setac(box1,(bc='/'),ad);break; } + if(i4<=dx/8 && i3<=dx/8 && i6-(dx-i1)>dx/4 ) { Setac(box1,(bc='\\'),ad);break; } + Break; + } + // --- test ()<> ------------------------------------------------ +// if(bc==UNKNOWN) +// if(!box1->dots) + for(ad=d=100;dx>1 && dy>4;){ // min 3x4 + DBG( wchar_t c_ask='('; ) + if (sdata->holes.num > 1) {Break;}; /* tolerant against a tiny hole */ +#if 1 + for(i=y=0;y<dy;y++){ + if( num_cross(0,dx-1,y,y,bp,cs) != 1 ) i++; + if( loop(bp, 0,y,dx,cs,0,RI) + + loop(bp,dx-1,y,dx,cs,0,LE)<3*dx/8 ) break; + } + if( y<dy ) {Break;}; + if ( i>2 || (i>0 && dy<16)) {Break;}; +#endif + /* look for the extrema => r1..r2 */ + for(i=dx,r1=r2=y=dy/2-dy/8;y<=dy/2+dy/8;y++){ + j=loop(bp, 0,y,dx,cs,0,RI); if(j==i) r2=y; if(j<i){ r2=r1=y; i=j; } + j=loop(bp,dx-1,y,dx,cs,0,LE); if(j==i) r2=y; if(j<i){ r2=r1=y; i=j; } + } y=(r1+r2)/2; + i1=loop(bp,dx-1, dy/16,dx,cs,0,LE); + i2=loop(bp,dx-1,y ,dx,cs,0,LE); + i3=loop(bp,dx-1,dy-1-dy/16,dx,cs,0,LE); + i4=loop(bp, 0,dy/16 ,dx,cs,0,RI); + i5=loop(bp, 0,y ,dx,cs,0,RI); + i6=loop(bp, 0,dy-1-dy/16,dx,cs,0,RI); + if(dx>dy){ +// from Aug06 vector-version of greater is used +// if(i2==0 && 3*i5>dx && i4<=dx/8 && i6<=dx/8) { Setac(box1,(bc='>'),98);{Break;}; } + if(i5==0 && 3*i2>dx && i1<=dx/8 && i3<=dx/8) { Setac(box1,(bc='<'),98);{Break;}; } + } + if( dx > 2 && 9*dx>=5*dy ){ // 4x6 screen-font (3*5) + ad=98; + if (dx<8) ad=99*ad/100; + if (dx<6) ad=96*ad/100; + if( 2*dx > JOB->res.avX && 4*dx>dy ) ad=98; +// printf(" %d %d %d %d %d %d\n",i5,i1,i3,i2,i4,i6); + if( i5==0 && i1<=dx/8+1 && i3<=dx/8+1 && i1+i3<=dx/8+1 + && i2>=dx/2 && i4>=3*dx/4 && i6>=3*dx/4 ) { + if (2*loop(bp, 0, y/2,dx,cs,0,RI)+1+dx/16<i4+i5) ad=95*ad/100; + if (2*loop(bp, 0,dy-1-y/2,dx,cs,0,RI)+1+dx/16<i6+i5) ad=95*ad/100; + Setac(box1,(bc='<'),ad);{Break;}; + } +/* obsolete code Aug06, will be removed if new code is stable + if( i2==0 && i4<=dx/8 && i6<=dx/8 + && i5>=dx/2 && i1>=3*dx/4 && i3>=3*dx/4 ) { + if (2*loop(bp,dx-1, y/2,dx,cs,0,LE)+1+dx/16<i1+i2) ad=95*ad/100; + if (2*loop(bp,dx-1,dy-1-y/2,dx,cs,0,LE)+1+dx/16<i3+i2) ad=95*ad/100; + Setac(box1,(bc='>'),ad);{Break;}; + } +*/ + } + + i1=loop(bp,dx-1,dy/16,dx,cs,0,LE); + i2=loop(bp,dx-1,dy/2 ,dx,cs,0,LE); + i3=loop(bp,dx-1,dy-1 ,dx,cs,0,LE); + i4=loop(bp, 0,0 ,dx,cs,0,RI); + i5=loop(bp, 0,dy/2,dx,cs,0,RI); + i6=loop(bp, 0,dy-1,dx,cs,0,RI); + i=(box1->m4+box1->m3)/2-box1->m2; + // + // out_x(box1);printf("() %d %d %d %d %d %d %d\n",i,i1,i2,i3,i4,i5,i6); + if(2*i2<i1+i3 && 2*i5>i4+i6 && 2*dx<dy && dy>=i){ + Setac(box1,(bc=')'),98);break; } + if(2*i2>i1+i3 && 2*i5<i4+i6 && 2*dx<dy && dy>=i){ + if(2*i2<=i1+i3+1 || 2*i5>=i4+i6-1) ad=98*ad/100; + if(2*i2<=i1+i3+2 || 2*i5>=i4+i6-2) ad=98*ad/100; + for(x=y=0;y<dy/4;y++){ + i=loop(bp,0,y,dx,cs,0,RI);if( i>x ) x=i; + } + for(y=0;y<(dy+2)/4;y++){ + i=loop(bp,0,y+dy/8,dx,cs,0,RI);if( i<x ) break; + } + if( y==(dy+2)/4 ) {Break;}; // ~l (left upper side must be convex) Jul00 + Setac(box1,(bc='('),ad); break; + } + Break; + } + // --------- test [] -------------------------------- + for(ad=d=98;dx>2 && dy>4 && dy>=2*dx;){ // (3,6) on 4x6 font + DBG( wchar_t c_ask=']'; ) + if (sdata->holes.num > 1) { Break;} /* tolerant against a tiny hole */ + if (!hchar) ad=97*ad/100; + for(y=0;y<dy;y++){ + if( num_cross(0,dx-1,y,y,bp,cs) != 1 ) break; + } if (y<dy) {Break;}; + if( get_bw(x0,x1,y0 ,y0 ,box1->p,cs,2) == 2 + && get_bw(x0,x1,y0+1,y0+1,box1->p,cs,2) == 2 ) {Break;}; + if( get_bw(x0,x1,y1 ,y1 ,box1->p,cs,2) == 2 + && get_bw(x0,x1,y1-1,y1-1,box1->p,cs,2) == 2 ) {Break;}; + if( get_bw(x0 ,x0,y0 ,y1 ,box1->p,cs,2) == 0 + || get_bw(x0+1 ,x0+1,y0 ,y1 ,box1->p,cs,2) == 0 ) + if( get_bw(x0+dx/2,x1,y0+dy/4,y1-dy/4,box1->p,cs,1) == 0 ) + { Setac(box1,(bc='['),ad);break; } + if( get_bw(x1 ,x1,y0 ,y1 ,box1->p,cs,2) == 0 + || get_bw(x1-1 ,x1-1,y0 ,y1 ,box1->p,cs,2) == 0 ) + if( get_bw(x0,x1-dx/2,y0+dy/4,y1-dy/4,box1->p,cs,1) == 0 ) + { Setac(box1,(bc=']'),ad);break; } + break; + } + +#if CODE_NOT_COMPLETED + // --- test ] ------- + for(ad=d=100;dx>2 && dy>3;){ + DBG( wchar_t c_ask=']'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + if (sdata->holes.num > 0) ad=98*ad/100; /* # */ + /* 1/8 distance to the center */ + d=2*sq(128/16); + /* now we check for the 4 ends of the x */ + if (aa[0][2]>d) Break; + if (aa[1][2]>d) Break; + if (aa[2][2]>d) Break; + if (aa[3][2]>d) Break; + if (aa[3][0]-aa[0][0]<7*dx/8) Break; + if (aa[2][0]-aa[1][0]<7*dx/8) Break; + if (aa[1][1]-aa[0][1]<7*dy/8) Break; + if (aa[2][1]-aa[3][1]<7*dy/8) Break; + if (aa[3][0]-aa[0][0]<2) Break; /* to small */ + if (aa[2][0]-aa[1][0]<2) Break; /* to small */ + MSG( fprintf(stderr," aa %d %d %d %d %d %d %d %d d %d %d %d %d",\ + aa[0][0]-x0,aa[0][1]-y0,aa[1][0]-x0,aa[1][1]-y0,\ + aa[2][0]-x0,aa[2][1]-y0,aa[3][0]-x0,aa[3][1]-y0,\ + aa[0][2],aa[1][2],aa[2][2],aa[3][2]);) + /* left and right vertical line */ + d=line_deviation(box1, aa[0][3], aa[1][3]); if (d>2*sq(1024/4)) Break; + ad=(100-(d-sq(1024)/2)/sq(1024)/4)*ad/100; + d=line_deviation(box1, aa[2][3], aa[3][3]); if (d>2*sq(1024/4)) Break; + + /* search uppermost left ^ */ + i1=nearest_frame_vector(box1,aa[1][3],aa[2][3], x0, y0); + x=box1->frame_vector[i1][0]; + y=box1->frame_vector[i1][1]; + if (y-y0 > 5*dy/8) Break; + if (x-x0 > 5*dx/8) Break; + /* search uppermost right ^ ~H */ + i3=nearest_frame_vector(box1,aa[1][3],aa[2][3], x1, y0); + if ( box1->frame_vector[i3][0]-x> dx/4 + && box1->frame_vector[i3][1]-y<=dy/8) Break; + + /* check if upper left and lower right point are joined directly */ + dbg[0]=d=line_deviation(box1,i1, aa[2][3]); if (d >2*sq(1024/4)) Break; + /* check if lower left and lower left point are joined directly */ + dbg[1]=d=line_deviation(box1, aa[1][3],i1); if (d >2*sq(1024/4)) Break; + + if (!hchar) ad=99*ad/100; + if ( gchar) ad=98*ad/100; // \sc N + ac=(wchar_t) ']'; + Setac(box1,ac,ad); + if (ad>=100) return ac; + break; + } +#endif + // --------- test ocr-a-[] -------------------------------- + if(bc==UNKNOWN) + for(ad=d=98;dx>5 && dy>7 && 2*dy>3*dx;){ // only for accurate font at the moment + DBG( wchar_t c_ask='['; ) + if (sdata->holes.num > 2) break; /* tolerant against a tiny hole */ + if (!hchar) ad=97*ad/100; + if( num_cross(0,dx-1, 0, 0,bp,cs) != 1 ) break; + if( num_cross(0,dx-1,dy-1,dy-1,bp,cs) != 1 ) break; + if ( loop(bp,dx-1,dy/2,dx,cs,0,LE) + +loop(bp, 0,dy/2,dx,cs,0,RI) <= dx/4 ) break; // O + for(y=dy/8;y<dy-dy/8;y++){ + if( num_cross(0,dx,y,y,bp,cs) != 2 ) break; + } if (y<dy-dy/8) break; + if( get_bw((3*x0+5*x1)/8,x1,y0+3*dy/16,y1-3*dy/16,box1->p,cs,1) == 0) + { Setac(box1,(bc='['),ad);break; } + if( get_bw(x0,(5*x0+3*x1)/8,y0+3*dy/16,y1-3*dy/16,box1->p,cs,1) == 0) + { Setac(box1,(bc=']'),ad);break; } + break; + } + // --------- test {} -------------------------------- + for(ad=d=99;dx>2 && dy>5 && 2*dy>3*dx;){ + DBG( wchar_t c_ask='{'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + if (!hchar) ad=97*ad/100; + for(y=0;y<dy;y++){ + if( num_cross(0,dx-1,y,y,bp,cs) != 1 ) break; + } if (y<dy) Break; + for(x=0;x<dx/2;x++){ + if( num_cross(dx-1-x,dx-1-x,0,dy-1,bp,cs) != 2 ) break; + } if (y<dx/2) Break; + if ( num_cross(dx-1,dx-1,dy/4,dy-1-dy/4,bp,cs) != 0 ) Break; + if ( num_cross( 0, 0,dy/4,dy-1-dy/4,bp,cs) != 1 ) Break; + if ( loop(bp,0,dy-1,dx,cs,0,RI)>3*dx/4 ) ad=99*ad/100; + if ( loop(bp,0, 0,dx,cs,0,RI)>3*dx/4 ) ad=99*ad/100; // < + if ( loop(bp,0, 0,dy,cs,0,DO)<dy/2-1 ) ad=98*ad/100; + if ( loop(bp,0,dy-1,dy,cs,0,UP)<dy/2-2 ) ad=98*ad/100; // ( + if ( loop(bp,dx-1,0,dx,cs,0,LE) + + loop(bp,dx-1,2,dx,cs,0,LE) + - 2*loop(bp,dx-1,1,dx,cs,0,LE) >=dx/8 ) ad=98*ad/100; // < + if ( loop(bp,dx-2,dy-1,dy,cs,0,UP)>dy/4 ) Break; // f + if ( get_bw(x0,x0,y0,y0+dy/4,box1->p,cs,1) == 1 + || get_bw(x0,x0,y1-dy/4,y1,box1->p,cs,1) == 1 ) Break; + Setac(box1,(bc='{'),ad);Break; + } + for(ad=d=99;dx>2 && dy>5 && 2*dy>3*dx;){ + DBG( wchar_t c_ask='}'; ) + if (!hchar) ad=97*ad/100; + for(y=0;y<dy;y++){ + if( num_cross(0,dx-1,y,y,bp,cs) != 1 ) break; + } if (y<dy) Break; + for(x=0;x<dx/2;x++){ + if( num_cross(x,x,0,dy-1,bp,cs) != 2 ) break; + } if (y<dx/2) Break; + if ( num_cross( 0, 0,dy/4,dy-1-dy/4,bp,cs) != 0 ) Break; + if ( num_cross(dx-1,dx-1,dy/4,dy-1-dy/4,bp,cs) != 1 ) Break; + if ( loop(bp,dx-1,dy-1,dx,cs,0,LE)>3*dx/4 ) {ad=99*ad/100;} + if ( loop(bp,dx-1, 0,dx,cs,0,LE)>3*dx/4 ) {ad=99*ad/100;} // > + if ( loop(bp,dx-1, 0,dy,cs,0,DO)<dy/2-1 ) {ad=98*ad/100;} + if ( loop(bp,dx-1,dy-1,dy,cs,0,UP)<dy/2-2 ) {ad=98*ad/100;} // ) + if ( loop(bp,0,0,dx,cs,0,RI) + + loop(bp,0,2,dx,cs,0,RI) + - 2*loop(bp,0,1,dx,cs,0,RI) >=dx/8 ) ad=98*ad/100; // < + if ( loop(bp,1,dy-1,dy,cs,0,UP)>dy/4 ) Break; // ??? + if ( get_bw(x1,x1,y0,y0+dy/4,box1->p,cs,1) == 1 + || get_bw(x1,x1,y1-dy/4,y1,box1->p,cs,1) == 1 ) Break; + Setac(box1,(bc='}'),ad);Break; + } + return box1->c; +} + +#if 0 +/* ---------- empty prototype function for copy and expand ---------- */ +static wchar_t ocr0_XXX(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,i0,i1,i2,i3,hchar=sdata->hchar,gchar=sdata->gchar, + x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1,cs=sdata->cs; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + ac,ad; /* tmp-vars */ + + // --- test XXX --------------------------------------------------- + return box1->c; +} +#endif + + +/* ----------------------- part9 -------------------------------- */ +static wchar_t ocr0p9(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + i1,i2,i3,i4; /* tmp-vars */ + int xa,xb, /* used for store significant points of char */ + dbg[9]={0,0,0,0,0,0,0,0,0}, /* debugging space */ + ya,ad,cs=sdata->cs; + wchar_t ac,bc=UNKNOWN; // bestletter + int hchar; // char is higher than e + int gchar; // char has ink lower than m3 + // --- hchar --- gchar ------------------------- + hchar=0;if( 2*y0<=2*box1->m2-(box1->m2-box1->m1) ) hchar=1; + gchar=0;if( 2*y1>=2*box1->m3+(box1->m4-box1->m3) ) gchar=1; + // if the char is slightly moved down correction can be done + if ( y0<box1->m2 && y1>box1->m3 && 2*y1<box1->m3+box1->m4) // moved + if( 2*(y0-(y1-box1->m3))<=2*box1->m2-(box1->m2-box1->m1) ) hchar=1; + + /* reserved for the future */ + // --- test beta,\3,sz,"s --------------------------------------------- + if(bc==UNKNOWN && hchar) + for(ad=d=100;dx>3 && dy>6;){ // min 4x7 + DBG( wchar_t c_ask='S'; ) + if (sdata->holes.num > 2) break; /* tolerant against a tiny hole */ + /* this part is provisorium, should be changed! + a-\ + | d + b| / + | \ + -c / + */ + if( num_cross(x0 ,x1 ,y0+dy/4 ,y0+dy/4 ,box1->p,cs) != 2 + && num_cross(x0 ,x1 ,y0+dy/4+1,y0+dy/4+1,box1->p,cs) != 2 ) break; + for(i=1+dy/16,y=y0+dy/8;y<y1-dy/4 && i>0;y++){ + if( y<y1-6*dy/16 ){ if( num_cross(x0 ,x1 ,y,y,box1->p,cs) != 2 ) i--;} + else { if( num_cross(x0 ,x1 ,y,y,box1->p,cs) < 2 ) i--;} + if( get_bw(x0,x0+dx/2,y,y,box1->p,cs,1) == 0 ) i--; + if( y<y1-5*dy/16 ) + if( get_bw(x1-dx/2,x1,y,y,box1->p,cs,1) == 0 ) i--; + } if( i<=0 ) break; + // out_x(box1); + + for(y=y0+dy/3;y<y1-dy/3;y++){ + i =loop(box1->p,x1,y,dx,cs,0,LE); + if( i>=dx/8 ) break; + i+=loop(box1->p,x1-i,y,dx,cs,1,LE); + if( i>=dx/2 ) break; + } if( y>=y1-dy/3 ) break; + + for(y=y0+dy/5;y<y0+dy/3;y++) + if( get_bw(x1-dx/6,x1,y,y,box1->p,cs,1) == 1 ) break; + if( y>=y0+dy/3 ) break; + + for(y=y0+dy/2;y<y1;y++) + if( get_bw(x1-dx/6,x1,y,y,box1->p,cs,1) == 1 ) break; + if( y>=y1 ) break; + + for(y=y1-dy/3;y<y1-dy/8;y++){ + i=loop(box1->p,x1,y,dx,cs,0,LE); + if( i>dx/4 + && get_bw(x1-dx/8,x1-dx/8,y,y1,box1->p,cs,1) == 1 ) break; + } if( y<y1-dy/8 ) break; // ~Q + + if( box1->m3==0 || 2*y1<box1->m3+box1->m4 ) + if( loop(box1->p,x1,y1, dx,cs,0,LE)==0 + && loop(box1->p,x1,y1-dy/4,dx,cs,0,LE)>dx/8 ) break; // ~R + + + for(x=x0+dx/4;x<x1-dx/4;x++) + if( num_cross(x,x,y0,y1,box1->p,cs) == 3 ) break; + if( x>=x1-dx/4 ) break; + + i=loop(bp,dx/2,dy-1,dy,cs,0,UP)+dy/64; // Jul00 + for(x=dx/5;x<dx/2;x++) + if( loop(bp,x,dy-1,dy,cs,0,UP) > i ) break; + if( x==dx/2 ) break; + + x=x0+loop(bp,0,dy/4,dx,cs,0,RI); + for(;x<x1-dx/3;x++) + if( get_bw(x,x,y0,y0+dy/4,box1->p,cs,1) == 0 ) break; + if( x<x1-dx/3 ) break; + + if( !gchar ) + // if( num_hole( x0, x1, y0, y1,box1->p,cs,NULL) != 0 ) break; + if (sdata->holes.num != 0) break; + + bc=LATIN_SMALL_LETTER_SHARP_S; + Setac(box1,(wchar_t)bc,98); + break; + } + // --- test + ------------------------------------------------ + for(ad=d=100;dx>2 && dy>2;){ // min 3x3 + DBG( wchar_t c_ask='+'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + xa=(dx+1)/3-1; ya=(dy+1)/3-1; + xb=(dx+1)/4; + if( get_bw(x0,x0+xa,y0,y0+ya,box1->p,cs,1) == 1 ) Break; + if( get_bw(x0,x0+xa,y1-ya,y1,box1->p,cs,1) == 1 ) Break; + if( get_bw(x1-xb,x1,y0,y0+ya,box1->p,cs,1) == 1 ) Break; + if( get_bw(x1-xa,x1,y1-ya,y1,box1->p,cs,1) == 1 ) Break; + for(i=0,y=y0+ya;y<=y1-ya;y++){ // horizontal line + if( get_bw(x0+dx/9,x1-dx/9,y,y,box1->p,cs,2) == 0 ) { i=y; break; } + } + if (3*dx<2*dy) ad=99*ad/100; // ~t + if( !i ) Break; + ac=(wchar_t) '+'; + Setac(box1,ac,ad); + if (ad>=100) return ac; + break; + } + // --- test $ ------------------------------------------------ + for(ad=d=99;dx>3 && dy>5;){ // min 3x4 + DBG( wchar_t c_ask='$'; ) + if (sdata->holes.num != 2) Break; + + if( get_bw(x0,x0+dx/5,y0 ,y0+dy/18,box1->p,cs,1) == 1 ) Break; + if( get_bw(x0,x0+dx/9,y1-dy/23,y1 ,box1->p,cs,1) == 1 ) Break; + if( get_bw(x1-dx/9,x1,y0 ,y0+dy/18,box1->p,cs,1) == 1 ) Break; + if( get_bw(x1-dx/5,x1,y1-dy/23,y1 ,box1->p,cs,1) == 1 ) Break; + if( get_bw(x0,x0+dx/3,y0+dy/3 ,y0+dy/2 ,box1->p,cs,1) != 1 ) Break; + if( get_bw(x1-dx/3,x1,y1-dy/2 ,y1-dy/3 ,box1->p,cs,1) != 1 ) Break; + i1=x0+loop(box1->p,x0,y0,dx,cs,0,RI); if( i1<x0+dx/3 || i1>x1-dx/5 ) Break; + i2=x0+loop(box1->p,x0,y1,dx,cs,0,RI); if( i2<x0+dx/5 || i2>i1 ) Break; + ad= get_line2(i1,y0,i2,y1,box1->p,cs,100)*ad/100; + // check upper left and lower right half circle, $ + for (x=0,i3=y=0;y<dy/3;y++) + if( num_cross(x0,x1,y0+dy/2-y,y0+dy/2-y,box1->p,cs) == 2 ) { + i = loop(box1->p,x0,y0+dy/2-y,dx,cs,0,RI); + if (i>x) { x=i; i3=y0+dy/2-y; } + } if (x<=dx/4) Break; + for (x=0,i4=y=0;y<dy/3;y++) + if( num_cross(x0,x1,y0+dy/2+y,y0+dy/2+y,box1->p,cs) == 2 ) { + i = loop(box1->p,x0,y0+dy/2+y,dx,cs,0,RI); + if (i>x) { x=i; i4=y0+dy/2+y; } + } if (x<=dx/4) Break; + if (ad<95) Break; + ac=(wchar_t) '$'; + Setac(box1,ac,ad); + if (ad>=100) return ac; + break; + } + // --- test & ------------------------------------------------ + for(ad=d=99;dx>3 && dy>4;){ /* 4x6 font */ + DBG( wchar_t c_ask='&'; ) + if (sdata->holes.num != 2) Break; + if( get_bw(x1-dx/9,x1,y0,y0+dy/4,box1->p,cs,1) == 1 ) Break; // g + if( loop(bp,dx/2,0,dy,cs,0,DO)>dy/2) Break; + i1=loop(bp,0,dy/8 ,dx,cs,0,RI); if (i1>dx/2) Break; + i =loop(bp,0,dy/4 ,dx,cs,0,RI); if (i1>dx/2) Break; if (i<i1) i1=i; + i3=loop(bp,0,dy-dy/4 ,dx,cs,0,RI); if (i3>dx/2) Break; + i =loop(bp,0,dy-dy/4-1,dx,cs,0,RI); if (i3>dx/2) Break; if (i<i3) i3=i; + if (i3>i1) Break; + for( i2=0, y=dy/4; y<=dy/2+1; y++ ){ + i =loop(bp,0,y,dx,cs,0,RI); if( i>i2 ) i2=i; + } + if(2*i2-i1-i3<1) Break; + // if( num_hole(x0,x1 ,y0,y1,box1->p,cs,NULL)!=2 ) Break; + if( num_hole(x0,x1-dx/4,y0,y1,box1->p,cs,NULL)!=2 ) Break; + if( num_cross(dx-1,dx-1,dy/4,dy-1,bp,cs) < 1 ) Break; + for( x=dx-1; x>=dx/2; x-- ){ + if( num_cross(x,x,dy/4,dy-1,bp,cs) > 1 ) break; + } if( x<=3*dx/4 && x<dx-2) Break; + if( num_cross(0,dx-1,dy-1-dy/4,dy-1-dy/4,bp,cs) > 3 ) { // glued ah + if (dy>15) { Break; } else ad=96*ad/100; + } + if (!hchar) ad=98*ad/100; + bc=(wchar_t) '&'; + Setac(box1,bc,ad); + if (ad>=100) return bc; + break; + } + // --- test \it & like \epsilon\tau ------------------------------ + if(bc==UNKNOWN) + for(ad=d=100;dx>7 && dy>7;){ + DBG( wchar_t c_ask='&'; ) + if (sdata->holes.num > 2) break; /* tolerant against a tiny hole */ + if( num_cross(0,dx-1, dy/4, dy/4,bp,cs) != 3 ) break; + if( num_cross(0,dx-1, dy/2, dy/2,bp,cs) != 4 ) break; + if( num_cross(dx/2,dx-1,dy/2, dy/2,bp,cs) != 2 ) break; + if( num_cross(0,dx-1,3*dy/4,3*dy/4,bp,cs) != 2 ) break; + if( num_cross(0,dx-1, dy-1, dy-1,bp,cs) != 1 ) break; + if( num_cross( 0, 0,0,dy-1,bp,cs) != 1 ) break; + if( num_cross( dx/3, dx/3,0,dy-1,bp,cs) != 4 ) break; + if( num_cross(13*dx/16,13*dx/16,0,dy/8,bp,cs) != 0 ) break; + if( num_cross(4*dx/8,4*dx/8,dy-dy/4,dy-1,bp,cs) != 1 ) break; + if( num_cross(3*dx/8,3*dx/8,dy-dy/4,dy-1,bp,cs) != 1 ) break; + if( num_cross(5*dx/8,5*dx/8,dy-dy/4,dy-1,bp,cs) != 1 ) break; + if( num_hole(x0 ,(x0+x1)/2,y0, y1,box1->p,cs,NULL) != 1 ) break; + if( num_hole(x0+dx/8,x1-dx/4,y0,y1-dy/4,box1->p,cs,NULL) != 1 ) break; + ac=(wchar_t) '&'; + Setac(box1,ac,ad); + if (ad>=100) return ac; + break; + } + // --- test ? --------------------------------------------------- + for(ad=d=98;dx>2 && dy>5;){ // min 3x(4+2) + DBG( wchar_t c_ask='?'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + if ( num_cross(x0, x1, y0, y0, box1->p, cs) !=1 ) Break; // ~? + if ( num_cross(x0, x1, y1, y1, box1->p, cs) > 1 ) Break; // ~? + for(y=y0;y<y1;y++) // new y1 + if( get_bw(x0, x1, y, y,box1->p,cs,1) != 1 ) break; // lower end + if (2*y<y0+y1) Break; + i1=y1; + if (y==y1 && box1->m4) { // probably lower dot not catched in box? + if (get_bw(x0+1,x1-1,y1+1,box1->m4,box1->p,cs,1) != 1 ) Break; + i1=box1->m4; + for(;i1>y1;i1--) // new y1 + if( get_bw(x0, x1,i1,i1,box1->p,cs,1) == 1 ) break; // lower dot + } + y--; i=y-y0+1; // new dy + for (y=0;y<dy/2;y++) + if( num_cross(x0, x1, y0+y, y0+y, box1->p, cs) == 2 ) break; + if (y==dy/2) Break; + // if( num_hole( x0, x1, y0, y1, box1->p,cs,NULL) > 0 ) Break; + if (sdata->holes.num > 0) Break; + for(y=y0+dy/2;y<=i1;y++) + if( get_bw(x0,x1,y,y,box1->p,cs,1) == 0 ) break; + if( y==i1 ) Break; + for( ;y<=i1;y++) + if( get_bw(x0,x1,y,y,box1->p,cs,1) == 1 ) break; + if( get_bw(x0,x1,y,y,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+7*dx/8,x1,y,i1,box1->p,cs,1) == 1 ) Break; // broken thin 2 + bc='?'; + Setac(box1,(wchar_t)bc,98); + return bc; + } + // --- test !| --------------------------------------------------- + for(ad=d=99; dy>4 && dy>2*dx;){ // min 3x4 + DBG( wchar_t c_ask='!'; ) + if (sdata->holes.num > 1) Break; /* tolerant against a tiny hole */ + // measure thickness + if (num_cross(x0,x1,y0 ,y0 ,box1->p,cs)!=1) Break; + if (num_cross(x0,x1,y0+dy/2,y0+dy/2,box1->p,cs)!=1) Break; + for(y=y0;y<y1;y++) // new y1 + if( get_bw(x0, x1, y, y,box1->p,cs,1) != 1 ) break; // lower end + if (2*y<y0+y1) Break; + if (y==y1 && y>box1->m3-dy/8) ad=ad*97/100; /* missing dot? */ + i1=y1; + if (y==y1 && box1->m4) { // probably lower dot not catched in box? + if ((dx>2 && get_bw(x0+1,x1-1,y1+1,box1->m4,box1->p,cs,1) == 1) + || (dx<3 && get_bw(x0 ,x1 ,y1+1,box1->m4,box1->p,cs,1) == 1 )) { + i1=box1->m4; + for(;i1>y1;i1--) // new y1 + if( get_bw(x0, x1,i1,i1,box1->p,cs,1) == 1 ) break; // lower dot + } + } i2=i1; + for( i1=0,y=y0;y<=i2;y++){ + i=num_cross(x0,x1,y,y,box1->p,cs); if(i>1) break; + if(i==0 && i1==0) i1=y; + } if(y<=i2 || i1==0 || i1<y0+dy/2) Break; + + if( loop(bp,dx-1,dy/8,dx,cs,0,LE) + -loop(bp,dx-1, 0,dx,cs,0,LE)>dx/4+1 ) Break; // f + + if (!hchar) ad=96*ad/100; + Setac(box1,(wchar_t)'!',ad); + break; + } + // --- test * five egdes (jagges? beames?) what is the right english word? ---- + for(ad=d=99;dx>2 && dy>4;){ + DBG( wchar_t c_ask='*'; ) + if (sdata->holes.num > 0) Break; /* tolerant against a tiny hole */ + if( num_cross(0,dx-1, 0,dy-1,bp,cs) != 1 + && num_cross(0,dx-1, 1,dy-2,bp,cs) != 1 ) Break; + if( num_cross(0,dx-1,dy-1,dy-1,bp,cs) != 2 + && num_cross(0,dx-1,dy-2,dy-2,bp,cs) != 2 ) Break; + x=dx/2;y=(6*dy+8)/16; // center point 6/8=6/2^3 rounded + /* upwarts from center */ + dbg[0]=i=get_line2(x,y,x ,0,bp,cs,100); if(i<95) Break; + if (dx<8) /* be exact on small fonts, where get_line2 returns 100 (ToDo change) */ + if (get_bw(x,x,0,y,bp,cs,2)==2) Break; + /* horizontal */ + dbg[1]=i=get_line2(0,y,dx-1,y,bp,cs,100); if(i<95) Break; + if (dy<8) + if (get_bw(0,dx-1,y ,y ,bp,cs,2)==2 + && get_bw(0,dx-1,y+1,y+1,bp,cs,2)==2) Break; + /* down (right) */ + i=get_line2(x,y,(5*dx+4)/8,dy-1,bp,cs,100); + j=get_line2(x,y,(6*dx+4)/8,dy-1,bp,cs,100); if(j>i) dbg[2]=i=j; + if(i<95) Break; + /* down (left) */ + dbg[3]=i=get_line2(x, y,(2*dx+4)/8,dy-1,bp,cs,100); if(i<95) Break; // straight up + /* check for lower gap at bottom */ + dbg[4]=i=get_bw( x, x,dy-1-dy/8,dy-1,bp,cs,1); if(i==1) Break; + dbg[5]=i=get_line2( dx/4,dy/4, 0,0,bp,cs,101); if(i<95) Break; // upper left gap + dbg[6]=i=get_line2(dx-1-dx/4,dy/4,dx-1,0,bp,cs,101); if(i<95) Break; // upper right gap + MSG(fprintf(stderr,"%d %d %d %d %d %d %d",dbg[0],dbg[1],dbg[2],dbg[3],dbg[4],dbg[5],dbg[6]);) + Setac(box1,(wchar_t)'*',ad); + break; + } + // --- test * six egdes (jagges? beames?) what is the right english word? ---- + for(ad=d=100;dx>4 && dy>4;){ + DBG( wchar_t c_ask='*'; ) + if (sdata->holes.num > 0) Break; /* tolerant against a tiny hole */ + if( num_cross(0,dx-1, dy/8, dy/8,bp,cs) != 3 + && num_cross(0,dx-1, 1+dy/8, 1+dy/8,bp,cs) != 3) Break; + if( num_cross(0,dx-1,dy-2-dy/8,dy-2-dy/8,bp,cs) != 3) Break; + if( num_cross(0 , 0, 0,dy-1,bp,cs) != 2) Break; + if( num_cross(dx-1,dx-1, 0,dy-1,bp,cs) != 2) Break; + if( num_cross(0,dx-1,dy/2,dy/2,bp,cs) != 1) Break; + if( num_cross( 0 ,dx/8,dy/2,dy/2,bp,cs) != 0) Break; + if( num_cross(dx-1-dx/8,dx-1,dy/2,dy/2,bp,cs) != 0) Break; + if (dx>5) { + dbg[0]=i=get_line2(0,dy-2-dy/8,dx-1,dy/8,bp,cs,100); if(i<95) Break; // black upwarts beam + dbg[1]=i=get_line2(0,dy/8,dx-1,dy-2-dy/8,bp,cs,100); if(i<95) Break; // black downwards beam + /* check vertical line */ + dbg[2]=i=get_line2(dx/2,0,dx/2, dy-1,bp,cs,100); if(i<95) Break; + } + MSG(fprintf(stderr,"%d %d %d %d %d %d",dbg[0],dbg[1],dbg[2],dbg[3],dbg[4],dbg[5]);) + Setac(box1,(wchar_t)'*',98); + break; + } + // --- test @ - a popular char should be detectable! added in version v0.2.4a5 + if(bc==UNKNOWN) + for(ad=d=99;dx>5 && dy>7;){ + DBG( wchar_t c_ask='@'; ) + if (sdata->holes.num > 3) Break; /* tolerant against a tiny hole */ + if (loop(bp, 0,dy/2,dx,cs,0,RI)>dx/4) Break; + if (loop(bp,dx-1,dy/2,dx,cs,0,LE)>dx/4) Break; + if (loop(bp,dx/2,dy-1,dy,cs,0,UP)>dx/8) Break; + if (loop(bp,dx/2, 0,dy,cs,0,DO)>dx/8) Break; + /* ..@@@@..<- 8*10 example + .@@..@@. + @@....@@ + @@..@@@@< + @@.@@.@@ + @@.@@.@@ + @@..@@@. + @@...... + .@@...@@ + ..@@@@@.<- */ + x=6*dx/16; + y=dy/2; + i=num_cross(0,dx-1,y,y,bp,cs); + if (i<3 || i>4) Break; + if( i != 4 && dx>8 ) ad=98*ad/100; + + i=num_cross(x,x,0,dy-1,bp,cs); if (i<2) Break; + if (i!=4) { j=num_cross(x+1,x+1,0,dy-1,bp,cs); + if (abs(4-j)<abs(i-4)) i=j; } + if (i!=4) { j=num_cross(x+2,x+2,0,dy-1,bp,cs); + if (abs(4-j)<abs(i-4)) i=j; } + if (i<3 || i>4) Break; + if (i!=4) ad=97*ad/100; + if( num_cross(0, x,y,y,bp,cs) != 2 ) Break; + if( num_cross(x,dx-1,y,y,bp,cs) != 2 ) Break; + if( num_cross(x,x,0, y,bp,cs) != 2 ) Break; + if( num_cross(x,x,y,dy-1,bp,cs) != 2 ) Break; + if (dx>7) { + // if( num_hole(x0,x1,y0,y1,box1->p,cs,NULL) != 1 ) Break; + if (sdata->holes.num != 1) Break; + if( num_hole(x0+dx/8,x1-3*dx/16,y0+dy/8,y1-dy/8,box1->p,cs,NULL) != 1 ) Break; + } + Setac(box1,(wchar_t)'@',ad); + break; + } + // --- test paragraph v0.2.6 + if(bc==UNKNOWN && hchar) + for(ad=d=100;dx>4 && dy>15;){ + DBG( wchar_t c_ask='$'; ) + if (sdata->holes.num > 3) break; /* tolerant against a tiny hole */ + if( get_bw( 0,dx/2,3*dy/4,3*dy/4,bp,cs,1) == 1 ) break; + if( get_bw(3*dx/4,dx-1,3*dy/4,3*dy/4,bp,cs,1) == 0 ) break; + if( get_bw( 0,dx/4, dy/4, dy/4,bp,cs,1) == 0 ) break; + if( get_bw( dx/2,dx-1, dy/4, dy/4,bp,cs,1) == 1 ) break; + if( get_bw(dx/2,dx/2, 0, dy/4,bp,cs,1) == 0 ) break; + if( get_bw(dx/2,dx/2,dy-1-dy/4, dy-1,bp,cs,1) == 0 ) break; + if( num_cross(dx/2,dx/2,0,dy-1,bp,cs) != 4 ) break; + if( num_cross(x0,x1,y0+dy/2,y0+dy/2,box1->p,cs) != 2 ) break; + if( num_hole( x0,x1,y0+dy/4,y1-dy/4,box1->p,cs,NULL) != 1 ) break; + Setac(box1,SECTION_SIGN,96); + break; // paragraph=0xA7=167 + } + + return bc; +} + +/* ----------------------- partx -------------------------------- */ +static wchar_t ocr0px(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int i,j,d,x,y,x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + i1,i2,i3,i4,j1,cs=sdata->cs; /* tmp-vars */ + int ya,ad; /* used for store significant points of char */ + wchar_t ac,bc=UNKNOWN; // bestletter + int hchar; // char is higher than e + int gchar; // char has ink lower than m3 + // --- hchar --- gchar ------------------------- + hchar=0;if( 2*y0<=2*box1->m2-(box1->m2-box1->m1) ) hchar=1; + gchar=0;if( 2*y1>=2*box1->m3+(box1->m4-box1->m3) ) gchar=1; + // if the char is slightly moved down correction can be done + if ( y0<box1->m2 && y1>box1->m3 && 2*y1<box1->m3+box1->m4) // moved + if( 2*(y0-(y1-box1->m3))<=2*box1->m2-(box1->m2-box1->m1) ) hchar=1; + + /* reserved for special chars, to test at the end */ + // --- test 'ff' --------------------------------------------------- + // ToDo: better check and call test 'f' and 'f' with subboxes + if( bc==UNKNOWN ) + for(ad=98;dx>4 && dy>6;){ // Dec00 body copied from H + DBG( wchar_t c_ask='f'; ) + if (sdata->holes.num > 2) break; /* tolerant against a tiny hole */ + if( num_cross(0,dx-1, dy/4 , dy/4 ,bp,cs) != 2 + && num_cross(0,dx-1,3*dy/16,3*dy/16,bp,cs) != 2 ) break; + if( num_cross(0,dx-1,3*dy/4 ,3*dy/4 ,bp,cs) != 2 + && num_cross(0,dx-1,3*dy/4+1,3*dy/4+1,bp,cs) != 2 ) break; + if( loop(bp,0 ,dy/8,dx,cs,0,RI) + + loop(bp,dx-1,dy/8,dx,cs,0,LE)>dx/2 ) break; // ~A + for( j1=0,i=1,y=y0+dy/10; y<y1-dy/10 && i; y++ ) // 2 vertikal lines + { j=loop(box1->p,x0 ,y,dx,cs,0,RI) + +loop(box1->p,x1 ,y,dx,cs,0,LE); + if( j>10*dx/16 ) i=0; if ( j>j1 ) j1=j; } + if( !i ) break; + for( x=dx/4; x<dx/2; x++ ){ // lower gap + y=loop(bp,x ,dy-1,dy,cs,0,UP); + if ( y > 3*dy/8 ) break; + if ( 10*y > dy ){ /* italic */ + i=loop(bp,x ,dy-y,dx,cs,0,RI); + if( i>1 && y+loop(bp,x+i-1,dy-y,dy,cs,0,UP)>3*dy/8 ) break; + } + } if( x>=dx/2 ) break; + x=loop(box1->p,x0 ,y1-dy/8,dx,cs,0,RI) + +loop(box1->p,x1 ,y1-dy/8,dx,cs,0,LE); + for( i=1,y=dy/4; y<dy-1-dy/4 && i; y++ ) // max - min width + { j=loop(bp,0 ,y,dx,cs,0,RI) + +loop(bp,dx-1,y,dx,cs,0,LE); if( j-x>dx/5 ) i=0; } + if( !i ) break; // ~K Jul00 + for( i=0,ya=y=y0+dy/4; y<y1-dy/3; y++ ) // horizontal line + { j=loop(box1->p,x0 ,y,dx,cs,0,RI); + j=loop(box1->p,x0+j,y,dx,cs,1,RI); if( j>i ) { i=j; ya=y; } } + if( i<=dx/2 ) break; ya-=y0; + if( num_cross(0,dx-1,ya ,ya ,bp,cs) != 1 + && num_cross(0,dx-1,ya+1,ya+1,bp,cs) != 1 ) break; /* Dec00 */ + for( y=ya; y<dy-dy/4; y++ ) // ~M Dec00 + if( num_cross(0,dx-1,y ,y ,bp,cs) > 2 + && num_cross(0,dx-1,y+1,y+1,bp,cs) > 2 ) break; + if ( y<dy-dy/4 ) break; + for(i=1,x=x0+dx/2;x<=x1-dx/4 && i;x++){ + if( get_bw( x, x,y0 ,y0+dy/4,box1->p,cs,1) == 0 ) i=0; + } if( !i ) break; + for(i=1,x=x0+dx/4;x<=x1-dx/4 && i;x++){ + if( get_bw( x, x,y1-dy/4,y1 ,box1->p,cs,1) == 0 ) i=0; + } if( i ) break; + for(i=1,x=x0+dx/4;x<=x1-dx/4 && i;x++){ + if( num_cross(x,x,y0+dy/8,y1-dy/8, box1->p,cs) == 1 ) i=0; + } if( i ) break; + for(i=1,y=y0;y<=y0+dy/4 && i;y++){ + if( num_cross(x0,x1,y,y, box1->p,cs) == 2 ) i=0; + } if( i ) break; + for(i=1,y=y1-dy/4;y<=y1 && i;y++){ + if( num_cross(x0,x1,y,y, box1->p,cs) == 2 ) i=0; + } if( i ) break; + if( num_cross(x0 ,x0+dx/8 ,y0+dy/8 ,y0 ,box1->p,cs) != 0 ) ad=96*ad/100; + if( get_bw(x1-dx/8, x1 , y0, y0+dy/8,box1->p,cs,1) != 1 ) break; + if( get_bw(x0 , x0+dx/8, y1-dy/8, y1,box1->p,cs,1) != 1 ) break; + i1=loop(bp,dx-1, dy/4,dx,cs,0,LE); if(i1>dx/2) break; + i2=loop(bp,dx-1, dy/2,dx,cs,0,LE); if(i2<i1-dx/4 || i2>i1+dx/8) break; + i3=loop(bp,dx-1,dy-1-dy/4,dx,cs,0,LE); if(i3<i2-dx/4 || i3>i2+dx/8) break; + if(abs(i1+i3-2*i2)>dx/16+1) break; + if( num_hole(x0,x1,y0+dy/4,y1,box1->p,cs,NULL) != 0 ) break; + if (!hchar) ad=96*ad/100; + if (!gchar) ad=99*ad/100; + ac=LATIN_SMALL_LIGATURE_FF; + Setac(box1,ac,ad); + break; + } + // --- test ae --------------------------------------------------- + if( bc==UNKNOWN ) + for(ad=98;dx>4 && dy>6;){ // provisorium + DBG( wchar_t c_ask=LATIN_SMALL_LETTER_AE; ) + if (sdata->holes.num > 4) Break; /* tolerant against a tiny hole */ + if( num_cross( dx/4,dx-1,3*dy/16,3*dy/16,bp,cs) != 2 + && num_cross(dx-1-dx/4,dx-1,3*dy/16,3*dy/16,bp,cs) != 1 ) Break; + if( num_cross(0,dx-1,3*dy/ 4,3*dy/ 4,bp,cs) < 2 ) Break; + if( num_cross(0,dx-1, 0, dy-1,bp,cs) < 3 ) Break; + if( num_cross(dx-1,0, 0, dy-1,bp,cs) < 3 ) Break; + if( num_cross(0,dx-1, dy/16, dy/16,bp,cs) < 2 ) + if( num_cross(0,dx-1,1+dy/16,1+dy/16,bp,cs) < 2 ) Break; + if( num_cross(0,dx-1,dy-1-dy/16,dy-1-dy/16,bp,cs) < 2 ) Break; + for( x=0,i2=y=dy/4; y<3*dy/4; y++ ){ + j=loop(bp,0,y,dx,cs,0,RI); if(j>x) { i2=y; x=j; } + } if( x<dx/4 || x>3*dx/4 ) Break; + for( x=0,i4=y=dy/4; y<3*dy/4; y++ ){ + j=loop(bp,dx-1,y,dx,cs,0,LE); if(j>x) { i4=y; x=j; } + } if( x<dx/4 || x>3*dx/4 ) Break; + for( x=0,i4=y=dy/8; y<3*dy/4; y++ ){ + j=loop(bp,dx-1 ,y,dx,cs,0,LE); + j=loop(bp,dx-1-j,y,dx,cs,1,LE); + if(j>x) { i4=y; x=j; } + } if( x<dx/4 ) Break; + if( num_hole(x0,x0+3*dx/4,y0+dy/4,y1,box1->p,cs,NULL) != 1 ) Break; + if( num_hole(x0+dx/2-1,x1,y0,y1-dy/4,box1->p,cs,NULL) != 1 ) Break; + ac=LATIN_SMALL_LETTER_AE; + Setac(box1,ac,ad); + if (ad>=100) return ac; + break; + + } + // --- test AE --------------------------------------------------- + if( bc==UNKNOWN ) + for(ad=98;dx>5 && dy>6;){ // provisorium + DBG( wchar_t c_ask=LATIN_CAPITAL_LETTER_AE; ) + if (sdata->holes.num > 2) Break; /* tolerant against a tiny hole */ + if( num_cross(0,dx-1,3*dy/16,3*dy/16,bp,cs) < 2 ) Break; + if( num_cross(0,dx-1,3*dy/ 4,3*dy/ 4,bp,cs) < 2 ) Break; + if( num_cross(0,dx-1, 0, dy-1,bp,cs) < 3 ) Break; + if( num_cross(0,dx-1, dy/16, dy/16,bp,cs) != 1 + && num_cross(0,dx-1, dy/32, dy/32,bp,cs) != 1 + && num_cross(0,dx-1, 0, 0,bp,cs) != 1 ) Break; + // check for upper horizontal line + j=loop(bp,dx-1 ,0,dx,cs,0,LE); x=j; + j=loop(bp,dx-1-j,0,dx,cs,1,LE); + i=loop(bp,dx-1 ,1,dx,cs,0,LE); if (i<x) x=i; + i=loop(bp,dx-1-i,1,dx,cs,1,LE); + if (i>j) j=i; + if (x>dx/8) Break; + if (j<dx/4) Break; + for( x=dx,i1=i3=0,i2=y=dy/4; y<3*dy/4; y++ ){ + j=loop(bp, 0,y,dx,cs,0,RI); if(j>x) break; x=j; + j=loop(bp, j,y,dx,cs,1,RI); if(j>i1) { i1=j; i2=y; } + j=loop(bp,dx-1 ,y,dx,cs,0,LE); + j=loop(bp,dx-1-j,y,dx,cs,1,LE); if(j>i3) { i3=j; i4=y; } + } if( y<3*dy/4 || i1<dx/4-1 || i3<dx/4-1) Break; + for( i1=i3=0,y=0; y<dy/8; y++ ){ + j=loop(bp,dx-1 , y,dx,cs,0,LE); + j=loop(bp,dx-1-j, y,dx,cs,1,LE); if(j>i1) { i1=j; } + j=loop(bp,dx-1 ,dy-1-y,dx,cs,0,LE); + j=loop(bp,dx-1-j,dy-1-y,dx,cs,1,LE); if(j>i3) { i3=j; } + } if( i1<=dx/4 || i3<=dx/4 ) Break; + for( x=dx-1-dx/8; x>dx/2; x-- ){ // look for right the E + if( num_cross(x,x, 0,dy-1,bp,cs) == 3 ) + if( num_cross(x,x, 0,dy/4,bp,cs) == 1 ) + if( num_cross(x-1,dx-1-dx/8,3*dy/4,3*dy/4,bp,cs) == 0 ) + if( num_cross(x,x,3*dy/4,dy-1,bp,cs) == 1 ) break; + } if (x<=dx/2) Break; // not found + if (sdata->holes.num != 1) Break; + if( num_hole(x0,x0+3*dx/4,y0,y1-dy/4,box1->p,cs,NULL) != 1 ) Break; + // if( num_hole(x0, x1,y0,y1 ,box1->p,cs,NULL) != 1 ) Break; + ac=LATIN_CAPITAL_LETTER_AE; + Setac(box1,ac,ad); + if (ad>=100) return ac; + break; + + } + // --- test /0 /o /O O_WITH_STROKE ----------------------------------------- + for(ad=99;dx>4 && dy>4;){ // provisorium + DBG( wchar_t c_ask=LATIN_SMALL_LETTER_O_WITH_STROKE; ) + if (sdata->holes.num > 3) Break; /* tolerant against a tiny hole */ + if( num_cross( 0,dx-1,dy/2,dy/2,bp,cs) != 3 ) Break; + if( num_cross(dx/2,dx/2, 0,dy-1,bp,cs) != 3 ) Break; + if (loop(bp,dx-1,3*dy/8,dx,cs,0,RI)>dx/8) Break; + if (loop(bp, 0,5*dy/8,dx,cs,0,RI)>dx/8) Break; + if( num_cross( 0,dx-1, 0, 0,bp,cs) > 2 ) Break; + if( num_cross(dx/4,dx-1, 0, 0,bp,cs) > 2 ) Break; + if( num_cross( 0,dx-1,dy-1,dy-1,bp,cs) > 2 ) Break; + if( num_cross( 0,3*dx/4,dy-1,dy-1,bp,cs) > 2 ) Break; + if( num_cross( 0, 0, 0,dy-1,bp,cs) > 2 ) Break; + if( num_cross(dx-1,dx-1, 0,dy-1,bp,cs) > 2 ) Break; + if( num_cross( 0, 0,dy/4,dy-1,bp,cs) > 2 ) Break; + if( num_cross(dx-1,dx-1, 0,3*dy/4,bp,cs) > 2 ) Break; + i1 =loop(bp,dx-1 , 0,dx,cs,0,LE); if( i1>dx/8 ) Break; + i1+=loop(bp,dx-1-i1, 0,dx,cs,1,LE); if( i1>dx/3 ) Break; i1=dx-1-i1; + i2 =loop(bp, 0,dy-1,dx,cs,0,RI); if( i2>dx/8 ) Break; + for(y=1;y<dy-1;y++){ + x=i1+y*(i2-i1)/dy-dx/8; if(x<0)x=0; + j=loop(bp,x,y,dx,cs,0,RI); if( j>3*dx/16 ) break; + } if( y<dy-1 ) Break; + if( num_cross( 0 ,dx/4,dy/2,dy/2,bp,cs) != 1 ) Break; + if( num_cross(dx-1-dx/4,dx-1,dy/2,dy/2,bp,cs) != 1 ) Break; + if( num_cross(dx/4,dx-1-dx/4,dy/2,dy/2,bp,cs) != 1 ) Break; + if (sdata->holes.num != 2) Break; + // if( num_hole(x0,x1,y0,y1,box1->p,cs,NULL) != 2 ) Break; + + if ( hchar && 2*y0<box1->m1+box1->m2 ) + ac=LATIN_CAPITAL_LETTER_O_WITH_STROKE; + else ac=LATIN_SMALL_LETTER_O_WITH_STROKE; + Setac(box1,ac,ad); + if (ad>=100) return ac; + break; + + } + // --- test /c /C C_WITH_STROKE CENT_SIGN -------------------------- + // here only the version with a continuously vertical line (not broken variant) + if( bc==UNKNOWN ) + for(ad=98;dx>4 && dy>4;){ // provisorium + DBG( wchar_t c_ask=CENT_SIGN; ) + if (sdata->holes.num > 2) Break; /* tolerant against a tiny hole */ + if( num_cross( 0,dx-1,dy/2,dy/2,bp,cs) != 2 ) Break; + if( num_cross(0,dx-1-dx/4,dy/2,dy/2,bp,cs) != 2 ) Break; + if( num_cross(dx/2,dx/2, 0,dy-1,bp,cs) != 3 ) Break; + if( num_cross( 0,dx-1, 0, 0,bp,cs) > 2 ) Break; + if( num_cross(dx/4,dx-1, 0, 0,bp,cs) > 2 ) Break; + if( num_cross( 0,dx-1,dy-1,dy-1,bp,cs) > 2 ) Break; + if( num_cross( 0,3*dx/4,dy-1,dy-1,bp,cs) > 2 ) Break; + if( num_cross( 0, 0, 0,dy-1,bp,cs) > 2 ) Break; + if( num_cross(dx-1,dx-1, 0,dy-1,bp,cs) > 3 ) Break; + if( num_cross( 0, 0,dy/4,dy-1,bp,cs) > 2 ) Break; + if( num_cross(dx-1,dx-1, 0,3*dy/4,bp,cs) > 3 ) Break; + i1 =loop(bp,dx-1 , 0,dx,cs,0,LE); if( i1>dx/4 ) Break; + i1+=loop(bp,dx-1-i1, 0,dx,cs,1,LE); if( i1>dx/4 ) Break; i1=dx-1-i1; + i2 =loop(bp, 0,dy-1,dx,cs,0,RI); if( i2>dx/4 ) Break; + for(y=0;y<dy;y++){ + x=i1+y*(i2-i1)/dy; if(x>dx/16+1) x-=dx/16+1; + j=loop(bp,x,y,dx,cs,0,RI); // fprintf(stderr,"\n x=%d j=%d",x,j); + if( j>(dx+4)/8 ) ad=96*ad/100; + if( j>(dx+2)/4 ) break; + } if( y<dy ) Break; + if( num_cross( 0 ,dx/4,dy/2,dy/2,bp,cs) != 1 ) Break; + if( num_cross(dx-1-dx/4,dx-1,dy/2,dy/2,bp,cs) != 0 ) Break; + if( num_cross(dx/4,dx-1-dx/4,dy/2,dy/2,bp,cs) != 1 ) Break; + // if( num_hole(x0,x1,y0,y1,box1->p,cs,NULL) != 1 ) Break; + if (sdata->holes.num != 1) Break; + + ac=CENT_SIGN; + Setac(box1,ac,ad); + if (ad>=100) return ac; + break; + + } + // --- test EURO_CURRENCY_SIGN ----------------------------------------- + if( bc==UNKNOWN ) + for(ad=98;dx>4 && dy>6;){ // provisorium + DBG( wchar_t c_ask='&'; ) + if (sdata->holes.num > 1) break; /* tolerant against a tiny hole */ + if( num_cross(dx/2,dx/2, 0,dy-1,bp,cs) != 4 ) break; + if( num_cross( 0,dx-1, 0, 0,bp,cs) != 1 ) break; + if( num_cross( 0,dx-1,dy-1,dy-1,bp,cs) != 1 ) break; + if( num_cross( 0,dx-1,dy/2,dy/2,bp,cs) != 1 ) break; + for(i=0,y=dy/4;y<dy-dy/4-1;y++){ // check if no gap on left side + x=loop(bp,0,y,dx,cs,0,RI); if( x>dx/4 ) break; + j=loop(bp,x,y,dx,cs,1,RI); if( j>i ) i=j; + } if( y<dy-dy/4-1 || i<dx/2 ) break; + for(y=dy/4;y<dy-dy/4-1;y++){ // check for right horizontal gap + x=loop(bp,dx-1,y,dx,cs,0,LE); if( x>dx/2 ) break; + } if( y>=dy-dy/4-1 ) break; + // if( num_hole(x0,x1,y0,y1,box1->p,cs,NULL) != 0 ) break; + if (sdata->holes.num != 0) break; + ac=EURO_CURRENCY_SIGN; + Setac(box1,ac,ad); + if (ad>=100) return ac; + break; + } + // --- test LETTER_C_WITH_CEDILLA --------------------------------------------------- + if (bc==UNKNOWN) + if (gchar) + for(ad=98;dx>3 && dy>6;){ // provisorium + DBG( wchar_t c_ask='c'; ) + if (sdata->holes.num > 0) break; /* no tolerant against tiny holes */ + j=loop(bp,dx-1,dy/16 ,dy,cs,0,LE); + x=loop(bp,dx-1,dy/16+1,dy,cs,0,LE); if (x<j) j=x; + if (3*x>dx) Break; // ~4 ocr-b + if( num_cross(0,dx-1,3*dy/16,3*dy/16,bp,cs) > 2 ) break; + if( num_cross(0,dx-1, 0, dy-1,bp,cs) < 2 ) break; + if( num_cross(0,dx-1, dy/16, dy/16,bp,cs) > 2 ) break; + for( x=dx,i2=y=dy/4; y<3*dy/4; y++ ){ + j=loop(bp,0,y,dx,cs,0,RI); if(j<x) { i2=y; x=j; } + } if( x>0 ) break; i1=x; + for( x=0,i4=y=dy/4; y<5*dy/8; y++ ){ + j=loop(bp,dx-1,y,dx,cs,0,LE); if(j>x) { i4=y; x=j; } + } if( x<dx/2 ) break; i3=x; + j =loop(bp,dx/2,0,dy,cs,0,DO); + j+=loop(bp,dx/2,j,dy,cs,1,DO); if(j>dy/4) break; + j =loop(bp,dx/2,j,dy,cs,0,DO); if(j<dy/2) break; + j =loop(bp,dx-1 ,dy-1-dy/8,dx,cs,0,LE); if(j<dx/4 || 4*j>3*dx) break; + j =loop(bp,dx-1-j/2,dy-1-dy/8,dy,cs,0,UP); if(j>dy/2) break; // ~() + // if( num_hole(x0,x1,y0,y1,box1->p,cs,NULL) != 0 ) break; + if (sdata->holes.num) break; + if( hchar ) ac= LATIN_CAPITAL_LETTER_C_WITH_CEDILLA; + else ac= LATIN_SMALL_LETTER_C_WITH_CEDILLA; + Setac(box1,ac,ad); + if (ad>=100) return ac; + break; + + } + // --- test # --------------------------------------------------- + for(ad=99;dx>4 && dy>4;){ // never sure? + DBG( wchar_t c_ask='#'; ) + if (sdata->holes.num > 2) Break; /* tolerant against a tiny hole */ + if (sdata->holes.num < 1) Break; + if( num_cross(0,dx-1, dy/8, dy/8,bp,cs) != 2 ) Break; + if( num_cross(0,dx-1,dy-1-dy/8,dy-1-dy/8,bp,cs) != 2 ) Break; + if( num_cross(0,dx-1, dy/2, dy/2,bp,cs) != 2 ) Break; + if( num_cross(0,dx/2, dy/2, dy/2,bp,cs) != 1 ) Break; + /* fat "#" have only small ends on left and right side, we tolerate this */ + j=loop(bp, 0,dy/8,dx,cs,0,RI); if(j<1 || j<dx/16) Break; if (j<dx/8) {ad=ad*96/100;} + j=loop(bp, 0,dy/2,dx,cs,0,RI); if(j<1 || j<dx/16 || j>=dx/2) Break; if (j<dx/8) {ad=ad*96/100;} + j=loop(bp,dx-1,dy/2,dx,cs,0,LE); if(j<1 || j<dx/16 || j>=dx/2) Break; if (j<dx/8) {ad=ad*96/100;} + j=loop(bp,dx-1,dy-1,dx,cs,0,LE); if(j<1 || j<dx/16) Break; if (j<dx/8) {ad=ad*96/100;} + for( i1=i3=0,y=dy/4; y<dy/2; y++ ){ + j=loop(bp,0, y,dx,cs,0,RI); if(j>3*dx/4) { i1=0; break; } + j=loop(bp,j, y,dx,cs,1,RI); if(j>i1) { i1=j; } + j=loop(bp,0,dy-1-y,dx,cs,0,RI); if(j>3*dx/4) { i1=0; break; } + j=loop(bp,j,dy-1-y,dx,cs,1,RI); if(j>i3) { i3=j; } + } + if (i1<dx-dx/4 || i3<dx-dx/4) Break; + if (i1<dx-dx/8) ad=97*ad/100; + if (i3<dx-dx/8) ad=97*ad/100; + if (sdata->holes.num != 1) {ad=95*ad/100;} + if( num_hole(x0+dx/8,x1-dx/8,y0+dy/8,y1-dy/8,box1->p,cs,NULL) != 1 ) Break; + // if( num_hole(x0 ,x1 ,y0 ,y1 ,box1->p,cs,NULL) != 1 ) Break; + + ac=(wchar_t) '#'; + if( gchar ) {ad=99*ad/100;} + Setac(box1,ac,ad); + if (ad>=100) return ac; + break; + } + // --- test bullet, full_box, grabbed cursor, ZapfDingBats_156 + if (bc==UNKNOWN) + for(ad=96;dx>4 && dy>4 && 2*dx>dy;){ // provisorium + DBG( wchar_t c_ask='#'; ) + if( get_bw(x0,x1,y0,y1,box1->p,cs,2) != 0 ) break; + ac=BULLET; + if (gchar && !hchar) ad=80*ad/100; + Setac(box1,ac,ad); + if (ad>=100) return ac; + break; + } + /* --- test | (vertical line, could be a I or l) --- */ + for(ad=99;dy>4 && 2*dx<dy;){ /* v0.44 */ + DBG( wchar_t c_ask='|'; ) + /* test if everything is filled black */ + if( get_bw(x0+dx/8,x1-dx/8,y0+dy/9,y1-dy/9,box1->p,cs,2) != 0 ) break; + /* more unsure if the borders are not exact */ + if( get_bw(x0 ,x0+dx/8,y0+dy/9,y1-dy/9,box1->p,cs,2) != 0 ) ad=99*ad/100; + if( get_bw(x1-dx/8,x1 ,y0+dy/9,y1-dy/9,box1->p,cs,2) != 0 ) ad=99*ad/100; + if( get_bw(x0+dx/8,x1-dx/8,y0 ,y0+dy/8,box1->p,cs,2) != 0 ) ad=99*ad/100; + if( get_bw(x0+dx/8,x1-dx/8,y1-dy/8,y1 ,box1->p,cs,2) != 0 ) ad=99*ad/100; + if (3*dx<dy) ad=98*ad/100; + if (4*dx<dy) ad=99*ad/100; + if (box1->m2 && 2*y1> box1->m2+box1->m3) Break; + if (box1->m2 && 3*y1>2*box1->m2+box1->m3) ad=95*ad/100; + ac='|'; + if (!hchar) ad=98*ad/100; + Setac(box1,ac,ad); + break; + } + // --- test % --------------------------------------------------- + for(ad=100;dx>5 && dy>7;){ // provisorium + DBG( wchar_t c_ask='%'; ) + if (sdata->holes.num > 2) break; /* tolerant against a tiny hole */ + if( num_cross(x0,x1 ,y0+dy/4,y0+dy/4,box1->p,cs) != 3 + && num_cross(x0,x1 ,y0+dy/8,y0+dy/8,box1->p,cs) != 3 ) Break; + if( num_cross(x0,x1+dx/4,y1-dy/4,y1-dy/4,box1->p,cs) != 3 + && num_cross(x0,x1+dx/4,y1-dy/8,y1-dy/8,box1->p,cs) != 3 ) Break; + if( num_cross(x0,x1, y0, y1,box1->p,cs) < 4 + && num_cross(x0+dx/8,x1, y0, y1,box1->p,cs) < 4 + && num_cross(x0,x1+dx/4, y0, y1,box1->p,cs) < 4 + && dx>7 && dy>15) Break; + if( num_cross(x0,x1, y0, y1,box1->p,cs) !=5 ) ad=99*ad/100; + + if (dx>7 && dy>12) { + if( num_hole(x0 ,x1 ,y0,y1-dy/4,box1->p,cs,NULL) != 1 ) Break; + if( num_hole(x0+dx/4,x1+dx/4,y0+dy/4,y1,box1->p,cs,NULL) != 1 ) Break; + if( num_hole(x0 ,x1+dx/4,y0,y1 ,box1->p,cs,NULL) != 2 ) Break; + } else ad=98*ad/100; + // use box1->p instead of b, because % is a sum of 3 objects + if ( loop(box1->p,x0,y0 ,dx,cs,0,RI) + <= loop(box1->p,x0,y0+dy/16+1,dx,cs,0,RI) ) ad=96*ad/100; // X + if ( loop(box1->p,x1,y1 ,dx,cs,0,LE) + <= loop(box1->p,x1,y1-1-dy/16,dx,cs,0,LE) ) ad=96*ad/100; // X + for (x=0;x<dx;x++) { /* look for a vertical line and break if found */ + if ( get_bw(x0+x,x0+x,y0+dy/8,y1-dy/8,box1->p,cs,2) != 2 ) break; + } if (x<dx) Break; // ~gluedVI + if (gchar) ad=98*ad/100; + ac=(wchar_t) '%'; + Setac(box1,ac,ad); + if (ad>=100) return ac; + break; + } + // --- test Omega --------------------------------------------------- + for(ad=d=99;dx>7 && dy>7;){ // min 3x4 + DBG( wchar_t c_ask=GREEK_CAPITAL_LETTER_OMEGA; ) + if( get_bw(x0 , x0+dx/2,y0+dy/2 , y0+dy/2,box1->p,cs,1) != 1 ) Break; + if( get_bw(x1-dx/2 , x1 ,y0+dy/2 , y0+dy/2,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2 , x0+dx/2,y0 , y0+dy/2,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2 , x0+dx/2,y0+dy/3 , y1-dy/3,box1->p,cs,1) != 0 ) Break; + + if( num_cross(x0+dx/2,x0+dx/2,y0 , y1-dy/3,box1->p,cs) != 1 ) Break; + if( num_cross(x0+dx/3,x1-dx/3,y0 , y0 ,box1->p,cs) != 1 ) // AND + if( num_cross(x0+dx/3,x1-dx/3,y0+1 , y0+1 ,box1->p,cs) != 1 ) Break; + if( num_cross(x0+dx/3,x1-dx/3,y1 , y1 ,box1->p,cs) != 2 ) // against "rauschen" + if( num_cross(x0+dx/3,x1-dx/3,y1-1 , y1-1 ,box1->p,cs) != 2 ) Break; + if( num_cross(x0 ,x0 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) + if( num_cross(x0+1 ,x0+1 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) Break; + if( num_cross(x1 ,x1 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) + if( num_cross(x1-1 ,x1-1 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) Break; + if (sdata->holes.num) Break; + // if( num_hole(x0,x1,y0,y1,box1->p,cs,NULL) != 0 ) break; + + if( loop(bp,0 ,0 ,x1-x0,cs,0,RI)<= + loop(bp,0 ,2 ,x1-x0,cs,0,RI) ) Break; + if( loop(bp,dx/2,dy-dy/4,x1-x0,cs,0,RI)>dx/4 + || loop(bp,dx/2,dy-dy/4,x1-x0,cs,0,LE)>dx/4 ) Break; + if( loop(bp,dx/2,3*dy/8,x1-x0,cs,0,RI)<dx/4 + || loop(bp,dx/2,3*dy/8,x1-x0,cs,0,LE)<dx/4 ) Break; + + i=loop(bp,0,dy-1-dy/16,x1-x0,cs,0,RI); if(i>dx/8) Break; + x=loop(bp,i,dy-1-dy/16,x1-x0,cs,1,RI); i+=x; if(i<3*dx/8 || i>dx/2) Break; + x=loop(bp,i,dy-1-dy/16,x1-x0,cs,0,RI); i+=x; if(i<dx/2 || i>5*dx/8) Break; + x=loop(bp,i,dy-1-dy/16,x1-x0,cs,1,RI); i+=x; if(i<7*dx/8) Break; + + /* look for a vertikal gap at lower end */ + for( x=dx/4;x<3*dx/4;x++ ){ + i=loop(bp,x,dy-1,y1-y0,cs,0,UP); + if( i>3*dy/4 ) break; + } + if( x>=3*dx/4 ) Break; + + if( !hchar ) ad=60*ad/100; + bc=GREEK_CAPITAL_LETTER_OMEGA; + Setac(box1,bc,ad); + break; + } + + return bc; +} + +// -------------------- OCR engine ;) ---------------------------- +wchar_t ocr0(struct box *box1, pix *bp, int cs){ + // pix p=*(box1->p); + int i,j,d,x,y,x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1; + int dx=x1-x0+1,dy=y1-y0+1, /* size */ + rx,ry,r1,r2,i1,i2,ad; /* tmp-vars */ + // ad,ac will be used in future + wchar_t bc = UNKNOWN; // bestletter + wchar_t um = SPACE; // modifier '" + int hchar; // char is higher than e + int gchar; // char has ink lower than m3 + int aa[4][4]; /* corner points, see xX, (x,y,dist^2,vector_idx) v0.41 */ + ocr0_shared_t sdata; // data used in all subfunctions + + sdata.box1=box1; + sdata.bp=bp; + sdata.cs=cs; + // --- hchar --- gchar ------------------------- + hchar=0;if( y0 < box1->m2-(box1->m2-box1->m1)/2 ) hchar=1; + gchar=0;if( y1 > box1->m3+(box1->m4-box1->m3)/2 ) gchar=1; + // if the char is slightly moved down correction can be done + if ( y0<box1->m2 && y1>box1->m3 && 2*y1<box1->m3+box1->m4) // moved + if( 2*(y0-(y1-box1->m3))<=2*box1->m2-(box1->m2-box1->m1) ) hchar=1; + + sdata.hchar=hchar; + sdata.gchar=gchar; + + /* search for nearest points to the 4 courners, typical for xX */ + /* this is faster as calling nearest_frame_vector 4 times */ + aa[0][0]=aa[1][0]=aa[2][0]=aa[3][0]=(x0+x1)/2; /* set to center */ + aa[0][1]=aa[1][1]=aa[2][1]=aa[3][1]=(y0+y1)/2; /* set to center */ + aa[0][2]=aa[1][2]=aa[2][2]=aa[3][2]=2*sq(128); /* distance to box edges */ + aa[0][3]=aa[1][3]=aa[2][3]=aa[3][3]=0; /* vector index */ + /* searching for 4 diagonal line ends */ + for (i=0;i<box1->num_frame_vectors[0];i++) { + x=box1->frame_vector[i][0]; /* take a vector */ + y=box1->frame_vector[i][1]; + /* distance to upper left end, normalized to 128 */ + j=0; d=sq((x-x0)*128/dx)+sq((y-y0)*128/dy); + // fprintf(stderr," setaa i= %2d xy= %3d %3d d=%5d aa[3]=%2d\n",i,x-x0,y-y0,d,aa[0][3]); + if (d<aa[j][2]) { aa[j][0]=x; aa[j][1]=y; aa[j][2]=d; aa[j][3]=i; } + /* distance to lower left end */ + j=1; d=sq((x-x0)*128/dx)+sq((y-y1)*128/dy); + if (d<aa[j][2]) { aa[j][0]=x; aa[j][1]=y; aa[j][2]=d; aa[j][3]=i; } + /* distance to lower right end */ + j=2; d=sq((x-x1)*128/dx)+sq((y-y1)*128/dy); + if (d<aa[j][2]) { aa[j][0]=x; aa[j][1]=y; aa[j][2]=d; aa[j][3]=i; } + /* distance to upper right end */ + j=3; d=sq((x-x1)*128/dx)+sq((y-y0)*128/dy); + if (d<aa[j][2]) { aa[j][0]=x; aa[j][1]=y; aa[j][2]=d; aa[j][3]=i; } + } + for (i=0;i<16;i++) sdata.aa[i/4][i%4]=aa[i/4][i%4]; + + /* extract number position and size of holes and store in a table + * - hole coordinates are relative to box (x-x0,y-y0) + */ + sdata.holes.num=0; + if (box1->num_frames>0) // speedup v0.42 + num_hole(x0,x1,y0,y1,box1->p,cs,&sdata.holes); // call once + // printf(" num_holes=%d\n",sdata.holes.num); + + /* + after division of two glued chars, boundaries could be wrong, + check this first (ToDo: only if a flag set?) + */ + if (2*y0 < box1->m2+box1->m3) + if (box1->m4>box1->m3 && 2*box1->y1>box1->m4+box1->m3){ + /* could be a "I" from divided "Ij" or "Ig" */ + for(y=(box1->m3+box1->m2)/2;2*y<box1->m3+box1->m4;y++) + if( get_bw(x0,x1,y,y,box1->p,cs,1)==0 ) break; + if(2*y<box1->m3+box1->m4) + if( get_bw((x0+x1)/2,(x0+x1)/2,y,box1->m4,box1->p,cs,1)==0 ){ + /* be sure, ~_ */ + if (y>y0) y1=box1->y1=y; + } + } + + DBG( IFV fprintf(stderr,"\nDBG L%d (%d,%d): ",__LINE__,box1->x0,box1->y0); ) + DBG( IFV out_b(box1,sdata.bp,0,0,dx,dy,160); ) + DBG( IFV fprintf(stderr,"# aa[] %d %d %d %d %d %d %d %d (4 corners)" + " d= %d %d %d %d", + aa[0][0]-x0,aa[0][1]-y0,aa[1][0]-x0,aa[1][1]-y0, + aa[2][0]-x0,aa[2][1]-y0,aa[3][0]-x0,aa[3][1]-y0, + aa[0][2], aa[1][2], aa[2][2], aa[3][2]);) + DBG( IFV fprintf(stderr,"\n# holes %d gchar=%d hchar=%d",sdata.holes.num, gchar, hchar);) + + // --- test thin lines - --------------------------------- + for( ad=100; 2*dy<box1->m3-box1->m2 && 3*dx>=4*dy && dx>2; ){ // min 3x3 (small font) + DBG( wchar_t c_ask='-'; ) + if( get_bw(x0+dx/8+1,x1-dx/8-1,y0+dy/8+((dy>2)?1:0), + y1-dy/8-((dy>2)?1:0),box1->p,cs,2)==2 ) break; + if( box1->dots ) { Setac(box1,'=',97);break; } + if (dx<=2*dy) ad=98*ad/100; + if (dx<=3*dy) ad=99*ad/100; + if (!box1->m4) ad=96*ad/100; + else { + if (y1>=box1->m3) { + if ( dx<2*dy) ad=98*ad/100; + if (2*dx<3*dy) ad=98*ad/100; + Setac(box1,'_',ad); + break; + } + } + Setac(box1,'-',ad); if (ad>=100) return '-'; + break; + } + // --- test thin lines = --------------------------------- + for( ; dy>2 && dx>2; ){ // min 3x3 (small font) + DBG( wchar_t c_ask='='; ) + for( y=y0;y<y1;y++) // remove upper empty space + if( get_bw(x0+dx/10,x1-dx/10,y ,y ,box1->p,cs,1)==1 ) break; + if( get_bw(x0+dx/10,x1-dx/10,y ,y ,box1->p,cs,2)==2 ) break; + if( get_bw(x0 ,x1 ,(y+y1)/2,(y+y1)/2,box1->p,cs,1)==1 ) break; + if( get_bw(x0+dx/10,x1-dx/10,y1 ,y1 ,box1->p,cs,2)==2 ) break; + Setac(box1,'=',100); + return '='; + } + // --- test dots : --------------------------------- + for( ad=100; dy>2 && dy>=2*dx; ){ // max 3x3 (small font) + + DBG( wchar_t c_ask=':'; ) + // check the gap hight + for( i1=dy/16;i1<dy/2;i1++) + if( get_bw(x0+dx/8,x1-dx/8,y0+i1,y0+i1,box1->p,cs,1)==0 ) break; + if (i1>=dy/2) break; + for( i2=dy/16;i2<dy/2;i2++) + if( get_bw(x0+dx/8,x1-dx/8,y1-i2,y1-i2,box1->p,cs,1)==0 ) break; + if (i2>=dy/2) Break; + MSG(fprintf(stderr,"gap y12 %d %d",i1,i2);) + + if (box1->m3 && y1>box1->m3) ad=98*ad/100; // ~; + if (box1->m3 && 2*y0> box1->m2+box1->m1) ad=98*ad/100; // ~i + if (gchar) ad=99*ad/100; + ad=ad-abs(i1-i2)/dy*20; + if (abs(i1-dx)>dy/4) Break; // round or quadratic dots? + if (abs(i1-dx)>dy/8) ad=98*ad/100; + if (abs(i2-dx)>dy/4) Break; // round or quadratic dots? + if (abs(i2-dx)>dy/8) ad=98*ad/100; + if (box1->dots!=1) ad=96*ad/100; + Setac(box1,':',ad); // dx<=3 ad-- + if (ad>=100) return ':'; + break; + } + // --- test dots ; --------------------------------- + if( 2*y0> box1->m2+box1->m1 ) // ~i + if( 4*y1>=3*box1->m3+box1->m2 ) // ~: + for( ad=100; dy>5 && dx>1 && dy>2*dx; ){ // max 3x3 (small font) + DBG( wchar_t c_ask=';'; ) + // better would it be to detect round pixelcluster on top + // check high of upper and lower dot + for( i1=0;i1<dy/2;i1++) + if( get_bw(x0,x1,y0+i1,y0+i1,box1->p,cs,1)==0 ) break; + if (i1>=dy/2) break; + for( i2=0;i2<dy/2;i2++) + if( get_bw(x0,x1,y1-i2,y1-i2,box1->p,cs,1)==0 ) break; + if (i2<i1) break; + + /* test for horizontal symmetry ~i */ + for (y=0;y<dy;y++) for (x=0;x<dx/2;x++) + if ((getpixel(bp,x,y)<cs)!=(getpixel(bp,dx-1-x,y)<cs)) { y=dy+1; break; } + if (y==dy) ad=96*ad/100; /* ~i */ + + if (i2==i1 && y1<=box1->m3) ad=97*ad/100; + if (i2-i1<dy/8) ad=99*ad/100; + Setac(box1,';',ad); // dx<=3 ad-- + if (ad>=100) return ';'; + break; + } + // --- first test small dots . --------------------------------- + if( 3*dy<box1->m4-box1->m1 && abs(dx-dy)<(dx+dy)/4+2 + && 3*y1>=(2*box1->m3+ box1->m2) // dot near baseline? + && 5*y0>=(3*box1->m3+2*box1->m2) ){ // Jul00 + DBG( wchar_t c_ask='.'; ) + d=0; r1=60;r2=140; ad=99; + for(x=x0;x<=x1;x++)for(y=y0;y<=y1;y++){ /* circle equation */ + rx=100*(2*x-(x0+x1))/dx; // normalize to 15bit number + ry=100*(2*y-(y0+y1))/dy; + if( rx*rx + ry*ry < r1*r1 ) if( getpixel(box1->p,x,y)>=cs ){ d++;x=x1+1;y=y1+1; } + if( rx*rx + ry*ry > r2*r2 ) if( getpixel(box1->p,x,y)< cs ){ d++;x=x1+1;y=y1+1; } + // fprintf(stderr,"\nDBG . x= %3d %3d r= %6d %6d %6d", rx, ry, rx*rx+ry*ry, r1*r1, r2*r2); + } + if(d==0) + if( loop(box1->p,x0,y0,x1-x0,cs,0,RI) + <= loop(box1->p,x0,y1,x1-x0,cs,0,RI) + || loop(box1->p,x1,y0,x1-x0,cs,0,LE) + >= loop(box1->p,x1,y1,x1-x0,cs,0,LE) ) + { + bc='.'; if (box1->dots) { Setac(box1,':',ad); ad=98*ad/100; } + Setac(box1,bc,ad); + } + } + // --- first test small dots , --------------------------------- + if( 3*dy<2*(box1->m4-box1->m1) + && 2*y0> box1->m2+box1->m3 + && (2*dx<3*dy + || get_bw(0,dx/2,dy/2,dy-1,bp,cs,1)==0) ){ // ocr-a-, + DBG( wchar_t c_ask=','; ) + ad=100; bc=','; + if (dy==1 && dx==1) ad=98*ad/100; + if (dy==2 && dx==1) ad=99*ad/100; // this is a problem case + if (dx>=dy) ad=99*ad/100; + if( 2*dy >= box1->m4-box1->m1) ad=98*ad/100; + if( loop(box1->p,x0,y0,x1-x0,cs,0,RI) /* simple line */ + > loop(box1->p,x0,y1,x1-x0,cs,0,RI) + && loop(box1->p,x1,y0,x1-x0,cs,0,LE) + < loop(box1->p,x1,y1,x1-x0,cs,0,LE) ) { ad=99*ad/100; } + else { /* with upper circle */ + if( loop(box1->p,x0,(y0+y1+1)/2,x1-x0,cs,0,RI)<dx/2 ) ad=98*ad/100; + if( loop(box1->p,x1, y1 ,x1-x0,cs,0,LE)<dx/2 ) ad=98*ad/100; + if( loop(box1->p,x0,y1-((dy>5)?1:0),x1-x0,cs,0,LE)>(dx+1)/2 ) + if( loop(box1->p,x0, y1 ,x1-x0,cs,0,LE)>(dx+1)/2 ) ad=96*ad/100; + } + if(box1->dots==1) { Setac(box1,';',ad); ad=99*ad/100; } + Setac(box1,bc,ad); + } + // --- first test small dots '" --------------------------------- + if( 2*dy < box1->m4 -box1->m1+1 + && 2*y0 < box1->m2 +box1->m3 + && 3*y1 < box1->m2+2*box1->m3+2 ){ + DBG( wchar_t c_ask='\''; ) + ad=100; bc='\''; + if (2*y1 >= box1->m2+box1->m3) { ad=96*ad/100; MSG({}) } // ~! + if (3*y1>=2*box1->m2+box1->m3) { ad=96*ad/100; MSG({}) } + if (get_bw(x0,x1,(box1->m2+box1->m3)/2,box1->m4,box1->p,cs,1)!=0) + { ad=98*ad/100; MSG({}) } + if (dx>4 + && num_cross(x0,x1,y1,y1,box1->p,cs) == 2) { // " " + bc='"'; + // ocr-a-" has no gap! + if ( get_bw((x0+x1)/2,(x0+x1)/2,y0,y1,box1->p,cs,1)!=0 ) ad=96*ad/100; + } else { + if ( num_cross(x0,x1, y0 , y0 ,box1->p,cs)!=1) ad=96*ad/100; + if ( num_cross(x0,x1,(y0+y1)/2,(y0+y1)/2,box1->p,cs)!=1) ad=98*ad/100; + if (dx>dy) { ad=96*ad/100; MSG({}) } + } + if (2*y0 > box1->m1+box1->m2) ad=99*ad/100; + Setac(box1,bc,ad); + if (ad>=100) return bc; + } + // --- TILDE ~ --------------------------------- + if( 2*dy<box1->m4-box1->m1 && dx>=dy && dx>3 && dy>1 + && 2*y0< box1->m1+box1->m2 + && 3*y1<2*box1->m2+box1->m3 ){ + if( loop(box1->p,x0,y0,dx,cs,0,RI) + > loop(box1->p,x0,y1,dx,cs,0,RI) + && loop(box1->p,x1,y0,dx,cs,0,LE) + < loop(box1->p,x1,y1,dx,cs,0,LE) + && num_cross(x0,x1,y0,y0,box1->p,cs) == 2 + && num_cross(x0,x1,y1,y1,box1->p,cs) == 2 ) { + DBG( wchar_t c_ask='~'; ) + bc=TILDE; + Setac(box1,bc,99); + } + } + // --- CIRCUMFLEX, hat ^ --------------------------------- + if( 2*dy<box1->m4-box1->m1 && dx>=dy && dx>2 && dy>1 + && 2*y0< box1->m1+box1->m2 + && 3*y1<2*box1->m2+box1->m3 ){ + DBG( wchar_t c_ask='^'; ) + if( ( loop(box1->p,x0,y0 ,dx,cs,0,RI) + > loop(box1->p,x0,y1 ,dx,cs,0,RI)-dx/8 + || loop(box1->p,x0,y0 ,dx,cs,0,RI) + > loop(box1->p,x0,y1-1,dx,cs,0,RI)-dx/8 ) + && ( loop(box1->p,x1,y0 ,dx,cs,0,LE) + > loop(box1->p,x1,y1 ,dx,cs,0,LE)-dx/8 + || loop(box1->p,x1,y0 ,dx,cs,0,LE) + > loop(box1->p,x1,y1-1,dx,cs,0,LE)-dx/8 ) + && num_cross(x0,x1,y0 ,y0 ,box1->p,cs) == 1 + && ( num_cross(x0,x1,y1 ,y1 ,box1->p,cs) == 2 + || num_cross(x0,x1,y1-1,y1-1,box1->p,cs) == 2 )) { + bc='^'; + Setac(box1,bc,99); + } + } + // ------------------------------------------------------ +// if( dots==1 ){ um='\''; } +#if 0 /* ToDo: change to vectors, call here or in whatletter */ + if (box1->dots==0) { // i-dots ??? (if dots==0 is wrong) + y=box1->m1; + for(;y<y0+dy/2;y++)if( get_bw(x0+dx/4,x1,y,y,box1->p,cs,1)==1) break; + { i1=y; + if( y<y0+dy/4 ) + for(;y<y0+dy/2;y++)if( get_bw(x0,x1,y,y,box1->p,cs,1)==0) break; + if( y<y0+dy/2 && 5*(y-i1+1)>box1->m2-box1->m1){ + testumlaut(box1,cs,2,&um); // set modifier + new y0 ??? + + } + } + } +#else + um = box1->modifier; +#endif + if ( /* um==ACUTE_ACCENT || */ um==DIAERESIS){ + for(y=y1;y>y0;y--) + if( get_bw(x0,x1,y,y,box1->p,cs,1)==0) { y0=y; dy=y1-y0+1; break; } // scan "a "o "u + } + + // --- test numbers 0..9 --- separated for faster compilation + if( JOB->cfg.only_numbers ) return ocr0n(&sdata); + + // bc=ocr1(box1,bp,cs); + if(bc!=UNKNOWN && box1->num_ac>0 && box1->wac[0]==100) + return bc; // for fast compilable tests + + // ------ separated for faster compilation + // ToDo: inser ocr0_shared_t here and split into a,b,cC,d,e,f,g9,... +#define IF_NOT_SURE if(bc==UNKNOWN || box1->num_ac==0 || box1->wac[0]<100) + + IF_NOT_SURE bc=ocr0_eE(&sdata); + IF_NOT_SURE bc=ocr0_f(&sdata); + IF_NOT_SURE bc=ocr0_bB(&sdata); + IF_NOT_SURE bc=ocr0_dD(&sdata); + IF_NOT_SURE bc=ocr0_F(&sdata); + IF_NOT_SURE bc=ocr0_uU(&sdata); + IF_NOT_SURE bc=ocr0_micro(&sdata); + IF_NOT_SURE bc=ocr0_vV(&sdata); + IF_NOT_SURE bc=ocr0_rR(&sdata); + IF_NOT_SURE bc=ocr0_m(&sdata); + IF_NOT_SURE bc=ocr0_tT(&sdata); + IF_NOT_SURE bc=ocr0_sS(&sdata); + IF_NOT_SURE bc=ocr0_gG(&sdata); + IF_NOT_SURE bc=ocr0_xX(&sdata); + IF_NOT_SURE bc=ocr0_yY(&sdata); + IF_NOT_SURE bc=ocr0_zZ(&sdata); + IF_NOT_SURE bc=ocr0_wW(&sdata); + IF_NOT_SURE bc=ocr0_aA(&sdata); + IF_NOT_SURE bc=ocr0_cC(&sdata); + IF_NOT_SURE bc=ocr0_lL(&sdata); + IF_NOT_SURE bc=ocr0_oO(&sdata); + IF_NOT_SURE bc=ocr0_pP(&sdata); + IF_NOT_SURE bc=ocr0_qQ(&sdata); + IF_NOT_SURE bc=ocr0_iIjJ(&sdata); + IF_NOT_SURE bc=ocr0_n(&sdata); + IF_NOT_SURE bc=ocr0_M(&sdata); + IF_NOT_SURE bc=ocr0_N(&sdata); + IF_NOT_SURE bc=ocr0_h(&sdata); + IF_NOT_SURE bc=ocr0_H(&sdata); + IF_NOT_SURE bc=ocr0_k(&sdata); + IF_NOT_SURE bc=ocr0_K(&sdata); + IF_NOT_SURE bc=ocr0n(&sdata); + IF_NOT_SURE bc=ocr0_brackets(&sdata); + IF_NOT_SURE bc=ocr0p9(&sdata); + IF_NOT_SURE bc=ocr0px(&sdata); + + + if(box1->num_ac==0 && bc!=UNKNOWN) fprintf(stderr,"<!--ERROR 576-->"); + if(box1->num_ac>0 && box1->wac[0]>95) box1->c=bc=box1->tac[0]; + /* will be removed later, only fix old things */ + for (i=0;i<box1->num_ac;i++) if (box1->tac[i]==bc) { bc=box1->tac[0]; } + + return bc; +} + + diff --git a/lib/gocr/ocr0.h b/lib/gocr/ocr0.h new file mode 100644 index 00000000..4c67d77b --- /dev/null +++ b/lib/gocr/ocr0.h @@ -0,0 +1,63 @@ +#ifndef _OCR0_H +#define _OCR0_H +#include "pgm2asc.h" + +/* ---------------------------------------------------------------- + - functions with thousand of lines make the compilation very slow + therefore the ocr0-function is splitted in subfunctions + - shared data used often in ocr0-subroutines are stored + in ocr0_shared structure. + * ------------------------------------------------------------ */ + +typedef struct ocr0_shared { /* shared variables and properties */ + + struct box *box1; /* box in whole image */ + pix *bp; /* extracted temporarly box, cleaned */ + int cs; /* global threshold value (gray level) */ + + /* ToDo: or MACROS: X0 = box1->x0 */ + int x0, x1, y0, y1; /* box coordinates related to box1 */ + int dx, dy; /* size of box */ + int hchar, gchar; /* relation to m1..m4 */ + int aa[4][4]; /* corner points, see xX (x,y,dist^2,vector_idx) v0.41 */ + holes_t holes; /* list of holes (max MAX_HOLES) */ + +} ocr0_shared_t; + +/* tests for umlaut */ +int testumlaut(struct box *box1, int cs, int m, wchar_t *modifier); +/* detect chars */ +wchar_t ocr0(struct box *box1, pix *b, int cs); +/* detect numbers */ +wchar_t ocr0n(ocr0_shared_t *sdata); + +static inline int sq(int x) { return x*x; } /* square */ + +/* + * go from vector j1 to vector j2 and measure maximum deviation of + * the steps from the line connecting j1 and j2 + * return the squared maximum distance + * in units of the box size times 1024 + */ +int line_deviation( struct box *box1, int j1, int j2 ); + +/* + * search vectors between j1 and j2 for nearest point a to point r + * example: + * + * r-> $$...$$ $ - mark vectors + * @@$..@@ @ - black pixels + * @@$..@@ . - white pixels + * @@@@.$@ + * a-> @@$@$@@ + * @$.@@@@ + * @@..$@@ + * @@..$@@ + * j1 --> $$...$$ <-- j2 + * + * ToDo: vector aa[5] = {rx,ry,x,y,d^2,idx} statt rx,ry? + * j1 and j2 must be in the same frame + * return aa? + */ +int nearest_frame_vector( struct box *box1, int j1, int j2, int rx, int ry); +#endif diff --git a/lib/gocr/ocr0n.c b/lib/gocr/ocr0n.c new file mode 100644 index 00000000..c833c588 --- /dev/null +++ b/lib/gocr/ocr0n.c @@ -0,0 +1,1254 @@ +/* ocr-engine numbers only */ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000-2007 Joerg Schulenburg + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + see README for EMAIL-address + + OCR engine (c) Joerg Schulenburg + first engine: rule based --- numbers 0..9 + +*/ + +#include <stdlib.h> +#include <stdio.h> +/* #include "pgm2asc.h" */ +#include "ocr0.h" +#include "ocr1.h" +#include "gocr.h" + +/* only for debugging and development */ +#define IFV if(JOB->cfg.verbose&4) +#define MM {IFV fprintf(stderr,"\nDBG %c L%04d (%d,%d): ",(char)c_ask,__LINE__,box1->x0,box1->y0);} + +/* the old debug mode (0.40) was only for a special char, for another char + * code must be recompiled with C_ASK='char' + * new debug mode (0.41) explains why char is declined or accepted as ABC... + * the output can be filtered by external scripts + * ToDo: we could reduce output to filter string + */ +#ifndef DO_DEBUG /* can be defined outside */ +#define DO_DEBUG 0 /* 0 is the default */ +#endif + +/* this macro is for debugging output: "if char is declined, why?" */ +#if DO_DEBUG /* 0=Work mode, 1=debugging mode */ +// Setac: output, that char is choosen with a probability +// Break: output, why the char is not choosen +// MSG: debugging functions for char C_ASK, mostly messages +// DBG: definitions usefull only for debugging +#define Setac(box1,ac,ad) { MM;IFV fprintf(stderr,"setac %d",ad);setac(box1,ac,ad); } +#define Break { MM;IFV fprintf(stderr,"break"); break; } +#define MSG(x) { MM;IFV x } +#define DBG(x) x +#else +#define Setac(box1,ac,ad) setac(box1,ac,ad) +#define Break break +#define MSG(x) +#define DBG(x) +#endif + +/* extern "C"{ */ + +// OCR engine ;) +wchar_t ocr0n(ocr0_shared_t *sdata){ + struct box *box1=sdata->box1; + pix *bp=sdata->bp; + int d,x,y,x0=box1->x0,x1=box1->x1,y0=box1->y0,y1=box1->y1; + int dx=x1-x0+1,dy=y1-y0+1,cs=sdata->cs; // size + int xa,xb,ya,yb, /* tmp-vars */ + i1,i2,i3,i4,i,j; + int (*aa)[4]=sdata->aa; /* corner-points, (x,y,dist^2,vector_idx) */ + wchar_t bc=UNKNOWN; // best char + int ad=0; // propability 0..100 + int hchar=sdata->hchar; // char is higher than 'e' + int gchar=sdata->gchar; // char has ink lower than m3 + int dots=box1->dots; + // --- test 5 near S --------------------------------------------------- + for(ad=d=100;dx>2 && dy>4;){ // min 3x4 + DBG( char c_ask='5'; ) + if (sdata->holes.num > 1) Break; /* be tolerant */ + if( num_cross( dx/2, dx/2,0,dy-1,bp,cs)!=3 + && num_cross(5*dx/8,3*dx/8,0,dy-1,bp,cs)!=3 ) Break; + + i1=loop(bp,dx-1,dy-1,dx,cs,0,LE); + i2=loop(bp,dx-1,dy-2,dx,cs,0,LE); + if (i2-i1 >= dx/4) Break; // ~{ 5x7font + + // get the upper and lower hole koords, y around dy/4 ??? + x=5*dx/8; + y =loop(bp,x,0,dy,cs,0,DO); if(y>dy/8) Break; + y +=loop(bp,x,y,dy,cs,1,DO); if(y>dy/4) Break; + i1 =loop(bp,x,y,dy,cs,0,DO)+y; if(i1>5*dy/8) Break; + i3=y=(y+i1)/2; // upper end can be shifted to the right for italic + x =loop(bp,0,y,dx,cs,0,RI); if(x>4*dx/8) Break; + x +=loop(bp,x,y,dx,cs,1,RI); if(x>5*dx/8) Break; + i1 =loop(bp,x,y,dx,cs,0,RI); i1=(i1+2*x)/2; // upper center (i1,i3) + y=11*dy/16; + x =loop(bp,dx-1 ,y,dx,cs,0,LE); if(x>dx/4) Break; + x +=loop(bp,dx-1-x,y,dx,cs,1,LE); if(x>dx/2) Break; + i2 =loop(bp,dx-1-x,y,dx,cs,0,LE); i2=dx-1-(i2+2*x)/2; // lower center x + + MSG( fprintf(stderr,"i1,i3=%d,%d i2=%d (upper+lower center)",i1,i3,i2);) + + y =loop(bp,i1,0,dy,cs,0,DO); + y +=loop(bp,i1,y,dy,cs,1,DO); + y =(3*y+i3)/4; + if( num_cross( i1, dx-1, y, y,bp,cs)>0 ){ /* S or serif5 ? */ + y =loop(bp,i1 ,i3,dy,cs,0,DO); + i =loop(bp,i1-1,i3,dy,cs,0,DO); + if (y>i ) ad=99*ad/100; /* looks like S */ + y =loop(bp,i1 ,i3,dy,cs,0,UP); + i =loop(bp,i1+1,i3,dy,cs,0,UP); + if (i<y ) ad=99*ad/100; /* looks like S */ + x =loop(bp,dx-1,0,dx,cs,0,LE); + i =loop(bp,dx-1,1,dx,cs,0,LE); + if (x>i ) ad=99*ad/100; /* looks like S */ + if( num_cross( 0, dx/2, dy-1, dy-1,bp,cs)>1 + && num_cross( dx/2,dx-1, 0, 0,bp,cs)>1 ) ad=98*ad/100; /* serifs */ + if (loop(bp,0,dy-1,dx,cs,0,RI)==0) ad=98*ad/100; /* S or 7segment */ + ad=99*ad/100; + } + + for(y=dy/5;y<3*dy/4;y++) // right gap? + if( num_cross(i1,dx-1,y,y,bp,cs)==0 ) break; + if( y==3*dy/4 ) Break; + + for(y=dy/4;y<=11*dy/16;y++) // left gap? + if( num_cross(0,i2,y,y,bp,cs)==0 ) break; + if( y>11*dy/16 ) Break; + + // if( num_hole( x0, x1, y0, y1, box1->p,cs,NULL) > 0 ) break; + if (sdata->holes.num>0) Break; + + // sS5 \sl z left upper v-bow ? + for(x=dx,i=y=dy/4;y<dy/2;y++){ + j=loop(bp,0,y,dx,cs,0,RI); if(j<x) { x=j; i=y; } + } y=i; + i1=loop(bp,0, dy/16 ,dx,cs,0,RI); + i2=loop(bp,0,(y+dy/16)/2 ,dx,cs,0,RI); + i =loop(bp,0,(y+dy/16)/2+1,dx,cs,0,RI); if( i>i2 ) i2=i; + i3=loop(bp,0, y ,dx,cs,0,RI); + i =loop(bp,0, y-1,dx,cs,0,RI); if( i<i3 ) i3=i; + if( 2*i2+1+dx/16 < i1+i3 ) Break; + + if( dy>=20 && dx<16 ) /* tall S */ + if( loop(bp,0, dy/5 ,dx,cs,0,RI) + ==loop(bp,0, dy/4 ,dx,cs,0,RI) + && + loop(bp,0, dy/10 ,dx,cs,0,RI) + >loop(bp,0, dy/4 ,dx,cs,0,RI) + && + loop(bp,0, 1 ,dx,cs,0,RI) + >loop(bp,0, dy/4 ,dx,cs,0,RI)+1 + && + loop(bp,dx-1, 0 ,dx,cs,0,LE) + >loop(bp,dx-1, 1 ,dx,cs,0,LE) ) Break; + + if( dy>=30 && dx>15 ) /* large S */ + if( loop(bp,dx/4,3*dy/10,dy,cs,1,DO)>0 ) // check start + if( loop(bp,dx-2,3*dy/4 ,dy,cs,1,UP)>0 ) // check end + if( num_cross(dx/4,dx-2,3*dy/10,3*dy/4,bp,cs)==1 ) Break; // connected? + + if( dy>17 && dx>9 ) /* S */ + if( loop(bp, 0,dy/2 ,dx,cs,0,RI)<dx/2 + || loop(bp, 0,dy/2-1 ,dx,cs,0,RI)<dx/2 ) + if( loop(bp,dx/4,3*dy/10,dy,cs,1,DO)>0 ) // check start + if( loop(bp,dx-2,2*dy/3 ,dy,cs,1,UP)>0 ) // check end + if( loop(bp, 0, dy/16,dx,cs,0,RI) + >= loop(bp,dx-1, dy-1-dy/16,dx,cs,0,LE) ) ad=ad*98/100; + if( loop(bp,dx-1, dy/16,dx,cs,0,LE) + >= loop(bp, 0, dy-1-dy/16,dx,cs,0,RI) + && loop(bp,dx-1, dy/16,dx,cs,0,LE) + >= loop(bp, 0, dy-1,dx,cs,0,RI) ) ad=ad*98/100; + + if ( gchar) ad=99*ad/100; + if (!hchar) ad=99*ad/100; + Setac(box1,(wchar_t)'5',ad); + if (ad==100) return '5'; + break; + + } + // --- test 1 --------------------------------------------------- + for(ad=d=100;dy>4 && dy>dx && 2*dy>box1->m3-box1->m2;){ // min 3x4 + DBG( char c_ask='1'; ) + if( dots==1 ) Break; + if (sdata->holes.num > 1) Break; /* be tolerant */ + + if( num_cross(0, dx-1, 0 , 0 ,bp,cs) != 1 + && num_cross(0, dx-1, 1 , 1 ,bp,cs) != 1 ) Break; + if( num_cross(0, dx-1,dy/2,dy/2,bp,cs) != 1 ) Break; + if( num_cross(0, dx-1,dy-1,dy-1,bp,cs) != 1 + && num_cross(0, dx-1,dy-2,dy-2,bp,cs) != 1 ) Break; + /* 5x7 micr + + ooo + .$. ooo + $@. oo + .$. oo + .@. ooooo + .$. ooooo + $@$ ooooo + + */ + + i4=0; // human font + if( num_cross(0, dx-1,3*dy/4,3*dy/4,bp,cs) != 2 ) { // except ocr-a + for( y=1; y<dy/2; y++ ){ + if( num_cross(0, dx-1, y , y ,bp,cs) == 2 ) break; + } if (y>=dy/2) ad=98*ad/100; + for( i=dy/8,y=7*dy/16;y<dy-1 && i;y++ ){ + if( num_cross(0, dx-1, y , y ,bp,cs) != 1 ) i--; + } if( dy>8 && !i ) Break; + } else { // ocr-a-1 + /* @@@.. + ..@.. + ..@.. + ..@.. + ..@.@ + ..@.@ + @@@@@ */ + i= loop(bp,dx/2,0,dy,cs,0,DO); + if (loop(bp,dx/2,i,dy,cs,1,DO)<dy-1) Break; + i= loop(bp,dx -1,dy-1-dy/16,dx,cs,0,LE); + if (loop(bp,dx-i-1,dy-1-dy/16,dx,cs,1,LE)<dx-1) Break; + i= loop(bp,0,dy/16,dx,cs,0,RI); + if (loop(bp,i,dy/16,dx,cs,1,RI)<dx/2) Break; + i4=1; + } + + if( num_cross(0, dx-1, 0 , 0 ,bp,cs) > 1 + && num_cross(0, dx-1, 1 , 1 ,bp,cs) > 1 ) Break; // ~/it_7 + + // calculate upper and lower mass center (without lower serif) + + x =loop(bp,0,7*dy/8-1,dx,cs,0,RI); i2=x; + x+=loop(bp,x,7*dy/8-1,dx,cs,1,RI)-1; i2=(i2+x)/2; + + i1=loop(bp,dx-1 ,1+0* dy/4,dx,cs,0,LE); i1=dx-1-i1-(x-i2)/2; + + x =(i1-i2+4)/8; i1+=x; i2-=x; + + if( get_line2(i1,0,i2,dy-1,bp,cs,100)<95 ) { // dont work for ocr-a-1 + i1=loop(bp,dx-1 ,1+0* dy/4,dx,cs,0,LE); i1=dx-1-i1; + if( get_line2(i1,0,i2,dy-1,bp,cs,100)<95 ) Break; + } + // upper and lower width + x =loop(bp,(i1+i2)/2,dy/2,dx,cs,1,RI); i=x; i3=0; + for(y=0;y<7*dy/8;y++) + if( loop(bp,i1+y*(i2-i1)/dy, y,dx,cs,1,RI)-i > 1+dx/8 ) break; + if(y<7*dy/8) ad=98*ad/100; // serif or ocr-a-1 ? + if(y<6*dy/8) ad=99*ad/100; /* MICR E-13B font Jan07 */ + if(y<4*dy/8) Break; +// out_x(box1); printf(" i12=%d %d\n",i1,i2); + x =loop(bp,i2,dy-1,dx,cs,1,LE); j=x; + x =loop(bp,i2,dy-2,dx,cs,1,LE); if(x>j)j=x; i=j; + x =loop(bp,i2,dy-1,dx,cs,1,RI); j=x; + x =loop(bp,i2,dy-2,dx,cs,1,RI); if(x>j)j=x; + if(abs(i-j)>1+dx/8) i3|=1; + if(i3) Break; +// out_x(box1);printf(" 11 i=%d j=%d i2=%d dx=%d\n",i,j,i1,dx); + // get most left upper point (i,j) + for(i=dx,j=y=0;y<7*dy/16;y++){ + x =loop(bp,0,y,dx,cs,0,RI); if(x<i) { i=x;j=y; } + } + if ( i1-i<7*dx/16 ) ad=ad*98/100; + if ( i1-i<6*dx/16 ) ad=ad*98/100; // 4*dx/8 => 7*dx/16 MICR E-13B font + if ( i1-i<4*dx/16 ) Break; + x =loop(bp,0,dy/2,dx,cs,0,RI); // right distance + j =loop(bp,x,dy/2,dx,cs,1,RI); // thickness + if( j>x+(dy+16)/32 ) ad=98*ad/100; // ~l but MICR E-13B font + x =loop(bp,0,0,dx,cs,0,RI); // straight line ??? + j =loop(bp,0,1,dx,cs,0,RI); if( j>x ) Break; // ~l + if( x==j ) j =loop(bp,0,dy/8,dx,cs,0,RI); if( j>x && !i4) Break; + if( x==j ) if(loop(bp,0,dy/4,dx,cs,0,RI)>x) { // ~l + // check micr-1 first before taken as 'l' + if (loop(bp,dx-1,dy/8,dx,cs,0,LE)<=dx/4 + && loop(bp,0,3*dy/4,dx,cs,1,RI)<dx-1) ad=97*ad/100; + } + x=j; +// j =loop(bp,0,2,dx,cs,0,RI); if( j>=x ) Break; x=j; // ~l +// j =loop(bp,0, 0,dx,cs,0,DO); if( !j ) Break; // ~7 + if( !hchar ) // ~ right part of n + if( loop(bp,dx-1, 1,dx,cs,0,LE)-dy/6 + > loop(bp,dx-1,dy/4,dx,cs,0,LE) + || get_bw(x1+1,x1+2,y0,y0+dy/8,box1->p,cs,1)==1 ) Break; // Mai00 + if( loop(bp,dx-1,3*dy/4,dx,cs,0,LE) > dx/2 + && get_bw(x1-dx/4,x1,y1-1,y1,box1->p,cs,1)==1 ) Break; // ~z Jun00 + + i=loop(bp, dx/8,0,dy,cs,0,DO); + for (y=dy,x=dx/2;x<3*dx/4;x++){ /* get upper end */ + j=loop(bp,x,0,dy,cs,0,DO); if (j<y) { y=j; } + } + if(y<dy/2 && y+dy/16>=i) ad=97*ad/100; // ~\tt l ??? ocr-a_1 + + if( loop(bp, 0, dy/8,dx,cs,0,RI) + -(dx-loop(bp,dx-1,7*dy/8,dx,cs,0,LE)) > dx/4 ) Break; // ~/ + + i= loop(bp, 0, 0,dy,cs,0,DO); // horizontal line? + if(dy>=12 && i>dy/8 && i<dy/2){ + if( loop(bp,dx-1,3*dy/16,dx,cs,0,LE)-dx/8 + >loop(bp,dx-1, i,dx,cs,0,LE) + || loop(bp,dx-1,3*dy/16,dx,cs,0,LE)-dx/8 + >loop(bp,dx-1, i+1,dx,cs,0,LE) ) Break; // ~t,~f + i= loop(bp, 0,dy-1-dy/32,dx,cs,0,RI); + x= loop(bp, 0,dy-2-dy/32,dx,cs,0,RI); if (i<x) x=i; + if( x-loop(bp, 0, 3*dy/4,dx,cs,0,RI)>dx/8 + && loop(bp,dx-1, 3*dy/4,dx,cs,0,LE)-dx/8 + >loop(bp,dx-1,dy-1-dy/32,dx,cs,0,LE) ) Break; // ~t + if( loop(bp, 0,i-1,dx,cs,0,RI)>1 && dx<6) { + ad=99*ad/100; + if ( loop(bp,dx-1,i-1,dx,cs,0,LE)>1 ) Break; // ~t + } + } + + if (dx>8){ + if (loop(bp,0,3*dy/4,dx,cs,0,RI)- + loop(bp,0,dy/2-1,dx,cs,0,RI)>dx/4) ad=95*ad/100; // ~3 + if (loop(bp,dx-1,dy/2-1,dx,cs,0,LE)- + loop(bp,dx-1,3*dy/4,dx,cs,0,LE)>dx/8) ad=95*ad/100; // ~3 + if (loop(bp,dx-1, dy/16,dx,cs,0,LE)- + loop(bp,dx-1, dy/4,dx,cs,0,LE)>dx/8) ad=95*ad/100; // ~23 + } + /* font 5x9 "2" recognized as "1" */ + i=loop(bp,dx-1-dx/8,dy-1,dy,cs,0,UP); + if (i<=dy/4) { + i+=loop(bp,dx-1-dx/8,dy-1-i,dy,cs,1,UP); + if (i<=dy/4) { + i=loop(bp,dx-1-dx/8,dy-1-i,dy,cs,0,LE); + if (2*i>=dx && loop(bp,dx/4,0,dy,cs,0,DO)<dy/2) { + if (dx<17) ad=98*ad/100; + if (dx<9) ad=97*ad/100; + } + } + } + + // looking for ### + // ..# pattern (its important, we dont want perp. lines as 1) + // ToDo: better check that we have exact one on top + for (i2=0,i=dx,y=0;y<dy/2;y++) { + j=loop(bp,0,y,dx,cs,0,RI); if (j<i) i=j; + if (j>i+dx/8) { break; } + } if (y>=dy/2) ad=95*ad/100; // Feb07 care plates, right black border + + if (sdata->holes.num > 0) Break; // mini holes should be filtered + if (!box1->m3 && ad>98) ad=98; else { + if (!hchar) ad=99*ad/100; + if (box1->y0>box1->m2) ad=98*ad/100; + if (box1->y1<(1*box1->m2+3*box1->m3)/4) ad=98*ad/100; + if (box1->y1-box1->y0<(box1->m3-box1->m1)/2) ad=98*ad/100; + if ( gchar) ad=99*ad/100; + } + + Setac(box1,(wchar_t)'1',ad); + break; + } + // --- test 2 old pixelbased - remove! ----------------------------- +#ifdef Old_pixel_based + for(ad=d=100;dx>2 && dy>4;){ // min 3x4 + DBG( char c_ask='2'; ) + if (sdata->holes.num > 1) Break; /* be tolerant */ + if( get_bw(x0+dx/2, x0+dx/2 , y1-dy/5, y1 ,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2, x0+dx/2 , y0 , y0+dy/5,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/8, x1-dx/3 , y1-dy/3, y1-dy/3,box1->p,cs,1) != 1 ) Break; + + if( get_bw(x1-dx/3, x1 , y0+dy/3 , y0+dy/3,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0 , x0+dx/ 8, y1-dy/16, y1 ,box1->p,cs,1) != 1 ) Break; + if( num_cross(x0, x1-dx/8, y0+dy/2, y0+dy/2,box1->p,cs) != 1 ) Break; + if( get_bw(x0, x0+dx/9 , y0 , y0 ,box1->p,cs,1) == 1 + && get_bw(x0, x0+dx/2 ,y0+3*dy/16,y0+3*dy/16,box1->p,cs,1) == 1 ) Break; + if( get_bw(x0, x0+dx/9 , y0 , y0 ,box1->p,cs,1) + != get_bw(x1-dx/9, x1 , y0 , y0 ,box1->p,cs,1) ) + { if (dx<6 && dy<9) ad=99*ad/100; else Break; } + // out_x(box1); + + for( x=x0+dx/4;x<x1-dx/6;x++ ) // C + if( num_cross( x, x, y0, y0+dy/2,box1->p,cs) == 2 ) break; + if( x>=x1-dx/6 ) Break; + + for( x=x0+dx/4;x<x1-dx/6;x++ ) // C, but acr-a + if( num_cross( x, x, y0+3*dy/8,y1,box1->p,cs) == 2 ) break; + if( x>=x1-dx/6 ) Break; + + for(i=1,y=y0;y<y0+dy/2;y++ ) + if( num_cross( x0, x1, y, y,box1->p,cs) == 2 ) i=0; + if( i ) ad=99*ad/100; // ToDo: ocr-a-2 should have 100% + + for(i=1,y=y0+dy/5;y<y0+3*dy/4;y++ ) + if( get_bw( x0, x0+dx/3, y, y,box1->p,cs,1) == 0 ) i=0; + if( i ) Break; + + x=x1-dx/3,y=y1; /* center bottom */ + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,UP,ST); if( y<y1-dy/5 ) Break; + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,ST,UP); if( y<y1-dy/4 ) ad=99*ad/100; + if( y<y1-dy/3 ) Break; + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,UP,ST); if( y<y0+dy/3 ) Break; y++; + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,RI,ST); + if( x<x1 ){ x--; // hmm thick font and serifs + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,UP,ST); if( y<y0+dy/2 ) Break; y++; + turmite(box1->p,&x,&y,x0,x1,y0,y1,cs,RI,ST); + if( x<x1 ) Break; + } + + // test ob rechte Kante ansteigend + for(x=0,y=dy/18;y<=dy/3;y++){ // rechts abfallende Kante/Rund? + i=loop(box1->p,x1,y0+y,dx,cs,0,LE); // use p (not b) for broken chars + if( i<x ) break; // rund + if( i>x ) x=i; + } + if (y>dy/3 ) Break; // z + + // hole is only allowed in beauty fonts + // if( num_hole( x0, x1, y0, y1,box1->p,cs,NULL) > 0 ) // there is no hole + // if( num_hole( x0, x0+dx/2, y0, y0+dy/2,box1->p,cs,NULL) == 0 ) // except in some beauty fonts + if (sdata->holes.num>0) + if (sdata->holes.hole[0].x1 >= dx/2 || sdata->holes.hole[0].y1 >= dy/2) + Break; + + i1=loop(bp,dx-1-dx/16,0,dy,cs,0,DO); // Jul00 + i2=loop(bp, dx/ 2,0,dy,cs,0,DO); if( i2+dy/32>=i1 ) Break; // ~z + i1=loop(bp,dx-1,dy-3*dy/16,dx,cs,0,LE); + i2=loop(bp, 0,dy-3*dy/16,dx,cs,0,RI); if( i2>i1 ) ad=98*ad/100; // ~i + if (dots) ad=98*ad/100; // i + if (loop(bp,dx-1,dy-1-dy/16,dx,cs,0,LE)>dx/4) ad=96*ad/100; // \it i + + if ((!hchar) && box1->m4!=0) ad=80*ad/100; + Setac(box1,(wchar_t)'2',ad); + if (ad==100) return '2'; + break; + } +#endif + // --- test 2 new edge based v0.44 -------------------------------------- + for(ad=d=100;dx>2 && dy>4;){ // min 3x4 + // rewritten for vectors 0.42 + int ld, i1, i2, i3, i4, i5, i6, i7; // line derivation + corners + DBG( wchar_t c_ask='2'; ) + if (sdata->holes.num > 0) Break; /* no hole */ + /* half distance to the center */ + d=2*sq(128/4); + /* now we check for the lower ends, must be near to the corner */ + if (aa[1][2]>d/4) Break; /* [2] = distance, ~7... */ + if (aa[2][2]>d/2) Break; /* [2] = distance, ~r... */ + if (aa[0][2]>d/1) Break; /* [2] = distance, ~d... */ + if (aa[3][2]>d/1) Break; /* [2] = distance, ~bhk... */ + /* searching for 4 notches between neighbouring ends */ + +/* + type A B + + 1OOO OO + 2 1 2 <- 6 + 7-> OOOO O + O O <- 5 + 3OO4 3OO4 +*/ + + /* get a point on the inner low left side of the J */ + i =box1->num_frame_vectors[0] - 1; + /* rightmost point on upper left side */ + i2=nearest_frame_vector(box1, aa[0][3], aa[1][3], x1+dx, y0+dy/4); + /* upper leftmost vector */ + i1=nearest_frame_vector(box1, aa[0][3], i2, x0-dx, (y0+y1)/2); + i3=aa[1][3]; + /* low leftmost vector */ + i5=nearest_frame_vector(box1, aa[2][3], aa[3][3], x0, y1); + /* low mostright vector */ + i4=nearest_frame_vector(box1, aa[1][3], i5, x1+dx, y1); + /* next local max_x-point after i5 */ + i6=i5; + for (i=i5;i!=aa[0][3];i=(i+1)%box1->num_frame_vectors[0]) { + if (box1->frame_vector[ i][0] + >box1->frame_vector[i6][0]) i6=i; // get next maximum + if (box1->frame_vector[ i][0]<x0+dx/3 + && box1->frame_vector[ i][1]<y0+dy/3 + && box1->frame_vector[i6][0]>x0+dx/2) break; // 5 + } + /* which type? ToDo: have a more sure algorithm */ + i7=nearest_frame_vector(box1, i2, i3, x0-dx/8, (y0+y1)/2); + if (box1->frame_vector[i7][0]<=x0+ dx/4 + && box1->frame_vector[i7][1]<=y0+2*dy/3) { + MSG(fprintf(stderr,"7-segment-type");) + } else { /* regular-book-type */ + if (aa[3][0]>=x1-dx/8 + && aa[3][1]<=y0+dy/8) ad=99*ad/100; + if (aa[0][0]<=x0+dx/8 + && aa[0][1]<=y0+dy/8) ad=99*ad/100; + if (aa[3][2]<=aa[1][2]) ad=97*ad/100; + } + // ToDo: output no=(x,y) + MSG(fprintf(stderr,"i1-7 %d %d %d %d %d %d %d",i1,i2,i3,i4,i5,i6,i7);) + if (i5==i6) Break; // ~+ + + if (box1->frame_vector[i5][1] + -box1->frame_vector[i6][1]<dy/4) Break; // ~5 + if (box1->frame_vector[i1][1]>y0+dy/2) Break; // not to low + if (box1->frame_vector[i1][0]>x0+dx/8) Break; + if (box1->frame_vector[i2][1]>(y0+ y1)/2) Break; + if (box1->frame_vector[i2][1]>(5*y0+3*y1)/8) ad=99*ad/100; + if (box1->frame_vector[i2][0]<(x0+x1+1)/2) Break; // fat tiny fonts? + if (box1->frame_vector[i2][0]<(x0+2*x1)/3) ad=99*ad/100; + if (box1->frame_vector[i3][0]>(3*x0+x1)/4) Break; + if (box1->frame_vector[i3][0]>(7*x0+x1)/8) ad=99*ad/100; + if (box1->frame_vector[i3][1]<(y0+3*y1)/4) Break; + if (box1->frame_vector[i3][1]>(y0+7*y1)/8) ad=99*ad/100; + /* check lower leftmost point from right side */ + if (box1->frame_vector[i5][0]>(x0+2*x1)/3) Break; + if (box1->frame_vector[i5][0]>(x0+ x1)/2) ad=98*ad/100; + if (box1->frame_vector[i5][0]>(2*x0+x1)/3) ad=99*ad/100; + if (box1->frame_vector[i5][1]<(3*y0+5*y1)/8) Break; + if (box1->frame_vector[i5][1]<(y0+3*y1)/4) ad=99*ad/100; + if (box1->frame_vector[i6][1]>(y0+2*y1)/3) Break; + if (box1->frame_vector[i6][1]>(y0+ y1)/2) ad=99*ad/100; + if (box1->frame_vector[i6][0]<(x0+3*x1)/4) Break; + if (box1->frame_vector[i6][0]<(x0+7*x1)/8) ad=99*ad/100; + + /* check for zZ */ + + /* check if lower left and right points are joined directly */ + ld=line_deviation(box1, i3, i4); + MSG(fprintf(stderr," i1-i2 %d %d dist= %d/%d",i1,i2,ld,2*sq(1024/4));) + if (ld >2*sq(1024/4)) Break; + if (ld > sq(1024/4)) ad=99*ad/100; + + if (box1->m3) { + if(!hchar){ ad=99*ad/100; } + if( gchar){ ad=99*ad/100; } + } else { if (ad==100) ad=99; } /* not 100% sure */ + Setac(box1,'2',ad); + if (ad==100) return '2'; + break; + } + // --- test 3 ------- + for(ad=d=100;dx>3 && dy>4;){ // dy<=dx nicht perfekt! besser mittleres + // min-suchen fuer m + DBG( char c_ask='3'; ) + if (sdata->holes.num > 1) Break; /* be tolerant */ + // if( get_bw(x0+dx/2,x0+dx/2,y0,y0+dy/4,box1->p,cs,1) == 0 ) Break; // ~4 + // if( get_bw(x0+dx/2,x0+dx/2,y1-dy/8,y1,box1->p,cs,1) == 0 ) Break; // ~4 + // if( num_cross(x0+dx/2,x0+dx/2,y0 ,y1,box1->p,cs) < 2 ) Break; + // if( num_cross(x0+dx/4,x0+dx/4,y1-dy/2,y1,box1->p,cs) == 0 ) Break; + if( get_bw(dx/2,dx/2, 0,dy/6,bp,cs,1) == 0 ) Break; // ~4 + if( get_bw(dx/2,dx-1, dy/6,dy/6,bp,cs,1) == 0 ) Break; // ~j + if( get_bw(dx/2,dx/2,dy-1-dy/8,dy-1,bp,cs,1) == 0 ) Break; // ~4 + if( num_cross(dx/2,dx/2,0 ,dy-1,bp,cs) < 2 // normal + && num_cross(dx/3,dx/3,0 ,dy-1,bp,cs) < 2 ) Break; // fat LCD + if( num_cross(dx/4,dx/4,dy-1-dy/2,dy-1,bp,cs) == 0 ) Break; + if( loop(bp,dx/2, 0 ,dy,cs,0,DO)>dy/4 ) Break; + if( loop(bp,dx/2, dy-1,dy,cs,0,UP)>dy/4 ) Break; + if( loop(bp,dx-1, dy/3,dy,cs,0,LE)>dy/4 /* 3 with upper bow */ + && loop(bp,dx-1, dy/8,dy,cs,0,LE)>dy/4 /* 3 with horizontal line */ + && loop(bp,dx/4, dy/8,dy,cs,1,RI)<dy/2 ) Break; + if( loop(bp,dx-1,2*dy/3,dy,cs,0,LE)>dy/4 ) Break; + if( loop(bp,dx-1,3*dy/4,dy,cs,0,LE)>dy/2 ) Break; // ~2 Feb06 + if( loop(bp,dx-1,7*dy/8,dy,cs,0,LE)>dy/2 ) Break; // ~2 Feb06 + // search upper right half circle + for( i3=x=0,i1=y=dy/5;y<dy/2;y++ ){ + i=loop(bp,0,y,dx,cs,0,RI); + if (i>x) { i3=x=i; i1=y; } + } i3--; if (i3<dx/3 && i3+1+loop(bp,i3+1,i1,dx,cs,1,RI)<3*dx/4) Break; + if (loop(bp,dx-1,i1,dx,cs,0,LE)>1+dx/8) ad=ad*99/100; // ~1 with a pixel + // search lower right half circle + for( i4=x=0,i2=y=dy-1-dy/8;y>=dy/2;y-- ){ + i=loop(bp,0,y,dx,cs,0,RI); + if( i>x ) { i4=x=i;i2=y; } + } i4--; if(i4<dx/3 && i4+1+loop(bp,i4+1,i2,dx,cs,1,RI)<3*dx/4) Break; + if (loop(bp,dx-1,i2,dx,cs,0,LE)>1+dx/8) ad=ad*99/100; // ~1 with a pixel + + for( x=xa=0,ya=y=dy/4;y<3*dy/4;y++ ){ // right gap, not on LCD-font + i=loop(bp,dx-1,y,dx,cs,0,LE); + if (i>=xa) { xa=i;ya=y;x=xa+loop(bp,dx-1-xa,y,dx,cs,1,LE); } + } if (dy>3*dx) if (xa<2 && x-xa<dx/2) Break; // ~] + if (xa>1+dx/8 // noLCD + && xa<=loop(bp,dx-1,i2,dx,cs,0,LE)) ad=ad*99/100; // ~1 with a pixel + if (xa>1+dx/8 // noLCD + && xa<=loop(bp,dx-1,i1,dx,cs,0,LE)) ad=ad*99/100; // ~1 with a pixel + + + if( get_bw(i3,i3,i1,i2 ,bp,cs,1) != 1 ) Break; + if( get_bw(i4,i4,i1,i2 ,bp,cs,1) != 1 ) Break; + if( get_bw(i3,i3,0 ,i1 ,bp,cs,1) != 1 ) Break; + if( get_bw(i4,i4,i1,dy-1,bp,cs,1) != 1 ) Break; // m like + // hole is only allowed in beauty fonts + // if( num_hole( x0, x1, y0, y1,box1->p,cs,NULL) > 0 ) // there is no hole + // if( num_hole( x0, x0+dx/2, y0, y0+dy/2,box1->p,cs,NULL) == 0 ) // except in some beauty fonts + if (sdata->holes.num>0) + if (sdata->holes.hole[0].x1 >= dx/2 || sdata->holes.hole[0].y1 >= dy/2) + Break; + Setac(box1,(wchar_t)'3',ad); + if (ad==100) return '3'; + break; + } + // --- test 4 --------------------------------------------------- 25Nov06 + for(ad=d=100;dy>3 && dx>2;){ // min 3x4 ~<gA', + // rewritten for vectors 0.42 + int ld, i1, i2, i3, i4, i5, i6, i7; // line derivation + corners + DBG( wchar_t c_ask='4'; ) + if (sdata->holes.num > 1) Break; /* no or one hole */ + /* half distance to the center */ + d=2*sq(128/4); + /* now we check for the lower left end, must be far away */ + if (aa[1][2]<d/8) Break; /* [2] = distance, ~ABDEF... */ + /* searching for 4 notches between neighbouring ends */ + +/* + type A B C D + + 1 5 1 1 1 + O O O O5 O5 + 2OO3 O 5 O O O O <- 7 6 + O 2O3O 2OO3 2OO3O + 4 4 4 4 +*/ + + /* Warning: aa0 can be left upper or left lower point for type B */ + /* get a point on the inner low left side of the J */ + i =box1->num_frame_vectors[0] - 1; + /* leftmost upper point */ + i1=nearest_frame_vector(box1, 0, i, x0, y0-dy); + /* lowest from leftmost vector can be very low (20/23) */ + i2=nearest_frame_vector(box1, 0, i, x0-2*dx, (y0+7*y1)/8); + /* lowest vector */ + i4=nearest_frame_vector(box1, 0, i, (x0+2*x1)/3, y1+dy); + /* right center crossing point */ + i3=nearest_frame_vector(box1, i2, i4, x1, (3*y0+y1)/4); + /* get a point on the outer right side below top serif */ + /* next local max_y-point after i4 */ + i5=i4; + for (i=i4;i!=i2;i=(i+1)%box1->num_frame_vectors[0]) { + if (box1->frame_vector[ i][1] + <box1->frame_vector[i5][1]) i5=i; // get next maximum + if (box1->frame_vector[ i][1] + >box1->frame_vector[i5][1]+1) break; // break after maximum + if (box1->frame_vector[ i][0]<x0+dx/4) break; // type A B + } + if (box1->num_frames>1) { // type C D + i = box1->num_frame_vectors[0] - 1; // end outer loop + j = box1->num_frame_vectors[1] - 1; // end inner loop + i6=nearest_frame_vector(box1, i+1, j, x1, y1); + i7=nearest_frame_vector(box1, i+1, j, x0, y1); + if (box1->frame_vector[i1][0] + -box1->frame_vector[i2][0]<dx/4+1) ad=96*ad/100; // ~4x6q + i =nearest_frame_vector(box1, i+1, j, x0, y0); // top left + MSG(fprintf(stderr,"triangle type top-left i=%d",i);) + if (box1->frame_vector[i ][0]-x0<dx/4+1 + && box1->frame_vector[i ][1]-y0<dy/4+1 + && dx>7) ad=97*ad/100; // q + + } else { // type A B + i6=nearest_frame_vector(box1, i5, i1, (x0+3*x1)/4, y1-dy/8); + i7=nearest_frame_vector(box1, i5, i1, x0 , y1-dy/8); + MSG(fprintf(stderr,"open type");) + } + // ToDo: output no=(x,y) + MSG(fprintf(stderr,"i1-7 %d %d %d %d %d %d %d",i1,i2,i3,i4,i5,i6,i7);) + if (i5==i6) Break; // ~+ + + if (box1->frame_vector[i1][1]>y0+dy/8) Break; // not to low + if (box1->frame_vector[i2][1] + -box1->frame_vector[i1][1]<dy/2) Break; + if (box1->frame_vector[i3][0] + -box1->frame_vector[i2][0]<dx/4) Break; + if (abs(box1->frame_vector[i3][1] + -box1->frame_vector[i2][1])>dy/4) Break; + if (box1->frame_vector[i2][0]>x0+dx/8) Break; + if (box1->frame_vector[i2][1]>y1-dy/8) Break; + if (box1->frame_vector[i4][1] + -box1->frame_vector[i2][1]<dy/8) Break; + if (box1->frame_vector[i4][1] + -box1->frame_vector[i2][1]<dy/6) ad=99*ad/100; + /* min. distance of the horizontal bar to the ground */ + if (box1->frame_vector[i4][1] + -box1->frame_vector[i3][1]<1+dy/16) Break; + if (box1->frame_vector[i4][1] + -box1->frame_vector[i3][1]<dy/6) ad=99*ad/100; /* tall chars */ + if (box1->frame_vector[i4][1] + -box1->frame_vector[i3][1]<dy/8) ad=99*ad/100; + if (box1->frame_vector[i4][1]<y1-1-dy/8) Break; + if (box1->frame_vector[i3][0]<x0+dx/4) Break; + if (box1->frame_vector[i3][0]<x0+dx/2) ad=98*ad/100; + /* on very tall chars the i3 point can be near to the groundline */ + if (box1->frame_vector[i3][1]>y1-1) Break; + if (box1->frame_vector[i3][1]>y1-dy/16) Break; + if (box1->frame_vector[i3][1]>=y1) Break; // ~5x5# + if (box1->frame_vector[i5][0]<x0+dx/3) Break; + /* upper end of right vertical line */ + if (box1->frame_vector[i5][1]>y0+2*dy/3) Break; + if (box1->frame_vector[i6][1] + -box1->frame_vector[i5][1]<1+dy/16) Break; + if (box1->frame_vector[i6][0]<x0+dx/3) Break; + if (box1->frame_vector[i7][0]>x0+dx/2) Break; + if (box1->frame_vector[i7][0]>x0+dx/3) ad=ad*99/100; + if (box1->frame_vector[i6][1]<y0+dy/3) Break; + if (box1->frame_vector[i6][0]<x0+dx/2) ad=96*ad/100; // ~ 42 + if (box1->frame_vector[i6][0]<aa[2][0]-dx/2 + && aa[2][1]>=y1-1-dy/8) ad=96*ad/100; // ~ 42 + if (box1->frame_vector[i7][1]<y0+dy/3) Break; + if (abs(box1->frame_vector[i3][1] + -box1->frame_vector[i2][1])>dy/4) Break; + + /* check if upper left and lower left points are joined directly */ + ld=line_deviation(box1, i1, i2); + MSG(fprintf(stderr," i1-i2 %d %d dist= %d/%d",i1,i2,ld,2*sq(1024/4));) + if (ld >2*sq(1024/4)) Break; + /* check if lower right and upper right points are joined directly */ + ld=line_deviation(box1, i2, i3); + MSG(fprintf(stderr," i2-i3 %d %d dist= %d/%d",i2,i3,ld,2*sq(1024/4));) + if (ld > sq(1024/4)) Break; + /* check if lower right and upper right points are joined directly */ + ld=line_deviation(box1, i3, i4); + MSG(fprintf(stderr," i3-i4 %d %d dist= %d/%d",i3,i4,ld,2*sq(1024/4));) + if (ld > sq(1024/4)) Break; + /* check if lower right and upper right points are joined directly */ + ld=line_deviation(box1, i6, i7); + MSG(fprintf(stderr," i6-i7 %d %d dist= %d/%d",i6,i7,ld,2*sq(1024/4));) + if (ld >2*sq(1024/4)) Break; + + // 4 exists as gchar and ~gchar + if(!hchar){ ad=99*ad/100; } + Setac(box1,'4',ad); + break; + } +#ifdef Old_pixel_based + // --- old test 4 pixelbased ------- remove! + for(ad=d=100;dx>3 && dy>5;){ // dy>dx, min 4x6 font + DBG( char c_ask='4'; ) + if (sdata->holes.num > 2) Break; /* be tolerant */ + if (sdata->holes.num > 1) ad=97*ad/100; + // upper raising or vertical line + if( loop(bp,0 ,3*dy/16,dx,cs,0,RI) + < loop(bp,0 ,2*dy/4 ,dx,cs,0,RI)-dx/8 ) Break; + // search for a vertical line on lower end + for (y=0;y<dy/4;y++) + if( loop(bp,0 ,dy-1-y,dx,cs,0,RI) + + loop(bp,dx-1,dy-1-y,dx,cs,0,LE) >= dx/2 ) break; + if (y>=dy/4) Break; + if( loop(bp,0 ,dy-1-dy/8,dx,cs,0,RI) < dx/4 ) Break; + // --- follow line from (1,0) to (0,.7) + y=0; x=loop(bp,0,0,dx,cs,0,RI); + if (x<=dx/4) { // ocr-a-4 + i=loop(bp,0,dy/4,dx,cs,0,RI); if (i>dx/4) Break; + i=loop(bp,i,dy/4,dx,cs,1,RI); if (i>dx/2) Break; + j=loop(bp,i,dy/4,dy,cs,0,DO)+dy/4; if (j>7*dy/8) Break; + } + turmite(bp,&x,&y,0,dx-1,0,dy-1,cs,DO,LE); if( x>=0 ) Break; + + y=loop(bp,0,0,dy,cs,0,DO); + if( (y+loop(bp,0,y,dy,cs,1,DO)) < dy/2 ) Break; + if( get_bw(x0 , x0+3*dx/8, y1-dy/7, y1-dy/7,box1->p,cs,1) == 1 ) Break; + if( get_bw(x0+dx/2, x1 , y1-dy/3, y1-dy/3,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2, x0+dx/2, y0+dy/3, y1-dy/5,box1->p,cs,1) != 1 ) Break; + i=loop(bp,bp->x-1, bp->y/4,dx,cs,0,LE); + if( i > loop(bp,bp->x-1,2*bp->y/4,dx,cs,0,LE)+1 + && i > loop(bp,bp->x-1,3*bp->y/8,dx,cs,0,LE)+1 ) Break; + if (loop(bp,0,0,dx,cs,0,RI)>dx/4) { + for(i=dx/8+1,x=0;x<dx && i;x++){ + if( num_cross(x ,x ,0 ,dy-1, bp,cs) == 2 ) i--; + } if( i ) Break; + } + for(i=dy/6+1,y=dy/4;y<dy && i;y++){ + if( num_cross(0 ,dx-1,y ,y , bp,cs) == 2 ) i--; + } if( dy>15 && i ) Break; + for(i=dy/10+1,y=dy-1-dy/4;y<dy && i;y++){ + if( num_cross(0 ,dx-1,y ,y , bp,cs) == 1 ) + if( num_cross(dx/2,dx-1,y ,y , bp,cs) == 1 ) i--; + } if( i ) Break; + // i4 = num_hole ( x0, x1, y0, y1,box1->p,cs,NULL); + // ToDo: + // - get start and endpoint of left edge of left vert. line + // and check if that is an streight line + // - check the right edge of the inner hole (if there) too + i4 = sdata->holes.num; + if (sdata->holes.num >0) { // ~q + i = loop(bp,0,dy/16,dx,cs,0,RI); + if (i < dx/3) Break; + if (i < dx/2) ad=98*ad/100; // hole? + if ( loop(bp, 0,dy-1,dy,cs,0,UP) + -loop(bp,dx/8+1,dy-1,dy,cs,0,UP)>dy/16) ad=97*ad/100; + } + // thickness of left vertical line + for (j=y=0;y<dy/6;y++) { + i=loop(bp,dx-1 ,y,dx,cs,0,LE); + i=loop(bp,dx-1-i,y,dx,cs,1,LE); if (i>j) j=i; + } + if (j>=dx/2) ad=98*ad/100; // ~q handwritten a (or very thinn 4) + // ToDo: check y of masscenter of the hole q4 + + if( i4 ) if( dx > 15 ) + if( loop(bp, dx/2, 0,dy,cs,0,DO)<dy/16 + && loop(bp, dx/4, 0,dy,cs,0,DO)<dy/8 + && loop(bp,3*dx/4, 0,dy,cs,0,DO)<dy/8 + && loop(bp, dx/4,dy-1,dy,cs,0,UP)<dy/8 + && loop(bp, dx/2,dy-1,dy,cs,0,UP)<dy/8 + && loop(bp,3*dx/4,dy-1,dy,cs,0,UP)<dy/4 ) Break; // ~9 + + i =loop(bp,dx-1 ,dy-1,dx,cs,0,LE); // ~9 + i+=loop(bp,dx-1-i,dy-1,dx,cs,1,LE); + if( i>3*dx/4 + && i-loop(bp,dx-1,dy-1-dy/8,dx,cs,0,LE)>dx/4 ) Break; + + i =loop(bp,dx-1-dx/4,dy-1,dx,cs,0,UP); + if (i> dy/2) ad=97*ad/100; + if (i>3*dy/4) ad=97*ad/100; /* handwritten n */ + + if( num_cross(0 ,dx-1,dy/16 ,dy/16 , bp,cs) == 2 // ~9 + && loop(bp,dx-1,dy/16 ,dx,cs,0,LE)> + loop(bp,dx-1,dy/16+1+dy/32,dx,cs,0,LE) ) Break; + if ( !hchar) ad=99*ad/100; + if (gchar && !hchar) ad=98*ad/100; // ~q + Setac(box1,(wchar_t)'4',ad); + if (ad>99) bc='4'; + break; + } +#endif + // --- test 6 ------- ocr-a-6 looks like a b :( + for(ad=d=100;dx>3 && dy>4;){ // dy>dx + DBG( char c_ask='6'; ) + if (sdata->holes.num > 2) Break; /* be tolerant */ + if( loop(bp, 0, dy/4,dx,cs,0,RI)>dx/2 // ocr-a=6 + && loop(bp,dx-1, 0,dy,cs,0,DO)>dy/4 ) Break; // italic-6 + if( loop(bp, 0, dy/2,dx,cs,0,RI)>dx/4 ) Break; + if( loop(bp, 0,3*dy/4,dx,cs,0,RI)>dx/4 ) Break; + if( loop(bp,dx-1,3*dy/4,dx,cs,0,LE)>dx/2 ) Break; + if( num_cross(x0+ dx/2,x0+ dx/2,y0 ,y1 ,box1->p,cs) != 3 + && num_cross(x0+5*dx/8,x0+5*dx/8,y0 ,y1 ,box1->p,cs) != 3 ) { + if( num_cross(x0+ dx/2,x0+ dx/2,y0+dy/4,y1 ,box1->p,cs) != 2 + && num_cross(x0+5*dx/8,x0+5*dx/8,y0+dy/4,y1 ,box1->p,cs) != 2 ) Break; + // here we have the problem to decide between ocr-a-6 and b + if ( loop(box1->p,(x0+x1)/2,y0,dy,cs,0,DO)<dy/2 ) Break; + ad=99*ad/100; + } else { + if (loop(box1->p,x0+dx/2,y0,dx,cs,0,DO)>dy/8 + && loop(box1->p,x1-dx/4,y0,dx,cs,0,DO)>dy/8 ) Break; + } + if( num_cross(x0 ,x1 ,y1-dy/4,y1-dy/4,box1->p,cs) != 2 ) Break; + for( y=y0+dy/6;y<y0+dy/2;y++ ){ + x =loop(box1->p,x1 ,y ,dx,cs,0,LE); if( x>dx/2 ) break; + x+=loop(box1->p,x1-x+1,y-1,dx,cs,0,LE); if( x>dx/2 ) break; + } if( y>=y0+dy/2 ) Break; + if (loop(box1->p,x0,y1-dy/3,dx,cs,0,RI)>dx/4 ) Break; + if (loop(box1->p,x1,y1-dy/3,dx,cs,0,LE)>dx/4 ) Break; + + if (sdata->holes.num != 1) Break; + if (sdata->holes.hole[0].y1 < dy/2) ad=95*ad/100; // whats good for? + if (sdata->holes.hole[0].y0 < dy/4) Break; +// if( num_hole ( x0, x1, y0, y0+dy/2,box1->p,cs,NULL) > 0 ) ad=95*ad/100; +// if( num_hole ( x0, x1, y0+dy/4, y1,box1->p,cs,NULL) != 1 ) Break; +// if( num_hole ( x0, x1, y0 , y1,box1->p,cs,NULL) != 1 ) Break; +// out_x(box1); printf(" x0 y0 %d %d\n",x0,y0); + /* check left vertical bow */ + i1=loop(bp,0,dy/8 ,dx,cs,0,RI); + i3=loop(bp,0,dy-1-dy/8,dx,cs,0,RI); + i2=loop(bp,0,dy/2 ,dx,cs,0,RI); + if(i1+i3-2*i2<-2-dx/16 && i1+i2+i3>0) Break; // convex from left + if(i1+i3-2*i2<1 && i1+i2+i3>0) ad=99*ad/100; // 7-segment-font + for( x=dx,y=0;y<dy/4;y++ ){ // ~ b (serife?) + i1=loop(bp, 0,y,dx,cs,0,RI); + i2=loop(bp,i1,y,dx,cs,1,RI); + if (i2+i1>dx/2 && i2>dx/4) break; /* its a 6 (example: 7-segment) */ + if (i1<x) x=i1; else if (i1>x) break; /* may be serifen b */ + } if (y<dy/4 && i1+i2<=dx/2) Break; + // ~& (with open upper loop) + for( i=0,y=dy/2;y<dy;y++){ + if( num_cross(dx/2,dx-1,y,y,bp,cs) > 1 ) i++; if( i>dy/8 ) break; + } if( y<dy ) Break; + if ( gchar) ad=99*ad/100; + if (!hchar) ad=98*ad/100; + if ( box1->dots ) ad=98*ad/100; + Setac(box1,(wchar_t)'6',ad); + bc='6'; + break; + } + // --- test 7 --------------------------------------------------- + for(ad=d=100;dx>2 && dy>4;){ // dx>1 dy>2*dx + DBG( char c_ask='7'; ) + if (sdata->holes.num > 1) Break; /* be tolerant */ + if( loop(bp,dx/2,0,dy,cs,0,DO)>dy/8 ) Break; + if( num_cross(0,dx-1,3*dy/4,3*dy/4,bp,cs) != 1 ) Break; // preselect + for( yb=xb=y=0;y<dy/2;y++){ // upper h-line and gap + j=loop(bp,0,y,dx,cs,0,RI);if(xb>0 && j>dx/4) break; // gap after h-line + j=loop(bp,j,y,dx,cs,1,RI);if(j>xb){ xb=j;yb=y; } // h-line + } if( xb<dx/4 || y==dy/2 ) Break; + j=loop(bp,0,dy/2,dx,cs,0,RI); + j=loop(bp,j,dy/2,dx,cs,1,RI); if(xb<2*j) Break; // minimum thickness + for(x=0,y+=dy/16;y<dy;y++){ // one v-line? + if( num_cross(0,dx-1,y,y,bp,cs) != 1 ) break; + j=loop(bp,dx-1,y,dx,cs,0,LE); if( j<x ) break; if( j-1>x ) x=j-1; + } if( y<dy || x<dx/3 ) { + MSG( fprintf(stderr,"xy= %d %d",x,y); ) + Break; + } + j =loop(bp,dx-1,0,dy,cs,0,DO); // ~T + j+=loop(bp,dx-1,j,dy,cs,1,DO)+dy/16; + i =loop(bp,dx-1,j,dx,cs,0,LE); if(j<dy/2) { + if (i>j) Break; + j=loop(bp, 0,j,dx,cs,0,RI); + if(j>dx/4 && j<=i+dx/16) Break; // tall T + } + + MSG( fprintf(stderr,"7: ad= %d",ad); ) + if( loop(bp, 0,3*dy/8,dx,cs,0,RI) + <=loop(bp,dx-1,3*dy/8,dx,cs,0,LE)+dx/8 ) ad=ad*98/100; // l + MSG( fprintf(stderr,"7: ad= %d",ad); ) + if( num_cross(0,dx-1,dy/4,dy/4,bp,cs) == 1 + && loop(bp,0,dy/4,dx,cs,0,RI) < dx/2 ) ad=ad*96/100; // J + MSG( fprintf(stderr,"7: ad= %d",ad); ) + + if (box1->m3 && dy<box1->m3-box1->m2) ad=99*ad/100; // too small + if (box1->m3 && 2*dy<box1->m3-box1->m2) ad=96*ad/100; // too small + if (dy>3*dx) ad=99*ad/100; // ) + if ( gchar) ad=99*ad/100; // J + if (!hchar) ad=99*ad/100; + Setac(box1,(wchar_t)'7',ad); + break; + } + // --- test 8 --------------------------------------------------- + // last change: May15th,2000 JS + for(ad=d=100;dx>2 && dy>4;){ // or we need large height + DBG( char c_ask='8'; ) + if (sdata->holes.num != 2) Break; + if( num_cross(x0,x1,y0 +dy/4,y0 +dy/4,box1->p,cs) != 2 ) Break; // ~gr (glued) + if( num_cross(x0,x1,y1 -dy/4,y1 -dy/4,box1->p,cs) != 2 + && num_cross(x0,x1,y1-3*dy/8,y1-3*dy/8,box1->p,cs) != 2 ) Break; + if( get_bw(x0,x0+dx/4,y1-dy/4,y1-dy/4,box1->p,cs,1) == 0 ) Break; // ~9 + if( get_bw(x0,x0+dx/2,y0+dy/4,y0+dy/4,box1->p,cs,1) == 0 ) Break; + if( get_bw(x0+dx/2,x0+dx/2,y0+dy/4,y1-dy/4,box1->p,cs,1) == 0 ) Break; // ~0 +// MSG( printf(" x0 y0 %d %d\n",x0,y0); ) + for( i2=i1=x=0,i=y=y0+dy/3;y<=y1-dy/3;y++){ // check left middle nick + j=loop(box1->p,x0,y,dx,cs,0,RI); + if (j>x || (abs(j-x)<=dx/8 /* care about MICR E-13B font */ + && (i1=loop(box1->p,x0+j,y,dx,cs,1,RI))>dx/2)) { + if (j>x) x=j; i=y; if (i1>i2) i2=i1; } + } if(i>=y1-dy/3 || (x<dx/8 && i2<=dx/2) || x>dx/2) Break; // no gB + if (x< dx/4) ad=99*ad/100; // no B + if (x<=dx/8) ad=98*ad/100; // no B + j = loop(box1->p,x1,y1- dy/4,dx,cs,0,LE); + if( j>loop(box1->p,x1,y1- dy/5,dx,cs,0,LE) + && j>loop(box1->p,x1,y1-2*dy/5,dx,cs,0,LE) ) Break; // & + // check for upper hole + for (j=0;j<sdata->holes.num;j++) { + if (sdata->holes.hole[j].y1 < i-y0+1 ) break; + if (sdata->holes.hole[j].y1 < i-y0+dy/8) break; + } if (j==sdata->holes.num) Break; // not found + // if( num_hole(x0,x1,y0,i+1 ,box1->p,cs,NULL)!=1 ) + // if( num_hole(x0,x1,y0,i+dy/8,box1->p,cs,NULL)!=1 ) Break; // upper hole + // check for lower hole + for (j=0;j<sdata->holes.num;j++) { + if (sdata->holes.hole[j].y0 > i-y0-1 ) break; + } if (j==sdata->holes.num) Break; // not found + // if( num_hole(x0,x1,i-1,y1,box1->p,cs,NULL)!=1 ) Break; + i1=i; // left middle nick + /* find the middle right nick */ + for( x=0,i2=i=y=y0+dy/3;y<=y1-dy/3;y++){ + j=loop(box1->p,x1,y,dx,cs,0,LE); if( j>=x ) i2=y; + /* we care also for 7-segment and unusual fonts */ + if (j>x || (abs(j-x)<=(dx+4)/8 + && loop(box1->p,x1-j,y,dx,cs,1,LE)>dx/2)){ + if (j>x) x=j; i=y; } + // MSG(fprintf(stderr," yjix %d %d %d %d %d %d",y-y0,j,i-y0,x,loop(box1->p,x1-j,y,dx,cs,1,LE),dx/2);) + } + if( i>y0+dy/2+dy/10 ) Break; + // if( x<dx/8 ) Break; + if( x>dx/2 ) Break; + MSG(fprintf(stderr,"center bar at y= %d %d x=%d+%d i1=%d",i-y0,i2-y0,x,j,i1);) + if( num_cross(x0,x1, i , i ,box1->p,cs) != 1 + && num_cross(x0,x1, i+1 , i+1 ,box1->p,cs) != 1 + && num_cross(x0,x1,(i+i2)/2,(i+i2)/2,box1->p,cs) != 1 ) Break; // no g + if(abs(i1-i)>(dy+5)/10) ad=99*ad/100; // y-distance right-left-nick + if(abs(i1-i)>(dy+4)/8) ad=99*ad/100; // y-distance right-left-nick + if(abs(i1-i)>(dy+2)/4) Break; + // ~B ff + for(i=dx,y=0;y<dy/8+2;y++){ + j=loop(bp,0,y,dx,cs,0,RI); if( j<i ) i=j; if( j>i+dx/16 ) break; + } if( y<dy/8+2 ) Break; + for(i=dx,y=0;y<dy/8+2;y++){ + j=loop(bp,0,dy-1-y,dx,cs,0,RI); + if( j<i ) i=j; if( j>i+dx/16 ) break; + } if( y<dy/8+2 ) Break; + if( dy>16 && num_cross(0,dx-1,dy-1,dy-1,bp,cs) > 1 + && loop(bp,0,dy-1,dx,cs,0,RI) <dx/8+1 ) Break; // no fat serif S + for( i=0,y=dy/2;y<dy;y++){ + if( num_cross(0,dx-1,y,y,bp,cs) > 2 ) i++; if( i>dy/8 ) break; + } if( y<dy ) Break; + if ( loop(bp,dx-1,0,dx,cs,0,LE)==0 ) ad=99*ad/100; + if (num_cross( 0,dx-1,dy-1,dy-1,bp,cs) > 1) ad=98*ad/100; // & + if (num_cross(dx-1,dx-1,dy/2,dy-1,bp,cs) > 1) ad=98*ad/100; // & + if (num_cross( 0,dx-1, 0, 0,bp,cs) > 1) ad=98*ad/100; + if (dy>15) + if (num_cross( 0,dx-1, 1, 1,bp,cs) > 1) ad=98*ad/100; + /* if m1..4 is unsure ignore hchar and gchar ~ga */ + if (!hchar) { + if ((box1->m2-box1->y0)*8>=dy) ad=98*ad/100; + else ad=99*ad/100; + } + if ( gchar + && (box1->y1-box1->m3)*8>=dy) ad=99*ad/100; + Setac(box1,(wchar_t)'8',ad); + break; + } + // --- test 9 \it g --------------------------------------------------- + /* + * lcd micr + * ooo ooo + * o o o o + * ooo ooo + * o o + * ooo o + */ + for(ad=d=100;dx>2 && dy>4;){ // dx>1 dy>2*dx + DBG( char c_ask='9'; ) + if (sdata->holes.num > 1) Break; + if( num_cross(x0+ dx/2,x0+ dx/2,y0,y1-dy/4,box1->p,cs) != 2 // pre select + && num_cross(x0+ dx/2,x0+ dx/2,y0, y1,box1->p,cs) != 3 // pre select + && num_cross(x0+3*dx/8,x0+3*dx/8,y0,y1,box1->p,cs) != 3 + && num_cross(x0+ dx/4,x1 -dx/4,y0,y1,box1->p,cs) != 3 ) Break; + if( num_cross(x0+ dx/2,x0 +dx/2,y0,y0+dy/4,box1->p,cs) < 1 ) Break; + if( num_cross(x0+ dx/2,x1, y0+dy/2 ,y0+dy/2,box1->p,cs) < 1 ) Break; + if( num_cross(x0,x1, y0+ dy/4 ,y0+ dy/4,box1->p,cs) != 2 + && num_cross(x0,x1, y0+3*dy/8 ,y0+3*dy/8,box1->p,cs) != 2 ) Break; + if( num_cross(x1-dx/8,x1,y0+dy/4,y0+dy/4,box1->p,cs) == 0) ad=ad*97/100; // ~4 + for( x=0,i=y=y0+dy/2;y<=y1-dy/4;y++){ // find notch (suche kerbe) + j=loop(box1->p,x0,y,dx,cs,0,RI); + if( j>x ) { x=j; i=y; } + } if (x<1 || x<dx/8) Break; y=i; + // fprintf(stderr," debug 9: %d %d\n",x,i-y0); + if( x<dx/2 ) { /* big bow? */ + j=loop(box1->p,x0+x-1,y,dy/8+1,cs,0,DO)/2; y=i=y+j; + j=loop(box1->p,x0+x-1,y,dx/2 ,cs,0,RI); x+=j; + if (x<dx/2) Break; + } + // check for the right lower bow + MSG( fprintf(stderr,"bow-y0= %d",i-y0); ) + if (dx>5) + if( num_cross(x0+dx/2,x1,i,y1 ,box1->p,cs) != 1 /* fails on 5x8 */ + && num_cross(x0+dx/2,x1,i,y1-dy/8,box1->p,cs) != 1 ) Break; + if( num_cross(x0+dx/2,x0+dx/2,i,y1,box1->p,cs) > 1 ) Break; + if( num_cross(x0+dx/2,x1 ,i, i,box1->p,cs) != 1 ) Break; + + if (sdata->holes.num < 1) { /* happens for 5x7 font */ + if (dx<8) ad=98*ad/100; else Break; } + else { + if (sdata->holes.hole[0].y1 >= i+1) Break; + if (sdata->holes.hole[0].y0 > i-1) Break; + if (sdata->holes.num > 1) + if (sdata->holes.hole[1].y0 > i-1) Break; + // if( num_hole(x0,x1,y0,i+1,box1->p,cs,NULL)!=1 ) Break; + // if( num_hole(x0,x1,i-1,y1,box1->p,cs,NULL)!=0 ) Break; + } + if( loop(box1->p,x0,y1 ,dy,cs,0,RI)>dx/3 && + loop(box1->p,x0,y1-1,dy,cs,0,RI)>dx/3 + && (box1->m3==0 || (box1->m3!=0 && (!hchar || gchar)))) ad=98*ad/100; // no q OR ocr-a-9 + for( x=0,i=y=y0+dy/3;y<=y1-dy/3;y++){ // suche kerbe + j=loop(box1->p,x1,y,dx,cs,0,LE); + if( j>x ) { x=j; i=y; } + } if( x>dx/2 ) Break; // no g + i1=loop(bp,dx-1,dy/8 ,dx,cs,0,LE); if(i1>dx/2) Break; + i3=loop(bp,dx-1,dy-1-dy/8,dx,cs,0,LE); + i2=loop(bp,dx-1,dy/2 ,dx,cs,0,LE); if(i1+i3-2*i2<-1-dx/16) Break; // konvex + i1=loop(bp,dx-1,dy/4 ,dx,cs,0,LE); if(i1>dx/2) Break; + i3=loop(bp,dx-1,dy-1-dy/8,dx,cs,0,LE); + for(y=dy/4;y<dy-1-dy/4;y++){ + i2=loop(bp,dx-1,y,dx,cs,0,LE); + if(i1+i3-2*i2<-1-dx/16) break; // konvex from right ~g ~3 + } if(y<dy-1-dy/4) Break; + x=loop(bp,dx -1,6*dy/8,dx,cs,0,LE); if(x>0){ + x--; // robust + y=loop(bp,dx-x-1, dy-1,dy,cs,0,UP); + if(y<dy/8) Break; // ~q (serif!) + } + if (box1->m3) { + if( gchar) ad=99*ad/100; /* unsure */ + if(!hchar) ad=99*ad/100; /* unsure */ + } else { if (ad==100) ad=99; } /* not 100% sure */ + Setac(box1,(wchar_t)'9',ad); + break; + } + // 0 is same as O !? + // --- test 0 (with one big hole in it ) ----------------------------- + for(d=ad=100;dx>2 && dy>3;){ // min 3x4 + DBG( char c_ask='0'; ) + if (sdata->holes.num > 1) Break; /* be tolerant */ + if( get_bw(x0 , x0+dx/3,y0+dy/2 , y0+dy/2,box1->p,cs,1) != 1 ) Break; + if( get_bw(x1-dx/3 , x1 ,y0+dy/2 , y0+dy/2,box1->p,cs,1) != 1 ) Break; + /* could be an O, unless we find a dot in the center */ + if( get_bw(x0 , x1 ,y0+dy/2 , y0+dy/2,box1->p,cs,1) != 3 ) ad=99; + if( get_bw(x0+dx/2 , x0+dx/2,y1-dy/3 , y1,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2 , x0+dx/2,y0 , y0+dy/3,box1->p,cs,1) != 1 ) Break; + /* accept 0 with dot in center, accept \/0 too ... */ + if( get_bw(x0+dx/2 , x0+dx/2,y0+dy/3 , y1-dy/3,box1->p,cs,1) != 0 ) Break; + + if( num_cross(x0+dx/2,x0+dx/2,y0 , y1 ,box1->p,cs) != 2 ) Break; + if( num_cross(x0+dx/3,x1-dx/3,y0 , y0 ,box1->p,cs) != 1 ) // AND + if( num_cross(x0+dx/3,x1-dx/3,y0+1 , y0+1 ,box1->p,cs) != 1 ) Break; + if( num_cross(x0+dx/3,x1-dx/3,y1 , y1 ,box1->p,cs) != 1 ) // against "rauschen" + if( num_cross(x0+dx/3,x1-dx/3,y1-1 , y1-1 ,box1->p,cs) != 1 ) Break; + if( num_cross(x0 ,x0 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) + if( num_cross(x0+1 ,x0+1 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) Break; + if( num_cross(x1 ,x1 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) + if( num_cross(x1-1 ,x1-1 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) Break; + // if( num_hole(x0,x1,y0,y1,box1->p,cs,NULL) != 1 ) Break; + if (sdata->holes.num != 1) Break; + + i= loop(bp,0 ,0 ,x1-x0,cs,0,RI)- + loop(bp,0 ,2 ,x1-x0,cs,0,RI); + if (i<0) Break; + if (i==0) { + if (loop(bp,dx-1,0 ,x1-x0,cs,0,LE)> + loop(bp,dx-1,2 ,x1-x0,cs,0,LE) ) ad=98*ad/100; + ad=99*ad/100; /* LCD-type? */ + } + + x=loop(bp,dx-1,dy-1-dy/3,x1-x0,cs,0,LE); // should be minimum + for (y=dy-1-dy/3;y<dy;y++){ + i=loop(bp,dx-1,y,x1-x0,cs,0,LE); + if (i<x-dx/16-1) break; if (i>x) x=i; + } + if( y<dy ) Break; + + // ~D (but ocr-a-font) + i= loop(bp, 0, dy/16,dx,cs,0,RI) + + loop(bp, 0,dy-1-dy/16,dx,cs,0,RI) + - 2*loop(bp, 0, dy/2 ,dx,cs,0,RI); + j= loop(bp,dx-1, dy/16,dx,cs,0,LE) + + loop(bp,dx-1,dy-1-dy/16,dx,cs,0,LE) + <= 2*loop(bp,dx-1, dy/2 ,dx,cs,0,LE); + if (i<-dx/8 || i+dx/8<j) Break; // not konvex + + if( loop(bp,dx-1, dy/16,dx,cs,0,LE)>dx/8 ) + if( loop(bp,0 , dy/16,dx,cs,0,RI)<dx/16 ) Break; + if( loop(bp,dx-1,dy-1-dy/16,dx,cs,0,LE)>dx/8 ) + if( loop(bp,0 ,dy-1-dy/16,dx,cs,0,RI)<dx/16 ) Break; + if( get_bw(x1-dx/32,x1,y0,y0+dy/32,box1->p,cs,1) == 0 + && get_bw(x1-dx/32,x1,y1-dy/32,y1,box1->p,cs,1) == 0 + && ( get_bw(x0,x0+dx/32,y0,y0+dy/32,box1->p,cs,1) == 1 + || get_bw(x0,x0+dx/32,y1-dy/32,y1,box1->p,cs,1) == 1 ) ) { + if (dx<32) ad=ad*99/100; else Break; // ~D + } + + // search lowest inner white point + for(y=dy,j=x=0;x<dx;x++) { + i =loop(bp,x,dy-1 ,y1-y0,cs,0,UP); + i+=loop(bp,x,dy-1-i,y1-y0,cs,1,UP); + if (i<=y) { y=i; j=x; } + } i=y; + // italic a + for(y=dy-1-i;y<dy-1;y++) + if( num_cross(j,dx-1,y,y,bp,cs) > 1 ) ad=99*ad/100; // ~a \it a + + if (loop(bp, 0, 0,x1-x0,cs,0,RI)>=dx/8) { // round, notLCD + if (loop(bp,dx-1,dy-1,x1-x0,cs,0,LE)<dx/8) ad=98*ad/100; // \it a + if (loop(bp,dx-1, 0,x1-x0,cs,0,LE)<dx/8) ad=98*ad/100; // \it a + } + + if (abs(loop(bp,dx/2, 0,dy,cs,0,DO) + -loop(bp,dx/2,dy-1,dy,cs,0,UP))>dy/8 + || num_cross(0,dx-1, 0, 0,bp,cs) > 1 + || num_cross(0,dx-1,dy-1,dy-1,bp,cs) > 1 + ) ad=98*ad/100; // ~bq + + if (box1->m3) { + if (!hchar) ad=98*ad/100; else // ~o + if ( gchar) ad=99*ad/100; // wrong line detection? + } else { if (ad==100) ad=99; } /* not 100% sure */ + if (ad>99) ad=99; /* we can never be sure having a O, + let context correction decide, see below! */ + Setac(box1,(wchar_t)'0',ad); + break; + } + // --- test 0 with a straight line in it ------------------- + for(ad=100;dx>4 && dy>5;){ /* v0.3.1+ */ + DBG( char c_ask='0'; ) + if (sdata->holes.num > 3) Break; /* be tolerant */ + if (sdata->holes.num < 1) Break; + if (sdata->holes.num != 2) ad=95*ad/100; + if( get_bw(x0 , x0+dx/2,y0+dy/2 , y0+dy/2,box1->p,cs,1) != 1 ) Break; + if( get_bw(x1-dx/2 , x1 ,y0+dy/2 , y0+dy/2,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2 , x0+dx/2,y1-dy/2 , y1,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2 , x0+dx/2,y0 , y0+dy/2,box1->p,cs,1) != 1 ) Break; + if( get_bw(x0+dx/2 , x0+dx/2,y0+dy/3 , y1-dy/3,box1->p,cs,1) != 1 ) Break; + // out_x(box1); printf(" x0 y0 %d %d\n",x0,y0); + if( num_cross(x0+dx/2,x0+dx/2,y0 , y1 ,box1->p,cs) != 3 ) Break; + if( num_cross(x0+dx/3,x1-dx/3,y0 , y0 ,box1->p,cs) != 1 ) // AND + if( num_cross(x0+dx/3,x1-dx/3,y0+1 , y0+1 ,box1->p,cs) != 1 ) Break; + if( num_cross(x0+dx/3,x1-dx/3,y1 , y1 ,box1->p,cs) != 1 ) // against "rauschen" + if( num_cross(x0+dx/3,x1-dx/3,y1-1 , y1-1 ,box1->p,cs) != 1 ) Break; + if( num_cross(x0 ,x0 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) + if( num_cross(x0+1 ,x0+1 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) Break; + if( num_cross(x1 ,x1 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) + if( num_cross(x1-1 ,x1-1 ,y0+dy/3 , y1-dy/3,box1->p,cs) != 1 ) Break; + // if( num_hole(x0,x1,y0,y1,box1->p,cs,NULL) != 2 ) Break; + if (sdata->holes.num != 2) ad=85*ad/100; + + if( loop(bp,0 , 0,x1-x0,cs,0,RI)<= + loop(bp,0 , 2+dy/32,x1-x0,cs,0,RI) ) Break; + x= loop(bp,0 ,dy/2 ,x1-x0,cs,0,RI); + i= loop(bp,0 ,dy/2-1,x1-x0,cs,0,RI); if (i>x) x=i; + i= loop(bp,0 ,dy/2-2,x1-x0,cs,0,RI); if (i>x && dy>8) x=i; + if( loop(bp,0 , dy/4,x1-x0,cs,0,RI)<x ) Break; // ~8 + x= loop(bp,dx-1,dy/2 ,x1-x0,cs,0,LE); + i= loop(bp,dx-1,dy/2-1,x1-x0,cs,0,LE); if(i>x) x=i; + i= loop(bp,dx-1,dy/2-1,x1-x0,cs,0,LE); if(i>x && dy>8) x=i; + if( loop(bp,dx-1,3*dy/4,x1-x0,cs,0,LE)<x) Break; // ~8 + + x=loop(bp,dx-1,dy-1-dy/3,x1-x0,cs,0,LE); // should be minimum + for( y=dy-1-dy/3;y<dy;y++ ){ + i=loop(bp,dx-1,y,x1-x0,cs,0,LE); + if (i<x-dx/16) break; + if (i>x) x=i; + } + if( y<dy ) Break; + + /* test for straight line */ + y =loop(bp,dx/2,dy-1 ,y1-y0,cs,0,UP); if(y>dy/4) Break; + y+=loop(bp,dx/2,dy-1-y,y1-y0,cs,1,UP); if(y>dy/3) Break; if (y>dy/4) ad=ad*99/100; + y+=loop(bp,dx/2,dy-1-y,y1-y0,cs,0,UP); if(3*y>2*dy) Break; + x =loop(bp,dx/2,dy-y,dx/2,cs,0,RI); if(x==0) Break; + // MM; fprintf(stderr," y=%d x=%d\n",y-1,x); + if( loop(bp,dx/2+x-1-dx/16,dy-y,y1-y0,cs,0,UP)==0 ) Break; + // $ + for(i=0,y=dy/4;y<dy-dy/4-1;y++) + if( loop(bp, 0,y,dx-1,cs,0,RI) > dx/4 + || loop(bp,dx-1,y,dx-1,cs,0,LE) > dx/4 ) break; + if( y<dy-dy/4-1 ) Break; + + // ~D + if( loop(bp,0, dy/16,dx,cs,0,RI) + + loop(bp,0,dy-1-dy/16,dx,cs,0,RI) + <= 2*loop(bp,0, dy/2 ,dx,cs,0,RI)+dx/8 ) Break; // not konvex + + if( loop(bp,dx-1, dy/16,dx,cs,0,LE)>dx/8 ) + if( loop(bp,0 , dy/16,dx,cs,0,RI)<dx/16 ) Break; + if( loop(bp,dx-1,dy-1-dy/16,dx,cs,0,LE)>dx/8 ) + if( loop(bp,0 ,dy-1-dy/16,dx,cs,0,RI)<dx/16 ) Break; + if( get_bw(x1-dx/32,x1,y0,y0+dy/32,box1->p,cs,1) == 0 + && get_bw(x1-dx/32,x1,y1-dy/32,y1,box1->p,cs,1) == 0 + && ( get_bw(x0,x0+dx/32,y0,y0+dy/32,box1->p,cs,1) == 1 + || get_bw(x0,x0+dx/32,y1-dy/32,y1,box1->p,cs,1) == 1 ) ) Break; // ~D + + /* 5x9 font "9" is like "0" */ + if (dx<16) + if ( num_cross(x0,x0,y0,y1,box1->p,cs) != 1 ) ad=98*ad/100; + + // italic a + for(i=0,y=6*dy/8;y<dy-dy/16;y++) + if( num_cross(0,dx-1,y,y,bp,cs) > 2 ) i++; else i--; + if(i>0) ad=ad*98/100; // ~'a' \it a + if( !hchar ) ad=90*ad/100; + Setac(box1,(wchar_t)'0',ad); + break; + } + return box1->c; +} diff --git a/lib/gocr/ocr1.c b/lib/gocr/ocr1.c new file mode 100644 index 00000000..7ddcc970 --- /dev/null +++ b/lib/gocr/ocr1.c @@ -0,0 +1,84 @@ +// test routines - faster to compile +#include <stdlib.h> +#include <stdio.h> +#include "pgm2asc.h" +#include "unicode.h" +#include "gocr.h" + +// for learn_mode/analyze_mode high, with, yoffset, num of pattern_i, +// - holes (center,radius in relative coordinates) etc. => cluster analyze +// num_hole => min-volume, tolerance border +// pattern: @@ @. @@ +// .@ @. .. +// regular filter for large resolutions to make edges more smooth (on boxes) +// extra-filter (only if not recognized?) +// map + same color to (#==change) +// - anti color +// . not used +// strongest neighbour pixels (3x3) => directions +// second/third run with more and more tolerance!? + +/* FIXME jb: following is unused */ +#if 0 +struct lobj { // line-object (for fitting to near lines) + int x0,y0; // starting point (left up) + int x1,y1; // end point (right down) + int mt; // minimum thickness + int q; // quality, overlapp +}; + +/* FIXME jb global */ +struct lobj obj1; +#endif + +// that is the first draft of feature extraction +// detect main lines and bows +// seems bad implemented, looking for better algorithms (ToDo: use autotrace) +#define MAXL 10 +void ocr2(pix *b,int cs){ + int x1,y1,x2,y2,l,i,j,xa[MAXL],ya[MAXL],xb[MAXL],yb[MAXL],ll[MAXL]; + for(i=0;i<MAXL;i++)xa[i]=ya[i]=xb[i]=yb[i]=ll[i]=0; + for(x1=0;x1<b->x;x1++) // very slowly, but simple to program + for(y1=0;y1<b->y;y1++) // brute force + for(x2=0;x2<b->x;x2++) + for(y2=y1+1;y2<b->y;y2++) + { + if( get_line2(x1,y1,x2,y2,b,cs,100)>99 ) + { // line ??? + l=(x2-x1)*(x2-x1)+(y2-y1)*(y2-y1); // len + for(i=0;i<MAXL;i++) + { // remove similar lines (same middle point) IMPROVE IT !!!!!! ??? + if( + abs(x1+x2-xa[i]-xb[i])<1+b->x/2 + && abs(y1+y2-ya[i]-yb[i])<1+b->y/2 + && abs(y1-ya[i])<1+b->y/4 + && abs(x1-xa[i])<1+b->x/4 + ) + { + if( l>ll[i] ) + { + for(j=i;j<MAXL-1;j++) + { // shift table + xa[j]=xa[j+1];ya[j]=ya[j+1]; + xb[j]=xb[j+1];yb[j]=yb[j+1];ll[j]=ll[j+1]; + } + ll[MAXL-1]=0; + } + else break; // forget it if shorter + } + if( l>ll[i] ){ // insert if larger + for(j=MAXL-1;j>i;j--){ // shift table + xa[j]=xa[j-1];ya[j]=ya[j-1]; + xb[j]=xb[j-1];yb[j]=yb[j-1];ll[j]=ll[j-1]; + } + xa[i]=x1;ya[i]=y1;xb[i]=x2;yb[i]=y2;ll[i]=l; + break; + } + } + } + } + for(i=0;i<MAXL;i++){ + printf(" %2d %2d %2d %2d %3d\n",xa[i],ya[i],xb[i],yb[i],ll[i]); + } +} + diff --git a/lib/gocr/ocr1.h b/lib/gocr/ocr1.h new file mode 100644 index 00000000..63a46e2d --- /dev/null +++ b/lib/gocr/ocr1.h @@ -0,0 +1,3 @@ +/* #include "pgm2asc.h" */ +#include "pnm.h" +/* wchar_t ocr1(struct box *box1, pix *b, int cs); */ diff --git a/lib/gocr/otsu.c b/lib/gocr/otsu.c new file mode 100644 index 00000000..50c754ad --- /dev/null +++ b/lib/gocr/otsu.c @@ -0,0 +1,284 @@ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000-2007 Joerg Schulenburg + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + see README for EMAIL-address + + the following code was send by Ryan Dibble <dibbler@umich.edu> + + The algorithm is very simple but works good hopefully. + + Compare the grayscale histogram with a mass density diagram: + I think the algorithm is a kind of + divide a body into two parts in a way that the mass + centers have the largest distance from each other, + the function is weighted in a way that same masses have a advantage + + - otsu algorithm is failing on diskrete multi color images + + TODO: + RGB: do the same with all colors (CMYG?) seperately + + test: hardest case = two colors + bbg: test done, using a two color gray file. Output: + # threshold: Value = 43 gmin=43 gmax=188 + + my changes: + - float -> double + - debug option added (vvv & 1..2) + - **image => *image, &image[i][1] => &image[i*cols+1] + - do only count pixels near contrast regions + this makes otsu much better for shadowed fonts or multi colored text + on white background + + (m) Joerg Schulenburg (see README for email address) + + ToDo: + - measure contrast + - detect low-contrast regions + + */ + +#include <stdio.h> +#include <string.h> + +#define Abs(x) ((x<0)?-(x):x) + +/*======================================================================*/ +/* global thresholding routine */ +/* takes a 2D unsigned char array pointer, number of rows, and */ +/* number of cols in the array. returns the value of the threshold */ +/*======================================================================*/ +int +otsu (unsigned char *image, int rows, int cols, + int x0, int y0, int dx, int dy, int vvv) { + + unsigned char *np; // pointer to position in the image we are working with + unsigned char op1, op2; // predecessor of pixel *np (start value) + int maxc=0; // maximum contrast (start value) + int thresholdValue=1; // value we will threshold at + int ihist[256]; // image histogram + int chist[256]; // contrast histogram + + int i, j, k; // various counters + int is, i1, i2, ns, n1, n2, gmin, gmax; + double m1, m2, sum, csum, fmax, sb; + + // zero out histogram ... + memset(ihist, 0, sizeof(ihist)); + memset(chist, 0, sizeof(chist)); + op1=op2=0; + + gmin=255; gmax=0; k=dy/512+1; + // v0.43 first get max contrast, dont do it together with next step + // because it failes if we have pattern as background (on top) + for (i = 0; i < dy ; i+=k) { + np = &image[(y0+i)*cols+x0]; + for (j = 0; j < dx ; j++) { + ihist[*np]++; + if(*np > gmax) gmax=*np; + if(*np < gmin) gmin=*np; + if (Abs(*np-op1)>maxc) maxc=Abs(*np-op1); /* new maximum contrast */ + if (Abs(*np-op2)>maxc) maxc=Abs(*np-op2); /* new maximum contrast */ + /* we hope that maxc will be find its maximum very fast */ + op2=op1; /* shift old pixel to next older */ + op1=*np; /* store old pixel for contrast check */ + np++; /* next pixel */ + } + } + + // generate the histogram + // Aug06 images with large white or black homogeneous + // areas give bad results, so we only add pixels on contrast edges + for (i = 0; i < dy ; i+=k) { + np = &image[(y0+i)*cols+x0]; + for (j = 0; j < dx ; j++) { + if (Abs(*np-op1)>maxc/4 + || Abs(*np-op2)>maxc/4) + chist[*np]++; // count only relevant pixels + op2=op1; /* shift old pixel to next older */ + op1=*np; /* store old pixel for contrast check */ + np++; /* next pixel */ + } + } + + // set up everything + sum = csum = 0.0; + ns = 0; + is = 0; + + for (k = 0; k <= 255; k++) { + sum += (double) k * (double) chist[k]; /* x*f(x) cmass moment */ + ns += chist[k]; /* f(x) cmass */ + is += ihist[k]; /* f(x) imass */ + // Debug: output to out_hist.dat? + // fprintf(stderr,"\chistogram %3d %6d (brightness weight)", k, ihist[k]); + } + + if (!ns) { + // if n has no value we have problems... + fprintf (stderr, "NOT NORMAL, thresholdValue = 160\n"); + return (160); + } + + // ToDo: only care about extremas in a 3 pixel environment + // check if there are more than 2 mass centers (more colors) + // return object colors and color radius instead of threshold value + // also the reagion, where colored objects are found + // what if more than one background color? no otsu at all? + // whats background? box with lot of other boxes in it + // threshold each box (examples/invers.png,colors.png) + // get maximum white and minimum black pixel color (possible range) + // check range between them for low..high contrast ??? + // typical scenes (which must be covered): + // - white page with text of different colors (gray values) + // - binear page: background (gray=1) + black text (gray=0) + // - text mixed with big (dark) images + // ToDo: recursive clustering for maximum multipol moments? + // idea: normalize ihist to max=1024 before otsu? + + // do the otsu global thresholding method + + if ((vvv&1)) // Debug + fprintf(stderr,"# threshold: value ihist chist mass_dipol_moment\n"); + fmax = -1.0; + n1 = 0; + for (k = 0; k < 255; k++) { + n1 += chist[k]; // left mass (integration) + if (!n1) continue; // we need at least one foreground pixel + n2 = ns - n1; // right mass (num pixels - left mass) + if (n2 == 0) break; // we need at least one background pixel + csum += (double) k *chist[k]; // left mass moment + m1 = csum / n1; // left mass center (black chars) + m2 = (sum - csum) / n2; // right mass center (white background) + // max. dipol moment? + // orig: sb = (double) n1 *(double) n2 * (m1 - m2) * (m1 - m2); + sb = (double) n1 *(double) n2 * (m2 - m1); // seems to be better Aug06 + /* bbg: note: can be optimized. */ + if (sb > fmax) { + fmax = sb; + thresholdValue = k + 1; + // thresholdValue = (m1 + 3 * m2) / 4; + } + if ((vvv&1) && ihist[k]) // Debug + fprintf(stderr,"# threshold: %3d %6d %6d %8.2f\n", + k, ihist[k], chist[k], + sb/(dx*dy)); /* normalized dipol moment */ + } + // ToDo: error = left/right point where sb is 90% of maximum? + // now we count all pixels for background detection + i1 = 0; + for (k = 0; k < thresholdValue; k++) { + i1 += ihist[k]; // left mass (integration) + } + i2 = is - i1; // right mass (num pixels - left mass) + + // at this point we have our thresholding value + // black_char: value<cs, white_background: value>=cs + + // can it happen? check for sureness + if (thresholdValue > gmax) { + fprintf(stderr,"# threshold: Value >gmax\n"); + thresholdValue = gmax; + } + if (thresholdValue <= gmin) { + fprintf(stderr,"# threshold: Value<=gmin\n"); + thresholdValue = gmin+1; + } + + // debug code to display thresholding values + if ( vvv & 1 ) + fprintf(stderr,"# threshold: Value = %d gmin=%d gmax=%d cmax=%d" + " i= %d %d\n", + thresholdValue, gmin, gmax, maxc, i1, i2); + + if (i1>=4*i2) { // black>=4*white, obviously black is background + if ( vvv & 1 ) + fprintf(stderr,"# threshold: invert the image\n"); + // we do inversion here (no data lost) + for (i = 0; i < dy ; i++) { + np = &image[(y0+i)*cols+x0]; + for (j = 0; j < dx ; j++) { + *np=255-*np; + np++; /* next pixel */ + } + } + thresholdValue=255-thresholdValue+1; + } + + return(thresholdValue); + /* range: 0 < thresholdValue <= 255, example: 1 on b/w images */ + /* 0..threshold-1 is foreground */ + /* threshold..255 is background */ + /* ToDo: min=blackmasscenter/2,thresh,max=(whitemasscenter+255)/2 */ +} + +/*======================================================================*/ +/* thresholding the image (set threshold to 128+32=160=0xA0) */ +/* now we have a fixed thresholdValue good to recognize on gray image */ +/* - so lower bits can used for other things (bad design?) */ +/* ToDo: different foreground colors, gray on black/white background */ +/*======================================================================*/ +int +thresholding (unsigned char *image, int rows, int cols, + int x0, int y0, int dx, int dy, int thresholdValue) { + + unsigned char *np; // pointer to position in the image we are working with + + int i, j; // various counters + int gmin=255,gmax=0; + int nmin=255,nmax=0; + + // calculate min/max (twice?) + for (i = y0 + 1; i < y0 + dy - 1; i++) { + np = &image[i*cols+x0+1]; + for (j = x0 + 1; j < x0 + dx - 1; j++) { + if(*np > gmax) gmax=*np; + if(*np < gmin) gmin=*np; + np++; /* next pixel */ + } + } + + /* allowed_threshold=gmin+1..gmax v0.43 */ + if (thresholdValue<=gmin || thresholdValue>gmax){ + thresholdValue=(gmin+gmax+1)/2; /* range=0..1 -> threshold=1 */ + fprintf(stderr,"# thresholdValue out of range %d..%d, reset to %d\n", + gmin, gmax, thresholdValue); + } + + /* b/w: min=0,tresh=1,max=1 v0.43 */ + // actually performs the thresholding of the image... + // later: grayvalues should also be used, only rescaling threshold=160=0xA0 + for (i = y0; i < y0+dy; i++) { + np = &image[i*cols+x0]; + for (j = x0; j < x0+dx; j++) { + *np = (unsigned char) (*np >= thresholdValue ? + (255-(gmax - *np)* 80/(gmax - thresholdValue + 1)) : + ( 0+(*np - gmin)*150/(thresholdValue - gmin )) ); + if(*np > nmax) nmax=*np; + if(*np < nmin) nmin=*np; + np++; + } + } + + // fprintf(stderr,"# thresholding: nmin=%d nmax=%d\n", nmin, nmax); + + return(128+32); // return the new normalized threshold value + /* 0..159 is foreground */ + /* 160..255 is background */ +} + diff --git a/lib/gocr/otsu.h b/lib/gocr/otsu.h new file mode 100644 index 00000000..acb16378 --- /dev/null +++ b/lib/gocr/otsu.h @@ -0,0 +1,23 @@ +/* + + see README for EMAIL-address + + */ + + +/*======================================================================*/ +/* OTSU global thresholding routine */ +/* takes a 2D unsigned char array pointer, number of rows, and */ +/* number of cols in the array. returns the value of the threshold */ +/*======================================================================*/ +int +otsu (unsigned char *image, int rows, int cols, int x0, int y0, int dx, int dy, int vvv); + + +/*======================================================================*/ +/* thresholding the image (set threshold to 128+32=160=0xA0) */ +/* now we have a fixed thresholdValue good to recognize on gray image */ +/* - so lower bits can used for other things (bad design?) */ +/*======================================================================*/ +int +thresholding (unsigned char *image, int rows, int cols, int x0, int y0, int dx, int dy, int thresholdValue); diff --git a/lib/gocr/output.c b/lib/gocr/output.c new file mode 100644 index 00000000..62d55872 --- /dev/null +++ b/lib/gocr/output.c @@ -0,0 +1,193 @@ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000-2006 Joerg Schulenburg + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + see README for EMAIL address +*/ + +#include <string.h> +#include "unicode.h" +#include "output.h" +#include "gocr.h" /* extern job_t JOB; */ + +/* function is only for debugging and for developing + it prints out a part of pixmap b at point x0,y0 to stderr + using dots .,; if no pixel, and @xoO for pixels + modify n_run and print out what would happen on 2nd, 3th loop! + new: output original and copied pixmap in the same figure + */ +void out_b(struct box *px, pix *b, int x0, int y0, int dx, int dy, int cs ){ + int x,y,x2,y2,yy0,tx,ty,n1,i; + char c1, c2; + yy0=y0; + if(px){ /* overwrite rest of arguments */ + if (!b) { + b=px->p; + x0=px->x0; dx=px->x1-px->x0+1; + y0=px->y0; dy=px->y1-px->y0+1; yy0=y0; + } + if(cs==0) cs=JOB->cfg.cs; + fprintf(stderr,"\n# list box x= %4d %4d d= %3d %3d r= %3d %3d" + " nrun=%d p=%p", /* ToDo: r,nrun is obsolete */ + px->x0, px->y0, px->x1 - px->x0 + 1, px->y1 - px->y0 + 1, + px->x - px->x0, px->y - px->y0, JOB->tmp.n_run, (void*)px); + fprintf(stderr,"\n# dots=%d boxes=%d subboxes=%d c=%s mod=%s" + " line=%d m= %d %d %d %d", + px->dots, px->num_boxes, px->num_subboxes, + decode(px->c,ASCII), decode(px->modifier,ASCII), px->line, + px->m1 - px->y0, px->m2 - px->y0, px->m3 - px->y0, px->m4 - px->y0); + if (px->num_frames) { + int i,j,jo; + fprintf(stderr,"\n# frames= %d (sumvects=%d)",px->num_frames, + ((px->num_frames)?px->num_frame_vectors[px->num_frames-1]:-1)); + for (jo=j=i=0; i<px->num_frames; i++, jo=j) { + fprintf(stderr,"\n# frame %d (%+4d,%3d,%2d) ", + i, px->frame_vol[i], px->frame_per[i], + px->num_frame_vectors[i]-jo); + /* print only the first vectors of each frame */ + for (;j<px->num_frame_vectors[i] && j<MaxFrameVectors; j++) + fprintf(stderr," #%02d %2d %2d", j, + px->frame_vector[j][0] - px->x0, + px->frame_vector[j][1] - px->y0); + } + } + if (px->num_ac){ /* output table of chars and its probabilities */ + fprintf(stderr,"\n# list box char: "); + for(i=0;i<px->num_ac && i<NumAlt;i++) + /* output the (xml-)string (picture position, barcodes, glyphs, ...) */ + if (px->tas[i]) + fprintf(stderr," %s(%d)", px->tas[i] ,px->wac[i]); + else + fprintf(stderr," %s(%d)",decode(px->tac[i],ASCII),px->wac[i]); + } + fprintf(stderr,"\n"); + if (px->dots && px->m2 && px->m1<y0) { yy0=px->m1; dy=px->y1-yy0+1; } + } + tx=dx/80+1; + ty=dy/40+1; /* step, usually 1, but greater on large maps */ + fprintf(stderr,"# list pattern x= %4d %4d d= %3d %3d t= %d %d\n", + x0,y0,dx,dy,tx,ty); + if (dx>0) + for(y=yy0;y<yy0+dy;y+=ty) { /* reduce the output to max 78x40 */ + /* first image is the copied and modified bitmap of the box */ + if (px) + for(x=x0;x<x0+dx;x+=tx){ /* by merging sub-pixels */ + n1=0; c1='.'; + for(y2=y;y2<y+ty && y2<y0+dy;y2++) /* sub-pixels */ + for(x2=x;x2<x+tx && x2<x0+dx;x2++) + { + if((getpixel(px->p,x2-x0+px->x0, + y2-y0+px->y0)<cs)) c1='@'; + } + if (px->num_frames) { /* mark vectors */ + int i; + if (c1!='$' && c1!='S') /* dont mark twice */ + for (i=0;i<px->num_frame_vectors[px->num_frames-1];i++) + if ((px->frame_vector[i][0]-px->x0)/tx==(x-x0)/tx + && (px->frame_vector[i][1]-px->y0)/ty==(y-y0)/ty) + { c1=((c1=='@')?'$':'S'); break; } + } + fprintf(stderr,"%c", c1 ); + } + + /* 2nd image is the boxframe in the original bitmap */ + if (dx<40) fprintf(stderr," "); + if (dx<40) /* do it only, if we have enough place */ + for(x=x0;x<x0+dx;x+=tx){ /* by merging sub-pixels */ + c1='.'; + for(y2=y;y2<y+ty && y2<y0+dy;y2++) /* sub-pixels */ + for(x2=x;x2<x+tx && x2<x0+dx;x2++) + { if((getpixel(b,x2,y2)<cs)) c1='@'; } + fprintf(stderr,"%c", c1 ); + } + + c1=c2=' '; + /* mark lines with < */ + if (px) if (y-y0+px->y0==px->m1 || y-y0+px->y0==px->m2 + || y-y0+px->y0==px->m3 || y-y0+px->y0==px->m4) c1='<'; + if (y==y0 || y==yy0+dy-1) c2='-'; /* boxmarks */ + + fprintf(stderr,"%c%c\n",c1,c2); + } +} + +/* same as out_b, but for faster use, only a box as argument + */ +void out_x(struct box *px) { + out_b(px,NULL,0, 0, 0, 0, JOB->cfg.cs); +} + + +/* print out two boxes side by side, for debugging comparision algos */ +void out_x2(struct box *box1, struct box *box2){ + int x,y,i,tx,ty,dy; + /*FIXME jb static*/static char *c1="OXXXXxx@.,,,,,,,"; + pix *b=&JOB->src.p; + dy=(box1->y1-box1->y0+1); + if(dy<box2->y1-box2->y0+1)dy=box2->y1-box2->y0+1; + tx=(box1->x1-box1->x0)/40+1; + ty=(box1->y1-box1->y0)/40+1; /* step, usually 1, but greater on large maps */ + if(box2)fprintf(stderr,"\n# list 2 patterns"); + for(i=0;i<dy;i+=ty) { /* reduce the output to max 78x40??? */ + fprintf(stderr,"\n"); y=box1->y0+i; + for(x=box1->x0;x<=box1->x1;x+=tx) + fprintf(stderr,"%c", c1[ ((getpixel(b,x,y)<JOB->cfg.cs)?0:8)+marked(b,x,y) ] ); + if(!box2) continue; + fprintf(stderr," "); y=box2->y0+i; + for(x=box2->x0;x<=box2->x1;x+=tx) + fprintf(stderr,"%c", c1[ ((getpixel(b,x,y)<JOB->cfg.cs)?0:8)+marked(b,x,y) ] ); + } +} + + +/* ---- list output ---- for debugging --- + * list all boxes where the results can be found within the c-option + */ +int output_list(job_t *job) { + int i = 0, j; + struct box *box2; + pix *pp = &job->src.p; + char *lc = job->cfg.lc; + + fprintf(stderr,"\n# list shape for charlist %s",lc); + for_each_data(&(JOB->res.boxlist)) { + box2 = (struct box *) list_get_current(&(JOB->res.boxlist)); + for (j=0; j<box2->num_ac; j++) + if (!lc || (box2->tac[j] && strchr(lc, box2->tac[j])) + || (box2->tas[j] && strstr(lc, box2->tas[j]))) break; + if (j<box2->num_ac) + fprintf(stderr,"\n# box found in charlist"); + if (!lc || (strchr(lc, box2->c) && box2->c < 256 && box2->c) + || (strchr(lc, '_') && box2->c==UNKNOWN) /* for compability */ + || j<box2->num_ac ){ /* also list alternative chars */ + if (!pp) pp=box2->p; + fprintf(stderr, + "\n# list shape %3d x=%4d %4d d= %3d %3d vf=%d ac=%d %04x %s", + i, box2->x0, box2->y0, + box2->x1 - box2->x0 + 1, + box2->y1 - box2->y0 + 1, + box2->num_frames, box2->num_ac, + (int)box2->c, /* wchar_t -> char ???? */ + decode(box2->c,ASCII) ); + if (JOB->cfg.verbose & 4) out_x(box2); + } + i++; + } end_for_each(&(JOB->res.boxlist)); + fprintf(stderr,"\n"); + return 0; +} + diff --git a/lib/gocr/output.h b/lib/gocr/output.h new file mode 100644 index 00000000..68b00ecd --- /dev/null +++ b/lib/gocr/output.h @@ -0,0 +1,36 @@ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000 Joerg Schulenburg + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + see README for EMAIL-address */ + +#ifndef OUTPUT_H +#define OUTPUT_H + +#include <stdlib.h> +#include <stdio.h> +#include "pnm.h" +#include "gocr.h" +#include "list.h" + +void out_b(struct box *px, pix *b, int x0, int y0, int dx, int dy, int cs ); +void out_x(struct box *px); +void out_x2(struct box *box1,struct box *box2); +int output_list(job_t *job); + + +#endif diff --git a/lib/gocr/pgm2asc.c b/lib/gocr/pgm2asc.c new file mode 100644 index 00000000..9d7a3ef4 --- /dev/null +++ b/lib/gocr/pgm2asc.c @@ -0,0 +1,2875 @@ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000-2007 Joerg Schulenburg + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + see README for EMAIL-address + + sometimes I have written comments in german language, sorry for that + + - look for ??? for preliminary code + - space: avX=22 11-13 (empirical estimated) + avX=16 5-7 + avX= 7 5-6 + + ToDo: - add filter (r/s mismatch) g300c1 + - better get_line2 function (problems on high resolution) + - write parallelizable code! + - learnmode (optimize filter) + - use ispell for final control or if unsure + - better line scanning (if not even) + - step 5: same chars differ? => expert mode + - chars dx>dy and above 50% hor-crossing > 4 is char-group ? + - detect color of chars and background + - better word space calculation (look at the examples) + (distance: left-left, middle-middle, left-right, thickness of e *0.75) + + GLOBAL DATA (mostly structures) + - pix : image - one byte per pixel bits0-2=working + - lines : rows of the text (points to pix) + - box : list of bounding box for character + - obj : objects (lines, splines, etc. building a character) + */ + + +#include <stdlib.h> +#include <stdio.h> +#include <assert.h> +#include <string.h> +#include <ctype.h> +#include "../../config.h" +#ifdef HAVE_WCHAR_H +#include <wchar.h> +#endif + +#include "list.h" +#include "pgm2asc.h" +// #include "pcx.h" /* needed for writebmp (removed later) */ +/* ocr1 is the test-engine - remember: this is development version */ +#include "ocr1.h" +/* first engine */ +#include "ocr0.h" +#include "otsu.h" +#include "progress.h" + +#include "gocr.h" + +/* wew: will be exceeded by capitals at 1200dpi */ +#define MaxBox (100*200) // largest possible letter (buffersize) +#define MAX(a,b) ((a) >= (b) ? (a) : (b)) + +/* if the system does not know about wchar.h, define functions here */ +#ifndef HAVE_WCHAR_H +/* typedef unsigned wchar_t; */ +/* Find the first occurrence of WC in WCS. */ +const wchar_t *wcschr (const wchar_t *wcs, const wchar_t wc) { + int i; for(i=0;wcs[i];i++) if (wcs[i]==wc) return wcs+i; return NULL; +} +const wchar_t *wcscpy (wchar_t *dest, const wchar_t *src) { + int i; for(i=0;src[i];i++) dest[i]=src[i]; dest[i]=0; return dest; +} +size_t wcslen (const wchar_t *s){ + size_t i; for(i=0;s[i];i++); return i; +} +#endif +#ifndef HAVE_WCSDUP +wchar_t * wcsdup (const wchar_t *WS) { /* its a gnu extension */ + wchar_t *copy; + copy = (wchar_t *) malloc((wcslen(WS)+1)*sizeof(wchar_t)); + if (!copy)return NULL; + wcscpy(copy, WS); + return copy; +} +#endif + +// ------------------------ feature extraction ----------------- +// ------------------------------------------------------------- +// detect maximas in of line overlaps (return in %) and line coordinates +// this is for future use +#define HOR 1 // horizontal +#define VER 2 // vertical +#define RIS 3 // rising=steigend +#define FAL 4 // falling=fallend + +/* exchange two variables */ +static void swap(int *a, int *b) { + int c = *a; + *a = *b; + *b = c; +} + +// calculate the overlapping of the line (0-1) with black points +// by recursive bisection +// line: y=dy/dx*x+b, implicit form: d=F(x,y)=dy*x-dx*y+b*dx=0 +// incremental y(i+1)=m*(x(i)+1)+b, F(x+1,y+1)=f(F(x,y)) +// ret & 1 => inverse pixel! +// d=2*F(x,y) integer numbers +int get_line(int x0, int y0, int x1, int y1, pix *p, int cs, int ret){ + int dx,dy,incrE,incrNE,d,x,y,r0,r1,ty,tx, + *px,*py,*pdx,*pdy,*ptx,*pty,*px1; + dx=abs(x1-x0); tx=((x1>x0)?1:-1); // tx=x-spiegelung (new) + dy=abs(y1-y0); ty=((y1>y0)?1:-1); // ty=y-spiegelung (new) + // rotate coordinate system if dy>dx +/*bbg: can be faster if instead of pointers we use the variables and swaps? */ +/*js: Do not know, I am happy that the current code is working and is small */ + if(dx>dy){ pdx=&dx;pdy=&dy;px=&x;py=&y;ptx=&tx;pty=&ty;px1=&x1; } + else { pdx=&dy;pdy=&dx;px=&y;py=&x;ptx=&ty;pty=&tx;px1=&y1; } + if( *ptx<0 ){ swap(&x0,&x1);swap(&y0,&y1);tx=-tx;ty=-ty; } + d=((*pdy)<<1)-(*pdx); incrE=(*pdy)<<1; incrNE=((*pdy)-(*pdx))<<1; + x=x0; y=y0; r0=r1=0; /* dd=tolerance (store max drift) */ + while( (*px)<=(*px1) ){ + if( ((getpixel(p,x,y)<cs)?1:0)^(ret&1) ) r0++; else r1++; + (*px)++; if( d<=0 ){ d+=incrE; } else { d+=incrNE; (*py)+=(*pty); } + } + return (r0*(ret&~1))/(r0+r1); // ret==100 => percentage % +} + +// this function should detect whether a direct connection between points +// exists or not, not finally implemented +// ret & 1 => inverse pixel! +// d=2*F(x,y) integer numbers, ideal line: ,I pixel: I@ +// ..@ @@@ .@. ...,@2@. +1..+3 floodfill around line ??? +// ..@ .@@ .@. ...,.@@@ +2..+4 <= that's not implemented yet +// ..@ ..@ .@. ...,.@@@ +2..+4 +// @.@ @.. .@. ...,@@@. +1..+3 +// @.@ @@. .@. ...I@@@. 0..+3 +// @@@ @@@ .@. ..@1@@.. 0..+2 +// 90% 0% 100% 90% r1-r2 +// I am not satisfied with it +int get_line2(int x0, int y0, int x1, int y1, pix *p, int cs, int ret){ + int dx,dy,incrE,incrNE,d,x,y,r0,r1,ty,tx,q,ddy,rx,ry, + *px,*py,*pdx,*pdy,*ptx,*pty,*px1; + dx=abs(x1-x0); tx=((x1>x0)?1:-1); // tx=x-spiegelung (new) + dy=abs(y1-y0); ty=((y1>y0)?1:-1); // ty=y-spiegelung (new) + // rotate coordinate system if dy>dx + if(dx>dy){ pdx=&dx;pdy=&dy;px=&x;py=&y;ptx=&tx;pty=&ty;px1=&x1;rx=1;ry=0; } + else { pdx=&dy;pdy=&dx;px=&y;py=&x;ptx=&ty;pty=&tx;px1=&y1;rx=0;ry=1; } + if( *ptx<0 ){ swap(&x0,&x1);swap(&y0,&y1);tx=-tx;ty=-ty; } + d=((*pdy)<<1)-(*pdx); incrE=(*pdy)<<1; incrNE=((*pdy)-(*pdx))<<1; + x=x0; y=y0; r0=r1=0; ddy=3; // tolerance = bit 1 + bit 0 = left+right + // int t=(*pdx)/16,tl,tr; // tolerance, left-,right delimiter + while( (*px)<=(*px1) ){ // not finaly implemented + q=((getpixel(p,x,y)<cs)?1:0)^(ret&1); + if ( !q ){ // tolerance one pixel perpenticular to the line + // what about 2 or more pixels tolerance??? + ddy&=(~1)|(((getpixel(p,x+ry,y+rx)<cs)?1:0)^(ret&1)); + ddy&=(~2)|(((getpixel(p,x-ry,y-rx)<cs)?1:0)^(ret&1))*2; + } else ddy=3; + if( ddy ) r0++; else r1++; + (*px)++; if( d<=0 ){ d+=incrE; } else { d+=incrNE; (*py)+=(*pty); } + } + return (r0*(ret&~1))/(r0+r1); // ret==100 => percentage % +} + +/* Look for dots in the rectangular region x0 <= x <= x1 and y0 <= y + <= y1 in pixmap p. The two low order bits in mask indicate the color + of dots to look for: If mask==1 then look for black dots (where a + pixel value less than cs is considered black). If mask==2 then look + for white dots. If mask==3 then look for both black and white dots. + If the dots are found, the corresponding bits are set in the returned + value. Heavily used by the engine ocr0*.cc */ +char get_bw(int x0, int x1, int y0, int y1, pix * p, int cs, int mask) { + char rc = 0; // later with error < 2% (1 dot) + int x, y; + + if (x0 < 0) x0 = 0; + if (x1 >= p->x) x1 = p->x - 1; + if (y0 < 0) y0 = 0; + if (y1 >= p->y) y1 = p->y - 1; + + for ( y = y0; y <= y1; y++) + for ( x = x0; x <= x1; x++) { + rc |= ((getpixel(p, x, y) < cs) ? 1 : 2); // break if rc==3 + if ((rc & mask) == mask) + return mask; // break loop + } + return (rc & mask); +} + +/* more general Mar2000 (x0,x1,y0,y1 instead of x0,y0,x1,y1! (history)) + * look for black crossings throw a line from x0,y0 to x1,y1 and count them + * follow line and count crossings ([white]-black-transitions) + * ex: horizontal num_cross of 'm' would return 3 */ +int num_cross(int x0, int x1, int y0, int y1, pix *p, int cs) { + int rc = 0, col = 0, k, x, y, i, d; // rc=crossings col=0=white + int dx = x1 - x0, dy = y1 - y0; + + d = MAX(abs(dx), abs(dy)); + for (i = 0, x = x0, y = y0; i <= d; i++) { + if (d) { + x = x0 + i * dx / d; + y = y0 + i * dy / d; + } + k = ((getpixel(p, x, y) < cs) ? 1 : 0); // 0=white 1=black + if (col == 0 && k == 1) + rc++; + col = k; + } + return rc; +} + +/* check if test matches pattern + * possible pattern: "a-zA-Z0-9+\-\\" (x-y dont work for c>127) + * ToDo: wchar_t cc + matching UTF-8 pattern for nonASCII + */ +int my_strchr( char *pattern, wchar_t cc ) { + char *s1; + if (pattern==(char *)NULL) return 0; + + /* if (!(cc&0x80)) s1=strchr(pattern,(char)cc); else */ + s1=strstr(pattern,decode(cc, UTF8)); + switch (cc) { + case '-': + case '\\': + if ((!s1) || s1-pattern<1 || *(s1-1)!='\\') return 0; + else return 1; + default: + if (s1) return 1; /* cc simply matches */ + s1=pattern+1; + while (s1) { + if ((!s1[0]) || (!s1[1])) return 0; /* end of string */ + if (*(s1-1)!='\\' && *(s1-1)<=cc && *(s1+1)>=cc) return 1; + s1=strchr(s1+1,'-'); /* look for next '-' */ + } + } + return 0; +} + +/* set alternate chars and its weight, called from the engine + if a char is recognized to (weight) percent + can be used for filtering (only numbers etc) + often usefull if Il1 are looking very similar + should this function stay in box.c ??? + weight is between 0 and 100 in percent, 100 means absolutely sure + - not final, not time critical (js) + - replace it by a string-function setaobj(*b,"string",weight) + and let call setac the setas function + */ + +int setas(struct box *b, char *as, int weight){ + int i,j; + if (b->num_ac > NumAlt || b->num_ac<0) { + fprintf(stderr,"\nDBG: There is something wrong with setas()!"); + b->num_ac=0; + } + if (as==NULL) { + fprintf(stderr,"\nDBG: setas(NULL) makes no sense!"); return 0; } + if (as[0]==0) { + fprintf(stderr,"\nDBG: setas(\"\") makes no sense!" + " x= %d %d", b->x0, b->y0); + // out_x(b); + return 0; + } + + /* char filter (ex: only numbers) ToDo: cfilter as UTF-8 */ + if (JOB->cfg.cfilter) { + /* do not accept chars which are not in the cfilter string */ + if ( as[0]>0 && as[1]==0 ) + if ( !my_strchr(JOB->cfg.cfilter,as[0]) ) return 0; + } +#if 0 /* obsolete, done in setac */ + /* not sure that this is the right place, but where else? */ + if ( as[0]>0 && as[1]==0 ) + if (b->modifier != SPACE && b->modifier != 0) { + wchar_t newac; + newac = compose(as[0], b->modifier); + as = (char *)decode(newac, UTF8); /* was (const char *) */ + if (newac == as[0]) { /* nothing composed */ + fprintf(stderr, "\nDBG setas compose was useless %d %d",b->x0,b->y0); + // out_x(b); + } + } +#endif + + /* only the first run gets the full weight */ + weight=(100-JOB->tmp.n_run)*weight/100; + + /* remove same entries from table */ + for (i=0;i<b->num_ac;i++) + if (b->tas[i]) + if (strcmp(as,b->tas[i])==0) break; + if (b->num_ac>0 && i<b->num_ac){ + if (weight<=b->wac[i]) return 0; /* if found + less weight ignore it */ + /* to insert the new weigth on the right place, we remove it first */ + if (b->tas[i]) free(b->tas[i]); + for (j=i;j<b->num_ac-1;j++){ /* shift lower entries */ + b->tac[j]=b->tac[j+1]; /* copy the char */ + b->tas[j]=b->tas[j+1]; /* copy the pointer to the string */ + b->wac[j]=b->wac[j+1]; /* copy the weight */ + } + b->num_ac--; /* shrink table */ + } + /* sorting and add it to the table */ + for (i=0;i<b->num_ac;i++) if (weight>b->wac[i]) break; + if (b->num_ac<NumAlt-1) b->num_ac++; /* enlarge table */ + for (j=b->num_ac-1;j>i;j--){ /* shift lower entries */ + b->tac[j]=b->tac[j-1]; /* copy the char */ + b->tas[j]=b->tas[j-1]; /* copy the pointer to the string */ + b->wac[j]=b->wac[j-1]; /* copy the weight */ + } + if (i<b->num_ac) { /* insert new entry */ + b->tac[i]=0; /* insert the char=0 ... */ + b->tas[i]=(char *)malloc(strlen(as)+1); /* ... string */ + if (b->tas[i]) memcpy(b->tas[i],as,strlen(as)+1); + b->wac[i]=weight; /* ... and its weight */ + } + if (i==0) b->c=b->tac[0]; /* char or 0 for string */ + return 0; +} + +/* ToDo: this function will be replaced by a call of setas() later */ +int setac(struct box *b, wchar_t ac, int weight){ + int i,j; + if ((!b) || b->num_ac > NumAlt || b->num_ac<0) { + fprintf(stderr,"\nDBG: This is a bad call to setac()!"); + b->num_ac=0; + } + if (ac==0 || ac==UNKNOWN) { + fprintf(stderr,"\nDBG: setac(0) makes no sense!"); + return 0; + } + /* char filter (ex: only numbers) ToDo: cfilter as UTF-8 */ + if (JOB->cfg.cfilter) { + /* do not accept chars which are not in the cfilter string */ + /* if ( ac>255 || !strchr(JOB->cfg.cfilter,(char)ac) ) return 0; */ + if ( !my_strchr(JOB->cfg.cfilter,ac) ) return 0; + } + /* not sure that this is the right place, but where else? */ + if (b->modifier != SPACE && b->modifier != 0) { + wchar_t newac; + newac = compose(ac, b->modifier); + if (newac == ac) { /* nothing composed */ + if(JOB->cfg.verbose & 7) + fprintf(stderr, "\nDBG setac(%s): compose was useless @ %d %d", + decode(ac,ASCII), b->x0, b->y0); + } + ac = newac; + } + + /* only the first run gets the full weight */ + weight=(100-JOB->tmp.n_run)*weight/100; + + /* remove same entries from table */ + for (i=0;i<b->num_ac;i++) if (ac==b->tac[i]) break; + if (b->num_ac>0 && i<b->num_ac){ + if (weight<=b->wac[i]) return 0; + if (b->tas[i]) free(b->tas[i]); + for (j=i;j<b->num_ac-1;j++){ /* shift lower entries */ + b->tac[j]=b->tac[j+1]; /* copy the char */ + b->tas[j]=b->tas[j+1]; /* copy the pointer to the string */ + b->wac[j]=b->wac[j+1]; /* copy the weight */ + } + b->num_ac--; /* shrink table */ + } + /* sorting it to the table */ + for (i=0;i<b->num_ac;i++) if (weight>b->wac[i]) break; + if (b->num_ac<NumAlt-1) b->num_ac++; /* enlarge table */ + for (j=b->num_ac-1;j>i;j--){ /* shift lower entries */ + b->tac[j]=b->tac[j-1]; /* copy the char */ + b->tas[j]=b->tas[j-1]; /* copy the pointer to the string */ + b->wac[j]=b->wac[j-1]; /* copy the weight */ + } + if (i<b->num_ac) { /* insert new entry */ + b->tac[i]=ac; /* insert the char ... */ + b->tas[j]=NULL; /* ... no string (?) */ + b->wac[i]=weight; /* ... and its weight */ + } + if (i==0) b->c=ac; /* store best result to b->c (will be obsolete) */ + + return 0; +} + +/* test if ac in wac-table + usefull for contextcorrection and box-splitting + return 0 if not found + return wac if found (wac>0) + */ +int testac(struct box *b, wchar_t ac){ + int i; + if (b->num_ac > NumAlt || b->num_ac<0) { + fprintf(stderr,"\n#DEBUG: There is something wrong with testac()!"); + b->num_ac=0; + } + /* search entries in table */ + for (i=0;i<b->num_ac;i++) if (ac==b->tac[i]) return b->wac[i]; + return 0; +} + + +/* look for edges: follow a line from x0,y0 to x1,y1, record the + * location of each transition, and return their number. + * ex: horizontal num_cross of 'm' would return 6 + * remark: this function is not used, obsolete? ToDo: remove? + */ +int follow_path(int x0, int x1, int y0, int y1, pix *p, int cs, path_t *path) { + int rc = 0, prev, x, y, i, d, color; // rc=crossings col=0=white + int dx = x1 - x0, dy = y1 - y0; + + d = MAX(abs(dx), abs(dy)); + prev = getpixel(p, x0, y0) < cs; // 0=white 1=black + path->start = prev; + for (i = 1, x = x0, y = y0; i <= d; i++) { + if (d) { + x = x0 + i * dx / d; + y = y0 + i * dy / d; + } + color = getpixel(p, x, y) < cs; // 0=white 1=black + if (color != prev){ + if (rc>=path->max){ + int n=path->max*2+10; + path->x = (int *) xrealloc(path->x, n*sizeof(int)); + path->y = (int *) xrealloc(path->y, n*sizeof(int)); + path->max = n; + } + path->x[rc]=x; + path->y[rc]=y; + rc++; + } + prev = color; + } + path->num=rc; + return rc; +} + +/* ToDo: only used in follow_path, which is obsolete, remove? */ +void *xrealloc(void *ptr, size_t size){ + void *p; + p = realloc(ptr, size); + if (size>0 && (!p)){ + fprintf(stderr, "insufficient memory"); + exit(1); + } + return p; +} + +/* + * ------------------------------------------------------------- + * mark edge-points + * - first move forward until b/w-edge + * - more than 2 pixel? + * - loop around + * - if forward pixel : go up, rotate right + * - if forward no pixel : rotate left + * - stop if found first 2 pixel in same order + * go_along_the_right_wall strategy is very similar and used otherwhere + * -------------------------------------------------------------- + * turmite game: inp: start-x,y, regel r_black=UP,r_white=RIght until border + * out: last-position + * + * could be used to extract more features: + * by counting stepps, dead-end streets ,xmax,ymax,ro-,ru-,lo-,lu-edges + * + * use this little animal to find features, I first was happy about it + * but now I prefer the loop() function + */ + +void turmite(pix *p, int *x, int *y, + int x0, int x1, int y0, int y1, int cs, int rw, int rb) { + int r; + if (outbounds(p, x0, y0)) // out of pixmap + return; + while (*x >= x0 && *y >= y0 && *x <= x1 && *y <= y1) { + r = ((getpixel(p, *x, *y) < cs) ? rb : rw); // select rule + switch (r) { + case UP: (*y)--; break; + case DO: (*y)++; break; + case RI: (*x)++; break; + case LE: (*x)--; break; + case ST: break; + default: assert(0); + } + if( r==ST ) break; /* leave the while-loop */ + } +} + +/* search a way from p0 to p1 without crossing pixels of type t + * only two directions, useful to test if there is a gap 's' + * labyrinth algorithm - do you know a faster way? */ +int joined(pix *p, int x0, int y0, int x1, int y1, int cs){ + int t,r,x,y,dx,dy,xa,ya,xb,yb; + x=x0;y=y0;dx=1;dy=0; + if(x1>x0){xa=x0;xb=x1;} else {xb=x0;xa=x1;} + if(y1>y0){ya=y0;yb=y1;} else {yb=y0;ya=y1;} + t=((getpixel(p,x,y)<cs)?1:0); + for(;;){ + if( t==((getpixel(p,x+dy,y-dx)<cs)?1:0) // right free? + && x+dy>=xa && x+dy<=xb && y-dx>=ya && y-dx<=yb) // wall + { r=dy;dy=-dx;dx=r;x+=dx;y+=dy; } // rotate right and step forward + else { r=dx;dx=-dy;dy=r; } // rotate left + // fprintf(stderr," path xy %d-%d %d-%d %d %d %d %d\n",xa,xb,ya,yb,x,y,dx,dy); + if( x==x1 && y==y1 ) return 1; + if( x==x0 && y==y0 && dx==1) return 0; + } + // return 0; // endless loop ? +} + +/* move from x,y to direction r until pixel of color col is found + * or maximum of l steps + * return the number of steps done */ +int loop(pix *p,int x,int y,int l,int cs,int col, DIRECTION r){ + int i=0; + if(x>=0 && y>=0 && x<p->x && y<p->y){ + switch (r) { + case UP: + for( ;i<l && y>=0;i++,y--) + if( (getpixel(p,x,y)<cs)^col ) + break; + break; + case DO: + for( ;i<l && y<p->y;i++,y++) + if( (getpixel(p,x,y)<cs)^col ) + break; + break; + case LE: + for( ;i<l && x>=0;i++,x--) + if( (getpixel(p,x,y)<cs)^col ) + break; + break; + case RI: + for( ;i<l && x<p->x;i++,x++) + if( (getpixel(p,x,y)<cs)^col ) + break; + break; + default:; + } + } + return i; +} + +/* Given a point, frames a rectangle containing all points of the same + * color surrounding it, and mark these points. + * ToDo: obsolate and replaced by frame_vector + * + * looking for better algo: go horizontally and look for upper/lower non_marked_pixel/nopixel + * use lowest three bits for mark + * - recursive version removed! AmigaOS has no Stack-OVL-Event + * run around the chape using laby-robot + * bad changes can lead to endless loop! + * - this is not absolutely sure but mostly works well + * diag - 0: only pi/2 direction, 1: pi/4 directions (diagonal) + * mark - 3 bit marker, mark each valid pixel with it + */ +int frame_nn(pix *p, int x, int y, + int *x0, int *x1, int *y0, int *y1, // enlarge frame + int cs, int mark,int diag){ +#if 1 /* flood-fill to detect black objects, simple and faster? */ + int rc = 0, dx, col, maxstack=0; static int overflow=0; + int bmax=1024, blen=0, *buf; /* buffer as replacement for recursion stack */ + + /* check bounds */ + if (outbounds(p, x, y)) return 0; + /* check if already marked (with mark since v0.4) */ + if ((marked(p,x,y)&mark)==mark) return 0; + + col = ((getpixel(p, x, y) < cs) ? 0 : 1); + buf=(int *)malloc(bmax*sizeof(int)*2); + if (!buf) { fprintf(stderr,"malloc failed (frame_nn)\n");return 0;} + buf[0]=x; + buf[1]=y; + blen=1; + + g_debug(fprintf(stderr,"\nframe_nn x=%4d y=%4d",x,y);) + for ( ; blen ; ) { + /* max stack depth is complexity of the object */ + if (blen>maxstack) maxstack=blen; + blen--; /* reduce the stack */ + x=buf[blen*2+0]; + y=buf[blen*2+1]; + if (y < *y0) *y0 = y; + if (y > *y1) *y1 = y; + /* first go to leftmost pixel */ + for ( ; x>0 && (col == ((getpixel(p, x-1, y) < cs) ? 0 : 1)) ; x--); + if ((marked(p,x,y)&mark)==mark) continue; /* already scanned */ + for (dx=-1;dx<2;dx+=2) /* look at upper and lower line, left */ + if ( diag && x<p->x && x-1>0 && y+dx >=0 && y+dx < p->y + && col != ((getpixel(p, x , y+dx) < cs) ? 0 : 1) + && col == ((getpixel(p, x-1, y+dx) < cs) ? 0 : 1) + && !((marked(p,x-1,y+dx)&mark)==mark) + ) { + if (blen+1>=bmax) { overflow|=1; continue; } + buf[blen*2+0]=x-1; + buf[blen*2+1]=y+dx; + blen++; + } + if (x < *x0) *x0 = x; + /* second go right, mark and get new starting points */ + for ( ; x<p->x && (col == ((getpixel(p, x , y) < cs) ? 0 : 1)) ; x++) { + p->p[x + y * p->x] |= (mark & 7); rc++; /* mark pixel */ + /* enlarge frame */ + if (x > *x1) *x1 = x; + for (dx=-1;dx<2;dx+=2) /* look at upper and lower line */ + if ( col == ((getpixel(p, x , y+dx) < cs) ? 0 : 1) + && ( + col != ((getpixel(p, x-1, y ) < cs) ? 0 : 1) + || col != ((getpixel(p, x-1, y+dx) < cs) ? 0 : 1) ) + && !((marked(p,x,y+dx)&mark)==mark) && y+dx<p->y && y+dx>=0 + ) { + if (blen+1>=bmax) { overflow|=1; continue; } + buf[blen*2+0]=x; + buf[blen*2+1]=y+dx; + blen++; + } + } + for (dx=-1;dx<2;dx+=2) /* look at upper and lower line, right */ + if ( diag && x<p->x && x-1>0 && y+dx >=0 && y+dx < p->y + && col == ((getpixel(p, x-1, y ) < cs) ? 0 : 1) + && col != ((getpixel(p, x , y ) < cs) ? 0 : 1) + && col != ((getpixel(p, x-1, y+dx) < cs) ? 0 : 1) + && col == ((getpixel(p, x , y+dx) < cs) ? 0 : 1) + && !((marked(p,x,y+dx)&mark)==mark) + ) { + if (blen+1>=bmax) { overflow|=1; continue; } + buf[blen*2+0]=x; + buf[blen*2+1]=y+dx; + blen++; + } + } + + /* debug, ToDo: use info maxstack and pixels for image classification */ + g_debug(fprintf(stderr," maxstack= %4d pixels= %6d",maxstack,rc);) + if (overflow==1){ + overflow|=2; + fprintf(stderr,"# Warning: frame_nn stack oerflow\n"); + } + free(buf); +#else /* old version, ToDo: improve it for tmp04/005*.pgm.gz */ + int i, j, d, dx, ox, oy, od, nx, ny, rc = 0, rot = 0, x2 = x, y2 = y, ln; + + static const int d0[8][2] = { { 0, -1} /* up */, {-1, -1}, + {-1, 0} /* left */, {-1, 1}, + { 0, 1} /* down */, { 1, 1}, + { 1, 0} /* right */, { 1, -1}}; + + /* check bounds */ + if (outbounds(p, x, y)) + return 0; + /* check if already marked */ + if ((marked(p,x,y)&mark)==mark) + return 0; + + i = ((getpixel(p, x, y) < cs) ? 0 : 1); + rc = 0; + + g_debug(fprintf(stderr," start frame:");) + + for (ln = 0; ln < 2 && rot >= 0; ln++) { // repeat if right-loop + g_debug(fprintf(stderr," ln=%d diag=%d cs=%d x=%d y=%d - go to border\n",ln,diag,cs,x,y);) + + od=d=(8+4*ln-diag)&7; // start robot looks up, right is a wall + // go to right (left) border + if (ln==1) { + x=x2; y=y2; + } + /* start on leftmost position */ + for (dx = 1 - 2*ln; x + dx < p->x && x + dx >= 0 /* bounds */ && + i == ((getpixel(p, x + dx, y) < cs) ? 0 : 1) /* color */; + x += dx); + + g_debug(fprintf(stderr," ln=%d diag=%d cs=%d x=%d y=%d\n",ln,diag,cs,x,y);) + + /* robot stores start-position */ + ox = x; oy = y; + for (rot = 0; abs(rot) <= 64; ) { /* for sure max. 8 spirals */ + /* leftmost position */ + if (ln == 0 && x < x2) { + x2 = x; y2 = y; + } + + g_debug(fprintf(stderr," x=%3d y=%3d d=%d i=%d p=%3d rc=%d\n",x,y,d,i,getpixel(p,x,y),rc);) + + if ( abs(d0[d][1]) ) { /* mark left (right) pixels */ + for (j = 0, dx = d0[d][1]; x + j >= 0 && x + j < p->x + && i == ((getpixel(p, x + j, y) < cs) ? 0 : 1); j += dx) { + if (!((marked(p, x + j, y)&mark)==mark)) + rc++; + p->p[x + j + y * p->x] |= (mark & 7); + } + } + /* look to the front of robot */ + nx = x + d0[d][0]; + ny = y + d0[d][1]; + /* if right is a wall */ + if ( outbounds(p, nx, ny) || i != ((getpixel(p,nx,ny)<cs) ? 0 : 1) ) { + /* rotate left */ + d=(d+2-diag) & 7; rot-=2-diag; + } + else { /* if no wall, go, turn back and rotate left */ + x=nx; y=ny; d=(d+4+2-diag) & 7; rot+=2-diag+4; + /* enlarge frame */ + if (x < *x0) *x0 = x; + if (x > *x1) *x1 = x; + if (y < *y0) *y0 = y; + if (y > *y1) *y1 = y; + } + if(x==ox && y==oy && d==od) break; // round trip finished + } + } + g_debug(fprintf(stderr," rot=%d\n",rot);) +#endif + return rc; +} + +/* obsolete! replaced by vectors + * mark neighbouring pixel of same color, return number + * better with neighbours of same color (more general) ??? + * parameters: (&~7)-pixmap, start-point, critical_value, mark + * recursion is removed */ +int mark_nn(pix * p, int x, int y, int cs, int r) { + /* out of bounds or already marked? */ + if (outbounds(p, x, y) || (marked(p, x, y)&r)==r) + return 0; + { + int x0, x1, y0, y1; + x0 = x1 = x; + y0 = y1 = y; // not used + return frame_nn(p, x, y, &x0, &x1, &y0, &y1, cs, r, JOB->tmp.n_run & 1); + // using same scheme + } +} + +/* ToDo: finish to replace old frame by this new one + * + * @...........#@@@@@@@. # = marked as already scanned black pixels + * @........@@@@@@@@@@@# only left and right border + * .......#@@@@@@@@@@@@@ left side on even y + * ......@@@@@@@@#.@@@@# right side on odd y + * .....#@@@@@......#@@@ no border is marked twice + * ....@@@@@#......@@@#. works also for thinn lines + * ...#@@@@........#@@@. - outer loop is stored as first + * ..@@@@#........@@@#.. - inner loop is stored as second + * .#@@@@........#@@@@.. 1st in an extra box (think on white chars) + * @@@@#.......@@@@#.... 2nd merge in an extra step + * #@@@@@....#@@@@@..... + * @@@@@@@@@@@@@@#...... + * .#@@@@@@@@@@@@....... + * + * run around the chape using laby-robot + * - used for scanning boxes, look for horizontal b/w transitions + * with unmarked black pixels and call this routine + * - stop if crossing a marked box in same direction (left=up, right=down) + * box - char box, store frame_vectors and box + * x,y - starting point + * mark - 3 bit marker, mark each valid pixel with it + * diag - 0: only pi/2 direction, 1: pi/4 directions (diagonal) + * ds - start direction, 6=right of right border, 2=left of left border + * ret - 0=ok, -1=already marked, -2=max_num_frames_exceeded + * -7=no border in direction ds + */ +#if 0 +#undef g_debug +#define g_debug(x) x +#endif +/* grep keywords: scan_vectors frame_vector */ +int frame_vector(struct box *box1, int x, int y, + int cs, int mark, int diag, int ds) { + int i1, i2, i2o, + new_x=1, /* flag for storing the vector x,y */ + steps=1, /* steps between stored vectors, speedup for big frames */ + d, /* direction */ + ox, oy, /* starting point */ + nx, ny, mx, my, /* used for simplification */ + /* ToDo: add periphery to box (german: Umfang?) */ + rc = 1, /* return code, circumference, sum vector lengths */ + rot = 0, /* memory for rotation, rot=8 means one full rotation */ + vol = 0; /* volume inside frame, negative for white inside black */ + pix *p=box1->p; + + /* translate the 8 directions to (x,y) pairs, + * if only four directions are used, only every 2nd vector is accessed, + * +1 turn left, -1 turn right + */ + static const int d0[8][2] = + { { 0, -1}, /* up */ {-1, -1}, /* up-le */ + {-1, 0}, /* left */ {-1, 1}, /* do-le */ + { 0, 1}, /* down */ { 1, 1}, /* do-ri */ + { 1, 0}, /* right */ { 1, -1} }; /* up-ri */ + + /* check bounds */ + if (outbounds(p, x, y)) + return 0; + + /* pixel color we are looking for, 0=black, 1=white */ + d = ds; + i1 = ((getpixel(p, x, y ) < cs) ? 0 : 1); + i2 = ((getpixel(p, x + d0[d][0], y + d0[d][1]) < cs) ? 0 : 1); + + g_debug(fprintf(stderr,"\nLEV2 frame_vector @ %3d %3d d%d %2d %2d" + " %d-%d pix=%3d mark=%d cs=%d",\ + x,y,ds,d0[ds][0],d0[ds][1],i1,i2,getpixel(p,x,y),mark,cs);) + + if (i1==i2){ + fprintf(stderr,"ERROR frame_vector: no border\n"); + return -7; /* no border detected */ + } + + /* initialize boxframe outside this function + box1->x0=box1->x1=x; + box1->y0=box1->y1=y; + */ + + /* initialize boxvector outside this function + box1->num_frames=0 + num_frame_vectors[0]=0 ??? + and store start value + */ + if (box1->num_frames > MaxNumFrames) return -2; + /* index to next (x,y) */ + i2o=i2=( (box1->num_frames==0)?0: + box1->num_frame_vectors[ box1->num_frames ] ); +#if 0 // obsolete v0.43 + box1->frame_vector[i2][0]=x; + box1->frame_vector[i2][1]=y; + i2++; + box1->num_frame_vectors[ box1->num_frames ]=i2; +#endif + box1->num_frames++; + + /* robot stores start-position */ + ox = x; oy = y; /* look forward to white pixel */ + + for (;;) { /* stop if same marked pixel touched */ + + g_debug(fprintf(stderr,"\nLEV3: x= %3d %3d d= %d rot= %2d %3d",x,y,d,rot,i2);) + + /* ToDo: store max. abs(rot) ??? for better recognition */ + if (new_x) { + g_debug(fprintf(stderr,"\nLEV2: markB xy= %3d %3d ", x, y);) + p->p[x + y * p->x] |= (mark & 7); /* mark black pixel */ + } + + /* store a new vector or enlarge the predecessor */ + if (new_x && (rc%steps)==0) { /* dont store everything on big chars */ + if (i2>=MaxFrameVectors) { + box1->num_frame_vectors[ box1->num_frames-1 ]=i2; + reduce_vectors(box1,1); /* simplify loop */ + i2=box1->num_frame_vectors[ box1->num_frames-1 ]; + /* enlarge steps on big chars getting speedup */ + steps=(box1->y1-box1->y0+box1->x1-box1->x0)/32+1; + } + /* store frame-vector */ + if (i2<MaxFrameVectors) { + box1->frame_vector[i2][0]=x; + box1->frame_vector[i2][1]=y; + /* test if older vector points to the same direction */ + if (i2>1) { + /* get predecessor */ + nx=box1->frame_vector[i2-1][0]-box1->frame_vector[i2-2][0]; + ny=box1->frame_vector[i2-1][1]-box1->frame_vector[i2-2][1]; + mx=x -box1->frame_vector[i2-1][0]; + my=y -box1->frame_vector[i2-1][1]; + /* same direction? */ + if (nx*my-ny*mx==0 && nx*mx>=0 && ny*my>=0) { + /* simplify by removing predecessor */ + i2--; + box1->frame_vector[i2][0]=x; + box1->frame_vector[i2][1]=y; + } /* do not simplify */ + } + i2++; + box1->num_frame_vectors[ box1->num_frames-1 ]=i2; + } + g_debug(fprintf(stderr," stored @ %3d steps= %d", i2-1, steps);) + } + new_x=0; /* work for new pixel (x,y) done */ + + /* check if round trip is finished */ + if (x==ox && y==oy && abs(rot)>=8) break; + + /* look to the front of robot (turtle or ant) */ + nx = x + d0[d][0]; + ny = y + d0[d][1]; + + /* next step, if right is a wall turn the turtle left */ + if ( outbounds(p, nx, ny) || i1 != ((getpixel(p,nx,ny)<cs) ? 0 : 1) ) { + if (y==ny && nx>=0 && nx<p->x) { /* if inbound */ + g_debug(fprintf(stderr,"\nLEV2: markW xy= %3d %3d ", nx, ny);) + p->p[nx + ny * p->x] |= (mark & 7); /* mark white pixel */ + } + /* rotate left 90 or 45 degrees */ + d=(d+2-diag) & 7; rot+=2-diag; + /* calculate volume inside frame */ + switch (d+diag) { + case 2+2: vol-=x-1; break; + case 6+2: vol+=x; break; + } + } + else { /* if no wall, go forward and turn right (90 or 45 degrees) */ + x=nx; y=ny; + /* turn back and rotate left */ + d=(d+4+2-diag) & 7; rot+=2-diag-4; + rc++; /* counting steps, used for speedup */ + + /* enlarge frame */ + if (x < box1->x0) box1->x0 = x; + if (x > box1->x1) box1->x1 = x; + if (y < box1->y0) box1->y0 = y; + if (y > box1->y1) box1->y1 = y; + + new_x=1; + } + } + + /* to distinguish inner and outer frames, store volume as +v or -v */ + box1->frame_vol[ box1->num_frames-1 ] = vol; + box1->frame_per[ box1->num_frames-1 ] = rc-1; + + /* dont count and store the first vector twice */ + if (i2-i2o>1) { + i2--; rc--; box1->num_frame_vectors[ box1->num_frames-1 ]=i2; + } + /* output break conditions */ + g_debug(fprintf(stderr,"\nLEV2 o= %3d %3d x= %3d %3d r=%d v=%d",ox,oy,x,y,rot,vol);) + /* rc=1 for a single point, rc=2 for a two pixel sized point */ + g_debug(fprintf(stderr," steps= %3d vectors= %3d",rc,i2);) + /* out_x(box1); ToDo: output only the first thousend */ + return rc; /* return number of bordering pixels = periphery? */ +} + + + +/* clear lowest 3 (marked) bits (they are used for marking) */ +void clr_bits(pix * p, int x0, int x1, int y0, int y1) { + int x, y; + for ( y=y0; y <= y1; y++) + for ( x=x0; x <= x1; x++) + p->p[x+y*p->x] &= ~7; +} + +/* look for white holes surrounded by black points + * at the moment look for white point with black in all four directions + * - store position of hole in coordinates relativ to box! + * ToDo: count only holes with vol>10% ??? + * ToDo: rewrite for frame vectors (faster, no malloc) + * holes are frames rotating left hand + * obsolete, do it with vectors + */ +int num_hole(int x0, int x1, int y0, int y1, pix * p, int cs, holes_t *holes) { + int num_holes = 0, x, y, hole_size; + pix b; // temporary mini-page + int dx = x1 - x0 + 1, dy = y1 - y0 + 1; + unsigned char *buf; // 2nd copy of picture, for working + + if (holes) holes->num=0; + if(dx<3 || dy<3) return 0; + b.p = buf = (unsigned char *) malloc( dx * dy ); + if( !buf ){ + fprintf( stderr, "\nFATAL: malloc(%d) failed, skip num_hole", dx*dy ); + return 0; + } + if (copybox(p, x0, y0, dx, dy, &b, dx * dy)) + { free(b.p); return -1;} + + // printf(" num_hole("); + /* --- mark white-points connected with border */ + for (x = 0; x < b.x; x++) { + if (getpixel(&b, x, 0) >= cs) + mark_nn(&b, x, 0, cs, AT); + if (getpixel(&b, x, b.y - 1) >= cs) + mark_nn(&b, x, b.y - 1, cs, AT); + } + for (y = 0; y < b.y; y++) { + if (getpixel(&b, 0, y) >= cs) + mark_nn(&b, 0, y, cs, AT); + if (getpixel(&b, b.x - 1, y) >= cs) + mark_nn(&b, b.x - 1, y, cs, AT); + } + + //g_debug(out_b(NULL,&b,0,0,b.x,b.y,cs);) + // --- look for unmarked white points => hole + for (x = 0; x < b.x; x++) + for (y = 0; y < b.y; y++) + if (!((marked(&b, x, y)&AT)==AT)) // unmarked + if (getpixel(&b, x, y) >= cs) { // hole found +#if 0 + hole_size=mark_nn(&b, x, y, cs, AT); /* old version */ + if (hole_size > 1 || dx * dy <= 40) + num_holes++; +#else + { /* new version, for future store of hole characteristics */ + int x0, x1, y0, y1, i, j; + x0 = x1 = x; + y0 = y1 = y; // not used + hole_size=frame_nn(&b, x, y, &x0, &x1, &y0, &y1, cs, AT, JOB->tmp.n_run & 1); + // store hole for future use, num is initialized with 0 + if (hole_size > 1 || dx * dy <= 40){ + num_holes++; + if (holes) { + // sort in table + for (i=0;i<holes->num && i<MAX_HOLES;i++) + if (holes->hole[i].size < hole_size) break; + for (j=MAX_HOLES-2;j>=i;j--) + holes->hole[j+1]=holes->hole[j]; + if (i<MAX_HOLES) { + // printf(" i=%d size=%d\n",i,hole_size); + holes->hole[i].size=hole_size; + holes->hole[i].x=x; + holes->hole[i].y=y; + holes->hole[i].x0=x0; + holes->hole[i].y0=y0; + holes->hole[i].x1=x1; + holes->hole[i].y1=y1; + } + holes->num++; + } + } + } +#endif + } + free(b.p); + // printf(")=%d",num_holes); + return num_holes; +} + +/* count for black nonconnected objects --- used for i,auml,ouml,etc. */ +/* ToDo: obsolete, replaced by vectors and box.num_boxes */ +int num_obj(int x0, int x1, int y0, int y1, pix * p, int cs) { + int x, y, rc = 0; // rc=num_obj + unsigned char *buf; // 2nd copy of picture, for working + pix b; + + if(x1<x0 || y1<y0) return 0; + b.p = buf = (unsigned char *) malloc( (x1-x0+1) * (y1-y0+1) ); + if( !buf ){ + fprintf( stderr, "\nFATAL: malloc(%d) failed, skip num_obj",(x1-x0+1)*(y1-y0+1) ); + return 0; + } + if (copybox(p, x0, y0, x1 - x0 + 1, y1 - y0 + 1, &b, (x1-x0+1) * (y1-y0+1))) + { free(b.p); return -1; } + // --- mark black-points connected with neighbours + for (x = 0; x < b.x; x++) + for (y = 0; y < b.y; y++) + if (getpixel(&b, x, y) < cs) + if (!((marked(&b, x, y)&AT)==AT)) { + rc++; + mark_nn(&b, x, y, cs, AT); + } + free(b.p); + return rc; +} + +#if 0 +// ---------------------------------------------------------------------- +// first idea for making recognition based on probability +// - start with a list of all possible chars +// - call recognition_of_char(box *) +// - remove chars from list which could clearly excluded +// - reduce probability of chars which have wrong features +// - font types list could also build +// at the moment it is only an idea, I should put it to the todo list +// +char *list="0123456789,.\0xe4\0xf6\0xfc" // "a=228 o=246 u=252 + "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; +int wert[100]; +int listlen=0,numrest=0; +// initialize a new character list (for future) +void ini_list(){ int i; + for(i=0;list[i]!=0 && i<100;i++) wert[i]=0; + numrest=listlen=i; } +// exclude??? (for future) oh it was long time ago, I wrote that :/ +void exclude(char *filt){ int i,j; + for(j=0;filt[j]!=0 && j<100;j++) + for(i=0;list[i]!=0 && i<100;i++) + if( filt[j]==list[i] ) { if(!wert[i])numrest--; wert[i]++; } } +// get the result after all the work (for future) +char getresult(){ int i; + if( numrest==1 ) + for(i=0;list[i]!=0 && i<100;i++) if(!wert[i]) return list[i]; + return '_'; + } +#endif + +// look at the environment of the pixel too (contrast etc.) +// detailed analysis only of diff pixels! +// +// 100% * "distance", 0 is ideal fit +// = similarity of two chars for recognition of garbled (verstuemmelter) chars +// weight of pixels with only one same neighbour set to 0 +// look at contours too! v0.2.4: B==H +// changed for v0.41, Mar06 +int distance( pix *p1, struct box *box1, + pix *p2, struct box *box2, int cs){ + int rc=0,x,y,v1,v2,i1,i2,rgood=0,rbad=0,x1,y1,x2,y2,dx,dy,dx1,dy1,dx2,dy2; + x1=box1->x0;y1=box1->y0;x2=box2->x0;y2=box2->y0; + dx1=box1->x1-box1->x0+1; dx2=box2->x1-box2->x0+1; dx=((dx1>dx2)?dx1:dx2); + dy1=box1->y1-box1->y0+1; dy2=box2->y1-box2->y0+1; dy=((dy1>dy2)?dy1:dy2); + if(abs(dx1-dx2)>1+dx/16 || abs(dy1-dy2)>1+dy/16) return 100; + // compare relations to baseline and upper line + if(2*box1->y1>box1->m3+box1->m4 && 2*box2->y1<box2->m3+box2->m4) rbad+=128; + if(2*box1->y0>box1->m1+box1->m2 && 2*box2->y0<box2->m1+box2->m2) rbad+=128; + // compare pixels + for( y=0;y<dy;y++ ) + for( x=0;x<dx;x++ ) { // try global shift too ??? + v1 =((getpixel(p1,x1+x ,y1+y )<cs)?1:0); i1=8; // better gray? + v2 =((getpixel(p2,x2+x ,y2+y )<cs)?1:0); i2=8; // better gray? + if(v1==v2) { rgood+=8; continue; } // all things are right! + // what about different pixel??? + // test overlap of 8 surounding pixels ??? bad if two nb. are bad + v1=-1; + for(i1=-1;i1<2;i1++) + for(i2=-1;i2<2;i2++)if(i1!=0 || i2!=0){ + if( ((getpixel(p1,x1+x+i1*(1+dx/32),y1+y+i2*(1+dy/32))<cs)?1:0) + !=((getpixel(p2,x2+x+i1*(1+dx/32),y2+y+i2*(1+dy/32))<cs)?1:0) ) v1++; + } + if (v1>0) rbad+=16*v1; + else rbad++; + } + if(rgood+rbad) rc= (100*rbad+(rgood+rbad-1))/(rgood+rbad); else rc=99; + if(rc<10 && JOB->cfg.verbose & 7){ + fprintf(stderr,"\n# distance rc=%d good=%d bad=%d",rc,rgood,rbad); +// out_x(box1);out_x(box2); + } + return rc; +} + + + +// ============================= call OCR engine ================== ;) +// nrun=0 from outside, nrun=1 from inside (allows modifications, oobsolete) +wchar_t whatletter(struct box *box1, int cs, int nrun){ + wchar_t bc=UNKNOWN; // best letter + wchar_t um=SPACE; // umlaut? '" => modifier + pix *p=box1->p; // whole image + int x,y,dots,xa,ya,x0,x1,y0,y1,dx,dy,i; + pix b; // box + struct box bbuf=*box1; // restore after modifikation! + + if (box1->num_ac>0 && box1->wac[0]>=JOB->cfg.certainty && bc==UNKNOWN) { + bc=box1->tac[0]; + } + // if (bc!=UNKNOWN) return bc; + // if whatletter() called again, only unknown chars are processed + // bad for splitting! + + // store box data, which can be modified for modified chars in 2nd run + bbuf.x0=box1->x0; bbuf.y0=box1->y0; + bbuf.x1=box1->x1; bbuf.y1=box1->y1; + + xa=box1->x; ya=box1->y; + x0=box1->x0; y0=box1->y0; + x1=box1->x1; y1=box1->y1; + // int vol=(y1-y0+1)*(x1-x0+1); // volume + // crossed l-m , divided chars + while( get_bw(x0,x1,y0,y0,p,cs,1)!=1 && y0+1<y1) y0++; + while( get_bw(x0,x1,y1,y1,p,cs,1)!=1 && y0+1<y1) y1--; + dx=x1-x0+1; + dy=y1-y0+1; // size + + // better to proof the white frame too!!! ???? + // --- test for german umlaut and points above, not robust enough??? + // if three chars are connected i-dots (ari) sometimes were not detected + // - therefore after division a test could be useful + // modify y0 only in second run!? + // we need it here to have the right copybox + if (um==SPACE && dy>5 && box1->num_boxes>1) + testumlaut(box1,cs,2,&um); /* set box1->modifier + new y0 */ + + dots=box1->dots; + y0 =box1->y0; // dots==2 => y0 below double dots + dy =y1-y0+1; + + // move upper and lower border (for divided letters) + while( get_bw(x0,x1,y0,y0,p,cs,1)==0 && y0+1<y1) y0++; + while( get_bw(x0,x1,y1,y1,p,cs,1)==0 && y0+1<y1) y1--; + while( get_bw(x0,x0,y0,y1,p,cs,1)==0 && x0+1<x1) x0++; + while( get_bw(x1,x1,y0,y1,p,cs,1)==0 && x0+1<x1) x1--; + dx=x1-x0+1; + dy=y1-y0+1; // size + box1->x0=x0; box1->y0=y0; // set reduced frame + box1->x1=x1; box1->y1=y1; + + // set good startpoint (probably bad from division)? + if( xa<x0 || xa>x1 || ya<y0 || ya>y1 + || getpixel(p,xa,ya)>=cs /* || 2*ya<y0+y1 */ || dots>0 ){ + // subfunction? also called after division of two glued chars? + for(y=y1;y>=y0;y--) // low to high (not i-dot) + for(x=(x0+x1)/2,i=0;x>=x0 && x<=x1;i++,x+=((2*i&2)-1)*i) /* is that ok? */ + if (getpixel(p,x,y)<cs && (getpixel(p,x+1,y)<cs + || getpixel(p,x,y+1)<cs)){ xa=x;ya=y;y=-1;break; } + /* should box1->x,y be set? */ + } + + // ----- create char-only-box ------------------------------------- + // ToDo: this will be obsolete if vectors are used only + if(dx<1 || dy<1) return bc; /* should not happen */ + b.p = (unsigned char *) malloc( dx * dy ); + if (!b.p) fprintf(stderr,"Warning: malloc failed L%d\n",__LINE__); + if( copybox(p,x0,y0,dx,dy,&b,dx*dy) ) + { free(b.p); return bc; } + // clr_bits(&b,0,b.x-1,0,b.y-1); + // ------ use diagonal too (only 2nd run?) + /* following code failes on ! and ? obsolete if vectors are used + ToDo: + - mark pixels neighoured to pixels outside and remove them from &b + v0.40 + will be replaced by list of edge vectors + - mark accents, dots and remove them from &b + */ +#if 1 /* becomes obsolate by vector code */ + if (y0>0) // mark upper overlap + for ( x=x0; x<=x1; x++) { + if (getpixel(p,x,y0-1)<cs + && getpixel(p,x,y0 )<cs && (marked(&b,x-x0,0)&1)!=1) + mark_nn(&b,x-x0,0,cs,1); + } + if (x0>0) // mark left overlap + for ( y=y0; y<=y1; y++) { + if (getpixel(p,x0-1,y)<cs + && getpixel(p,x0 ,y)<cs && (marked(&b,0,y-y0 )&1)!=1) + mark_nn(&b,0,y-y0,cs,1); + } + if (x1<p->x-1) // mark right overlap + for ( y=y0; y<=y1; y++) { + if (getpixel(p,x1+1,y)<cs + && getpixel(p,x1 ,y)<cs && (marked(&b,x1-x0,y-y0)&1)!=1) + mark_nn(&b,x1-x0,y-y0,cs,1); + } + mark_nn(&b,xa-x0,ya-y0,cs,2); // not glued chars + for(x=0;x<b.x;x++) + for(y=0;y<b.y;y++){ + if ( (marked(&b,x,y )&3)==1 && getpixel(&b,x,y )<cs ) + b.p[x+y*b.x] = 255&~7; /* reset pixel */ + } +#endif + + // if (bc == UNKNOWN) // cause split to fail + bc=ocr0(box1,&b,cs); + + /* ToDo: try to change pixels near cs?? or melt? */ + if (box1->num_ac>0 && box1->wac[0]>=JOB->cfg.certainty && bc==UNKNOWN) { + bc=box1->tac[0]; + } + + if (um!=0 && um!=SPACE && bc<127) { /* ToDo: is that obsolete now? */ + wchar_t newbc; + newbc = compose(bc, um ); + if (newbc == bc) { /* nothing composed */ + if(JOB->cfg.verbose & 7) + fprintf(stderr, "\nDBG whatletter: compose(%s) was useless (%d,%d)", + decode(bc,ASCII), box1->x0, box1->y0); + // if(JOB->cfg.verbose & 6) out_x(box1); + } + bc = newbc; + } + // restore modified boxes + box1->x0=bbuf.x0; box1->y0=bbuf.y0; + box1->x1=bbuf.x1; box1->y1=bbuf.y1; +// if (box1->c==UNKNOWN) out_b(box1,&b,0,0,dx,dy,cs); // test + + free(b.p); + return bc; +} + +/* +** creates a list of boxes/frames around objects detected +** on the pixmap p for further work +** returns number of boxes created. +** - by the way: get average X, Y (avX=sumX/numC,..) +*/ +int scan_boxes( pix *p ){ + int x, y, nx, cs, rc, ds; + struct box *box3; + + if (JOB->cfg.verbose) + fprintf(stderr,"# scanning boxes"); + + cs = JOB->cfg.cs; + JOB->res.sumX = JOB->res.sumY = JOB->res.numC = 0; + + /* clear the lowest bits of each pixel, later used as "scanned"-marker */ + clr_bits( p, 0, p->x - 1, 0, p->y - 1); + + for (y=0; y < p->y; y++) + for (x=0; x < p->x; x++) + for (ds=2; ds<7; ds+=4) { // NO - dust of size 1 is not removed !!! + nx=x+((ds==2)?-1:+1); + if (nx<0 || nx>=p->x) continue; /* out of image, ex: recframe */ + if ( getpixel(p, x,y)>=cs || getpixel(p,nx,y)< cs) // b/w transition? + continue; + if ((marked(p, x,y) & 1)&&(marked(p, nx, y) & 1)) + continue; + /* check (and mark) only horizontal b/w transitions */ + // --- insert new box in list + box3 = (struct box *)malloc_box(NULL); + box3->x0=box3->x1=box3->x=x; + box3->y0=box3->y1=box3->y=y; + box3->num_frames=0; + box3->dots=0; + box3->num_boxes=1; + box3->num_subboxes=0; + box3->modifier='\0'; + box3->num=JOB->res.numC; + box3->line=0; // not used here + box3->m1=0; box3->m2=0; box3->m3=0; box3->m4=0; + box3->p=p; + box3->num_ac=0; // for future use + +/* frame, vectorize and mark only odd/even horizontal b/w transitions + * args: box, x,y, cs, mark, diag={0,1}, ds={2,6} + * ds - start direction, 6=right of right border, 2=left of left border + * ret - 0=ok, -1=already marked, -2=max_num_frames_exceeded + * -7=no border in direction ds + * ToDo: count errors and print out for debugging + */ + rc=frame_vector(box3, x, y, cs, 1, 1, ds); + g_debug(fprintf(stderr,"\n# ... scan xy= %3d %3d rc= %2d", x, y, rc);) + if (rc<0) { free_box(box3); continue; } + if (box3->num_frames && !box3->num_frame_vectors[0]) + fprintf(stderr,"\nERROR scan_boxes: no vector in frame (%d,%d)",x,y); + + JOB->res.numC++; + JOB->res.sumX += box3->x1 - box3->x0 + 1; + JOB->res.sumY += box3->y1 - box3->y0 + 1; + + box3->c=(((box3->y1-box3->y0+1) + *(box3->x1-box3->x0+1)>=MaxBox)? PICTURE : UNKNOWN); + list_app(&(JOB->res.boxlist), box3); // append to list + // ToDo: debug + // if (JOB->cfg.verbose && box3->y0==29) out_x(box3); + } + if(JOB->res.numC){ + if (JOB->cfg.verbose) + fprintf(stderr," nC= %3d avD= %2d %2d\n",JOB->res.numC, + (JOB->res.sumX+JOB->res.numC/2)/JOB->res.numC, + (JOB->res.sumY+JOB->res.numC/2)/JOB->res.numC); + } + return JOB->res.numC; +} + +/* compare ints for sorting. Return -1, 0, or 1 according to + whether *vr < *vs, vr == *vs, or *vr > *vs */ +int +intcompare (const void *vr, const void *vs) +{ + int *r=(int *)vr; + int *s=(int *)vs; + + if (*r < *s) return -1; + if (*r > *s) return 1; + return 0; +} + +/* + * measure_pitch - detect monospaced font and measure the pitch + * measure overall pitch for difficult lines, + * after that measure pitch per line + * dists arrays are limited to 1024 elements to reduce + * cpu usage for qsort on images with extreme high number of objects + * insert space if dist>=pitch in list_insert_spaces() + * ToDo: ??? + * - min/max distance-matrix a-a,a-b,a-c,a-d ... etc; td,rd > ie,el,es + * - OR measuring distance as min. pixel distance instead of box distance + * especially useful for italic font! + */ +void measure_pitch( job_t *job ){ + int numdists=0, spc=0, /* number of stored distances */ + pitch_p=2, pdist, pdists[1024], /* proportional distances */ + pitch_m=6, mdist, mdists[1024], /* monospaced distances */ + monospaced=0, l1; + struct box *box2, *prev=NULL; + + if(job->cfg.verbose){ fprintf(stderr,"# check for word pitch"); } + for (l1=0; l1<job->res.lines.num; l1++) + { /* 0 means all lines */ + if(job->cfg.verbose){ fprintf(stderr,"\n# line %2d",l1); } + numdists = 0; /* clear distance lists */ + for_each_data(&(job->res.boxlist)) { + box2 = (struct box *)list_get_current(&(job->res.boxlist)); + if (l1>0 && box2->line!=l1) continue; /* ignore other lines */ + /* ignore dots and pictures (min. font is 4x6) */ + if (box2->y1 - box2->y0 + 1 < 4 || box2->c==PICTURE) { prev=NULL; } + if (!prev) { prev=box2; continue; } /* we need a predecessor */ + /* use center distance for monospaced fonts */ + mdist = ((box2->x0 + box2->x1) - (prev->x0 + prev->x1) + 1)/2; + /* use gap for proportional fonts */ + pdist = box2->x0 - prev->x1 + 1; + /* ToDo: better take 3 instead of 2 neighbours?, smallest font 4x6 */ + /* fonts are expected to be 6 to 60 pixels high, which is about + 4 to 50 pixels wide. We allow some extra margin. */ + if (3 < mdist && mdist < 150) { /* better mdist < 3*Xaverage ? */ + /* two options for overflow: 1) ignore, 2) store randomly */ + if (numdists<1024) { /* we do ignore here */ + mdists[numdists] = mdist; + pdists[numdists] = pdist; + numdists++; + } + } + prev = box2; + } end_for_each(&(job->res.boxlist)); + + if(job->cfg.verbose){ fprintf(stderr," num_gaps= %2d",numdists); } + if( numdists<8 ){ + if (job->cfg.verbose && l1==0) /* only for all lines */ + fprintf(stderr," (WARNING num_gaps<8)"); + } + if (numdists>0) { + int i,diff,ni_min,max,best_m,best_p,ni; double v; + /* aware: takes long time for big data sets */ + /* dilute? (german: ausduennen?) */ + qsort (mdists, numdists, sizeof (int), intcompare); + qsort (pdists, numdists, sizeof (int), intcompare); + /* the new method, div0? */ + v = (mdists[numdists*7/10]-mdists[numdists/5]) + /(double)mdists[numdists/5]; + /* measurements showed v=.09 for Courier and .44 for Times-Roman */ + if (l1==0) monospaced = (v < .22); + best_m= numdists/5; + best_p=4*numdists/5; + /* try to find better pitch for monospaced font (ok for prop) */ + for (i=numdists/5+1;i<numdists;i++) { + if (2*mdists[i]>=3*mdists[best_m]) { best_m=i-1; break; } + } + /* try to find better pitch for proportional font */ + // the largest diff could be the best, if diff is always 1, + // take the diff with the lowest weight + for (ni=ni_min=1024,max=0,i=numdists/2+1;i<numdists-numdists/16;i++) { + diff=pdists[i]-pdists[i-1]; + if (diff>max) { + max=diff; best_p=i-1; + if ((job->cfg.verbose&(32+16))==48) + fprintf(stderr," best_p=%d maxdiff=%d\n# ...", pdists[best_p], max); + if (max>3 && 3*pdists[i]>=4*pdists[i-1]) { break; } + } + if (diff) { + if (ni<ni_min) { + // do not try to divide one word per line + ni_min=ni; if (max<=1 && numdists>16) best_p=i-1; + if ((job->cfg.verbose&(32+16))==48) + fprintf(stderr," best_p=%d ni_min=%d\n# ...", pdists[best_p], ni_min); + } + ni=1; + } else ni++; + } + if (numdists<16 && max<=1 && ni_min>1) best_p=numdists-1; // one word +#if 1 /* debugging */ + if ((job->cfg.verbose&(32+16))==48) { + fprintf(stderr,"\n# ..."); + for (i=0;i<numdists;i++) fprintf(stderr," %2d",mdists[i]); + fprintf(stderr," <- mdist[%d]\n# ...",l1); + for (i=0;i<numdists;i++) fprintf(stderr," %2d",pdists[i]); + fprintf(stderr," <- pdist[%d]\n# ...",l1); + fprintf(stderr," maxdiff=%d min_samediffs=%d\n# ...",max,ni_min); + } +#endif + /* we measure spaces in two different ways (mono, prop) */ + /* prop: gap between boxes, mono: distance of middle */ + if (best_p<numdists-1) pitch_p = ((pdists[best_p]+pdists[best_p+1])/2+1); + else pitch_p = (pdists[best_p]+1 ); + pitch_m = (mdists[best_m]*4/3); + if (numdists) + if ( pdists[numdists-1]*2 <= pdists[0]*3 + || pdists[numdists-1] <= pdists[0]+3) { + /* line is just a single word */ + pitch_p = pdists[numdists-1]+10; + } + if (l1>0 && job->cfg.spc==0) { + job->res.lines.pitch[l1]=(monospaced?pitch_m:pitch_p); + job->res.lines.mono[l1]=monospaced; + } + if (job->cfg.verbose) { + fprintf(stderr,"\n# ..." + " mono: v=%f (v<0.22) line=%d numdists=%d\n# ...", + v, l1, numdists); + fprintf(stderr," mono: min=%3d max=%3d pitch=%3d @ %2d%%\n# ...", + mdists[0],mdists[numdists-1],pitch_m,best_m*100/numdists); + fprintf(stderr," prop: min=%3d max=%3d pitch=%3d @ %2d%%\n# ...", + pdists[0],pdists[numdists-1],pitch_p,best_p*100/numdists); + fprintf(stderr," result: distance >= %d considered space\n# ...", + job->res.lines.pitch[l1]); + } + } /* if (not) enough spaces */ + if (l1==0) { /* set default spaces to each line */ + int l2; + spc = job->cfg.spc; + if (spc==0) /* set only if not set by option */ + spc = ((monospaced)?pitch_m:pitch_p); + for (l2=0; l2<job->res.lines.num; l2++ ) + job->res.lines.pitch[l2]=spc; + } + } /* each line */ + if (job->cfg.spc==0) + job->cfg.spc = spc; + if (job->cfg.verbose) + fprintf(stderr," overall space width is %d %s\n", + spc, ((monospaced)?"monospaced":"proportional")); + + +} + +/* ---- count subboxes (white holes within black area) -------- + * new: count boxes lying inside another box (usually holes, ex: "aeobdg") + * needed for glue_boxes, dont glue textboxes, tables and other complex + * objects + * ToDo: count only frames of invers spin? do we need sorted list here? -> no + */ +int count_subboxes( pix *pp ){ + int ii=0, num_mini=0, num_same=0, cnt=0; + struct box *box2,*box4; + progress_counter_t *pc = NULL; + if (JOB->cfg.verbose) { fprintf(stderr,"# count subboxes\n# ..."); } + + pc = open_progress(JOB->res.boxlist.n,"count_subboxes"); + for_each_data(&(JOB->res.boxlist)) { + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + box2->num_subboxes=0; + progress(cnt++,pc); + if ( (box2->x1 - box2->x0)<2 + || (box2->y1 - box2->y0)<2) continue; /* speedup for dotted bg */ + // holes inside box2 char, aoebdqg, 0.41 + for_each_data(&(JOB->res.boxlist)) { + box4=(struct box *)list_get_current(&(JOB->res.boxlist)); + if (box4->y0 > box2->y1) break; // faster, but boxes need to be sorted + // ToDo: better use binary tree (above/below x) to find near boxes? + if (box4==box2) continue; + if( box4->x0==box2->x0 && box4->x1==box2->x1 + && box4->y0==box2->y0 && box4->y1==box2->y1) + num_same++; /* erroneous!? */ + if ( box4->x0 >= box2->x0 && box4->x1 <= box2->x1 + && box4->y0 >= box2->y0 && box4->y1 <= box2->y1 + && box4->num_subboxes==0 ) /* box4 inside box2? */ + { + box2->num_subboxes++; ii++; + if ((box4->x1 - box4->x0 + 1) + *(box4->y1 - box4->y0 + 1)<17) num_mini++; + } + } end_for_each(&(JOB->res.boxlist)); +#if 0 + if (cnt < 1000 && JOB->cfg.verbose) + fprintf(stderr," %4d box %4d %4d %+3d %+3d subboxes %4d\n# ...", + cnt, box2->x0, box2->y0, box2->x1-box2->x0, + box2->y1-box2->y0, box2->num_subboxes); +#endif + } end_for_each(&(JOB->res.boxlist)); + close_progress(pc); + if (JOB->cfg.verbose) + fprintf(stderr," %3d subboxes counted (mini=%d, same=%d) nC= %d\n", + ii, num_mini, num_same/2 /* counted twice */, cnt); + return 0; +} + +/* ---- glue holes tochars( before step1 ) v0.42 ----------------------- + glue boxes lying inside another box (usually holes, ex: "aeobdg46890") + Dont add dust to a char! + lines are not detected yet +*/ +int glue_holes_inside_chars( pix *pp ){ + int ii, cs, x0, y0, x1, y1, cnt=0, + glued_same=0, glued_holes=0; + struct box *box2, *box4; + progress_counter_t *pc = NULL; + cs=JOB->cfg.cs; + { + count_subboxes( pp ); /* move to pgm2asc() later */ + + pc = open_progress(JOB->res.boxlist.n,"glue_holes_inside_chars"); + if (JOB->cfg.verbose) + fprintf(stderr,"# glue holes to chars nC= %d\n# ...",JOB->res.numC); + ii=0; + for_each_data(&(JOB->res.boxlist)) { + // get the smaller box which may be extended by bigger boxes around it + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + x0 = box2->x0; x1 = box2->x1; + y0 = box2->y0; y1 = box2->y1; + + progress(cnt++,pc); + + // would it better than moving vectors to build a sub-box-tree? + + // do not remove chars inside pictures (car plates on photos) + if( box2->c == PICTURE || box2->num_subboxes > 7) continue; + + // holes inside char, aoebdqg, 0.41 + // dont merge boxes which have subboxes by itself! + // search boxes inside box2 + // if (x1-x0+1>2 || y1-y0+1>2) /* skip tiny boxes, bad for 4x6 */ + for_each_data(&(JOB->res.boxlist)) { + box4=(struct box *)list_get_current(&(JOB->res.boxlist)); + if(box4!=box2 && box4->c != PICTURE ) + { + // ToDo: dont glue, if size differs by big factors (>16?) + if ( ( box4->x0==x0 && box4->x1==x1 + && box4->y0==y0 && box4->y1==y1 ) /* do not happen !? */ + || ( box4->x0>=x0 && box4->x1<=x1 + && box4->y0>=y0 && box4->y1<=y1 + && box4->num_subboxes==0 ) ) /* no or very small subboxes? */ + { // fkt melt(box2,box4) + // same box, if very small but hollow char (4x5 o) + if( box4->x0==x0 && box4->x1==x1 + && box4->y0==y0 && box4->y1==y1) glued_same++; else glued_holes++; + // fprintf(stderr,"\n# DEBUG merge:"); + // out_x(box2); // small + // out_x(box4); // big + if ((JOB->cfg.verbose & 7)==7) // LEV3 + fprintf(stderr," glue hole (%4d %4d %+3d %+3d %+4d)" + " (%4d %4d %+3d %+3d %+4d) %d\n# ...", + x0, y0, x1-x0+1, y1-y0+1, box2->frame_vol[0], + box4->x0, box4->y0, + box4->x1-box4->x0+1, box4->y1-box4->y0+1, + box4->frame_vol[0], glued_same); + if ((box4->x1-box4->x0+1)< 8*(x1-x0+1) + || (box4->y1-box4->y0+1)<12*(y1-y0+1)) // skip dust + merge_boxes( box2, box4 ); // add box4 to box2 + // out_x(box2); + x0 = box2->x0; x1 = box2->x1; + y0 = box2->y0; y1 = box2->y1; + JOB->res.numC--; // dont count fragments as chars + ii++; // count removed + list_del(&(JOB->res.boxlist), box4); // remove box4 + free_box(box4); + // now search another hole inside box2 + } + } + } end_for_each(&(JOB->res.boxlist)); + + } end_for_each(&(JOB->res.boxlist)); + + if (JOB->cfg.verbose) + fprintf(stderr," glued: %3d holes, %3d same, nC= %d\n", + glued_holes, glued_same, JOB->res.numC); + close_progress(pc); + } + return 0; +} + + +/* ---- glue broken chars ( before step1 ??? ) ----------------------- + use this carefully, do not destroy previous detection ~fi, broken K=k' g + glue if boxes are near or diagonally connected + other strategy: mark boxes for deleting and delete in extra loop at end + faster: check only next two following boxes because list is sorted! + ToDo: store m4 of upper line to m4_of_prev_line, and check that "-points are below + done: glue boxes lying inside another box (usually holes, ex: "aeobdg") + Dont add dust to a char! + lines should be detected already (Test it for m1-m4 unknown) + ToDo: divide in glue_idots, glue_thin_chars etc. and optimize it +*/ +int glue_broken_chars( pix *pp ){ + int ii, y, cs, x0, y0, x1, y1, cnt=0, + num_frags=0, glued_frags=0, glued_hor=0; + struct box *box2, *box4; + progress_counter_t *pc = NULL; + cs=JOB->cfg.cs; + { + count_subboxes( pp ); /* move to pgm2asc() later */ + + pc = open_progress(JOB->res.boxlist.n,"glue_broken_chars"); + if (JOB->cfg.verbose) + fprintf(stderr,"# glue broken chars nC= %d\n# ...",JOB->res.numC); + ii=0; + for_each_data(&(JOB->res.boxlist)) { + // get the box which may be extended by boxes around it + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + x0 = box2->x0; x1 = box2->x1; + y0 = box2->y0; y1 = box2->y1; + + progress(cnt++,pc); + + // vertical broken (g965T umlauts etc.) + // not: f, + + // would it better than moving vectors to build a sub-box-tree? + + // do not remove chars inside pictures (car plates on photos) + if( box2->c == PICTURE || box2->num_subboxes > 7) continue; + + /* continue loop if box is below or above line */ + if( box2->m4>0 && y0>box2->m4 ) continue; /* dust outside ? */ + if( box2->m1>0 && y0<box2->m1-(box2->m3-box2->m2) ) continue; + /* ToDo: + * - check that y0 is greater as m3 of the char/line above + */ + + // check small boxes (box2) whether they belong + // to near same size or bigger boxes (box4) + if( 2*(y1-y0) < box2->m4 - box2->m1 // care for dots etc. + && ( 2*y1<=(box2->m3+box2->m2) // upper fragments + || 2*y0>=(box2->m3+box2->m2)) ) { // lower fragments + struct box *box5=NULL, *box6=NULL; // nearest and next nearest box + box4=NULL; + num_frags++; /* count for debugging */ + // get the [2nd] next x-nearest box in the same line + for_each_data(&(JOB->res.boxlist)) { + box4=(struct box *)list_get_current(&(JOB->res.boxlist)); + if (box4 == box2 || box4->c == PICTURE) continue; + /* 0.42 speed up for backround pixel pattern, box4 to small */ + if ( box4->x1 - box4->x0 + 1 < x1-x0+1 + && box4->y1 - box4->y0 + 1 < y1-y0+1 ) continue; + // have in mind that line number may be wrong for dust + if (box4->line>=0 && box2->line>=0 && box4->line==box2->line) + { + if (!box5) box5=box4; + if ( abs(box4->x0 + box4->x1 - 2*box2->x0) + <abs(box5->x0 + box5->x1 - 2*box2->x0)) + { box6=box5; box5=box4; } + } + } end_for_each(&(JOB->res.boxlist)); + box4=box5; // next nearest box within the same line + if (box4) { +#if 0 /* set this to 1 for debugging of melting bugs */ + if (JOB->cfg.verbose & 7) { + fprintf(stderr,"\n# next two boxes are candidates for melting "); + out_x(box2); + out_x(box4); } +#endif + if( /* umlaut "a "o "u, ij; box2 is the small dot, box4 the body */ + ( y1 <= box2->m2 + && box4->y1 >= box2->m2 // dont melt dots together + && 2* y1 < box4->y1 + box4->y0 // box2 above box4 + && box4->x1+JOB->res.avX/2>=x0 + && box4->x0-JOB->res.avX/2<=x1 + && (y1 < box4->y0 || x0 < box4->x1) // dont melt "d'" + && 3* ( y1 - box4->y0) + <= 2* (box4->y1 - box4->y0) // too far away? dust! + && 8* ( x1 - x0 + 1) + >= (box4->x1 - box4->x0 + 1) // dot must have minimum size + && 10* ( y1 - y0 + 1) + >= (box4->y1 - box4->y0 + 1) // dot must have minimum size + ) || ( 0 && /* broken T */ + 3*(box2->x1 - box2->x0) > 2*JOB->res.avX + && 4*box4->x0>3*box2->x0+box2->x1 + && 4*box4->x1<box2->x0+3*box2->x1 + ) + || /* !?; box2 is the dot, box4 the body */ + ( 2*box4->x1>=x0+x1 /* test if box4 is around box2 */ + && 2*box4->x0<=2*x1 /* +x0+1 Jan00 */ + && ( x1-x0 <= box4->x1-box4->x0+2 ) + && 2*y0>=box2->m2+box2->m3 + && 4*y1>=box2->m2+3*box2->m3 + && 4*(y1-y0)<box2->m4-box2->m1 + && (8*box4->y1 < box4->m2+7*box4->m3 + || box4->m4-box4->m1<16) /* Jan00 */ + ) + || /* =;: box2 is the upper box, box4 the lower box */ + ( 2*box4->x1>=x0+x1 /* test if box4 is around box2 */ + && 2*box4->x0<=2*x1 /* +x0+1 */ + && ( x1-x0 <= box4->x1-box4->x0+4 ) + && ( 4*x0 <= 3*box4->x1+box4->x0 ) + && (( box2->m2 && box4->m2 + && y1< box2->m3 + && 2*box4->y1 > box4->m3+box4->m2 // can be bigger than m3 + && 4*box4->y0 >= 3*box4->m2+box4->m3 + && 2*box2->y0 < box2->m3+box2->m2 + ) + || ( (!box2->m2) || (!box4->m2) ) + ) + ) + ) + { // fkt melt(box2,box4) + if (JOB->cfg.verbose & 7) + fprintf(stderr," glue objects (%3d %3d %+3d %+3d)" + " (%3d %3d %+3d %+3d)\n# ...", + x0, y0, x1-x0+1, y1-y0+1, box4->x0, box4->y0, + box4->x1-box4->x0+1, box4->y1-box4->y0+1); + // fprintf(stderr,"\n# DEBUG merge:"); // d=7x34 @ (109,51) ??? + // out_x(box2); + // out_x(box4); + merge_boxes( box2, box4 ); // add box4 to box2 + x0 = box2->x0; x1 = box2->x1; + y0 = box2->y0; y1 = box2->y1; + // if (JOB->cfg.verbose & 4) out_x(box2); + // JOB->res.numC--; // dont count fragments as chars + ii++; glued_frags++; // remove + // output_list(JOB); + list_del(&(JOB->res.boxlist), box4); /* ret&1: error-message ??? */ + // output_list(JOB); + free_box(box4); + } + } + } +// continue; + + // horizontally broken w' K' + if( 2*y1 < (box2->m3+box2->m2) ) + if( 2*(y1-y0) < (box2->m3+box2->m2) ) // fragment + for_each_data(&(JOB->res.boxlist)) { + box4=(struct box *)list_get_current(&(JOB->res.boxlist)); + if(box4!=box2 && box4->c != PICTURE ) + { + if( box4->line>=0 && box4->line==box2->line + && box4->x1>=x0-1 && box4->x1<x0 // do not glue 6- + && box4->x0+3*box4->x1<4*x0) + if( get_bw(x0 ,x0 ,y1,y1 ,pp,cs,1) == 1) + if( get_bw(x0-2,x0-1,y1,y1+2,pp,cs,1) == 1) + { // fkt melt(box2,box4) + put(pp,x0,y1+1,~(128+64),0); + merge_boxes( box2, box4 ); + x0 = box2->x0; x1 = box2->x1; + y0 = box2->y0; y1 = box2->y1; + JOB->res.numC--; ii++; // remove + glued_hor++; + list_del(&(JOB->res.boxlist), box4); + free_box(box4); + } + } + } end_for_each(&(JOB->res.boxlist)); + + // horizontally broken n h (h=l_) v0.2.5 Jun00 + if( abs(box2->m2-y0)<=(y1-y0)/8 ) + if( abs(box2->m3-y1)<=(y1-y0)/8 ) + if( num_cross(x0, x1,(y0+ y1)/2,(y0+ y1)/2,pp,cs) == 1) + if( num_cross(x0, x1,(y0+3*y1)/4,(y0+3*y1)/4,pp,cs) == 1) + if( get_bw((3*x0+x1)/4,(3*x0+x1)/4,(3*y0+y1)/4,y1,pp,cs,1) == 0) + if( get_bw(x0,(3*x0+x1)/4,(3*y0+y1)/4,(y0+3*y1)/4,pp,cs,1) == 0) + if( get_bw(x0, x0, y0,(3*y0+y1)/4,pp,cs,1) == 1) + for_each_data(&(JOB->res.boxlist)) { + box4=(struct box *)list_get_current(&(JOB->res.boxlist)); + if(box4!=box2 && box4->c != PICTURE ) + { + if( box4->line>=0 && box4->line==box2->line + && box4->x1>x0-3 && box4->x1-2<x0 + && abs(box4->y1-box2->m3)<2) + { // fkt melt(box2,box4) + y=loop(pp,x0,y0,y1-y0,cs,0,DO);if(2*y>y1-y0) continue; + put(pp,x0-1,y0+y ,~(128+64),0); + put(pp,x0-1,y0+y+1,~(128+64),0); + merge_boxes( box2, box4 ); // add box4 to box2 + x0 = box2->x0; x1 = box2->x1; + y0 = box2->y0; y1 = box2->y1; + JOB->res.numC--; ii++; // remove + glued_hor++; + list_del(&(JOB->res.boxlist), box4); + free_box(box4); + } + } + } end_for_each(&(JOB->res.boxlist)); + } end_for_each(&(JOB->res.boxlist)); + if (JOB->cfg.verbose) + fprintf(stderr," glued: %3d fragments (found %3d), %3d rest, nC= %d\n", + glued_frags, num_frags, glued_hor, JOB->res.numC); + close_progress(pc); + } + return 0; +} + +/* +** this is a simple way to improve results on noisy images: +** - find similar chars (build cluster of same chars) +** - analyze clusters (could be used for generating unknown font-base) +** - the quality of the result depends mainly on the distance function +*/ + // ---- analyse boxes, compare chars, compress picture ------------ + // ToDo: - error-correction only on large chars! +int find_same_chars( pix *pp){ + int i,k,d,cs,dist,n1,dx; struct box *box2,*box3,*box4,*box5; + pix p=(*pp); + cs=JOB->cfg.cs; + { + if(JOB->cfg.verbose)fprintf(stderr,"# packing"); + i = list_total(&(JOB->res.boxlist)); + for_each_data(&(JOB->res.boxlist)) { + box4 = box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + dist=1000; // 100% maximum + dx = box2->x1 - box2->x0 + 1; + + if(JOB->cfg.verbose)fprintf(stderr,"\r# packing %5d",i); + if( dx>3 ) + for(box3=(struct box *)list_next(&(JOB->res.boxlist),box2);box3; + box3=(struct box *)list_next(&(JOB->res.boxlist),box3)) { + if(box2->num!=box3->num){ + int d=distance(&p,box2,&p,box3,cs); + if ( d<dist ) { dist=d; box4=box3; } // best fit + if ( d<5 ){ // good limit = 5% ??? + i--;n1=box3->num; // set all num==box2.num to box2.num + for_each_data(&(JOB->res.boxlist)) { + box5=(struct box *)(struct box *)list_get_current(&(JOB->res.boxlist)); + if(box5!=box2) + if( box5->num==n1 ) box5->num=box2->num; + } end_for_each(&(JOB->res.boxlist)); + // out_x2(box2,box5); + // fprintf(stderr," dist=%d\n",d); + } + } + } + // nearest dist to box2 has box4 + // out_b2(box2,box4); + // fprintf(stderr," dist=%d\n",dist); + } end_for_each(&(JOB->res.boxlist)); + k=0; + if(JOB->cfg.verbose)fprintf(stderr," %d different chars",i); + for_each_data(&(JOB->res.boxlist)) { + struct box *box3,*box4; + int j,dist; + box2=(struct box *)list_get_current(&(JOB->res.boxlist)); + for(box3=(struct box *)list_get_header(&(JOB->res.boxlist)); + box3!=box2 && box3!=NULL; + box3=(struct box *)list_next(&(JOB->res.boxlist), box3)) + if(box3->num==box2->num)break; + if(box3!=box2 && box3!=NULL)continue; + i++; + // count number of same chars + dist=0;box4=box2; + + for(box3=box2,j=0;box3; + box3=(struct box *)list_next(&(JOB->res.boxlist), box3)) { + if(box3->num==box2->num){ + j++; + d=distance(&p,box2,&p,box3,cs); + if ( d>dist ) { dist=d; box4=box3; } // worst fit + } + } + if(JOB->cfg.verbose&8){ + fprintf(stderr," no %d char %4d %5d times maxdist=%d\n",i,box2->num,j,dist); + } + // calculate mean-char (error-correction) + // ToDo: calculate maxdist in group + k+=j; + // if(j>1) + // out_b(box1,NULL,0,0,0,0,cs); + if(JOB->cfg.verbose&8) + fprintf(stderr," no %d char %4d %5d times sum=%d\n",i,box2->num,j,k); + } end_for_each(&(JOB->res.boxlist)); + if(JOB->cfg.verbose)fprintf(stderr," ok\n"); + } + return 0; +} + +/* +** call the first engine for all boxes and set box->c=result; +** +*/ +int char_recognition( pix *pp, int mo){ + int i,ii,ni,cs,x0,y0,x1,y1; + struct box *box2; + progress_counter_t *pc; + wchar_t cc; + cs=JOB->cfg.cs; + // ---- analyse boxes, find chars --------------------------------- + if (JOB->cfg.verbose) + fprintf(stderr,"# char recognition"); + i=ii=ni=0; + for_each_data(&(JOB->res.boxlist)) { /* count boxes */ + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + /* wew: isn't this just JOB->res.numC? */ + /* js: The program is very complex. I am not sure anymore + wether numC is the number of boxes or the number of valid + characters. + Because its not time consuming I count the boxes here. */ + if (box2->c==UNKNOWN) i++; + if (box2->c==PICTURE) ii++; + ni++; + } end_for_each(&(JOB->res.boxlist)); + if(JOB->cfg.verbose) + fprintf(stderr," unknown= %d picts= %d boxes= %d\n# ",i,ii,ni); + if (!ni) return 0; + i=ii=0; + pc = open_progress(ni,"char_recognition"); + for_each_data(&(JOB->res.boxlist)) { + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + x0=box2->x0;x1=box2->x1; + y0=box2->y0;y1=box2->y1; // box + cc=box2->c; + if (cc==PICTURE) continue; + + if ((mo&256)==0) { /* this case should be default (main engine) */ + if(cc==UNKNOWN || box2->num_ac==0 || box2->wac[0]<JOB->cfg.certainty) + cc=whatletter(box2,cs ,0); + } + + if(mo&2) + if(cc==UNKNOWN || box2->num_ac==0 || box2->wac[0]<JOB->cfg.certainty) + cc=ocr_db(box2); + + + // box2->c=cc; bad idea (May03 removed) + // set(box2,cc,95); ToDo: is that better? + + if(cc==UNKNOWN) + i++; + ii++; + + if(JOB->cfg.verbose&8) { + fprintf(stderr,"\n# code= %04lx %c",(long)cc,(char)((cc<255)?cc:'_')); + //out_b(box2,pp,x0,y0,x1-x0+1,y1-y0+1,cs); + } + progress(ii,pc); /* ii = 0..ni */ + + } end_for_each(&(JOB->res.boxlist)); + close_progress(pc); + if(JOB->cfg.verbose)fprintf(stderr," %d of %d chars unidentified\n",i,ii); + return 0; +} + + +/* +** compare unknown with known chars, +** very similar to the find_similar_char_function but here only to +** improve the result +*/ +int compare_unknown_with_known_chars(pix * pp, int mo) { + int i, cs = JOB->cfg.cs, dist, d, ad, wac, ni, ii; + struct box *box2, *box3, *box4; + progress_counter_t *pc=NULL; + wchar_t bc; + i = ii = 0; // ---- ------------------------------- + if (JOB->cfg.verbose) + fprintf(stderr, "# try to compare unknown with known chars !(mode&8)"); + if (!(mo & 8)) + { + ii=ni=0; + for_each_data(&(JOB->res.boxlist)) { ni++; } end_for_each(&(JOB->res.boxlist)); + pc = open_progress(ni,"compare_chars"); + for_each_data(&(JOB->res.boxlist)) { + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); ii++; + if (box2->c == UNKNOWN || (box2->num_ac>0 && box2->wac[0]<97)) + if (box2->y1 - box2->y0 > 4 && box2->x1 - box2->x0 > 1) { // no dots! + box4 = (struct box *)list_get_header(&(JOB->res.boxlist));; + dist = 1000; /* 100% maximum */ + bc = UNKNOWN; /* best fit char */ + for_each_data(&(JOB->res.boxlist)) { + box3 = (struct box *)list_get_current(&(JOB->res.boxlist)); + wac=((box3->num_ac>0)?box3->wac[0]:100); + if (box3 == box2 || box3->c == UNKNOWN + || wac<JOB->cfg.certainty) continue; + if (box2->y1 - box2->y0 < 5 || box2->x1 - box2->x0 < 3) continue; + d = distance(pp, box2, pp, box3, cs); + if (d < dist) { + dist = d; bc = box3->c; box4 = box3; + } + } end_for_each(&(JOB->res.boxlist)); + if (dist < 10) { + /* sureness can be maximal of box3 */ + if (box4->num_ac>0) ad = box4->wac[0]; + else ad = 97; + ad-=dist; if(ad<1) ad=1; + /* ToDo: ad should depend on ad of bestfit */ + setac(box2,(wchar_t)bc,ad); + i++; + } // limit as option??? + // => better max distance('e','e') ??? + if (dist < 50 && (JOB->cfg.verbose & 7)) { // only for debugging + fprintf(stderr,"\n# L%02d best fit was %04x=%c dist=%3d%% i=%d", + box2->line, (int)bc, (char)((bc<128)?bc:'_'), dist, i); + if(box4->num_ac>0)fprintf(stderr," w= %3d%%",box4->wac[0]); + } + progress(ii,pc); + } + } end_for_each(&(JOB->res.boxlist)); + close_progress(pc); + } + if (JOB->cfg.verbose) + fprintf(stderr, " - found %d (nC=%d)\n", i, ii); + return 0; +} + +/* +// ---- divide overlapping chars which !strchr("_,.:;",c); +// block-splitting (two ore three glued chars) +// division if dots>0 does not work properly! ??? +// +// what about glued "be"? +// what about recursive division? +// ToDo: mark divided boxes to give the engine a chance to +// handle wrong divisions +*/ +int try_to_divide_boxes( pix *pp, int mo){ + struct box *box2, boxa, boxb; + int cs=JOB->cfg.cs, ad=100, + a2[8], ar, // certainty of each part, ar = product of all certainties + cbest; // best certainty, skip search of certainty<cbest-1 for speed + wchar_t ci[8], // split max. 8 chars + s1[]={ UNKNOWN, '_', '.', ',', '\'', '!', ';', '?', ':', '-', + '=', '(', ')', '/', '\\', '\0' }; // not accepted chars, \0-terminated! + int x0, x1, y0, y1, + xi[8+1]; // cutting positions + int i, ii, n1, dy, dx; + // pix p=(*pp); // remove! + if (JOB->cfg.verbose) + fprintf(stderr,"# try to divide unknown chars !(mode&16)"); + if(!(mo&16)) // put this to the caller + for_each_data(&(JOB->res.boxlist)) { + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + // don't try to split simple structures (ex: 400x30 square) + if ((!box2->num_frames) + || box2->num_frame_vectors[ box2->num_frames-1 ]<9) continue; + if((box2->c==UNKNOWN || (box2->num_ac && box2->wac[0]<JOB->cfg.certainty)) + && box2->x1-box2->x0>5 && box2->y1-box2->y0>4){ + x0=box2->x0; x1=box2->x1; + y0=box2->y0; y1=box2->y1; + ad=100; + cbest=0; + + /* get minimum vertical lines */ + n1 = num_cross(x0,x1,( y1+y0)/2,( y1+y0)/2,pp,cs); + ii = num_cross(x0,x1,(3*y1+y0)/4,(3*y1+y0)/4,pp,cs); if (ii<n1) n1=ii; + if (box2->m2 && box2->m3 > box2->m2+2) + for (i=box2->m2+1;i<=box2->m3-1;i++) { + if (loop(pp,x0+1,i,x1-x0,cs,1,RI) > (x1-x0-2)) continue; // ll + ii = num_cross(x0,x1,i,i,pp,cs); if (ii<n1) n1=ii; + } if (n1<2) continue; // seems to make no sense to divide + if (n1<4) ad=99*ad/100; // not to strong because m2+m3 could be wrong + if (n1<3) ad=99*ad/100; + + if( 2*y1 < box2->m3+box2->m4 /* baseline char ? */ + && num_cross(x0,x1,y1-1,y1-1,pp,cs)==1 // -1 for slopes + && num_cross((x0+2*x1)/3,(x0+3*x1)/4,y0,y1,pp,cs)<3 // not exclude tz + && num_cross((3*x0+x1)/4,(2*x0+x1)/3,y0,y1,pp,cs)<3 // not exclude zl + && loop(pp,x0,y1-(y1-y0)/32,x1-x0,cs,0,RI) + +loop(pp,x1,y1-(y1-y0)/32,x1-x0,cs,0,LE) > (x1-x0+1)/2 + ) continue; /* do not try on bvdo"o etc. */ + + // one vertical line can not be two glued chars, lc? + if ( num_cross(x0,x1,(y1+y0)/2,(y1+y0)/2,pp,cs)<=1 ) continue; + { // doublet = 2 letters + // char buf[4]="\0\0\0"; // 4th byte is string end == \0 + // buf[0]=c1; // c1 is wchar_t! (0xbf00 to 0) failes + // buf[1]=c2; + char buf[64]=""; // end == \0 + if (JOB->cfg.verbose&2){ + fprintf(stderr, "\n#\n# divide box: %4d %4d %3d %3d\n", + x0, y0, x1-x0+1, y1-y0+1); + } + // it would be better if testing is only if most right and left char + // is has no horizontal gap (below m2) ex: be + i=0; // num splittet chars + xi[0]=x0; xi[1]=x0+3; xi[2]=x1; + for ( ; ; xi[i+1]++) { // x[i] .. x[i+1], slower? but better v0.42 + /* break if x is to near to the right border */ + if (xi[i+1]>x1-3) { if (i==0) break; i--; xi[i+2]=x1; continue; } + // ToDo: skip if not a local dy-min for speedup + { int ymin=y1, ymax=y0, bow=0, // min max at cutting point + max0=y0, max1=y0, // max y on left and right side + min0=y1, min1=y1; // min y on left and right side + for (dy=0,ii=0;ii<box2->num_frame_vectors[ 0 ];ii++) { + int pre=ii-1, next=(ii+1)%box2->num_frame_vectors[ 0 ]; + if (pre<0) pre=box2->num_frame_vectors[ 0 ]-1; + // check if vector is inside box to cut + if ( box2->frame_vector[ii ][0]<=xi[i ]) continue; + if ( box2->frame_vector[ii ][0]> xi[i+2]) continue; + // 2nd derivation of y(x) + if (abs(box2->frame_vector[ii ][0]-xi[i+1])<2) { + dy= 2*box2->frame_vector[ii ][1] + -box2->frame_vector[next][1] + -box2->frame_vector[pre ][1]; + dx= box2->frame_vector[next][0] + -box2->frame_vector[pre ][0]; + // rotate 180 degree if dx<0 + if (((dx>0)?dy:-dy)<-abs(dx)/2) { bow=1; } + } + // its not the best if we think on glued fi fo etc. + if (( box2->frame_vector[pre ][0]<=xi[i+1] + && box2->frame_vector[next][0]>=xi[i+1]) + || ( box2->frame_vector[pre ][0]>=xi[i+1] + && box2->frame_vector[next][0]<=xi[i+1])) { + if ( box2->frame_vector[ii ][1]>ymax) + ymax= box2->frame_vector[ii ][1]; + if ( box2->frame_vector[ii ][1]<ymin) + ymin= box2->frame_vector[ii ][1]; + } + // min and max of left and right side + if ( box2->frame_vector[ii ][1]>max0 + && box2->frame_vector[ii ][0]<=xi[i+1]) + max0=box2->frame_vector[ii ][1]; + if ( box2->frame_vector[ii ][1]>max1 + && box2->frame_vector[ii ][0]> xi[i+1]) + max1=box2->frame_vector[ii ][1]; + if ( box2->frame_vector[ii ][1]<min0 + && box2->frame_vector[ii ][0]<=xi[i+1]) + min0=box2->frame_vector[ii ][1]; + if ( box2->frame_vector[ii ][1]<min1 + && box2->frame_vector[ii ][0]> xi[i+1]) + min1=box2->frame_vector[ii ][1]; + } + if(JOB->cfg.verbose&2) + fprintf(stderr,"\n# test if to split at x%d= %2d %2d %2d" + " bow,(max-min)[i,0,1] %d %3d %3d %3d" + , i, xi[i]-x0, xi[i+1]-x0, xi[i+2]-x0, bow, ymax-ymin, max0-min0, max1-min1); + /* skip if no local minimum at xi[i+1] or if its not thin enough */ + if (bow==0 || 4*(ymax-ymin)>2*(y1-y0)) continue; + // cuttet parts should have about the same height (max-min) + // we dont want to cut an 'n' in three parts! + if (2*(max0-min0+1)<(y1-y0+1)) continue; // left height + if (2*(max1-min1+1)<(y1-y0+1)) continue; // right height + // ToDo: thickness on xi[i+1]? + } + // try to split successive right box if left box is recognised, + // else shift the splitting point further to the right border + // removing ->dots if dot only above one char !!! ??? not implemented + if(JOB->cfg.verbose&2) + fprintf(stderr,"\n# try to split, newbox[%d].x= %2d ... %2d " + "dy= %d ", i, xi[i]-x0, xi[i+1]-x0, dy); + boxa=*box2; // copy contents, ToDo: reset ac-list (in cut_box?) + boxa.x=xi[i]; boxa.y=y0; // obsolete? mark pixel, overlap? + boxa.x0=xi[i];boxa.x1=xi[i+1]; // new horizontal box range + cut_box(&boxa); boxa.num_ac=0; + // out_x(&boxa); + // get wchar + certainty + ci[i]=whatletter(&boxa,cs,0); a2[i]=testac(&boxa,ci[i]); + if(JOB->cfg.verbose&2) + fprintf(stderr,"\n# certainty %d limit= %d cbest= %d ", + a2[i], JOB->cfg.certainty, cbest); + if (a2[i]<JOB->cfg.certainty || a2[i]<cbest-1 + || wcschr(s1,ci[i]) ) { continue; } // dont split here + + for (ar=ad,ii=0;ii<=i;ii++) { + ar=a2[ii]*ar/100; } // multiply all probabilities + if (ar<98*JOB->cfg.certainty/100 || ar<cbest) { + continue; } // dont go deeper, no longer string + + i++; if (i==8) break; // maximum splits + if (i==4) break; // at the moment its to slow to go further + if (i+1<8) xi[i+1]=x1; // right border of next box + if (i+2<8) xi[i+2]=x1; + + if(JOB->cfg.verbose&2) + fprintf(stderr,"\n try end split [%d]=%d [%d]=%d ", + i, xi[i]-x0, i+1, xi[i+1]-x0); + boxb=*box2; // try rest if it has to be split again + boxb.x=xi[i]+1; boxb.y=y0; + boxb.x0=xi[i]+1;boxb.x1=xi[i+1]; + cut_box(&boxb); boxb.num_ac=0; + ci[i]=whatletter(&boxb,cs,0); a2[i]=testac(&boxb,ci[i]); + if (a2[i]<JOB->cfg.certainty || a2[i]<cbest-1 + || wcschr(s1,ci[i]) ) { xi[i+1]=xi[i]+2; continue; } // split rest + // now we have everything splittet + + if(JOB->cfg.verbose&2) { + fprintf(stderr,"\n split at/to: "); + for (ii=0;ii<=i;ii++) + fprintf(stderr," %2d %s (%3d)", xi[ii+1]-x0, + decode(ci[ii],ASCII), a2[ii]); + fprintf(stderr,"\n"); + } + // boxa..c changed!!! dots should be modified!!! + // Question: cut it into boxes v0.40 or set a string v0.41? + // new way of building a string v0.41 (can call setas multiple) + // usefull if compare unknown with known strings (except barcode?) + // ToDo: also create alternate variants? ex: I <-> l + for (buf[0]=0,ar=ad,ii=0;ii<=i;ii++) { + ar=a2[ii]*ar/100; // multiply all probabilities + if (i>0 && ci[ii]=='n' && ci[ii-1]=='r') ar--; // m == rn + strncat(buf,decode(ci[ii],JOB->cfg.out_format),20); + } + + if (ar>cbest) cbest=ar; // best (highest) certainty found + // reduce, but not if we cross certainty border + if (99*ar/100 > JOB->cfg.certainty) ar=99*ar/100; + if (JOB->cfg.verbose&2) + fprintf(stderr,"\n split result= %s (%3d) ",buf, ar); + setas(box2,buf,ar); // char *, does it disturb further splitting? + buf[0]=0; + i--; xi[i+2]=x1; + } + } + } + } end_for_each(&(JOB->res.boxlist)); + if(JOB->cfg.verbose)fprintf(stderr,", numC %d\n",JOB->res.numC); + return 0; +} + +/* +// ---- divide vertical glued boxes (ex: g above T); +*/ +int divide_vert_glued_boxes( pix *pp, int mo){ + struct box *box2,*box3,*box4; + int y0,y1,y,dy,flag_found,dx; + if(JOB->cfg.verbose)fprintf(stderr,"# divide vertical glued boxes"); + for_each_data(&(JOB->res.boxlist)) { + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if (box2->c != UNKNOWN) continue; /* dont try on pictures */ + y0=box2->y0; y1=box2->y1; dy=y1-y0+1; + dx=4*(JOB->res.avX+box2->x1-box2->x0+1); // we want to be sure to look at 4ex distance + if ( dy>2*JOB->res.avY && dy<6*JOB->res.avY && box2->m1 + && y0<=box2->m2+2 && y0>=box2->m1-2 + && y1>=box2->m4+JOB->res.avY-2) + { // test if lower end fits one of the other lines? + box4=box2; flag_found=0; + for_each_data(&(JOB->res.boxlist)) { + box4 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if (box4->c != UNKNOWN) continue; /* dont try on pictures */ + if (box4->x1<box2->x0-dx || box4->x0>box2->x1+dx) continue; // ignore far boxes + if (box4->line==box2->line ) flag_found|=1; // near char on same line + if (box4->line==box2->line+1) flag_found|=2; // near char on next line + if (flag_found==3) break; // we have two vertical glued chars + } end_for_each(&(JOB->res.boxlist)); + if (flag_found!=3) continue; // do not divide big chars or special symbols + y=box2->m4; // lower end of the next line + if(JOB->cfg.verbose&2){ + fprintf(stderr,"\n# divide box below y=%4d",y-y0); + } + // --- insert box3 before box2 + box3= (struct box *) malloc_box(box2); + box3->y1=y; + box2->y0=y+1; box2->line++; // m1..m4 should be corrected! + if (box4->line == box2->line){ + box2->m1=box4->m1; box2->m2=box4->m2; + box2->m3=box4->m3; box2->m4=box4->m4; + } + box3->num=JOB->res.numC; + if (list_ins(&(JOB->res.boxlist), box2, box3)) { + fprintf(stderr,"ERROR list_ins\n"); }; + JOB->res.numC++; + } + } end_for_each(&(JOB->res.boxlist)); + if(JOB->cfg.verbose)fprintf(stderr,", numC %d\n",JOB->res.numC); + return 0; +} + + +/* + on some systems isupper(>255) cause a segmentation fault SIGSEGV + therefore this function + ToDo: should be replaced (?) by wctype if available on every system + */ +int wisupper(wchar_t cc){ return ((cc<128)?isupper(cc):0); } +int wislower(wchar_t cc){ return ((cc<128)?islower(cc):0); } +int wisalpha(wchar_t cc){ return ((cc<128)?isalpha(cc):0); } +int wisdigit(wchar_t cc){ return ((cc<128)?isdigit(cc):0); } +int wisspace(wchar_t cc){ return ((cc<128)?isspace(cc):0); } + +/* set box2->c to cc if cc is in the ac-list of box2, return 1 on success */ +int setc(struct box *box2, wchar_t cc){ + int ret=0, w1, w2; + w1=((box2->num_ac) ? box2->wac[0] : 0); // weight of replaced char + w2=testac(box2,cc); + if (JOB->cfg.verbose) + fprintf(stderr, "\n# change %s (%d) to %s (%d) at (%d,%d)", + decode(box2->c,ASCII), w1, decode(cc,ASCII), w2, box2->x0, box2->y0); + if (w2) { if (box2->c!=cc) { ret=1; setac(box2,cc,(100+w2)/2); } } + // if(JOB->cfg.verbose & 4) out_x(box2); + // ToDo: modify per setac (shift ac) + return ret; +} + + +/* ---- proof difficult chars Il1 by context view ---- + context: separator, number, vowel, nonvowel, upper case ???? + could be also used to find unknown chars if the environment (nonumbers) + can be found in other places! + ToDo: + - box->tac[] as set of possible chars, ac set by engine, example: + ac="l/" (not "Il|/\" because serifs detected and slant>0) + correction only to one of the ac-set (alternative chars)! + - should be language-settable; Unicode compatible + - box2->ad and wac should be changed? (not proper yet) + * ------------- */ +int context_correction( job_t *job ) { + // const static char + char *l_vowel="aeiouy"; + // *l_Vowel="AEIOU",chars if the environment (nonumbers) + char *l_nonvo = "bcdfghjklmnpqrstvwxz"; + struct box *box4, *box3, *box2, *prev, *next; + // pix *pp = &(job->src.p); + int nc=0, ns=0; // num corrections + + if (job->cfg.verbose) + fprintf(stderr, "# context correction Il1 0O"); + + for_each_data(&(job->res.boxlist)) { + box2 = (struct box *)list_get_current(&(job->res.boxlist)); + if (box2->c > 0xFF) continue; // temporary UNICODE fix + prev = (struct box *)list_get_cur_prev(&(job->res.boxlist)); + next = (struct box *)list_get_cur_next(&(job->res.boxlist)); + if( (prev) && (prev->c > 0xFF)) continue; // temporary UNICODE fix 2 + if( (next) && (next->c > 0xFF)) continue; // temporary UNICODE fix 3 + if (box2->num_ac<2) continue; // no alternatives + if (box2->wac[0]==100 && box2->wac[1]<100) continue; + if (box2->num_ac && box2->tas[0]) continue; // buggy space_remove 0.42 + + /* check for Il1| which are general difficult to distinguish */ + /* bbg: not very good. Should add some tests to check if is preceded by '.', + spelling, etc */ + /* ToDo: only correct if not 100% sure (wac[i]<100) + and new char is in wat[] */ + if (strchr("Il1|", box2->c) && next && prev) { +// if( strchr(" \n",prev->c) // SPC +// && strchr(" \n",next->c) ) box2->c='I'; else // bad idea! I have ... + if (wisalpha(next->c) && next->c!='i' && + ( prev->c == '\n' || + ( prev->c == ' ' && + ( box4=(struct box *)list_prev(&(job->res.boxlist), prev)) && + box4->c == '.' ) ) ) { nc+=setc(box2,(wchar_t)'I'); } + else if (box2->c!='1' && strchr(l_nonvo,next->c) && + strchr("\" \n",prev->c)) /* lnt => Int, but 1st */ + /* do not change he'll to he'Il! */ + { nc+=setc(box2,(wchar_t)'I'); } // set box2->c to 'I' if 'I' is in the ac-list + else if (strchr(l_vowel,next->c)) /* unusual? Ii Ie Ia Iy Iu */ + /* && strchr("KkBbFfgGpP",prev->c)) */ /* kle Kla Kli */ + { nc+=setc(box2,(wchar_t)'l'); } + else if (wisupper(next->c) + && !strchr("O0I123456789",next->c) + && !strchr("O0I123456789",prev->c)) /* avoid lO => IO (10) */ + { nc+=setc(box2,(wchar_t)'I'); } + else if (wislower(prev->c)) + { nc+=setc(box2,(wchar_t)'l'); } + else if (wisdigit(prev->c) || wisdigit(next->c) + || (next->c=='O' && !wisalpha(prev->c))) /* lO => 10 */ + { nc+=setc(box2,(wchar_t)'1'); } + } + + /* check for O0 */ + else if (strchr("O0", box2->c) && next && prev) { + if (wisspace(prev->c) && wisalpha(next->c)) /* initial letter */ + { nc+=setc(box2,(wchar_t)'O'); } + else if (wisalpha(prev->c) && wisalpha(next->c) + && wisupper(next->c)) /* word in upper case */ + { nc+=setc(box2,(wchar_t)'O'); } + else if (wisdigit(prev->c) || wisdigit(next->c)) + { nc+=setc(box2,(wchar_t)'0'); } + } + + /* check for 5S */ + else if (strchr("5S", box2->c) && next && prev) { + if (wisspace(prev->c) && wisalpha(next->c)) /* initial letter */ + { nc+=setc(box2,(wchar_t)'S'); } + else if (wisalpha(prev->c) && wisalpha(next->c) + && wisupper(next->c)) /* word in upper case */ + { nc+=setc(box2,(wchar_t)'S'); } + else if (wisdigit(prev->c) || wisdigit(next->c)) + { nc+=setc(box2,(wchar_t)'5'); } + } + + /* was a space not found? xXx => x Xx ??? */ + if (wisupper(box2->c) && next && prev) { + if (wislower(prev->c) && wislower(next->c) + && 2 * (box2->x0 - prev->x1) > 3 * (next->x0 - box2->x1)) { + struct box *box3 = malloc_box((struct box *) NULL); + box3->x0 = prev->x1 + 2; + box3->x1 = box2->x0 - 2; + box3->y0 = box2->y0; + box3->y1 = box2->y1; + box3->x = box2->x0 - 1; + box3->y = box2->y0; + box3->dots = 0; + box3->num_boxes = 0; + box3->num_subboxes = 0; + box3->c = ' '; + box3->modifier = 0; + setac(box3,' ',99); /* ToDo: weight depends from distance */ + box3->num = -1; + box3->line = prev->line; + box3->m1 = box3->m2 = box3->m3 = box3->m4 = 0; + box3->p = &(job->src.p); + list_ins(&(job->res.boxlist), box2, box3); + } + } + + /* a space before punctuation? but not " ./file" */ + if ( prev && next) + if (prev->c == ' ' && strchr(" \n" , next->c) + && strchr(".,;:!?)", box2->c)) + if (prev->x1 - prev->x0 < 2 * job->res.avX) { // carefully on tables + box3 = prev; + if ( !list_del(&(job->res.boxlist), box3) ) free_box(box3); + prev = (struct box *)list_get_cur_prev(&(job->res.boxlist)); + ns++; + } + + /* \'\' to \" */ + if ( prev ) + if ( (prev->c == '`' || prev->c == '\'') + && (box2->c == '`' || box2->c == '\'') ) + if (prev->x1 - box2->x0 < job->res.avX) { // carefully on tables + box2->c='\"'; + box3 = prev; + list_del(&(job->res.boxlist), box3); + free_box(box3); + } + } end_for_each(&(job->res.boxlist)); + if (job->cfg.verbose) + fprintf(stderr, " num_corrected= %d removed_spaces= %d\n", nc, ns); + return 0; +} + + +/* ---- insert spaces ---- + * depends strongly from the outcome of measure_pitch() + * ------------------------ */ +int list_insert_spaces( pix *pp, job_t *job ) { + int i=0, j1, j2, i1, maxline=-1, dy=0; char cc; + struct box *box2, *box3=NULL, *box4=NULL; + + // measure mean line height + for(i1=1;i1<job->res.lines.num;i1++) { + dy+=job->res.lines.m4[i1]-job->res.lines.m1[i1]+1; + } if (job->res.lines.num>1) dy/=(job->res.lines.num-1); + i=0; j2=0; + for(i1=1;i1<job->res.lines.num;i1++) { + j1=job->res.lines.m4[i1]-job->res.lines.m1[i1]+1; + if (j1>dy*120/100 || j1<dy*80/100) continue; // only most frequently + j2+=j1; i++; + } if (i>0 && j2/i>7) dy=j2/i; + if( job->cfg.verbose&1 ) + fprintf(stderr,"# insert space between words (dy=%d) ...",dy); + if (!dy) dy=(job->res.avY)*110/100+1; + + i=0; + for_each_data(&(job->res.boxlist)) { + box2 =(struct box *)list_get_current(&(job->res.boxlist)); + cc=0; + if (box2->line>maxline) { // lines and chars must be sorted! + if (maxline>=0) cc='\n'; // NL + maxline=box2->line; + } + if((box3 = (struct box *)list_prev(&(job->res.boxlist), box2))){ + if (maxline && !box2->line && cc==0) cc=' '; + if (box2->line<=maxline && cc==0) { // lines and chars must be sorted! + int thispitch = job->res.lines.pitch[box2->line]; + int thismono = job->res.lines.mono[box2->line]; + int mdist = (box2->x1 + box2->x0 - (box3->x1 + box3->x0) + 1)/2; + int pdist = box2->x0 - box3->x1 + 1; + if (box2->x1 - box2->x0 < thispitch) pdist=pdist*4/3; + /* allow extra pixels around small characters .,'!: etc */ + // fprintf(stderr,"#\n ... mono= %2d pitch= %2d mdist= %2d pdist= %2d", + // thismono, thispitch, mdist, pdist); + if ((thismono!=0 && mdist >= thispitch) + || (thismono==0 && pdist >= thispitch)) + cc=' '; // insert SPACE + } + } + if(cc){ + box4=(struct box *)list_prev(&(job->res.boxlist), box2); + box3=(struct box *)malloc_box(NULL); + box3->x0=box2->x0-2; box3->x1=box2->x0-2; + box3->y0=box2->y0; box3->y1=box2->y1; + if(cc!='\n' && box4) + box3->x0=box4->x1+2; + if(cc=='\n' || !box4) + box3->x0=job->res.lines.x0[box2->line]; + if(cc=='\n' && box4){ + box3->y0=box4->y1; // better use lines.y1[box2->pre] ??? + box3->y1=box2->y0; + } + box3->x =box2->x0-1; box3->y=box2->y0; + box3->dots=0; box3->c=cc; + box3->num_boxes = 0; + box3->num_subboxes = 0; + box3->modifier='\0'; + box3->num=-1; box3->line=box2->line; + box3->m1=box2->m1; box3->m2=box2->m2; + box3->m3=box2->m3; box3->m4=box2->m4; + box3->p=pp; + setac(box3,cc,99); /* ToDo: weight depends from distance */ + list_ins(&(job->res.boxlist),box2,box3); + if( job->cfg.verbose&1 ) { + fprintf(stderr,"\n# insert space &%d; at x= %4d %4d box= %p", + (int)cc, box3->x0, box3->y0, (void*)box3); + /* out_x(box3); */ + } + i++; + } + } end_for_each(&(job->res.boxlist)); + if( job->cfg.verbose&1 ) fprintf(stderr," found %d\n",i); + return 0; +} + + +/* + add infos where the box is positioned to the box + this is useful for better recognition +*/ +int add_line_info(/* List *boxlist2 */){ + // pix *pp=&JOB->src.p; + struct tlines *lines = &JOB->res.lines; + struct box *box2; + int i,xx,m1,m2,m3,m4,num_line_members=0,num_rest=0; + if( JOB->cfg.verbose&1 ) fprintf(stderr,"# add line infos to boxes ..."); + for_each_data(&(JOB->res.boxlist)) { + box2 =(struct box *)list_get_current(&(JOB->res.boxlist)); + for(i=1;i<JOB->res.lines.num;i++) /* line 0 is a place holder */ + { + if (lines->dx) xx=lines->dy*((box2->x1+box2->x0)/2)/lines->dx; else xx=0; + m1=lines->m1[i]+xx; + m2=lines->m2[i]+xx; + m3=lines->m3[i]+xx; + m4=lines->m4[i]+xx; + // fprintf(stderr," test line %d m1=%d %d %d %d\n",i,m1,m2,m3,m4); + if (m4-m1==0) continue; /* no text line (line==0) */ +#if 0 + if( box2->y1+2*JOB->res.avY >= m1 + && box2->y0-2*JOB->res.avY <= m4 ) /* not to far away */ +#endif + /* give also a comma behind the line a chance */ + if( box2->x0 >= lines->x0[i] && box2->x1 <= lines->x1[i]+JOB->res.avX ) + if( box2->m2==0 || abs(box2->y0-box2->m2) > abs(box2->y0-m2) ) + { /* found nearest line */ + box2->m1=m1; + box2->m2=m2; + box2->m3=m3; + box2->m4=m4; + box2->line=i; + } + } + if( box2->y1+2 < box2->m1 + || box2->y0 < box2->m1 - (box2->m3-box2->m1)/2 + || box2->y0-2 > box2->m4 + || box2->y1 > box2->m3 + (box2->m3-box2->m1) + ) /* to far away */ + { /* reset */ + box2->m1=0; + box2->m2=0; + box2->m3=0; + box2->m4=0; + box2->line=0; + num_rest++; + } else num_line_members++; + } end_for_each(&(JOB->res.boxlist)); + if( JOB->cfg.verbose&1 ) + fprintf(stderr," done, num_line_chars=%d rest=%d\n", + num_line_members, num_rest); + return 0; +} + + +/* + * bring the boxes in right order + * add_line_info must be executed first! + */ +int sort_box_func (const void *a, const void *b) { + struct box *boxa, *boxb; + + boxa = (struct box *)a; + boxb = (struct box *)b; + + if ( ( boxb->line < boxa->line ) || + ( boxb->line == boxa->line && boxb->x0 < boxa->x0 ) ) + return 1; + return -1; +} + +// ------------------------------------------------------------- +// ------ use this for entry from other programs +// include pnm.h pgm2asc.h +// ------------------------------------------------------------- +// entry point for gocr.c or if it is used as lib +// better name is call_ocr ??? +// jb: OLD COMMENT: not removed due to set_options_* () +// args after pix *pp should be removed and new functions +// set_option_mode(int mode), set_option_spacewidth() .... etc. +// should be used instead, before calling pgm2asc(pix *pp) +// ! change if you can ! - used by X11 frontend +int pgm2asc(job_t *job) +{ + pix *pp; + progress_counter_t *pc; + + assert(job); + /* FIXME jb: remove pp */ + pp = &(job->src.p); + + if( job->cfg.verbose ) + fprintf(stderr, "# db_path= %s\n", job->cfg.db_path); + + pc = open_progress(100,"pgm2asc_main"); + progress(0,pc); /* start progress output 0% 0% */ + + /* ----- count colors ------ create histogram ------- + - this should be used to create a upper and lower limit for cs + - cs is the optimum gray value between cs_min and cs_max + - also inverse scans could be detected here later */ + if (job->cfg.cs==0) + job->cfg.cs=otsu( pp->p,pp->y,pp->x,0,0,pp->x,pp->y, job->cfg.verbose & 1 ); + /* renormalize the image and set the normalized threshold value */ + job->cfg.cs=thresholding( pp->p,pp->y,pp->x,0,0,pp->x,pp->y, job->cfg.cs ); + if( job->cfg.verbose ) + fprintf(stderr, "# thresholding new_threshold= %d\n", job->cfg.cs); + + progress(5,pc); /* progress is only estimated */ + +#if 0 /* dont vast memory */ + /* FIXME jb: malloc */ + if ( job->cfg.verbose & 32 ) { + // generate 2nd imagebuffer for debugging output + job->tmp.ppo.p = (unsigned char *)malloc(job->src.p.y * job->src.p.x); + // buffer + assert(job->tmp.ppo.p); + copybox(&job->src.p, + 0, 0, job->src.p.x, job->src.p.y, + &job->tmp.ppo, + job->src.p.x * job->src.p.y); + } +#else + job->tmp.ppo=job->src.p; /* temporarely, removed later */ +#endif + + /* load character data base */ + if ( job->cfg.mode&2 ) + load_db(); + + /* this is first step for reorganize the PG + ---- look for letters, put rectangular frames around letters + letter = connected points near color F + should be used by dust removing (faster) and line detection! + ---- 0..cs = black letters, last change = Mai99 */ + + progress(8,pc); /* progress is only estimated */ + + scan_boxes( pp ); + if ( !job->res.numC ){ + fprintf( stderr,"# no boxes found - stopped\n" ); + //if(job->cfg.verbose&32) debug_img("out01",job,8); + /***** should free stuff, etc) */ + return(1); + } + // if (job->cfg.verbose&32) debug_img("out00",job,4+8); + + progress(10,pc); /* progress is only estimated */ + // if(job->cfg.verbose&32) debug_img("out01",job,4+8); + // output_list(job); // for debugging + // ToDo: matrix printer preprocessing + + remove_dust( job ); /* from the &(job->res.boxlist)! */ +// if(job->cfg.verbose&32) debug_img("out02",job,4+8); +// output_list(job); // for debugging + smooth_borders( job ); /* only for big chars */ + progress(12,pc); /* progress is only estimated */ +// if(job->cfg.verbose&32) debug_img("out03",job,4+8); +// output_list(job); // for debugging + + //detect_barcode( job ); /* mark barcode */ +// if(job->cfg.verbose&32) debug_img("out04",job,4+8); +// output_list(job); // for debugging + + detect_pictures( job ); /* mark pictures */ +// if(job->cfg.verbose&32) debug_img("out05",job,4+8); +// output_list(job); // for debugging + + remove_pictures( job ); /* do this as early as possible, before layout */ +// if(job->cfg.verbose&32) debug_img("out06",job,4+8); +// output_list(job); // for debugging + + glue_holes_inside_chars( pp ); /* including count subboxes (holes) */ + + detect_rotation_angle( job ); + +#if 1 /* Rotate the whole picture! move boxes */ + if( job->res.lines.dy!=0 ){ // move down lowest first, move up highest first + // in work! ??? (at end set dy=0) think on ppo! + } +#endif + detect_text_lines( pp, job->cfg.mode ); /* detect and mark JOB->tmp.ppo */ +// if(job->cfg.verbose&32) debug_img("out07",job,4+8); + progress(20,pc); /* progress is only estimated */ + + add_line_info(/* &(job->res.boxlist) */); + //if (job->cfg.verbose&32) debug_img("out10",job,4+8); + + divide_vert_glued_boxes( pp, job->cfg.mode); /* after add_line_info, before list_sort! */ +// if(job->cfg.verbose&32) debug_img("out11",job,0); + + remove_melted_serifs( pp ); /* make some corrections on pixmap */ + /* list_ins seems to sort in the boxes on the wrong place ??? */ +// if(job->cfg.verbose&32) debug_img("out12",job,4+8); + + glue_broken_chars( pp ); /* 2nd glue */ +// if(job->cfg.verbose&32) debug_img("out14",job,4+8); + + remove_rest_of_dust( ); +// if(job->cfg.verbose&32) debug_img("out15",job,4+8); + + /* better sort after dust is removed (slow for lot of pixels) */ + list_sort(&(job->res.boxlist), sort_box_func); + + measure_pitch( job ); + + if(job->cfg.mode&64) find_same_chars( pp ); + progress(30,pc); /* progress is only estimated */ +// if(job->cfg.verbose&32) debug_img("out16",job,4+8); + + char_recognition( pp, job->cfg.mode); + progress(60,pc); /* progress is only estimated */ +// if(job->cfg.verbose&32) debug_img("out17",job,4+8); + + if ( adjust_text_lines( pp, job->cfg.mode ) ) { /* correct using chars */ + /* may be, characters/pictures have changed line number */ + list_sort(&(job->res.boxlist), sort_box_func); + // 2nd recognition call if lines are adjusted + char_recognition( pp, job->cfg.mode); + } + +#define BlownUpDrawing 1 /* german: Explosionszeichnung, temporarly */ +#if BlownUpDrawing == 1 /* german: Explosionszeichnung */ +{ /* just for debugging */ + int i,ii,ni; struct box *box2; + i=ii=ni=0; + for_each_data(&(JOB->res.boxlist)) { /* count boxes */ + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if (box2->c==UNKNOWN) i++; + if (box2->c==PICTURE) ii++; + ni++; + } end_for_each(&(JOB->res.boxlist)); + if (JOB->cfg.verbose) + fprintf(stderr,"# debug: unknown= %d picts= %d boxes= %d\n",i,ii,ni); +} +#endif + // ----------- write out20.pgm ----------- mark lines + boxes + //if (job->cfg.verbose&32) debug_img("out20",job,1+4+8); + + compare_unknown_with_known_chars( pp, job->cfg.mode); + progress(70,pc); /* progress is only estimated */ + + try_to_divide_boxes( pp, job->cfg.mode); + progress(80,pc); /* progress is only estimated */ + + /* --- list output ---- for debugging --- */ + //if (job->cfg.verbose&6) output_list(job); + + /* ---- insert spaces ---- */ + list_insert_spaces( pp , job ); + + // ---- proof difficult chars Il1 by context view ---- + if (JOB->cfg.verbose) + fprintf(stderr,"# context correction if !(mode&32)\n"); + if (!(job->cfg.mode&32)) context_correction( job ); + + store_boxtree_lines( job->cfg.mode ); + progress(90,pc); /* progress is only estimated */ + +/* 0050002.pgm.gz ca. 109 digits, only 50 recognized (only in lines?) + * ./gocr -v 39 -m 56 -e - -m 4 -C 0-9 -f XML tmp0406/0050002.pbm.gz + * awk 'BEGIN{num=0}/1<\/box>/{num++;}END{print num}' o + * 15*0 24*1 18*2 19*3 15*4 6*5 6*6 6*7 4*8 8*9 sum=125digits counted boxes + * 9*0 19*1 14*2 15*3 11*4 6*5 5*6 6*7 4*8 8*9 sum=97digits recognized + * 1*1 1*7 not recognized (Oct04) + * 33*SPC 76*NL = 109 spaces + 36*unknown sum=241 * 16 missed + */ +#if BlownUpDrawing == 1 /* german: Explosionszeichnung */ +{ /* just for debugging */ + int i,ii,ni; struct box *box2; const char *testc="0123456789ABCDEFGHIJK"; + i=ii=ni=0; + for_each_data(&(JOB->res.boxlist)) { /* count boxes */ + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if (box2->c==UNKNOWN) i++; + if (box2->c==PICTURE) ii++; + if (box2->c>' ' && box2->c<='z') ni++; + } end_for_each(&(JOB->res.boxlist)); + if(JOB->cfg.verbose) + fprintf(stderr,"# debug: (_)= %d picts= %d chars= %d",i,ii,ni); + for (i=0;i<20;i++) { + ni=0; + for_each_data(&(JOB->res.boxlist)) { /* count boxes */ + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if (box2->c==testc[i]) ni++; + } end_for_each(&(JOB->res.boxlist)); + if(JOB->cfg.verbose && ni>0) + fprintf(stderr," (%c)=%d",testc[i],ni); + } + if(JOB->cfg.verbose) + fprintf(stderr,"\n"); +} +#endif + + // ---- frame-size-histogram + // ---- (my own defined) distance between letters + // ---- write internal picture of textsite + // ----------- write out30.pgm ----------- + //if( job->cfg.verbose&32 ) debug_img("out30",job,2+4); + + progress(100,pc); /* progress is only estimated */ + + close_progress(pc); + + return 0; /* what should I return? error-state? num-of-chars? */ +} diff --git a/lib/gocr/pgm2asc.h b/lib/gocr/pgm2asc.h new file mode 100644 index 00000000..9cd8b1fd --- /dev/null +++ b/lib/gocr/pgm2asc.h @@ -0,0 +1,106 @@ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000-2006 Joerg Schulenburg + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + see README for EMAIL-address + +*/ + +#ifndef PGM2ASC_H +#define PGM2ASC_H 1 + +#include "pnm.h" +//#include "output.h" +#include "list.h" +#include "unicode.h" +#include "gocr.h" + +#define pixel_at(pic, xx, yy) (pic).p[(xx)+((yy)*((pic).x))] +#define pixel_atp(pic, xx, yy) (pic)->p[(xx)+((yy)*((pic)->x))] + +#ifndef HAVE_WCHAR_H +const wchar_t *wcschr (const wchar_t *wcs, wchar_t wc); +const wchar_t *wcscpy (wchar_t *dest, const wchar_t *src); +size_t wcslen (const wchar_t *s); +#endif +#ifndef HAVE_WCSDUP +wchar_t * wcsdup (const wchar_t *WS); /* its a gnu extension */ +#endif + +/* declared in pgm2asc.c */ +/* set alternate chars and its weight, called from the engine + if a char is recognized to (weight) percent */ +int setas(struct box *b, char *as, int weight); /* string + xml */ +int setac(struct box *b, wchar_t ac, int weight); /* wchar */ + +/* for qsort() call */ +int intcompare (const void *vr, const void *vs); + +/* declared in box.c */ +int box_gt(struct box *box1, struct box *box2); +int reset_box_ac(struct box *box); /* reset and free char table */ +struct box *malloc_box( struct box *inibox ); /* alloc memory for a box */ +int free_box( struct box *box ); /* free memory of a box */ +int copybox( pix *p, int x0, int y0, int dx, int dy, pix *b, int len); +int reduce_vectors ( struct box *box1, int mode ); +int merge_boxes( struct box *box1, struct box *box2 ); +int cut_box( struct box *box1); + + +/* declared in database.c */ +int load_db(void); +wchar_t ocr_db(struct box *box1); + +/* declared in detect.c */ +int detect_lines1(pix * p, int x0, int y0, int dx, int dy); +int detect_lines2(pix *p,int x0,int y0,int dx,int dy,int r); +int detect_rotation_angle(job_t *job); +int detect_text_lines(pix * pp, int mo); +int adjust_text_lines(pix * pp, int mo); +int detect_pictures(job_t *job); + +/* declared in lines.c */ +void store_boxtree_lines( int mo ); + /* free memory for internal stored textlines. + * Needs to be called _after_ having retrieved the text. + * After freeing, no call to getTextLine is possible any + * more + */ +void free_textlines( void ); + + /* get result of ocr for a given line number. + * If the line is out of range, the function returns 0, + * otherwise a pointer to a complete line. + */ +const char *getTextLine( int ); + +/* declared in remove.c */ +int remove_dust( job_t *job ); +int remove_pictures( job_t *job); +int remove_melted_serifs( pix *pp ); +int remove_rest_of_dust(); +int smooth_borders( job_t *job ); + +/* declared in pixel.c */ +int marked(pix * p, int x, int y); +int pixel(pix *p, int x, int y); +void put(pix * p, int x, int y, int ia, int io); + +/* start ocr on a image in job.src.p */ +int pgm2asc(job_t *job); + +#endif diff --git a/lib/gocr/pixel.c b/lib/gocr/pixel.c new file mode 100644 index 00000000..41647f39 --- /dev/null +++ b/lib/gocr/pixel.c @@ -0,0 +1,537 @@ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000-2006 Joerg Schulenburg + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + Joerg.Schulenburg@physik.uni-magdeburg.de */ + +/* Filter by tree, filter by number methods added by + * William Webber, william@williamwebber.com. */ + +#include "pgm2asc.h" +#include <assert.h> +#include <string.h> + +/* + * Defining this causes assert() calls to be turned off runtime. + * + * This is normally taken care of by make. + */ +/* #define NDEBUG */ + +// ------------------ (&~7)-pixmap-functions ------------------------ + +/* test if pixel marked? + * Returns: 0 if not marked, least 3 bits if marked. + */ +int marked (pix * p, int x, int y) { + if (x < 0 || y < 0 || x >= p->x || y >= p->y) + return 0; + return (pixel_atp(p, x, y) & 7); +} + +#define Nfilt3 6 /* number of 3x3 filter */ +/* + * Filters to correct possible scanning or image errors. + * + * Each of these filters represents a 3x3 pixel area. + * 0 represents a white or background pixel, 1 a black or + * foreground pixel, and 2 represents a pixel of either value. + * Note that this differs from the meaning of pixel values in + * the image, where a high value means "white" (background), + * and a low value means "black" (foreground). + * + * These filters are applied to the 3x3 environment of a pixel + * to be retrieved from the image, centered around that pixel + * (that is, the to-be-retrieved pixel corresponds with the + * the fifth position of the filter). + * If the filter matches that pixel environment, then + * the returned value of the pixel is inverted (black->white + * or white->black). + * + * So, for instance, the second filter below matches this + * pattern: + * + * 000 + * X0X + * 000 + * + * and "fills in" the middle (retrieved) pixel to rejoin a line + * that may have been broken by a scanning or image error. + */ +const char filt3[Nfilt3][9]={ + {0,0,0, 0,0,1, 1,0,0}, /* (-1,-1) (0,-1) (1,-1) (-1,0) (0,0) ... */ + {0,0,0, 1,0,1, 0,0,0}, + {1,0,0, 0,0,1, 0,0,0}, + {1,1,0, 0,1,0, 2,1,1}, + {0,0,1, 0,0,0, 2,1,0}, + {0,1,0, 0,0,0, 1,2,0} +}; +/* 2=ignore_pixel, 0=white_background, 1=black_pixel */ + + +/* + * Filter by matrix uses the above matrix of filters directly. Pixel + * environments to be filtered are compared pixel by pixel against + * these filters. + * + * Filter by number converts these filters into integer representations + * and stores them in a table. Pixel environments are similarly + * converted to integers, and looked up in the table. + * + * Filter by tree converts these filters into a binary tree. Pixel + * environments are matched by traversing the tree. + * + * A typical performance ratio for these three methods is 20:9:7 + * respectively (i.e., the tree method takes around 35% of the + * time of the matrix method). + */ +#define FILTER_BY_MATRIX 0 +#define FILTER_BY_NUMBER 1 +#define FILTER_BY_TREE 2 + +#define FILTER_METHOD FILTER_BY_TREE + +/* + * Defining FILTER_CHECKED causes filter results from either the tree + * or the number method to be checked against results of the other + * two methods to ensure correctness. This is for bug checking purposes + * only. + */ +/* #define FILTER_CHECKED */ + +/* + * Defining FILTER_STATISTICS causes statistics to be kept on how many + * times the filters are tried, how many times a filter matches, and + * of these matches how many flip a black pixel to white, and how many + * the reverse. These statistics are printed to stderr at the end of + * the program run. Currently, statistics are only kept if the tree + * filter method is being used. + */ +/* #define FILTER_STATISTICS */ + +#ifdef FILTER_STATISTICS +static int filter_tries = 0; +static int filter_matches = 0; +static int filter_blackened = 0; +static int filter_whitened = 0; +#endif + +#ifdef FILTER_STATISTICS +void print_filter_stats() { + fprintf(stderr, "\n# Error filter statistics: tries %d, matches %d, " + "blackened %d, whitened %d\n", + filter_tries, filter_matches, filter_blackened, filter_whitened); +} +#endif + +#if FILTER_METHOD == FILTER_BY_MATRIX || defined(FILTER_CHECKED) +/* + * Filter the pixel at (x,y) by directly applying the matrix. + */ +int pixel_filter_by_matrix(pix * p, int x, int y) { + int i; + static char c33[9]; + memset(c33, 0, sizeof(c33)); + /* copy environment of a point (only highest bit) +bbg: FASTER now. It has 4 ifs less at least, 8 at most. */ + if (x > 0) { c33[3] = pixel_atp(p,x-1, y )>>7; + if (y > 0) c33[0] = pixel_atp(p,x-1,y-1)>>7; + if (y+1 < p->y) c33[6] = pixel_atp(p,x-1,y+1)>>7; + } + if (x+1 < p->x) { c33[5] = pixel_atp(p,x+1, y )>>7; + if (y > 0) c33[2] = pixel_atp(p,x+1,y-1)>>7; + if (y+1 < p->y) c33[8] = pixel_atp(p,x+1,y+1)>>7; + } + if (y > 0) c33[1] = pixel_atp(p, x ,y-1)>>7; + c33[4] = pixel_atp(p, x , y )>>7; + if (y+1 < p->y) c33[7] = pixel_atp(p, x ,y+1)>>7; + + /* do filtering */ + for (i = 0; i < Nfilt3; i++) + if( ( (filt3[i][0]>>1) || c33[0]!=(1 & filt3[i][0]) ) + && ( (filt3[i][1]>>1) || c33[1]!=(1 & filt3[i][1]) ) + && ( (filt3[i][2]>>1) || c33[2]!=(1 & filt3[i][2]) ) + && ( (filt3[i][3]>>1) || c33[3]!=(1 & filt3[i][3]) ) + && ( (filt3[i][4]>>1) || c33[4]!=(1 & filt3[i][4]) ) + && ( (filt3[i][5]>>1) || c33[5]!=(1 & filt3[i][5]) ) + && ( (filt3[i][6]>>1) || c33[6]!=(1 & filt3[i][6]) ) + && ( (filt3[i][7]>>1) || c33[7]!=(1 & filt3[i][7]) ) + && ( (filt3[i][8]>>1) || c33[8]!=(1 & filt3[i][8]) ) ) { + return ((filt3[i][4])?JOB->cfg.cs:0); + } + return pixel_atp(p, x, y) & ~7; +} +#endif + +#if FILTER_METHOD == FILTER_BY_NUMBER || defined(FILTER_CHECKED) + +#define NUM_TABLE_SIZE 512 /* max value of 9-bit value */ +/* + * Recursively generates entries in the number table for a matrix filter. + * + * gen_num_filt is the number representation of the matrix filter. + * This generation is handled recursively because this is the easiest + * way to handle 2 (either value) entries in the filter, which lead + * to 2 distinct entries in the number table (one for each alternate + * value). + */ +void rec_generate_number_table(char * num_table, const char * filter, + int i, unsigned short gen_num_filt) { + if (i == 9) { + /* Invert the value of the number representation, to reflect the + * fact that the "white" is 0 in the filter, 1 (high) in the image. */ + gen_num_filt = ~gen_num_filt; + gen_num_filt &= 0x01ff; + assert(gen_num_filt < NUM_TABLE_SIZE); + num_table[gen_num_filt] = 1; + } else { + if (filter[i] == 0 || filter[i] == 2) + rec_generate_number_table(num_table, filter, i + 1, gen_num_filt); + if (filter[i] == 1 || filter[i] == 2) { + gen_num_filt |= (1 << (8 - i)); + rec_generate_number_table(num_table, filter, i + 1, gen_num_filt); + } + } +} + +/* + * Filter the pixel at (x, y) using a number table. + * + * Each filter can be converted into a 9-bit representation, where + * filters containing 2 (either value) pixels are converted into + * a separate numerical representation for each pixel, where position + * i in the filter corresponds to bit i in the number. Each resulting + * numerical representation N is represented as a 1 value in the Nth + * position of a lookup table. A pixel's environment is converted in + * the same way to a numeric representation P, and that environment + * matches a filter if num_table[P] == 1. + */ +int pixel_filter_by_number(pix * p, int x, int y) { + unsigned short val = 0; + static char num_table[NUM_TABLE_SIZE]; + static int num_table_generated = 0; + if (!num_table_generated) { + int f; + memset(num_table, 0, sizeof(num_table)); + for (f = 0; f < Nfilt3; f++) + rec_generate_number_table(num_table, filt3[f], 0, 0); + num_table_generated = 1; + } + + /* calculate a numeric value for the 3x3 square around the pixel. */ + if (x > 0) { val |= (pixel_atp(p,x-1, y )>>7) << (8 - 3); + if (y > 0) val |= (pixel_atp(p,x-1,y-1)>>7) << (8 - 0); + if (y+1 < p->y) val |= (pixel_atp(p,x-1,y+1)>>7) << (8 - 6); + } + if (x+1 < p->x) { val |= (pixel_atp(p,x+1, y )>>7) << (8 - 5); + if (y > 0) val |= (pixel_atp(p,x+1,y-1)>>7) << (8 - 2); + if (y+1 < p->y) val |= (pixel_atp(p,x+1,y+1)>>7) << (8 - 8); + } + if (y > 0) val |= (pixel_atp(p, x ,y-1)>>7) << (8 - 1); + val |= (pixel_atp(p, x , y )>>7) << (8 - 4); + if (y+1 < p->y) val |= (pixel_atp(p, x ,y+1)>>7) << (8 - 7); + assert(val < NUM_TABLE_SIZE); + + if (num_table[val]) + return (val & (1 << 4)) ? 0 : JOB->cfg.cs; + else + return pixel_atp(p, x, y) & ~7; +} +#endif + +#if FILTER_METHOD == FILTER_BY_TREE || defined(FILTER_CHECKED) + +#define TREE_ARRAY_SIZE 1024 +/* 1+ number of nodes in a complete binary tree of height 10 */ + +/* + * Recursively generate a tree representation of a filter. + */ +void rec_generate_tree(char * tree, const char * filter, int i, int n) { + assert(i >= 0 && i <= 9); + assert(n < TREE_ARRAY_SIZE); + if (i == 9) { + if (filter[4] == 0) + tree[n] = 2; + else + tree[n] = 1; + return; + } + /* first iteration has n == -1, does not set any values of the tree, + just to find whether to start to the left or the right */ + if (n != -1) + tree[n] = 1; + if (filter[i] == 0) + rec_generate_tree(tree, filter, i + 1, n * 2 + 2); + else if (filter[i] == 1) + rec_generate_tree(tree, filter, i + 1, n * 2 + 3); + else { + rec_generate_tree(tree, filter, i + 1, n * 2 + 2); + rec_generate_tree(tree, filter, i + 1, n * 2 + 3); + } +} + +/* + * Filter the pixel at (x, y) using the tree method. + * + * Each filter is represented by a single branch of a binary + * tree, except for filters contain "either value" entries, which + * bifurcate at that point in the branch. Each white pixel in the filter + * is a left branch in the tree, each black pixel a right branch. The + * final node of a branch indicates whether this filter turns a white + * pixel black, or a black one white. + * + * We match a pixel's environment against this tree by similarly + * using the pixels in that environment to traverse the tree. If + * we run out of nodes before getting to the end of a branch, then + * the environment doesn't match against any of the filters represented + * by the tree. Otherwise, we return the value specified by the + * final node. + * + * Since the total tree size, even including missing nodes, is small + * (2 ^ 10), we can use a standard array representation of a binary + * tree, where for the node tree[n], the left child is tree[2n + 2], + * and the right tree[2n + 3]. The only information we want + * from a non-leaf node is whether it exists (that is, is part of + * a filter-representing branch). We represent this with the value + * 1 at the node's slot in the array, the contrary by 0. For the + * leaf node, 0 again represents non-existence, 1 that the filter + * represented by this branch turns a black pixel white, and 2 a + * white pixel black. + */ +int pixel_filter_by_tree(pix * p, int x, int y) { + static char tree[TREE_ARRAY_SIZE]; + static int tree_generated = 0; + int n; + int pixel_val = pixel_atp(p, x, y) & ~7; +#ifdef FILTER_STATISTICS + static int registered_filter_stats = 0; + if (!registered_filter_stats) { + atexit(print_filter_stats); + registered_filter_stats = 1; + } + filter_tries++; +#endif /* FILTER_STATISTICS */ + if (!tree_generated) { + int f; + memset(tree, 0, sizeof(tree)); + for (f = 0; f < Nfilt3; f++) { + const char * filter = filt3[f]; + rec_generate_tree(tree, filter, 0, -1); + } + tree_generated = 1; + } + n = -1; + + /* Note that for the image, low is black, high is white, whereas + * for the filter, 0 is white, 1 is black. For the image, then, + * high (white) means go left, low (black) means go right. */ + +#define IS_BLACK(_dx,_dy) !(pixel_atp(p, x + (_dx), y + (_dy)) >> 7) +#define IS_WHITE(_dx,_dy) (pixel_atp(p, x + (_dx), y + (_dy)) >> 7) +#define GO_LEFT n = n * 2 + 2 +#define GO_RIGHT n = n * 2 + 3 +#define CHECK_NO_MATCH if (tree[n] == 0) return pixel_val + + /* Top row */ + if (y == 0) { + /* top 3 pixels off edge == black == right + n = 2 * (2 * (2 * -1 + 3) + 3) + 3 = 13 */ + n = 13; + } else { + if (x == 0 || IS_BLACK(-1, -1)) + GO_RIGHT; + else + GO_LEFT; + + if (IS_WHITE(0, -1)) + GO_LEFT; + else + GO_RIGHT; + CHECK_NO_MATCH; + + if (x + 1 == p->x || IS_BLACK(+1, -1)) + GO_RIGHT; + else + GO_LEFT; + CHECK_NO_MATCH; + } + + /* Second row */ + if (x == 0 || IS_BLACK(-1, 0)) + GO_RIGHT; + else + GO_LEFT; + CHECK_NO_MATCH; + + if (IS_WHITE(0, 0)) + GO_LEFT; + else + GO_RIGHT; + CHECK_NO_MATCH; + + if (x + 1 == p->x || IS_BLACK(+1, 0)) + GO_RIGHT; + else + GO_LEFT; + CHECK_NO_MATCH; + + /* bottom row */ + if (y + 1 == p->y) { + /* bottom 3 pixels off edge == black == right + n' = 2 * (2 * (2n + 3) + 3) + 3 + = 2 * (4n + 9) + 3 + = 8n + 21 */ + n = 8 * n + 21; + } else { + if (x == 0 || IS_BLACK(-1, +1)) + GO_RIGHT; + else + GO_LEFT; + CHECK_NO_MATCH; + + if (IS_WHITE(0, 1)) + GO_LEFT; + else + GO_RIGHT; + CHECK_NO_MATCH; + + if (x + 1 == p->x || IS_BLACK(+1, +1)) + GO_RIGHT; + else + GO_LEFT; + } + assert(n < TREE_ARRAY_SIZE); + assert(tree[n] == 0 || tree[n] == 1 || tree[n] == 2); + CHECK_NO_MATCH; +#ifdef FILTER_STATISTICS + filter_matches++; +#endif + if (tree[n] == 1) { +#ifdef FILTER_STATISTICS + if (pixel_atp(p, x, y) < JOB->cfg.cs) + filter_whitened++; +#endif + return JOB->cfg.cs; + } else { +#ifdef FILTER_STATISTICS + if (pixel_atp(p, x, y) >= JOB->cfg.cs) + filter_blackened++; +#endif + return 0; + } +} +#endif /* FILTER_METHOD == FILTER_BY_TREE */ + +/* + * This simple filter attempts to correct "fax"-like scan errors. + */ +int pixel_faxfilter(pix *p, int x, int y) { + int r; // filter + r = pixel_atp(p,x,y)&~7; + /* {2,2,2, 2,0,1, 2,1,0} */ + if ((r&128) && (~pixel_atp(p,x+1, y )&128) + && (~pixel_atp(p, x ,y+1)&128) + && ( pixel_atp(p,x+1,y+1)&128)) + r = 64; /* faxfilter */ + + else + /* {2,2,2, 1,0,2, 0,1,2} */ + if ((r&128) && (~pixel_atp(p,x-1, y )&128) + && (~pixel_atp(p, x ,y+1)&128) + && ( pixel_atp(p,x-1,y+1)&128)) + r = 64; /* faxfilter */ + return r & ~7; +} + +#ifdef FILTER_CHECKED +/* + * Print out the 3x3 environment of a pixel as a 9-bit binary. + * + * For debugging purposes only. + */ +void print_pixel_env(FILE * out, pix * p, int x, int y) { + int x0, y0; + for (y0 = y - 1; y0 < y + 2; y0++) { + for (x0 = x - 1; x0 < x + 2; x0++) { + if (x0 < 0 || x0 >= p->x || y0 < 0 || y0 >= p->y) + fputc('?', out); + else if (pixel_atp(p, x0, y0) >> 7) + fputc('0', out); + else + fputc('1', out); + } + } +} +#endif + +/* this function is heavily used + * test if pixel was set, remove low bits (marks) --- later with error-correction + * result depends on n_run, if n_run>0 filter are used + * Returns: pixel-color (without marks) + */ +int getpixel(pix *p, int x, int y){ + if ( x < 0 || y < 0 || x >= p->x || y >= p->y ) + return 255 & ~7; + + /* filter will be used only once later, when vectorization replaces pixel + * processing + */ + if (JOB->tmp.n_run > 0) { /* use the filters (correction of errors) */ +#if FILTER_METHOD == FILTER_BY_NUMBER + int pix = pixel_filter_by_number(p, x, y); +#ifdef FILTER_CHECKED + int pix2 = pixel_filter_by_matrix(p, x, y); + if (pix != pix2) { + fprintf(stderr, + "# BUG: pixel_filter: by number: %d; by matrix: %d, " + "by atp %d; env: ", pix, pix2, pixel_atp(p, x, y) & ~7); + print_pixel_env(stderr, p, x, y); + fputc('\n', stderr); + } +#endif /* FILTER_CHECKED */ + return pix; +#elif FILTER_METHOD == FILTER_BY_MATRIX + return pixel_filter_by_matrix(p, x, y); +#elif FILTER_METHOD == FILTER_BY_TREE + int pix = pixel_filter_by_tree(p, x, y); +#ifdef FILTER_CHECKED + int pix2 = pixel_filter_by_matrix(p, x, y); + int pix3 = pixel_filter_by_number(p, x, y); + if (pix != pix2 || pix != pix3) { + fprintf(stderr, + "# BUG: pixel_filter: tree: %d; matrix: %d, " + "number: %d, atp %d; env: ", pix, pix2, pix3, + pixel_atp(p, x, y) & ~7); + print_pixel_env(stderr, p, x, y); + fputc('\n', stderr); + } +#endif /* FILTER_CHECKED */ + return pix; +#else +#error FILTER_METHOD not defined +#endif /* FILTER_BY_NUMBER */ + } + + return (pixel_atp(p,x,y) & ~7); +} + +/* modify pixel, test if out of range */ +void put(pix * p, int x, int y, int ia, int io) { + if (x < p->x && x >= 0 && y >= 0 && y < p->y) + pixel_atp(p, x, y) = (pixel_atp(p, x, y) & ia) | io; +} diff --git a/lib/gocr/pnm.h b/lib/gocr/pnm.h new file mode 100644 index 00000000..7d5bc8a1 --- /dev/null +++ b/lib/gocr/pnm.h @@ -0,0 +1,24 @@ +/* Handle PNM-files Dez98 JS + * 0,0 = left up + * PAM-formats + * PAM any P7 + * PNM-formats + * PGM gray ASCII=P2 RAW=P5 dx dy col gray + * PPM RGB ASCII=P3 RAW=P6 dx dy col RGB + * PBM B/W ASCII=P1 RAW=P4 dx dy bitmap + */ + +#ifndef GOCR_PNM_H +#define GOCR_PNM_H 1 + +#include "../../config.h" + +struct pixmap { + unsigned char *p; /* pointer of image buffer (pixmap) */ + int x; /* xsize */ + int y; /* ysize */ + int bpp; /* bytes per pixel: 1=gray 3=rgb */ + }; +typedef struct pixmap pix; + +#endif diff --git a/lib/gocr/progress.c b/lib/gocr/progress.c new file mode 100644 index 00000000..14804ed8 --- /dev/null +++ b/lib/gocr/progress.c @@ -0,0 +1,87 @@ +/* ---------------------------- progress output ---------------------- */ +#include <stdlib.h> +#include <stdio.h> +#include "progress.h" + +FILE *fp=NULL; /* output stream for progress info */ +time_t printinterval = 10; /* approx. seconds between printouts, 1.. */ + +/* initialization of progress output, fname="<fileID>","<filename>","-" */ +int ini_progress(char *fname){ + int fd; + if (fp) { fclose(fp); fp=NULL; } + if (fname) if (fname[0]) { + fd=atoi(fname); + if(fd>255 || fname[((fd>99)?3:((fd>9)?2:1))]) fd=-1; /* be sure */ + if (fname[0]=='-' && fname[1]==0) { fp=stdout; } +#ifdef __USE_POSIX + else if (fd>0) { fp=fdopen(fd,"w"); } /* not sure that "w" is ok ???? */ +#endif + else { fp=fopen(fname,"w");if(!fp)fp=fopen(fname,"a"); } + if (!fp) { + fprintf(stderr,"could not open %s for progress output\n",fname); + return -1; /* no success */ + } + } + /* fprintf(stderr,"# progress: fd=%d\n",fileno(fp)); */ + return 0; /* no error */ +} + +progress_counter_t *open_progress(int maxcount, const char *name){ + progress_counter_t *pc; + pc = (progress_counter_t*) malloc( sizeof(progress_counter_t) ); + if (!pc) return 0; /* nonfatal */ + pc->starttime = time(NULL); + pc->maxcount = maxcount; + pc->numskip = 0; + pc->lastprintcount = -1; + pc->name = name; + pc->lastprinttime = pc->starttime; + return pc; +} +/* free counter */ +int close_progress(progress_counter_t *counter){ + if (counter) free(counter); + return 0; +} +/* progress meter output + * only 1output/10s, + estimated endtime (test on pixelfields) + * ToDo: to stderr by default? remove subprogress, ini_progress? rm_progress? + * test on tcl + */ +int progress(int counter, progress_counter_t *pc){ + /* we try to save computing time, so we skip early */ + if ((!fp) || counter - pc->lastprintcount <= pc->numskip) return 0; + { + char cr='\n'; + time_t now = time(NULL); +#if 0 /* debugging */ + if (counter) + fprintf(fp," progress %s %3d / %d time %d skip %d\n", + pc->name,counter,pc->maxcount,(int)(now - pc->starttime), + pc->numskip); fflush(fp); +#endif + if (5*(now - pc->lastprinttime) < 2*printinterval + && counter - pc->lastprintcount >= pc->numskip) { /* save for tests */ + if (pc->numskip < 1024) pc->numskip += pc->numskip+1; + } + if (3*(now - pc->lastprinttime) < 2*printinterval ) { + return 0; /* to early for printing */ + } + if (2*(now - pc->lastprinttime) > 3*printinterval ) { + pc->numskip >>= 1; /* to late for printing */ + } + if (fileno(fp)<3) cr='\r'; /* may be choosen in ini? */ + if (counter) + fprintf(fp," progress %s %5d / %d time[s] %5d / %5d (skip=%d)%c", + pc->name,counter,pc->maxcount, + (int)(now - pc->starttime), /* time gone since start */ + (int)(now - pc->starttime)*pc->maxcount/(counter), /* estimated */ + pc->numskip, cr); + fflush(fp); + pc->lastprintcount=counter; + pc->lastprinttime=now; + } + return 0; /* no error */ +} +/* --------------------- end of progress output ---------------------- */ diff --git a/lib/gocr/progress.h b/lib/gocr/progress.h new file mode 100644 index 00000000..d31e7a82 --- /dev/null +++ b/lib/gocr/progress.h @@ -0,0 +1,42 @@ +/* + ---------------------- progress output ---------------------- + output progress for GUIs to a pipe + format: "counter_name" counter maxcounter time estimated_time \r|\n + */ +#ifndef GOCR_PROGRESS_H +#define GOCR_PROGRESS_H "Oct06" +#include <time.h> + +/* initialization of progress output, fname="<fileID>","<filename>","-" */ +int ini_progress(char *fname); + +/* ToDo: add by open_* and close_* */ +/* place to store values for progress calculation, called often, but + * dont call systime so often + */ +typedef struct progress_counter { + const char *name; /* name of counter */ + int lastprintcount; /* last counter printed for extrapolation */ + int maxcount; /* max counter */ + int numskip; /* num of counts to skip before timecall 0..maxcount */ + time_t starttime; /* start time of this counter */ + time_t lastprinttime; /* last time printed in seconds */ + +} progress_counter_t; + +/* progress output p1=main_progress_0..100% p2=sub_progress_0..100% */ +/* ToDo: improved_progress: counter, maxcount(ini), counter_name(ini), + * printinterval=10 # time before printing out progressmeter + * *numskip=1 # if (counter-lastprintcounter<numskip) return; gettime() ... + * *startutime, *lastprintutime, *lastprintcounter # numskip*=2 or /=2 + * only 1output/10s, + estimated endtime (test on pixelfields) + * to stderr by default? remove subprogress, ini_progress? rm_progress? + * test on tcl + */ +progress_counter_t *open_progress(int maxcount, const char *name); +/* free counter */ +int close_progress(progress_counter_t *counter); +/* output progress for pc */ +int progress(int counter, progress_counter_t *pc); +/* --------------------- end of progress output ---------------------- */ +#endif diff --git a/lib/gocr/remove.c b/lib/gocr/remove.c new file mode 100644 index 00000000..7224e70f --- /dev/null +++ b/lib/gocr/remove.c @@ -0,0 +1,687 @@ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000-2007 Joerg Schulenburg + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + see README for EMAIL-address +*/ + +#include <stdlib.h> +#include <stdio.h> +#include "pgm2asc.h" +#include "gocr.h" +#include "progress.h" + +/* measure mean thickness as an criteria for big chars */ +int mean_thickness( struct box *box2 ){ + int mt=0, i, y, dx=box2->x1-box2->x0+1, dy; + for (y=box2->y0+1; y<box2->y1; y++) { + i=loop(box2->p,box2->x0+0,y,dx,JOB->cfg.cs,0,RI); + i=loop(box2->p,box2->x0+i,y,dx,JOB->cfg.cs,1,RI); + mt+=i; + } + dy = box2->y1 - box2->y0 - 1; + if (dy) mt=(mt+dy/2)/dy; + return mt; +} + +/* ---- remove dust --------------------------------- + What is dust? I think, this is a very small pixel cluster without + neighbours. Of course not all dust clusters can be detected correct. + This feature should be possible to switch off via option. + -> may be, all clusters should be stored here? + speed is very slow, I know, but I am happy that it is working well +*/ +int remove_dust( job_t *job ){ + /* new dust removing */ + /* FIXME jb:remove pp */ + pix *pp = &job->src.p; + int i1,i,j,x,y,x0,x1,y0,y1,nC,sX,sY,sP, cs,vvv=job->cfg.verbose; + struct box *box2; +#define HISTSIZE 220 /* histogramm */ + int histo[HISTSIZE]; + cs=job->cfg.cs; sP=sX=sY=nC=0; + /* + * count number of black pixels within a box and store it in .dots + * later .dots is re-used for number of objects belonging to the character + * should be done in the flood-fill algorithm + * volume of white pixels is estimated to big here (left/right rot) + * ToDo: mean thickness of char lines? + * or interval nesting (minP..maxP) to remove outriders + */ + j=0; + for (i1=0;i1<HISTSIZE;i1++) histo[i1]=0; + /* mean value over every black object which is big enough */ + for_each_data(&(job->res.boxlist)) { + box2 = (struct box *)list_get_current(&(job->res.boxlist)); + if (!box2->num_frames) continue; + if (box2->frame_vol[0]<0) continue; /* don't count inner holes */ + j = abs(box2->frame_vol[0]); + if ((box2->y1-box2->y0+1)>3) { + nC++; /* only count potential chars v0.42 */ + sX+=box2->x1 - box2->x0 + 1; + sY+=box2->y1 - box2->y0 + 1; + sP+=j; + } + if (j<HISTSIZE) histo[j]++; + } end_for_each(&(job->res.boxlist)); + + if (job->cfg.dust_size < 0 && nC > 0) { /* auto detection */ + /* this formula is empirically, high resolution scans have bigger dust */ + /* maximum allowed dustsize (min=4*7 ca. 32) + * does not work for background pattern! + */ + job->cfg.dust_size = ( ( sX/nC ) * ( sY/nC ) + 16) / 32; + if (vvv) fprintf(stderr, "# dust size detection, vol num" + " #obj=%d maxDust=%d mpixel= %3d mxy= %2d %2d", + nC, job->cfg.dust_size, sP/nC, sX/nC, sY/nC); + /* we assume that for random dust applies histo[i+1]<histo[i] */ + for (i=1;i+3<HISTSIZE;i++){ + if (vvv) fprintf(stderr,"\n# dust size histogram %3d %5d",i,histo[i]); + if (histo[i]>=nC) continue; /* v0.42 lot of pixels -> bg pattern < 3 */ + if (i>=job->cfg.dust_size) break; /* maximum = mean size / 32 */ + if (histo[i+1]==0) break; /* bad statistic */ + if ((histo[i+2]+histo[i+3]) + >=(histo[i] +histo[i+1])) break; /* no noise, but to late? */ + if ( histo[i-1] > 1024*histo[i] && + 2*histo[i+1] >=histo[i]) break; /* bg pattern */ + } + if (vvv) fprintf(stderr," break"); + if (vvv) for (i1=0,j=i+1;j<HISTSIZE;j++) { + /* compressed, output only if something is changing */ + if (j==HISTSIZE-1 || histo[j]!=histo[j-1] || histo[j]!=histo[j+1]) { + fprintf(stderr,"\n# dust size histogram %3d %5d",j,histo[j]); + if (++i1>20) break; /* dont do excessive output */ + } + } + job->cfg.dust_size=i-1; + /* what is the statistic of random dust? + * if we have p pixels on a x*y image we should have + * (p/(x*y))^1 * (x*y) = p singlets + * (p/(x*y))^2 * (x*y) = p^2/(x*y) doublets and + * (p/(x*y))^3 * (x*y) = p^3/(x*y)^2 triplets + */ + if (vvv) fprintf(stderr,"\n# auto dust size = %d nC= %3d .. %3d" + " avD= %2d %2d .. %2d %2d\n", + job->cfg.dust_size, nC, job->res.numC, + (job->res.sumX+job->res.numC/2)/job->res.numC, + (job->res.sumY+job->res.numC/2)/job->res.numC, sX/nC, sY/nC); + } + if (job->cfg.dust_size) + { i=0; + if(vvv){ + fprintf(stderr,"# remove dust of size %2d",job->cfg.dust_size); + /* Warning: better use (1/(x*y))^2 as 1/((x*y)^2), + * because (x*y)^2 may overflow */ + fprintf(stderr," histo=%d,%d(?=%d),%d(?=%d),...\n# ...", + histo[1],histo[2],histo[1]*histo[1]/(pp->x*pp->y), + histo[3], histo[1]*histo[1]/(pp->x*pp->y) + *histo[1]/(pp->x*pp->y)); + } + i = 0; + for_each_data(&(job->res.boxlist)) { + box2 = (struct box *)list_get_current(&(job->res.boxlist)); + x0=box2->x0;x1=box2->x1;y0=box2->y0;y1=box2->y1; /* box */ + j=abs(box2->frame_vol[0]); + if(j<=job->cfg.dust_size) /* remove this tiny object */ + { /* here we should distinguish dust and i-dots, + * may be we should sort out dots to a seperate dot list and + * after line detection decide, which is dust and which not + * dust should be removed to make recognition easier (ToDo) + */ +#if 0 + if(get_bw((3*x0+x1)/4,(x0+3*x1)/4,y1+y1-y0+1,y1+8*(y1-y0+1),pp,cs,1)) + continue; /* this idea was to simple, see kscan003.jpg sample */ +#endif + /* remove from average */ + job->res.numC--; + job->res.sumX-=x1-x0+1; + job->res.sumY-=y1-y0+1; + /* remove pixels (should only be done with dust) */ + for(x=x0;x<=x1;x++) + for(y=y0;y<=y1;y++){ put(pp,x,y,0,255&~7); } + /* remove from list */ + list_del(&(job->res.boxlist),box2); + /* free memory */ + free_box(box2); + i++; /* count as dust particle */ + continue; + } + } end_for_each(&(job->res.boxlist)); + if(vvv)fprintf(stderr," %3d cluster removed, nC= %3d\n",i,job->res.numC); + } + /* reset dots to 0 and remove white pixels (new) */ + i=0; + for_each_data(&(job->res.boxlist)) { + box2 = ((struct box *)list_get_current(&(job->res.boxlist))); + if (box2->frame_vol[0]<0) continue; /* for black areas only */ + x0=box2->x0;x1=box2->x1;y0=box2->y0;y1=box2->y1; /* box */ + if (x1-x0>16 && y1-y0>30) /* only on large enough chars */ + for(x=x0+1;x<=x1-1;x++) + for(y=y0+1;y<=y1-1;y++){ + if( pixel_atp(pp,x ,y )>=cs + && pixel_atp(pp,x-1,y ) <cs + && pixel_atp(pp,x+1,y ) <cs + && pixel_atp(pp,x ,y-1) <cs + && pixel_atp(pp,x ,y+1) <cs ) /* remove it */ + { + put(pp,x,y,0,0); i++; /* (x and 0) or 0 */ + } + } + } end_for_each(&(job->res.boxlist)); + if (vvv) fprintf(stderr,"# ... %3d white pixels removed, cs=%d nC= %3d\n", + i,cs,job->res.numC); + return 0; +} + +/* ---- smooth big chars --------------------------------- + * Big chars often do not have smooth borders, which let fail + * the engine. Here we smooth the borders of big chars (>7x16). + * Smoothing is important for b/w scans, where we often have + * comb like pattern on a vertikal border. I also received + * samples with lot of white pixels (sample: 04/02/25). + * ToDo: obsolete if vector code is complete + */ +int smooth_borders( job_t *job ){ + pix *pp = &job->src.p; + int ii=0,x,y,x0,x1,y0,y1,dx,dy,cs,i0,i1,i2,i3,i4,n1,n2, + cn[8],cm,vvv=job->cfg.verbose; /* dust found */ + struct box *box2; + cs=job->cfg.cs; n1=n2=0; + if(vvv){ fprintf(stderr,"# smooth big chars 7x16 cs=%d",cs); } + /* filter for each big box */ + for_each_data(&(job->res.boxlist)) { n2++; /* count boxes */ + box2 = (struct box *)list_get_current(&(job->res.boxlist)); + /* do not touch small characters! but how we define small characters? */ + if (box2->x1-box2->x0+1<7 || box2->y1-box2->y0+1<16 ) continue; + if (box2->c==PICTURE) continue; + if (mean_thickness(box2)<3) continue; + n1++; /* count boxes matching big-char criteria */ + x0=box2->x0; y0=box2->y0; + x1=box2->x1; y1=box2->y1; + dx=x1-x0+1; dy=y1-y0-1; + /* out_x(box2); + * dont change to much! only change if absolutely sure! + * ....... 1 2 3 + * ex: .?##### 0 * 4 + * ....... 7 6 5 + * we should also avoid removing lines by sytematic remove + * from left end to the right, so we concern also about distance>1 + */ + for(x=box2->x0;x<=box2->x1;x++) + for(y=box2->y0;y<=box2->y1;y++){ /* filter out high frequencies */ + /* this is a very primitive solution, only for learning */ + cn[0]=getpixel(pp,x-1,y); + cn[4]=getpixel(pp,x+1,y); /* horizontal */ + cn[2]=getpixel(pp,x,y-1); + cn[6]=getpixel(pp,x,y+1); /* vertical */ + cn[1]=getpixel(pp,x-1,y-1); + cn[3]=getpixel(pp,x+1,y-1); /* diagonal */ + cn[7]=getpixel(pp,x-1,y+1); + cn[5]=getpixel(pp,x+1,y+1); + cm=getpixel(pp,x,y); + /* check for 5 other and 3 same surrounding pixels */ + for (i0=0;i0<8;i0++) + if ((cn[i0 ]<cs)==(cm<cs) + && (cn[(i0+7) & 7]<cs)!=(cm<cs)) break; /* first same */ + for (i1=0;i1<8;i1++) + if ((cn[(i0+i1) & 7]<cs)!=(cm<cs)) break; /* num same */ + for (i2=0;i2<8;i2++) + if ((cn[(i0+i1+i2) & 7]<cs)==(cm<cs)) break; /* num other */ + cn[0]=getpixel(pp,x-2,y); + cn[4]=getpixel(pp,x+2,y); /* horizontal */ + cn[2]=getpixel(pp,x,y-2); + cn[6]=getpixel(pp,x,y+2); /* vertical */ + cn[1]=getpixel(pp,x-2,y-2); + cn[3]=getpixel(pp,x+2,y-2); /* diagonal */ + cn[7]=getpixel(pp,x-2,y+2); + cn[5]=getpixel(pp,x+2,y+2); + /* check for 5 other and 3 same surrounding pixels */ + for (i0=0;i0<8;i0++) + if ((cn[i0 ]<cs)==(cm<cs) + && (cn[(i0+7) & 7]<cs)!=(cm<cs)) break; /* first same */ + for (i3=0;i3<8;i3++) + if ((cn[(i0+i3) & 7]<cs)!=(cm<cs)) break; /* num same */ + for (i4=0;i4<8;i4++) + if ((cn[(i0+i3+i4) & 7]<cs)==(cm<cs)) break; /* num other */ + if (i1<=3 && i2>=5 && i3>=3 && i4>=3) { /* change only on borders */ + ii++; /* white : black */ + put(pp,x,y,7,((cm<cs)?(cs|32):cs/2)&~7); +#if 0 + printf(" x y i0 i1 i2 i3 i4 cm new cs %3d %3d" + " %3d %3d %3d %3d %3d %3d %3d %3d\n", + x-box2->x0,y-box2->y0,i0,i1,i2,i3,i3,cm,getpixel(pp,x,y),cs); +#endif + } + } +#if 0 /* debugging */ + out_x(box2); +#endif + } end_for_each(&(job->res.boxlist)); + if(vvv)fprintf(stderr," ... %3d changes in %d of %d\n",ii,n1,n2); + return 0; +} + +/* test if a corner of box1 is within box2 */ +int box_nested( struct box *box1, struct box *box2){ + /* box1 in box2, +1..-1 frame for pixel-patterns */ + if ( ( ( box1->x0>=box2->x0-1 && box1->x0<=box2->x1+1 ) + || ( box1->x1>=box2->x0-1 && box1->x1<=box2->x1+1 ) ) + && ( ( box1->y0>=box2->y0-1 && box1->y0<=box2->y1+1 ) + || ( box1->y1>=box2->y0-1 && box1->y1<=box2->y1+1 ) ) ) + return 1; + return 0; +} + +/* test if box1 is within box2 */ +int box_covered( struct box *box1, struct box *box2){ + /* box1 in box2, +1..-1 frame for pixel-patterns */ + if ( ( box1->x0>=box2->x0-1 && box1->x1<=box2->x1+1 ) + && ( box1->y0>=box2->y0-1 && box1->y1<=box2->y1+1 ) ) + return 1; + return 0; +} + +/* ---- remove pictures ------------------------------------------ + * may be, not deleting or moving to another list is much better! + * should be renamed to remove_pictures and border boxes + */ +int remove_pictures( job_t *job){ + struct box *box4,*box2; + int j=0, j2=0, num_del=0; + + if (job->cfg.verbose) + fprintf(stderr, "# "__FILE__" L%d: remove pictures\n# ...", + __LINE__); + + /* ToDo: output a list for picture handle scripts */ + j=0; j2=0; + if(job->cfg.verbose) + for_each_data(&(job->res.boxlist)) { + box4 = (struct box *)list_get_current(&(job->res.boxlist)); + if (box4->c==PICTURE) j++; else j2++; + } end_for_each(&(job->res.boxlist)); + if (job->cfg.verbose) + fprintf(stderr," status: pictures= %d other= %d nC= %d\n# ...", + j, j2, job->res.numC); + + /* remove table frames */ + if (job->res.numC > 8) + for_each_data(&(job->res.boxlist)) { + box2 = (struct box *)list_get_current(&(job->res.boxlist)); + if (box2->c==PICTURE + && box2->x1-box2->x0+1>box2->p->x/2 /* big table? */ + && box2->y1-box2->y0+1>box2->p->y/2 ){ j=0; + /* count boxes nested with the picture */ + for_each_data(&(job->res.boxlist)) { + box4 = (struct box *)list_get_current(&(job->res.boxlist)); + if( box4 != box2 ) /* not count itself */ + if (box_nested(box4,box2)) j++; /* box4 in box2 */ + } end_for_each(&(job->res.boxlist)); + if( j>8 ){ /* remove box if more than 8 chars are within box */ + list_del(&(job->res.boxlist), box2); /* does not work proper ?! */ + free_box(box2); num_del++; + } + } + } end_for_each(&(job->res.boxlist)); + if (job->cfg.verbose) + fprintf(stderr, " deleted= %d pictures (table frames)\n# ...", + num_del); + num_del=0; + + /* remove dark-border-boxes (typical for hard copy of book site, + * or spam random border) */ + if (job->res.numC > 1) /* dont remove the only char */ + for_each_data(&(job->res.boxlist)) { + box2 = (struct box *)list_get_current(&(job->res.boxlist)); + if (box2->c!=PICTURE) continue; // ToDo: PICTUREs set already? + if ( box2->x1-box2->x0+1 > box2->p->x/2 + && box2->y1-box2->y0+1 > box2->p->y/2 ) continue; + j=0; + if (box2->x0==0) j++; + if (box2->y0==0) j++; /* on border? */ + if (box2->x1==box2->p->x-1) j++; + if (box2->y1==box2->p->y-1) j++; + if (j>2){ /* ToDo: check corner pixel */ + int cs=job->cfg.cs; + j=0; + if (getpixel(box2->p,box2->x0,box2->y0)<cs) j++; + if (getpixel(box2->p,box2->x1,box2->y0)<cs) j++; + if (getpixel(box2->p,box2->x0,box2->y1)<cs) j++; + if (getpixel(box2->p,box2->x1,box2->y1)<cs) j++; + if (j>2) { + list_del(&(job->res.boxlist), box2); + free_box(box2); num_del++; + } + } + } end_for_each(&(job->res.boxlist)); + if (job->cfg.verbose) + fprintf(stderr, " deleted= %d pictures (on border)\n# ...", + num_del); + num_del=0; + + j=0; j2=0; + if(job->cfg.verbose) + for_each_data(&(job->res.boxlist)) { + box4 = (struct box *)list_get_current(&(job->res.boxlist)); + if( box4->c==PICTURE ) j++; else j2++; + } end_for_each(&(job->res.boxlist)); + if (job->cfg.verbose) + fprintf(stderr," status: pictures= %d other= %d nC= %d\n# ...", + j, j2, job->res.numC); + + for(j=1;j;){ j=0; /* this is only because list_del does not work */ + /* can be slow on gray images */ + for_each_data(&(job->res.boxlist)) { + box2 = (struct box *)list_get_current(&(job->res.boxlist)); + if( box2->c==PICTURE && box2->num_ac==0) + for(j=1;j;){ /* let it grow to max before leave */ + j=0; box4=NULL; + /* find boxes nested with the picture and remove */ + /* its for pictures build by compounds */ + for_each_data(&(job->res.boxlist)) { + box4 = (struct box *)list_get_current(&(job->res.boxlist)); + if( box4!=box2 /* not destroy self */ + && (box4->num_ac==0) /* dont remove barcodes etc. */ + && (/* box4->c==UNKNOWN || */ + box4->c==PICTURE) ) /* dont remove valid chars */ + if( + /* box4 in box2, +1..-1 frame for pixel-patterns */ + box_nested(box4,box2) + /* or box2 in box4 */ + || box_nested(box2,box4) /* same? */ + ) + if ( box4->x1-box4->x0+1>2*job->res.avX + || box4->x1-box4->x0+1<job->res.avX/2 + || box4->y1-box4->y0+1>2*job->res.avY + || box4->y1-box4->y0+1<job->res.avY/2 + || box_covered(box4,box2) ) /* box4 completely within box2 */ + /* dont remove chars! see rotate45.fig */ + { + /* do not remove boxes in inner loop (bug?) ToDo: check why! */ + /* instead we leave inner loop and mark box4 as valid */ + if( box4->x0<box2->x0 ) box2->x0=box4->x0; + if( box4->x1>box2->x1 ) box2->x1=box4->x1; + if( box4->y0<box2->y0 ) box2->y0=box4->y0; + if( box4->y1>box2->y1 ) box2->y1=box4->y1; + j=1; /* mark box4 as valid */ + break; /* and leave inner loop */ + } + } end_for_each(&(job->res.boxlist)); + if (j!=0 && box4!=NULL) { /* check for valid box4 */ + /* ToDo: melt */ + list_del(&(job->res.boxlist), box4); /* does not work proper ?! */ + free_box(box4); /* break; ToDo: necessary to leave after del??? */ + num_del++; + } + + } + } end_for_each(&(job->res.boxlist)); + } + + if (job->cfg.verbose) + fprintf(stderr, " deleted= %d nested pictures\n# ...", num_del); + + /* output a list for picture handle scripts */ + j=0; j2=0; + if(job->cfg.verbose) + for_each_data(&(job->res.boxlist)) { + box4 = (struct box *)list_get_current(&(job->res.boxlist)); + if( box4->c==PICTURE ) { + fprintf(stderr," found picture at %4d %4d size %4d %4d\n# ...", + box4->x0, box4->y0, box4->x1-box4->x0+1, box4->y1-box4->y0+1 ); + j++; + } else j2++; + } end_for_each(&(job->res.boxlist)); + if (job->cfg.verbose) + fprintf(stderr," status: pictures= %d other= %d nC= %d\n", + j, j2, job->res.numC); + return 0; +} + + + + /* ---- remove melted serifs --------------------------------- v0.2.5 + >>v<< + ##########.######## <-y0 + ################### like X VW etc. + ...###.......###... <-y + ...###......###.... + j1 j2 j3 + - can generate new boxes if two characters were glued + */ +int remove_melted_serifs( pix *pp ){ + int x,y,j1,j2,j3,j4,i2,i3,i,ii,ni,cs,x0,x1,xa,xb,y0,y1,vvv=JOB->cfg.verbose; + struct box *box2, *box3; + progress_counter_t *pc = NULL; + + cs=JOB->cfg.cs; i=0; ii=0; ni=0; + for_each_data(&(JOB->res.boxlist)) { + ni++; + } end_for_each(&(JOB->res.boxlist)); + pc = open_progress(ni,"remove_melted_serifs"); + ni = 0; + + if(vvv){ fprintf(stderr,"# searching melted serifs ..."); } + for_each_data(&(JOB->res.boxlist)) { + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if (box2->c != UNKNOWN) continue; /* dont try on pictures */ + x0=box2->x0; x1=box2->x1; + y0=box2->y0; y1=box2->y1; /* box */ + /* upper serifs */ + for(j1=x0;j1+4<x1;){ + j1+=loop(pp,j1,y0 ,x1-x0,cs,0,RI); + x =loop(pp,j1,y0 ,x1-x0,cs,1,RI); if(j1+x>x1+1) break; + y =loop(pp,j1,y0+1,x1-x0,cs,1,RI); if(y>x) x=y; if(j1+x>x1+1) break; + /* measure mean thickness of serif */ + for(j2=j3=j4=0,i2=j1;i2<j1+x;i2++){ + i3 =loop(pp,j1,y0 ,y1-y0,cs,0,DO); if(8*i3>y1-y0) break; + i3+=loop(pp,j1,y0+i3,y1-y0,cs,1,DO); if(8*i3>y1-y0) break; + if(8*i3<y1-y0){ j2+=i3; j3++; } + } if(j3==0){ j1+=x; continue; } + y = y0+(j2+j3-1)/j3+(y1-y0+1)/32; + + /* check if really melted serifs */ + if( loop(pp,j1,y,x1-x0,cs,0,RI)<1 ) { j1+=x; continue; } + if(num_cross(j1 ,j1+x,y,y,pp,cs) < 2 ){ j1+=x;continue; } + j2 = j1 + loop(pp,j1,y,x1-x0,cs,0,RI); + j2 = j2 + loop(pp,j2,y,x1-x0,cs,1,RI); + i3 = loop(pp,j2,y,x1-x0,cs,0,RI); if(i3<2){j1+=x;continue;} + j2 += i3/2; + j3 = j2 + loop(pp,j2,y ,x1-j2,cs,0,RI); + i3 = j2 + loop(pp,j2,y+1,x1-j2,cs,0,RI); if(i3>j3)j3=i3; + j3 = j3 + loop(pp,j3,y ,x1-j3,cs,1,RI); + i3 = loop(pp,j3,y ,x1-j3,cs,0,RI); + if(i3<2 || j3>=j1+x){j1+=x;continue;} + j3 += i3/2; + + if(x>5) + { + i++; /* snip! */ + for(y=0;y<(y1-y0+1+4)/8;y++)put(pp,j2,y0+y,255,128+64); /* clear highest bit */ + if(vvv&4){ + fprintf(stderr,"\n"); + //out_x(box2); + fprintf(stderr,"# melted serifs corrected on %d %d j1=%d j3=%d", + j2-x0, y, j1-x0, j3-x0); + } + for(xb=0,xa=0;xa<(x1-x0+4)/8;xa++){ /* detect vertical gap */ + i3=y1; + if(box2->m3>y0 && 2*y1>box2->m3+box2->m4) i3=box2->m3; /* some IJ */ + if( loop(pp,j2-xa,i3,i3-y0,cs,0,UP) > (y1-y0+1)/2 + && loop(pp,j2,(y0+y1)/2,xa+1,cs,0,LE) >=xa ){ xb=-xa; break; } + if( loop(pp,j2+xa,i3,i3-y0,cs,0,UP) > (y1-y0+1)/2 + && loop(pp,j2,(y0+y1)/2,xa+1,cs,0,RI) >=xa ){ xb= xa; break; } + } + if( get_bw(j2 ,j2 ,y0,(y0+y1)/2,pp,cs,1) == 0 + && get_bw(j2+xb,j2+xb,(y0+y1)/2,i3,pp,cs,1) == 0 ) + { /* divide */ + box3=malloc_box(box2); + box3->x1=j2-1; + box2->x0=j2+1; x1=box2->x1; + cut_box(box2); /* cut vectors outside the box */ + cut_box(box3); + box3->num=JOB->res.numC; + list_ins(&(JOB->res.boxlist),box2,box3); JOB->res.numC++; ii++; /* insert box3 before box2 */ + if(vvv&4) fprintf(stderr," => splitted"); + j1=x0=box2->x0; x=0; /* hopefully ok, UVW */ + } + } + j1+=x; + } + /* same on lower serifs -- change this later to better function + // #### ### + // #### v ### # <-y + // #################### <-y1 + // j1 j2 j3 + */ + for(j1=x0;j1<x1;){ + j1+=loop(pp,j1,y1 ,x1-x0,cs,0,RI); + x =loop(pp,j1,y1 ,x1-x0,cs,1,RI); if(j1+x>x1+1) break; + y =loop(pp,j1,y1-1,x1-x0,cs,1,RI); if(y>x) x=y; if(j1+x>x1+1) break; + /* measure mean thickness of serif */ + for(j2=j3=j4=0,i2=j1;i2<j1+x;i2++){ + i3 =loop(pp,j1,y1 ,y1-y0,cs,0,UP); if(8*i3>y1-y0) break; + i3+=loop(pp,j1,y1-i3,y1-y0,cs,1,UP); if(8*i3>y1-y0) break; + if(8*i3<y1-y0){ j2+=i3; j3++; } + } if(j3==0){ j1+=x; continue; } + y = y1-(j2+j3-1)/j3-(y1-y0+1)/32; + + /* check if really melted serifs */ + if( loop(pp,j1,y,x1-x0,cs,0,RI)<1 ) { j1+=x; continue; } + if(num_cross(j1 ,j1+x,y,y,pp,cs) < 2 ){ j1+=x;continue; } + j2 = j1 + loop(pp,j1,y,x1-x0,cs,0,RI); + j2 = j2 + loop(pp,j2,y,x1-x0,cs,1,RI); + i3 = loop(pp,j2,y,x1-x0,cs,0,RI); if(i3<2){j1+=x;continue;} + j2 += i3/2; + j3 = j2 + loop(pp,j2,y ,x1-j2,cs,0,RI); + i3 = j2 + loop(pp,j2,y-1,x1-j2,cs,0,RI); if(i3>j3)j3=i3; + j3 = j3 + loop(pp,j3,y ,x1-j3,cs,1,RI); + i3 = loop(pp,j3,y,x1-j3,cs,0,RI); + if(i3<2 || j3>=j1+x){j1+=x;continue;} + j3 += i3/2; + + /* y =y1-(y1-y0+1+4)/8; */ + if(x>5) + { + i++; /* snip! */ + for(i3=0;i3<(y1-y0+1+4)/8;i3++) + put(pp,j2,y1-i3,255,128+64); /* clear highest bit */ + if(vvv&4){ + fprintf(stderr,"\n"); + //out_x(box2); + fprintf(stderr,"# melted serifs corrected on %d %d j1=%d j3=%d",j2-x0,y-y0,j1-x0,j3-x0); + } + for(xb=0,xa=0;xa<(x1-x0+4)/8;xa++){ /* detect vertical gap */ + if( loop(pp,j2-xa,y0,y1-y0,cs,0,DO) > (y1-y0+1)/2 + && loop(pp,j2,(y0+y1)/2,xa+1,cs,0,LE) >=xa ){ xb=-xa; break; } + if( loop(pp,j2+xa,y0,y1-y0,cs,0,DO) > (y1-y0+1)/2 + && loop(pp,j2,(y0+y1)/2,xa+1,cs,0,RI) >=xa ){ xb= xa; break; } + } + if( get_bw(j2 ,j2 ,(y0+y1)/2,y1,pp,cs,1) == 0 + && get_bw(j2+xb,j2+xb,y0,(y0+y1)/2,pp,cs,1) == 0 ) + { /* divide */ + box3=malloc_box(box2); + box3->x1=j2-1; + box2->x0=j2; x1=box2->x1; + cut_box(box2); /* cut vectors outside the box */ + cut_box(box3); + box3->num=JOB->res.numC; + list_ins(&(JOB->res.boxlist),box2,box3); JOB->res.numC++; ii++; + /* box3,box2 in correct order??? */ + if(vvv&4) fprintf(stderr," => splitted"); + j1=x0=box2->x0; x=0; /* hopefully ok, NMK */ + } + } + j1+=x; + } + progress(ni++,pc); + } end_for_each(&(JOB->res.boxlist)); + close_progress(pc); + if(vvv)fprintf(stderr," %3d cluster corrected, %d new boxes\n",i,ii); + return 0; +} + +/* remove black borders often seen on bad scanned copies of books + - dust around the border + */ +int remove_rest_of_dust() { + int i1, i2, vvv = JOB->cfg.verbose, x0, x1, y0, y1, cnt=0; + struct box *box2, *box4; + progress_counter_t *pc = NULL; + + i1 = i2 = 0; /* counter for removed boxes */ + if (vvv) + fprintf(stderr, "# detect dust (avX,nC), ... "); + /* remove fragments from border */ + for_each_data(&(JOB->res.boxlist)) { + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if (box2->c == UNKNOWN) { + x0 = box2->x0; x1 = box2->x1; + y0 = box2->y0; y1 = box2->y1; /* box */ + /* box in char ??? */ + if ( 2 * JOB->res.numC * (y1 - y0 + 1) < 3 * JOB->res.sumY + && ( y1 < box2->p->y/4 || y0 > 3*box2->p->y/4 ) /* not single line */ + && JOB->res.numC > 1 /* do not remove everything */ + && ( box2->m4 == 0 ) ) /* remove this */ + { + JOB->res.numC--; /* ToDo: dont count tiny pixels */ + /* ToDo: res.sumX,Y must also be corrected */ + i1++; + list_del(&(JOB->res.boxlist), box2); + free_box(box2); + } + } + } end_for_each(&(JOB->res.boxlist)); + + pc = open_progress(JOB->res.boxlist.n,"remove_dust2"); + for_each_data(&(JOB->res.boxlist)) { + box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); + progress(cnt++,pc); + if (box2->c == PICTURE) continue; + x0 = box2->x0; x1 = box2->x1; + y0 = box2->y0; y1 = box2->y1; /* box */ + /* remove tiny box2 if to far away from bigger boxes */ + /* ToDo: remove clouds of tiny pixels (count near small, compare with num bigger) */ + /* 0.42: remove far away pixel? ToDo: do it at earlier? */ + if (x1-x0+1<3 && y1-y0+1<3){ + int xn, yn, xs, ys; + int found=0; /* nearest bigger box */ + /* search near bigger box */ + for_each_data(&(JOB->res.boxlist)) { + box4 = (struct box *)list_get_current(&(JOB->res.boxlist)); + if (found || box4 == box2) continue; + if (box4->x1-box4->x0+1<3 && box4->y1-box4->y0+1<3) continue; + xs = box4->x1-box4->x0+1; + ys = box4->y1-box4->y0+1; + xn = abs((box4->x0+box4->x1)/2 - box2->x0); + yn = abs((box4->y0+box4->y1)/2 - box2->y0); + if (2*xn < 3*xs && 2*yn < 3*ys) { found=1; } + } end_for_each(&(JOB->res.boxlist)); + if (!found) { /* found nothing, box2 to far from big boxes */ + i2++; + list_del(&(JOB->res.boxlist), box2); + free_box(box2); + } + } + } end_for_each(&(JOB->res.boxlist)); + close_progress(pc); + if (vvv) + fprintf(stderr, " %3d + %3d boxes deleted, nC= %d ?\n", + i1, i2, JOB->res.numC); + + return 0; +} diff --git a/lib/gocr/unicode.c b/lib/gocr/unicode.c new file mode 100644 index 00000000..d8ed7036 --- /dev/null +++ b/lib/gocr/unicode.c @@ -0,0 +1,1314 @@ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000-2007 Joerg Schulenburg + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + see README for EMAIL-address + */ + +#include "unicode.h" +#include <stdio.h> + +/* FIXME jb global */ +int warn=0; /* if 1 a message is generated if composition is not defined */ + +/* Arguments: the character (main), and the modifier (accent, etc). See the + function if you want to know the modifiers. + Description: This function intends to be a small helper, to avoid having + to write switches in functions. It's therefore mainly to accents, and + specially for the most usual ones. It supports the basic greek + characters too, which is actually not very helpful. + Returns: the unicode character corresponding to the composed character. + + ToDo: + - It seems to me, that tables should be more effectiv. + So we should use tables in future? (js) + */ +wchar_t compose(wchar_t main, wchar_t modifier) { +/* supported by now: part of ISO8859-1, basic greek characters */ + if( main == UNKNOWN || main == PICTURE ) return main; +#ifdef DEBUG + if(modifier!=UNICODE_NULL && modifier!=SPACE) + printf(" compose(%c,%d)",(char)main,(int)modifier); +#endif + if(main>127 && modifier!=0 && modifier!=SPACE && warn) + fprintf(stderr,"# Warning compose %04x + %04x>127\n", + (int)modifier,(int)main); + switch (modifier) { + case UNICODE_NULL: + case SPACE: + return (wchar_t)main; + + case APOSTROPHE: /* do NOT USE this. It's here for compatibility only. + Use ACUTE_ACCENT instead. */ + fprintf( stderr, "COMPOSE: got APOSTROPHE instead of ACUTE_ACCENT"); + + case ACUTE_ACCENT: /* acute/cedilla */ + switch (main) { + case 'a': return LATIN_SMALL_LETTER_A_WITH_ACUTE; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_ACUTE; + case LATIN_SMALL_LETTER_AE: return LATIN_SMALL_LETTER_AE_WITH_ACUTE; + case LATIN_CAPITAL_LETTER_AE: return LATIN_CAPITAL_LETTER_AE_WITH_ACUTE; + case 'c': return LATIN_SMALL_LETTER_C_WITH_ACUTE; + case 'C': return LATIN_CAPITAL_LETTER_C_WITH_ACUTE; + case 'e': return LATIN_SMALL_LETTER_E_WITH_ACUTE; + case 'E': return LATIN_CAPITAL_LETTER_E_WITH_ACUTE; + case 'g': return LATIN_SMALL_LETTER_G_WITH_ACUTE; + case 'G': return LATIN_CAPITAL_LETTER_G_WITH_ACUTE; + case 'i': return LATIN_SMALL_LETTER_I_WITH_ACUTE; + case 'I': return LATIN_CAPITAL_LETTER_I_WITH_ACUTE; + case 'l': return LATIN_SMALL_LETTER_L_WITH_ACUTE; + case 'L': return LATIN_CAPITAL_LETTER_L_WITH_ACUTE; + case 'n': return LATIN_SMALL_LETTER_N_WITH_ACUTE; + case 'N': return LATIN_CAPITAL_LETTER_N_WITH_ACUTE; + case 'o': return LATIN_SMALL_LETTER_O_WITH_ACUTE; + case 'O': return LATIN_CAPITAL_LETTER_O_WITH_ACUTE; + case '0': return LATIN_CAPITAL_LETTER_O_WITH_ACUTE; + case 'r': return LATIN_SMALL_LETTER_R_WITH_ACUTE; + case 'R': return LATIN_CAPITAL_LETTER_R_WITH_ACUTE; + case 's': return LATIN_SMALL_LETTER_S_WITH_ACUTE; + case 'S': return LATIN_CAPITAL_LETTER_S_WITH_ACUTE; + case 'u': return LATIN_SMALL_LETTER_U_WITH_ACUTE; + case 'U': return LATIN_CAPITAL_LETTER_U_WITH_ACUTE; + case 'y': return LATIN_SMALL_LETTER_Y_WITH_ACUTE; + case 'Y': return LATIN_CAPITAL_LETTER_Y_WITH_ACUTE; + case 'z': return LATIN_SMALL_LETTER_Z_WITH_ACUTE; + case 'Z': return LATIN_CAPITAL_LETTER_Z_WITH_ACUTE; + default: + if(warn)fprintf( stderr, " COMPOSE: ACUTE_ACCENT+%04x not defined\n",(int)main); + } + break; + + case BREVE: /* caron (latin2) "u"-above-... (small bow) */ + switch (main) { + /* FIXME write separate heuristics for breve */ + case 'a': return LATIN_SMALL_LETTER_A_WITH_BREVE; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_BREVE; + case 'e': return LATIN_SMALL_LETTER_E_WITH_BREVE; + case 'E': return LATIN_CAPITAL_LETTER_E_WITH_BREVE; + case 'g': return LATIN_SMALL_LETTER_G_WITH_BREVE; + case 'G': return LATIN_CAPITAL_LETTER_G_WITH_BREVE; + case 'i': return LATIN_SMALL_LETTER_I_WITH_BREVE; + case 'I': return LATIN_CAPITAL_LETTER_I_WITH_BREVE; + case 'o': return LATIN_SMALL_LETTER_O_WITH_BREVE; + case 'O': return LATIN_CAPITAL_LETTER_O_WITH_BREVE; + case 'u': return LATIN_SMALL_LETTER_U_WITH_BREVE; + case 'U': return LATIN_CAPITAL_LETTER_U_WITH_BREVE; + default: + if(warn)fprintf( stderr, " COMPOSE: BREVE+%04x not defined\n",(int)main); + } + break; + + case CARON: /* caron (latin2) "v"-above-... */ + switch (main) { + case 'a': return LATIN_SMALL_LETTER_A_WITH_CARON; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_CARON; + case 'c': return LATIN_SMALL_LETTER_C_WITH_CARON; + case 'C': return LATIN_CAPITAL_LETTER_C_WITH_CARON; + case 'e': return LATIN_SMALL_LETTER_E_WITH_CARON; + case 'E': return LATIN_CAPITAL_LETTER_E_WITH_CARON; + case 'i': return LATIN_SMALL_LETTER_I_WITH_CARON; + case 'I': return LATIN_CAPITAL_LETTER_I_WITH_CARON; + case 'o': return LATIN_SMALL_LETTER_O_WITH_CARON; + case 'O': return LATIN_CAPITAL_LETTER_O_WITH_CARON; + case '0': return LATIN_CAPITAL_LETTER_O_WITH_CARON; + case 's': return LATIN_SMALL_LETTER_S_WITH_CARON; + case 'S': return LATIN_CAPITAL_LETTER_S_WITH_CARON; + case 'u': return LATIN_SMALL_LETTER_U_WITH_CARON; + case 'U': return LATIN_CAPITAL_LETTER_U_WITH_CARON; + case 'z': return LATIN_SMALL_LETTER_Z_WITH_CARON; + case 'Z': return LATIN_CAPITAL_LETTER_Z_WITH_CARON; + default: + if(warn)fprintf( stderr, " COMPOSE: CARON+%04x not defined\n",(int)main); + } + break; + + case CEDILLA: + switch (main) { + case 'c': return LATIN_SMALL_LETTER_C_WITH_CEDILLA; + case 'C': return LATIN_CAPITAL_LETTER_C_WITH_CEDILLA; + default: + if(warn)fprintf( stderr, " COMPOSE: CEDILLA+%04x not defined\n",(int)main); + } + break; + + case TILDE: + switch (main) { + case 'a': return LATIN_SMALL_LETTER_A_WITH_TILDE; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_TILDE; + case 'i': return LATIN_SMALL_LETTER_I_WITH_TILDE; + case 'I': return LATIN_CAPITAL_LETTER_I_WITH_TILDE; + case 'n': return LATIN_SMALL_LETTER_N_WITH_TILDE; + case 'N': return LATIN_CAPITAL_LETTER_N_WITH_TILDE; + case 'o': return LATIN_SMALL_LETTER_O_WITH_TILDE; + case 'O': return LATIN_CAPITAL_LETTER_O_WITH_TILDE; + case '0': return LATIN_CAPITAL_LETTER_O_WITH_TILDE; + case 'u': return LATIN_SMALL_LETTER_U_WITH_TILDE; + case 'U': return LATIN_CAPITAL_LETTER_U_WITH_TILDE; + default: + if(warn)fprintf( stderr, " COMPOSE: TILDE+%04x not defined\n",(int)main); + } + break; + + case GRAVE_ACCENT: + switch (main) { + case 'a': return LATIN_SMALL_LETTER_A_WITH_GRAVE; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_GRAVE; + case 'e': return LATIN_SMALL_LETTER_E_WITH_GRAVE; + case 'E': return LATIN_CAPITAL_LETTER_E_WITH_GRAVE; + case 'i': return LATIN_SMALL_LETTER_I_WITH_GRAVE; + case 'I': return LATIN_CAPITAL_LETTER_I_WITH_GRAVE; + case 'n': return LATIN_SMALL_LETTER_N_WITH_GRAVE; + case 'N': return LATIN_CAPITAL_LETTER_N_WITH_GRAVE; + case 'o': return LATIN_SMALL_LETTER_O_WITH_GRAVE; + case 'O': return LATIN_CAPITAL_LETTER_O_WITH_GRAVE; + case '0': return LATIN_CAPITAL_LETTER_O_WITH_GRAVE; + case 'u': return LATIN_SMALL_LETTER_U_WITH_GRAVE; + case 'U': return LATIN_CAPITAL_LETTER_U_WITH_GRAVE; + default: + if(warn)fprintf( stderr, " COMPOSE: GRAVE_ACCENT+%04x not defined\n",(int)main); + } + break; + + case QUOTATION_MARK: /* do NOT USE this. It's here for compatibility only. + Use DIAERESIS instead. */ + fprintf( stderr, "COMPOSE: got APOSTROPHE instead of ACUTE_ACCENT"); + + case DIAERESIS: + switch (main) { + case 'a': return LATIN_SMALL_LETTER_A_WITH_DIAERESIS; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_DIAERESIS; + case 'e': return LATIN_SMALL_LETTER_E_WITH_DIAERESIS; + case 'E': return LATIN_CAPITAL_LETTER_E_WITH_DIAERESIS; + case 'i': return LATIN_SMALL_LETTER_I_WITH_DIAERESIS; + case 'I': return LATIN_CAPITAL_LETTER_I_WITH_DIAERESIS; + case 'o': return LATIN_SMALL_LETTER_O_WITH_DIAERESIS; + case 'O': return LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS; + case '0': return LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS; + case 'u': return LATIN_SMALL_LETTER_U_WITH_DIAERESIS; + case 'U': return LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS; + case 'y': return LATIN_SMALL_LETTER_Y_WITH_DIAERESIS; + case 'Y': return LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; + default: + if(warn)fprintf( stderr, " COMPOSE: DIAERESIS+%04x (%c) not defined\n",(int)main,(char)main); + } + break; + + case CIRCUMFLEX_ACCENT: /* ^ */ + switch (main) { + case 'a': return LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX; + case 'c': return LATIN_SMALL_LETTER_C_WITH_CIRCUMFLEX; + case 'C': return LATIN_CAPITAL_LETTER_C_WITH_CIRCUMFLEX; + case 'e': return LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX; + case 'E': return LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX; + case 'g': return LATIN_SMALL_LETTER_G_WITH_CIRCUMFLEX; + case 'G': return LATIN_CAPITAL_LETTER_G_WITH_CIRCUMFLEX; + case 'h': return LATIN_SMALL_LETTER_H_WITH_CIRCUMFLEX; + case 'H': return LATIN_CAPITAL_LETTER_H_WITH_CIRCUMFLEX; + case 'i': return LATIN_SMALL_LETTER_I_WITH_CIRCUMFLEX; + case 'I': return LATIN_CAPITAL_LETTER_I_WITH_CIRCUMFLEX; + case 'j': return LATIN_SMALL_LETTER_J_WITH_CIRCUMFLEX; + case 'J': return LATIN_CAPITAL_LETTER_J_WITH_CIRCUMFLEX; + case 'o': return LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX; + case 'O': return LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX; + case '0': return LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX; + case 's': return LATIN_SMALL_LETTER_S_WITH_CIRCUMFLEX; + case 'S': return LATIN_CAPITAL_LETTER_S_WITH_CIRCUMFLEX; + case 'u': return LATIN_SMALL_LETTER_U_WITH_CIRCUMFLEX; + case 'U': return LATIN_CAPITAL_LETTER_U_WITH_CIRCUMFLEX; + case 'w': return LATIN_SMALL_LETTER_W_WITH_CIRCUMFLEX; + case 'W': return LATIN_CAPITAL_LETTER_W_WITH_CIRCUMFLEX; + case 'y': return LATIN_SMALL_LETTER_Y_WITH_CIRCUMFLEX; + case 'Y': return LATIN_CAPITAL_LETTER_Y_WITH_CIRCUMFLEX; + default: + if(warn)fprintf( stderr, " COMPOSE: CIRCUMFLEX_ACCENT+%04x not defined\n",(int)main); + } + break; + + case MACRON: /* a minus sign above the char (latin2) */ + switch (main) { + case 'a': return LATIN_SMALL_LETTER_A_WITH_MACRON; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_MACRON; + case 'e': return LATIN_SMALL_LETTER_E_WITH_MACRON; + case 'E': return LATIN_CAPITAL_LETTER_E_WITH_MACRON; + case 'i': return LATIN_SMALL_LETTER_I_WITH_MACRON; + case 'I': return LATIN_CAPITAL_LETTER_I_WITH_MACRON; + case 'o': return LATIN_SMALL_LETTER_O_WITH_MACRON; + case 'O': return LATIN_CAPITAL_LETTER_O_WITH_MACRON; + case 'u': return LATIN_SMALL_LETTER_U_WITH_MACRON; + case 'U': return LATIN_CAPITAL_LETTER_U_WITH_MACRON; + case 'y': return LATIN_SMALL_LETTER_Y_WITH_MACRON; + case 'Y': return LATIN_CAPITAL_LETTER_Y_WITH_MACRON; + case LATIN_SMALL_LETTER_AE: return LATIN_SMALL_LETTER_AE_WITH_MACRON; + case LATIN_CAPITAL_LETTER_AE: return LATIN_CAPITAL_LETTER_AE_WITH_MACRON; + case '=': return IDENTICAL_TO; + case '-': return '='; + case ' ': return MODIFIER_LETTER_MACRON; + default: + if(warn)fprintf( stderr, " COMPOSE: MACRON+%04x not defined\n",(int)main); + } + break; + + case DOT_ABOVE: /* latin2 */ + switch (main) { + case 'a': return LATIN_SMALL_LETTER_A_WITH_DOT_ABOVE; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_DOT_ABOVE; + case 'c': return LATIN_SMALL_LETTER_C_WITH_DOT_ABOVE; + case 'C': return LATIN_CAPITAL_LETTER_C_WITH_DOT_ABOVE; + case 'e': return LATIN_SMALL_LETTER_E_WITH_DOT_ABOVE; + case 'E': return LATIN_CAPITAL_LETTER_E_WITH_DOT_ABOVE; + case 'g': return LATIN_SMALL_LETTER_G_WITH_DOT_ABOVE; + case 'G': return LATIN_CAPITAL_LETTER_G_WITH_DOT_ABOVE; + case 'l': return 'i'; /* correct wrong recognition */ + case 'i': return 'i'; + case LATIN_SMALL_LETTER_DOTLESS_I: return 'i'; + case 'I': return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE; + case 'j': return 'j'; + case 'o': return LATIN_SMALL_LETTER_O_WITH_DOT_ABOVE; + case 'O': return LATIN_CAPITAL_LETTER_O_WITH_DOT_ABOVE; + case 'z': return LATIN_SMALL_LETTER_Z_WITH_DOT_ABOVE; + case 'Z': return LATIN_CAPITAL_LETTER_Z_WITH_DOT_ABOVE; + case ',': return ';'; + case '.': return ':'; + default: + if(warn)fprintf( stderr, " COMPOSE: DOT_ABOVE+%04x not defined\n",(int)main); + } + break; + + case RING_ABOVE: + switch (main) { + case 'a': return LATIN_SMALL_LETTER_A_WITH_RING_ABOVE; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE; + case 'u': return LATIN_SMALL_LETTER_U_WITH_RING_ABOVE; + case 'U': return LATIN_CAPITAL_LETTER_U_WITH_RING_ABOVE; + default: + if(warn)fprintf( stderr, " COMPOSE: RING_ABOVE+%04x not defined\n",(int)main); + } + break; + + case 'e': /* e ligatures: ae, oe. */ + case 'E': + switch (main) { + case 'a': return LATIN_SMALL_LETTER_AE; + case 'A': return LATIN_CAPITAL_LETTER_AE; + case 'o': return LATIN_SMALL_LIGATURE_OE; + case 'O': return LATIN_CAPITAL_LIGATURE_OE; + case '0': return LATIN_CAPITAL_LIGATURE_OE; + default: + if(warn)fprintf( stderr, " COMPOSE: %04x+e/E not defined\n",(int)main); + } + break; + + case 'g': /* greek */ + switch (main) { + /* missing 0x37A-0x390 */ + /* weird cases: Q -> theta (it resembles a little, doesn't it?) + V -> psi (what can I do?) */ + case 'A': return GREEK_CAPITAL_LETTER_ALPHA; + case 'B': return GREEK_CAPITAL_LETTER_BETA; + case 'G': return GREEK_CAPITAL_LETTER_GAMMA; + case 'D': return GREEK_CAPITAL_LETTER_DELTA; + case 'E': return GREEK_CAPITAL_LETTER_EPSILON; + case 'Z': return GREEK_CAPITAL_LETTER_ZETA; + case 'H': return GREEK_CAPITAL_LETTER_ETA; + case 'Q': return GREEK_CAPITAL_LETTER_THETA; + case 'I': return GREEK_CAPITAL_LETTER_IOTA; + case 'K': return GREEK_CAPITAL_LETTER_KAPPA; + case 'L': return GREEK_CAPITAL_LETTER_LAMDA; + case 'M': return GREEK_CAPITAL_LETTER_MU; + case 'N': return GREEK_CAPITAL_LETTER_NU; + case 'X': return GREEK_CAPITAL_LETTER_XI; + case 'O': return GREEK_CAPITAL_LETTER_OMICRON; + case 'P': return GREEK_CAPITAL_LETTER_PI; + case 'R': return GREEK_CAPITAL_LETTER_RHO; + case 'S': return GREEK_CAPITAL_LETTER_SIGMA; + case 'T': return GREEK_CAPITAL_LETTER_TAU; + case 'Y': return GREEK_CAPITAL_LETTER_UPSILON; + case 'F': return GREEK_CAPITAL_LETTER_PHI; + case 'C': return GREEK_CAPITAL_LETTER_CHI; + case 'V': return GREEK_CAPITAL_LETTER_PSI; + case 'W': return GREEK_CAPITAL_LETTER_OMEGA; +/* + case '': return GREEK_CAPITAL_LETTER_IOTA_WITH_DIALYTIKA; + case '': return GREEK_CAPITAL_LETTER_UPSILON_WITH_DIALYTIKA; + case '': return GREEK_SMALL_LETTER_ALPHA_WITH_TONOS; + case '': return GREEK_SMALL_LETTER_EPSILON_WITH_TONOS; + case '': return GREEK_SMALL_LETTER_ETA_WITH_TONOS; + case '': return GREEK_SMALL_LETTER_IOTA_WITH_TONOS; + case '': return GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS; +*/ + case 'a': return GREEK_SMALL_LETTER_ALPHA; + case 'b': return GREEK_SMALL_LETTER_BETA; + case 'g': return GREEK_SMALL_LETTER_GAMMA; + case 'd': return GREEK_SMALL_LETTER_DELTA; + case 'e': return GREEK_SMALL_LETTER_EPSILON; + case 'z': return GREEK_SMALL_LETTER_ZETA; + case 'h': return GREEK_SMALL_LETTER_ETA; + case 'q': return GREEK_SMALL_LETTER_THETA; + case 'i': return GREEK_SMALL_LETTER_IOTA; + case 'k': return GREEK_SMALL_LETTER_KAPPA; + case 'l': return GREEK_SMALL_LETTER_LAMDA; + case 'm': return GREEK_SMALL_LETTER_MU; + case 'n': return GREEK_SMALL_LETTER_NU; + case 'x': return GREEK_SMALL_LETTER_XI; + case 'o': return GREEK_SMALL_LETTER_OMICRON; + case 'p': return GREEK_SMALL_LETTER_PI; + case 'r': return GREEK_SMALL_LETTER_RHO; + case '&': return GREEK_SMALL_LETTER_FINAL_SIGMA; + case 's': return GREEK_SMALL_LETTER_SIGMA; + case 't': return GREEK_SMALL_LETTER_TAU; + case 'y': return GREEK_SMALL_LETTER_UPSILON; + case 'f': return GREEK_SMALL_LETTER_PHI; + case 'c': return GREEK_SMALL_LETTER_CHI; + case 'v': return GREEK_SMALL_LETTER_PSI; + case 'w': return GREEK_SMALL_LETTER_OMEGA; +/* + case '': return GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA; + case '': return GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA; + case '': return GREEK_SMALL_LETTER_OMICRON_WITH_TONOS; + case '': return GREEK_SMALL_LETTER_UPSILON_WITH_TONOS; + case '': return GREEK_SMALL_LETTER_OMEGA_WITH_TONOS; + case '': return GREEK_BETA_SYMBOL; + case '': return GREEK_THETA_SYMBOL; + case '': return GREEK_UPSILON_WITH_HOOK_SYMBOL; + case '': return GREEK_UPSILON_WITH_ACUTE_AND_HOOK_SYMBOL; + case '': return GREEK_UPSILON_WITH_DIAERESIS_AND_HOOK_SYMBOL; + case '': return GREEK_PHI_SYMBOL; + case '': return GREEK_PI_SYMBOL; +*/ + default: + if(warn)fprintf( stderr, " COMPOSE: GREEK %04x not defined\n",(int)main); + } + break; + + default: + fprintf( stderr, " COMPOSE: modifier %04x not defined\n",(int)modifier); + } + return (wchar_t)main; +} + +#define UNDEFINED "~" + +/* Arguments: character in Unicode format, type of format to convert to. + Returns: a string containing the Unicode character converted to the chosen + format. This string is statically allocated and should not be freed. + ToDo: better using tables? + */ +const char *decode(wchar_t c, FORMAT type) { + /* static char d; --- js: big bug (missing \0) if &d returned */ + /*FIXME jb static*/ static char bbuf[8*32]; /* space for 8 buffers, rotating */ + /*FIXME jb static*/ static char *buf=bbuf; /* used for UTF8 sequences and undefined codes */ + buf+=32; if(buf>=bbuf+8*32) buf=bbuf; + buf[0]=buf[1]=buf[2]=0; + switch (type) { + case ISO8859_1: + if ( c <= 0xFF ) { /* UNICODE == ISO8859-1 */ + buf[0] = (char)c; + return buf; + } + switch (c) { /* not found in list, but perhaps we can describe it */ + /* todo: add greek. GREEK_SMALL_LETTER_ALPHA = alpha */ + + /* general puctuation */ + case HYPHEN: + return (const char *)"-"; + case FIGURE_DASH: + case EN_DASH: + return (const char *)"--"; + case EM_DASH: + return (const char *)"---"; + case LEFT_SINGLE_QUOTATION_MARK: + return (const char *)"`"; + case RIGHT_SINGLE_QUOTATION_MARK: + return (const char *)"'"; + case SINGLE_LOW_9_QUOTATION_MARK: + return (const char *)","; + case SINGLE_HIGH_REVERSED_9_QUOTATION_MARK: + return (const char *)UNDEFINED; + case LEFT_DOUBLE_QUOTATION_MARK: + return (const char *)"``"; + case RIGHT_DOUBLE_QUOTATION_MARK: + return (const char *)"''"; + case DOUBLE_LOW_9_QUOTATION_MARK: + return (const char *)",,"; + case DOUBLE_HIGH_REVERSED_9_QUOTATION_MARK: + return (const char *)UNDEFINED; + case DAGGER: + return (const char *)"+"; + case DOUBLE_DAGGER: + return (const char *)"*"; + case BULLET: + return (const char *)"*"; + case TRIANGULAR_BULLET: + return (const char *)"*"; + case HYPHENATION_POINT: + return (const char *)"-"; + case HORIZONTAL_ELLIPSIS: + return (const char *)"..."; + case PER_MILLE_SIGN: + return (const char *)"%%"; /* awk! */ + case SINGLE_LEFT_POINTING_ANGLE_QUOTATION_MARK: + return (const char *)"<"; + case SINGLE_RIGHT_POINTING_ANGLE_QUOTATION_MARK: + return (const char *)">"; + case EURO_CURRENCY_SIGN: + return (const char *)"EUR"; /* change it! */ + + /* ligatures */ + case LATIN_SMALL_LIGATURE_FF: + return (const char *)"ff"; + case LATIN_SMALL_LIGATURE_FI: + return (const char *)"fi"; + case LATIN_SMALL_LIGATURE_FL: + return (const char *)"fl"; + case LATIN_SMALL_LIGATURE_FFI: + return (const char *)"ffi"; + case LATIN_SMALL_LIGATURE_FFL: + return (const char *)"ffl"; + case LATIN_SMALL_LIGATURE_LONG_S_T: + case LATIN_SMALL_LIGATURE_ST: + return (const char *)"st"; + + /* extra */ + case UNKNOWN: + return (const char *)"_"; + case PICTURE: + return (const char *)"_"; /* Due to Mobile OCR */ + + default: + /* snprintf seems to be no standard, so I use insecure sprintf */ + sprintf(buf,"\\code(%04x)",(unsigned)c); + return buf; /* UNDEFINED; */ + } + break; + case TeX: + if ( c >= SPACE && c <= TILDE ) { /* ASCII */ + switch (c) { + case '$': + return (const char *)"\\$"; + case '&': + return (const char *)"\\&"; + case '%': + return (const char *)"\\%"; + case '#': + return (const char *)"\\#"; + case '_': + return (const char *)"\\_"; + case '{': + return (const char *)"\\{"; + case '}': + return (const char *)"\\}"; + case '\\': + return (const char *)"$\\backslash$"; + case '~': + return (const char *)"\\~{}"; + case '^': + return (const char *)"\\^{}"; + default: + buf[0] = (char)c; + return (const char *)buf; + } + } + switch (c) { + /* ISO8859_1 */ + case NO_BREAK_SPACE: + return (const char *)"~"; + case INVERTED_EXCLAMATION_MARK: + return (const char *)"!'"; + case CENT_SIGN: + return (const char *)"\\textcent"; /* \usepackage{textcomp} */ + case POUND_SIGN: + return (const char *)"\\pounds"; + case EURO_CURRENCY_SIGN: + return (const char *)"\\euro"; /* \usepackage{eurosans} */ + case CURRENCY_SIGN: + return (const char *)"\\textcurrency"; /* \usepackage{textcomp} */ + case YEN_SIGN: + return (const char *)"\\textyen"; /* \usepackage{textcomp} */ + case BROKEN_BAR: + return (const char *)"\\textbrokenbar"; /* \usepackage{textcomp} */ + case SECTION_SIGN: + return (const char *)"\\S"; + case DIAERESIS: + return (const char *)"\""; + case COPYRIGHT_SIGN: + return (const char *)"\\copyright"; + case FEMININE_ORDINAL_INDICATOR: + return (const char *)"$^{\\underbar{a}}$"; + case LEFT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK: + return (const char *)"\\flqq{}"; + case NOT_SIGN: + return (const char *)"$\\lnot$"; + case SOFT_HYPHEN: + return (const char *)"\\-"; + case REGISTERED_SIGN: + return (const char *)"\\textregistered";/* \usepackage{textcomp} */ + case MACRON: + return (const char *)"\\textasciimacron";/* \usepackage{textcomp} */ + case DEGREE_SIGN: + return (const char *)"$^{o}$"; + case PLUS_MINUS_SIGN: + return (const char *)"$\\pm$"; + case SUPERSCRIPT_TWO: + return (const char *)"$^{2}$"; + case SUPERSCRIPT_THREE: + return (const char *)"$^{3}$"; + case ACUTE_ACCENT: + return (const char *)"\\( \\prime \\)"; + case MICRO_SIGN: + return (const char *)"$\\mu$"; + case PILCROW_SIGN: + return (const char *)"\\P"; + case MIDDLE_DOT: + return (const char *)"$\\cdot$"; + case CEDILLA: + return (const char *)"\\,"; + case SUPERSCRIPT_ONE: + return (const char *)"$^{1}$"; + case MASCULINE_ORDINAL_INDICATOR: + return (const char *)"$^{\\underbar{o}}$"; + case RIGHT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK: + return (const char *)"\\frqq{}"; + case VULGAR_FRACTION_ONE_QUARTER: /* these fractions are not good*/ + return (const char *)"\\( 1\\over 4 \\)"; + case VULGAR_FRACTION_ONE_HALF: + return (const char *)"\\( 1\\over 2 \\)"; + case VULGAR_FRACTION_THREE_QUARTERS: + return (const char *)"\\( 3\\over 4 \\)"; + case INVERTED_QUESTION_MARK: + return (const char *)"?'"; + case LATIN_CAPITAL_LETTER_A_WITH_GRAVE: + return (const char *)"\\`A"; + case LATIN_CAPITAL_LETTER_A_WITH_ACUTE: + return (const char *)"\\'A"; + case LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX: + return (const char *)"\\^A"; + case LATIN_CAPITAL_LETTER_A_WITH_TILDE: + return (const char *)"\\~A"; + case LATIN_CAPITAL_LETTER_A_WITH_DIAERESIS: + return (const char *)"\\\"A"; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + return (const char *)"\\AA"; + case LATIN_CAPITAL_LETTER_AE: + return (const char *)"\\AE"; + case LATIN_CAPITAL_LETTER_C_WITH_CARON: + return (const char *)"\\v{C}"; + case LATIN_CAPITAL_LETTER_C_WITH_CEDILLA: + return (const char *)"\\C"; + case LATIN_CAPITAL_LETTER_E_WITH_GRAVE: + return (const char *)"\\`E"; + case LATIN_CAPITAL_LETTER_E_WITH_ACUTE: + return (const char *)"\\'E"; + case LATIN_CAPITAL_LETTER_E_WITH_CARON: + return (const char *)"\\v{E}"; + case LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX: + return (const char *)"\\^E"; + case LATIN_CAPITAL_LETTER_E_WITH_DIAERESIS: + return (const char *)"\\\"E"; + case LATIN_CAPITAL_LETTER_I_WITH_GRAVE: + return (const char *)"\\`I"; + case LATIN_CAPITAL_LETTER_I_WITH_ACUTE: + return (const char *)"\\'I"; + case LATIN_CAPITAL_LETTER_I_WITH_CIRCUMFLEX: + return (const char *)"\\^I"; + case LATIN_CAPITAL_LETTER_I_WITH_DIAERESIS: + return (const char *)"\\\"I"; + case LATIN_CAPITAL_LETTER_ETH: + return (const char *)UNDEFINED; + case LATIN_CAPITAL_LETTER_N_WITH_TILDE: + return (const char *)"\\~N"; + case LATIN_CAPITAL_LETTER_O_WITH_GRAVE: + return (const char *)"\\`O"; + case LATIN_CAPITAL_LETTER_O_WITH_ACUTE: + return (const char *)"\\'O"; + case LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX: + return (const char *)"\\^O"; + case LATIN_CAPITAL_LETTER_O_WITH_TILDE: + return (const char *)"\\~O"; + case LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS: + return (const char *)"\\\"O"; + case MULTIPLICATION_SIGN: + return (const char *)"$\\times$"; + case LATIN_CAPITAL_LETTER_O_WITH_STROKE: + return (const char *)"\\O"; + case LATIN_CAPITAL_LETTER_S_WITH_CARON: + return (const char *)"\\v{S}"; + case LATIN_CAPITAL_LETTER_U_WITH_GRAVE: + return (const char *)"\\`U"; + case LATIN_CAPITAL_LETTER_U_WITH_ACUTE: + return (const char *)"\\'U"; + case LATIN_CAPITAL_LETTER_U_WITH_CIRCUMFLEX: + return (const char *)"\\^U"; + case LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS: + return (const char *)"\\\"U"; + case LATIN_CAPITAL_LETTER_Y_WITH_ACUTE: + return (const char *)"\\'Y"; + case LATIN_CAPITAL_LETTER_Z_WITH_CARON: + return (const char *)"\\v{Z}"; + case LATIN_CAPITAL_LETTER_THORN: + return (const char *)UNDEFINED; + case LATIN_SMALL_LETTER_SHARP_S: + return (const char *)"\\ss"; + case LATIN_SMALL_LETTER_A_WITH_GRAVE: + return (const char *)"\\`a"; + case LATIN_SMALL_LETTER_A_WITH_ACUTE: + return (const char *)"\\'a"; + case LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX: + return (const char *)"\\^a"; + case LATIN_SMALL_LETTER_A_WITH_TILDE: + return (const char *)"\\~a"; + case LATIN_SMALL_LETTER_A_WITH_DIAERESIS: + return (const char *)"\\\"a"; + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + return (const char *)"\\aa"; + case LATIN_SMALL_LETTER_AE: + return (const char *)"\\ae"; + case LATIN_SMALL_LETTER_C_WITH_CARON: + return (const char *)"\\v{c}"; + case LATIN_SMALL_LETTER_C_WITH_CEDILLA: + return (const char *)"\\c"; + case LATIN_SMALL_LETTER_E_WITH_GRAVE: + return (const char *)"\\`e"; + case LATIN_SMALL_LETTER_E_WITH_ACUTE: + return (const char *)"\\'e"; + case LATIN_SMALL_LETTER_E_WITH_CARON: + return (const char *)"\\v{e}"; + case LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX: + return (const char *)"\\^e"; + case LATIN_SMALL_LETTER_E_WITH_DIAERESIS: + return (const char *)"\\\"e"; + case LATIN_SMALL_LETTER_I_WITH_GRAVE: + return (const char *)"\\`i"; + case LATIN_SMALL_LETTER_I_WITH_ACUTE: + return (const char *)"\\'i"; + case LATIN_SMALL_LETTER_I_WITH_CIRCUMFLEX: + return (const char *)"\\^i"; + case LATIN_SMALL_LETTER_I_WITH_DIAERESIS: + return (const char *)"\\\"i"; + case LATIN_SMALL_LETTER_ETH: + return (const char *)UNDEFINED; + case LATIN_SMALL_LETTER_N_WITH_TILDE: + return (const char *)"\\~n"; + case LATIN_SMALL_LETTER_O_WITH_GRAVE: + return (const char *)"\\`o"; + case LATIN_SMALL_LETTER_O_WITH_ACUTE: + return (const char *)"\\'o"; + case LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX: + return (const char *)"\\^o"; + case LATIN_SMALL_LETTER_O_WITH_TILDE: + return (const char *)"\\~o"; + case LATIN_SMALL_LETTER_O_WITH_DIAERESIS: + return (const char *)"\\\"o"; + case DIVISION_SIGN: + return (const char *)"$\\div$"; + case LATIN_SMALL_LETTER_O_WITH_STROKE: + return (const char *)"\\o"; + case LATIN_SMALL_LETTER_S_WITH_CARON: + return (const char *)"\\v{s}"; + case LATIN_SMALL_LETTER_U_WITH_GRAVE: + return (const char *)"\\`u"; + case LATIN_SMALL_LETTER_U_WITH_ACUTE: + return (const char *)"\\'u"; + case LATIN_SMALL_LETTER_U_WITH_CIRCUMFLEX: + return (const char *)"\\^u"; + case LATIN_SMALL_LETTER_U_WITH_DIAERESIS: + return (const char *)"\\\"u"; + case LATIN_SMALL_LETTER_Y_WITH_ACUTE: + return (const char *)"\\'y"; + case LATIN_SMALL_LETTER_THORN: + return (const char *)UNDEFINED; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + return (const char *)"\\\"y"; + case LATIN_SMALL_LETTER_Z_WITH_CARON: + return (const char *)"\\v{z}"; + + /* greek */ + /* some (punctuation, accents, accented capital) greek letters missing*/ + case GREEK_CAPITAL_LETTER_ALPHA: + return (const char *)"A"; + case GREEK_CAPITAL_LETTER_BETA: + return (const char *)"B"; + case GREEK_CAPITAL_LETTER_GAMMA: + return (const char *)"\\( \\Gamma \\)"; + case GREEK_CAPITAL_LETTER_DELTA: + return (const char *)"\\( \\Delta \\)"; + case GREEK_CAPITAL_LETTER_EPSILON: + return (const char *)"E"; + case GREEK_CAPITAL_LETTER_ZETA: + return (const char *)"Z"; + case GREEK_CAPITAL_LETTER_ETA: + return (const char *)"H"; + case GREEK_CAPITAL_LETTER_THETA: + return (const char *)"\\( \\Theta \\)"; + case GREEK_CAPITAL_LETTER_IOTA: + return (const char *)"I"; + case GREEK_CAPITAL_LETTER_KAPPA: + return (const char *)"K"; + case GREEK_CAPITAL_LETTER_LAMDA: + return (const char *)"\\( \\Lambda \\)"; + case GREEK_CAPITAL_LETTER_MU: + return (const char *)"M"; + case GREEK_CAPITAL_LETTER_NU: + return (const char *)"N"; + case GREEK_CAPITAL_LETTER_XI: + return (const char *)"\\( \\Xi \\)"; + case GREEK_CAPITAL_LETTER_OMICRON: + return (const char *)"O"; + case GREEK_CAPITAL_LETTER_PI: + return (const char *)"\\( \\Pi \\)"; + case GREEK_CAPITAL_LETTER_RHO: + return (const char *)"P"; + case GREEK_CAPITAL_LETTER_SIGMA: + return (const char *)"\\( \\Sigma \\)"; + case GREEK_CAPITAL_LETTER_TAU: + return (const char *)"T"; + case GREEK_CAPITAL_LETTER_UPSILON: + return (const char *)"\\( \\Upsilon \\)"; + case GREEK_CAPITAL_LETTER_PHI: + return (const char *)"\\( \\Phi \\)"; + case GREEK_CAPITAL_LETTER_CHI: + return (const char *)"\\( \\Chi \\)"; + case GREEK_CAPITAL_LETTER_PSI: + return (const char *)"\\( \\Psi \\)"; + case GREEK_CAPITAL_LETTER_OMEGA: + return (const char *)"\\( \\Omega \\)"; + case GREEK_CAPITAL_LETTER_IOTA_WITH_DIALYTIKA: + return (const char *)UNDEFINED; + case GREEK_CAPITAL_LETTER_UPSILON_WITH_DIALYTIKA: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_ALPHA_WITH_TONOS: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_EPSILON_WITH_TONOS: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_ETA_WITH_TONOS: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_IOTA_WITH_TONOS: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_ALPHA: + return (const char *)"\\( \\alpha \\)"; + case GREEK_SMALL_LETTER_BETA: + return (const char *)"\\( \\beta \\)"; + case GREEK_SMALL_LETTER_GAMMA: + return (const char *)"\\( \\gamma \\)"; + case GREEK_SMALL_LETTER_DELTA: + return (const char *)"\\( \\delta \\)"; + case GREEK_SMALL_LETTER_EPSILON: + return (const char *)"\\( \\epsilon \\)"; + case GREEK_SMALL_LETTER_ZETA: + return (const char *)"\\( \\zeta \\)"; + case GREEK_SMALL_LETTER_ETA: + return (const char *)"\\( \\eta \\)"; + case GREEK_SMALL_LETTER_THETA: + return (const char *)"\\( \\theta \\)"; + case GREEK_SMALL_LETTER_IOTA: + return (const char *)"\\( \\iota \\)"; + case GREEK_SMALL_LETTER_KAPPA: + return (const char *)"\\( \\kappa \\)"; + case GREEK_SMALL_LETTER_LAMDA: + return (const char *)"\\( \\lambda \\)"; + case GREEK_SMALL_LETTER_MU: + return (const char *)"\\( \\mu \\)"; + case GREEK_SMALL_LETTER_NU: + return (const char *)"\\( \\nu \\)"; + case GREEK_SMALL_LETTER_XI: + return (const char *)"\\( \\xi \\)"; + case GREEK_SMALL_LETTER_OMICRON: + return (const char *)"\\( \\omicron \\)"; + case GREEK_SMALL_LETTER_PI: + return (const char *)"\\( \\pi \\)"; + case GREEK_SMALL_LETTER_RHO: + return (const char *)"\\( \\rho \\)"; + case GREEK_SMALL_LETTER_FINAL_SIGMA: + return (const char *)"\\( \\varsigma \\)"; + case GREEK_SMALL_LETTER_SIGMA: + return (const char *)"\\( \\sigma \\)"; + case GREEK_SMALL_LETTER_TAU: + return (const char *)"\\( \\tau \\)"; + case GREEK_SMALL_LETTER_UPSILON: + return (const char *)"\\( \\upsilon \\)"; + case GREEK_SMALL_LETTER_PHI: + return (const char *)"\\( \\varphi \\)"; + case GREEK_SMALL_LETTER_CHI: + return (const char *)"\\( \\chi \\)"; + case GREEK_SMALL_LETTER_PSI: + return (const char *)"\\( \\psi \\)"; + case GREEK_SMALL_LETTER_OMEGA: + return (const char *)"\\( \\omega \\)"; + case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_OMICRON_WITH_TONOS: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_UPSILON_WITH_TONOS: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_OMEGA_WITH_TONOS: + return (const char *)UNDEFINED; + case GREEK_BETA_SYMBOL: + return (const char *)UNDEFINED; + case GREEK_THETA_SYMBOL: + return (const char *)"\\( \\vartheta \\)"; + case GREEK_UPSILON_WITH_HOOK_SYMBOL: + return (const char *)UNDEFINED; + case GREEK_UPSILON_WITH_ACUTE_AND_HOOK_SYMBOL: + return (const char *)UNDEFINED; + case GREEK_UPSILON_WITH_DIAERESIS_AND_HOOK_SYMBOL: + return (const char *)UNDEFINED; + case GREEK_PHI_SYMBOL: + return (const char *)"\\( \\phi \\)"; + case GREEK_PI_SYMBOL: + return (const char *)"\\( \\varpi \\)"; + /* and some greek letters missing*/ + + /* punctuation (partial) */ + case HYPHEN: + return (const char *)"-"; + case NON_BREAKING_HYPHEN: + return (const char *)UNDEFINED; + case FIGURE_DASH: + case EN_DASH: + return (const char *)"--"; + case EM_DASH: + return (const char *)"---"; + case HORIZONTAL_BAR: + return (const char *)UNDEFINED; + case LEFT_SINGLE_QUOTATION_MARK: + return (const char *)"`"; + case RIGHT_SINGLE_QUOTATION_MARK: + return (const char *)"'"; + case SINGLE_LOW_9_QUOTATION_MARK: + return (const char *)"\\glq{}"; + case SINGLE_HIGH_REVERSED_9_QUOTATION_MARK: + return (const char *)UNDEFINED; + case LEFT_DOUBLE_QUOTATION_MARK: + return (const char *)"``"; + case RIGHT_DOUBLE_QUOTATION_MARK: + return (const char *)"''"; + case DOUBLE_LOW_9_QUOTATION_MARK: + return (const char *)"\\glqq{}"; + case DOUBLE_HIGH_REVERSED_9_QUOTATION_MARK: + return (const char *)UNDEFINED; + case DAGGER: + return (const char *)"\\dag"; + case DOUBLE_DAGGER: + return (const char *)"\\ddag"; + case BULLET: + return (const char *)"$\\bullet$"; + case TRIANGULAR_BULLET: + return (const char *)"$\\blacktriangleright"; + case HYPHENATION_POINT: + return (const char *)"\\-"; + case HORIZONTAL_ELLIPSIS: + return (const char *)"\\ldots"; + case PER_MILLE_SIGN: + return (const char *)UNDEFINED; + case SINGLE_LEFT_POINTING_ANGLE_QUOTATION_MARK: + return (const char *)"\\flq{}"; + case SINGLE_RIGHT_POINTING_ANGLE_QUOTATION_MARK: + return (const char *)"\\frq{}"; + /* ligatures */ + case LATIN_SMALL_LIGATURE_FF: + return (const char *)"ff"; + case LATIN_SMALL_LIGATURE_FI: + return (const char *)"fi"; + case LATIN_SMALL_LIGATURE_FL: + return (const char *)"fl"; + case LATIN_SMALL_LIGATURE_FFI: + return (const char *)"ffi"; + case LATIN_SMALL_LIGATURE_FFL: + return (const char *)"ffl"; + case LATIN_SMALL_LIGATURE_LONG_S_T: + case LATIN_SMALL_LIGATURE_ST: + return (const char *)"st"; + /* reserved */ + case 0: + return (const char *)""; + case UNKNOWN: + return (const char *)"\\_"; + case PICTURE: + return (const char *)"(PICTURE)"; + default: + /* snprintf seems to be no standard, so I use insecure sprintf */ + sprintf(buf,"\\symbol{%u}",(unsigned)c); + return buf; /* UNDEFINED; */ + } + case HTML: + if ( c >= SPACE && c <= TILDE ) { /* ASCII */ + switch (c) { + case '&': + return (const char *)"&"; + /* semicolon must not be coded */ + case '\'': + return (const char *)"'"; + case '"': + return (const char *)"""; + case '<': + return (const char *)"<"; + case '>': + return (const char *)">"; + } + buf[0] = (char)c; + return buf; + } + switch (c) { + case PICTURE: + return (const char *)"<!--PICTURE-->"; + case UNKNOWN: + return (const char *)"_"; /* better use colored symbol? */ + case LINE_FEED: + return (const char *)"<br />"; /* \n handled somwhere else? */ + case FORM_FEED: + case CARRIAGE_RETURN: + return (const char *)"<br />"; + case NO_BREAK_SPACE: + return (const char *)"<nobr />"; + case INVERTED_EXCLAMATION_MARK: + return (const char *)"¡"; + case CENT_SIGN: + return (const char *)"¢"; + case POUND_SIGN: + return (const char *)"£"; + case CURRENCY_SIGN: + return (const char *)"¤"; + case YEN_SIGN: + return (const char *)"¥"; + case BROKEN_BAR: + return (const char *)"¦"; + case SECTION_SIGN: + return (const char *)"§"; + case DIAERESIS: + return (const char *)"¨"; + case COPYRIGHT_SIGN: + return (const char *)"©"; + case FEMININE_ORDINAL_INDICATOR: + return (const char *)"ªem;"; + case LEFT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK: + return (const char *)"«"; + case NOT_SIGN: + return (const char *)"¬"; + case SOFT_HYPHEN: + return (const char *)"­"; + case REGISTERED_SIGN: + return (const char *)"®"; + case MACRON: + return (const char *)"¯"; + case DEGREE_SIGN: + return (const char *)"°"; + case PLUS_MINUS_SIGN: + return (const char *)"±"; + case SUPERSCRIPT_TWO: + return (const char *)"²"; + case SUPERSCRIPT_THREE: + return (const char *)"³"; + case ACUTE_ACCENT: + return (const char *)"´"; + case MICRO_SIGN: + return (const char *)"µ"; + case PILCROW_SIGN: + return (const char *)"¶"; + case MIDDLE_DOT: + return (const char *)"·"; + case CEDILLA: + return (const char *)"¸"; + case SUPERSCRIPT_ONE: + return (const char *)"¹"; + case MASCULINE_ORDINAL_INDICATOR: + return (const char *)"º"; + case RIGHT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK: + return (const char *)"»"; + case VULGAR_FRACTION_ONE_QUARTER: + return (const char *)"¼"; + case VULGAR_FRACTION_ONE_HALF: + return (const char *)"½"; + case VULGAR_FRACTION_THREE_QUARTERS: + return (const char *)"¾"; + case INVERTED_QUESTION_MARK: + return (const char *)"¿"; + case LATIN_CAPITAL_LETTER_A_WITH_GRAVE: + return (const char *)"À"; + case LATIN_CAPITAL_LETTER_A_WITH_ACUTE: + return (const char *)"Á"; + case LATIN_CAPITAL_LETTER_A_WITH_BREVE: + return (const char *)"Ă"; + case LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX: + return (const char *)"Â"; + case LATIN_CAPITAL_LETTER_A_WITH_TILDE: + return (const char *)"Ã"; + case LATIN_CAPITAL_LETTER_A_WITH_DIAERESIS: + return (const char *)"Ä"; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + return (const char *)"Å"; + case LATIN_CAPITAL_LETTER_AE: + return (const char *)"Æ"; + case LATIN_CAPITAL_LETTER_C_WITH_CARON: + return (const char *)"Č"; + case LATIN_CAPITAL_LETTER_C_WITH_CEDILLA: + return (const char *)"Ç"; + case LATIN_CAPITAL_LETTER_E_WITH_GRAVE: + return (const char *)"È"; + case LATIN_CAPITAL_LETTER_E_WITH_ACUTE: + return (const char *)"É"; + case LATIN_CAPITAL_LETTER_E_WITH_CARON: + return (const char *)"Ě"; + case LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX: + return (const char *)"Ê"; + case LATIN_CAPITAL_LETTER_E_WITH_DIAERESIS: + return (const char *)"Ë"; + case LATIN_CAPITAL_LETTER_I_WITH_GRAVE: + return (const char *)"Ì"; + case LATIN_CAPITAL_LETTER_I_WITH_ACUTE: + return (const char *)"Í"; + case LATIN_CAPITAL_LETTER_I_WITH_CIRCUMFLEX: + return (const char *)"Î"; + case LATIN_CAPITAL_LETTER_I_WITH_DIAERESIS: + return (const char *)"Ï"; + case LATIN_CAPITAL_LETTER_ETH: + return (const char *)"Ð"; + case LATIN_CAPITAL_LETTER_N_WITH_TILDE: + return (const char *)"Ñ"; + case LATIN_CAPITAL_LETTER_O_WITH_GRAVE: + return (const char *)"Ò"; + case LATIN_CAPITAL_LETTER_O_WITH_ACUTE: + return (const char *)"Ó"; + case LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX: + return (const char *)"Ô"; + case LATIN_CAPITAL_LETTER_O_WITH_TILDE: + return (const char *)"Õ"; + case LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS: + return (const char *)"Ö"; + case MULTIPLICATION_SIGN: + return (const char *)"×"; + case LATIN_CAPITAL_LETTER_O_WITH_STROKE: + return (const char *)"Ø"; + case LATIN_CAPITAL_LETTER_S_WITH_CARON: + return (const char *)"Š"; + case LATIN_CAPITAL_LETTER_U_WITH_GRAVE: + return (const char *)"Ù"; + case LATIN_CAPITAL_LETTER_U_WITH_ACUTE: + return (const char *)"Ú"; + case LATIN_CAPITAL_LETTER_U_WITH_CIRCUMFLEX: + return (const char *)"Û"; + case LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS: + return (const char *)"Ü"; + case LATIN_CAPITAL_LETTER_Y_WITH_ACUTE: + return (const char *)"Ý"; + case LATIN_CAPITAL_LETTER_Z_WITH_CARON: + return (const char *)"Ž"; + case LATIN_CAPITAL_LETTER_THORN: + return (const char *)"Þ"; + case LATIN_SMALL_LETTER_SHARP_S: + return (const char *)"ß"; + case LATIN_SMALL_LETTER_A_WITH_GRAVE: + return (const char *)"à"; + case LATIN_SMALL_LETTER_A_WITH_ACUTE: + return (const char *)"á"; + case LATIN_SMALL_LETTER_A_WITH_BREVE: + return (const char *)"ă"; + case LATIN_SMALL_LETTER_A_WITH_CARON: + return (const char *)"&acaron;"; + case LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX: + return (const char *)"â"; + case LATIN_SMALL_LETTER_A_WITH_TILDE: + return (const char *)"ã"; + case LATIN_SMALL_LETTER_A_WITH_DIAERESIS: + return (const char *)"ä"; + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + return (const char *)"å"; + case LATIN_SMALL_LETTER_AE: + return (const char *)"æ"; + case LATIN_SMALL_LETTER_C_WITH_CARON: + return (const char *)"č"; + case LATIN_SMALL_LETTER_C_WITH_CEDILLA: + return (const char *)"ç"; + case LATIN_SMALL_LETTER_E_WITH_GRAVE: + return (const char *)"è"; + case LATIN_SMALL_LETTER_E_WITH_ACUTE: + return (const char *)"é"; + case LATIN_SMALL_LETTER_E_WITH_CARON: + return (const char *)"ě"; + case LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX: + return (const char *)"ê"; + case LATIN_SMALL_LETTER_E_WITH_DIAERESIS: + return (const char *)"ë"; + case LATIN_SMALL_LETTER_I_WITH_GRAVE: + return (const char *)"ì"; + case LATIN_SMALL_LETTER_I_WITH_ACUTE: + return (const char *)"í"; + case LATIN_SMALL_LETTER_I_WITH_CIRCUMFLEX: + return (const char *)"î"; + case LATIN_SMALL_LETTER_I_WITH_DIAERESIS: + return (const char *)"ï"; + case LATIN_SMALL_LETTER_ETH: + return (const char *)"ð"; + case LATIN_SMALL_LETTER_N_WITH_TILDE: + return (const char *)"ñ"; + case LATIN_SMALL_LETTER_O_WITH_GRAVE: + return (const char *)"ò"; + case LATIN_SMALL_LETTER_O_WITH_ACUTE: + return (const char *)"ó"; + case LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX: + return (const char *)"ô"; + case LATIN_SMALL_LETTER_O_WITH_TILDE: + return (const char *)"õ"; + case LATIN_SMALL_LETTER_O_WITH_DIAERESIS: + return (const char *)"ö"; + case DIVISION_SIGN: + return (const char *)"÷"; + case LATIN_SMALL_LETTER_O_WITH_STROKE: + return (const char *)"ø"; + case LATIN_SMALL_LETTER_S_WITH_CARON: + return (const char *)"š"; + case LATIN_SMALL_LETTER_U_WITH_GRAVE: + return (const char *)"ù"; + case LATIN_SMALL_LETTER_U_WITH_ACUTE: + return (const char *)"ú"; + case LATIN_SMALL_LETTER_U_WITH_CIRCUMFLEX: + return (const char *)"û"; + case LATIN_SMALL_LETTER_U_WITH_DIAERESIS: + return (const char *)"ü"; + case LATIN_SMALL_LETTER_Y_WITH_ACUTE: + return (const char *)"ý"; + case LATIN_SMALL_LETTER_THORN: + return (const char *)"þ"; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + return (const char *)"ÿ"; + case LATIN_SMALL_LETTER_Z_WITH_CARON: + return (const char *)"ž"; + case EURO_CURRENCY_SIGN: + return (const char *)"€"; + case 0: + return (const char *)""; + default: + sprintf(buf,"&#%u;",(unsigned)c); + return buf; /* undefined */ + } + /* break; unreachable code */ + case XML: /* only 5 &xxx;-ENTITIES ar defined by default */ + if ( c >= SPACE && c <= TILDE ) { /* ASCII */ + switch (c) { + case '&': + return (const char *)"&"; + case '\'': + return (const char *)"'"; + case '"': + return (const char *)"""; + case '<': + return (const char *)"<"; + case '>': + return (const char *)">"; + } + buf[0] = (char)c; + return buf; + } + switch (c) { /* subject of change! */ + case PICTURE: + return (const char *)"(PICTURE)"; + case UNKNOWN: + return (const char *)"_"; /* better use colored symbol? */ + case LINE_FEED: /* \n handled somwhere else? */ + case FORM_FEED: + case CARRIAGE_RETURN: + return (const char *)"<br />"; + case NO_BREAK_SPACE: + return (const char *)"<nobr />"; + case 0: + return (const char *)""; + default: + sprintf(buf,"&#x%03x;",(unsigned)c); + return buf; /* undefined */ + } + /* break; unreachable code */ + case SGML: + switch (c) { + default: + sprintf(buf,"&#%u;",(unsigned)c); + return buf; /* UNDEFINED */ + } + /* break; unreachable code */ + case ASCII: /* mainly used for debugging */ + if ( c=='\n' || (c>= 0x20 && c <= 0x7F) ) { + buf[0] = (char)c; + return buf; + } + switch (c) { + /* extra */ + case UNKNOWN: + return (const char *)"(?)"; + case PICTURE: + return (const char *)"(?)"; + + default: + /* snprintf seems to be no standard, so I use insecure sprintf */ + if ((unsigned)c>255) sprintf(buf,"(0x%04x)",(unsigned)c); + else sprintf(buf,"(0x%02x)",(unsigned)c); + return buf; /* UNDEFINED; */ + } + /* break; unreachable code */ + default: /* use UTF8 as default, test with xterm -u8 */ + /* extra */ + if ( c == UNKNOWN ) return (const char *)"_"; + if ( c == PICTURE ) return (const char *)"_"; /* Due to Mobile OCR */ + if ( c <= (wchar_t)0x0000007F ) { /* UTF8 == 7bit ASCII */ + buf[0] = (char)c; + return buf; + } + if ( c <= (wchar_t)0x000007FF ) { /* UTF8 == 11bit */ + buf[0] = (char)(0xc0|((c>> 6) & 0x1f)); /* 110xxxxx */ + buf[1] = (char)(0x80|( c & 0x3f)); /* 10xxxxxx */ + buf[2] = (char)0; /* terminate string */ + return buf; + } + /* wchar_t is 16bit for Borland-C !? Jan07 */ + if ( c <= (wchar_t)0x0000FFFF ) { /* UTF8 == 16bit */ + buf[0] = (char)(0xe0|((c>>12) & 0x0f)); /* 1110xxxx */ + buf[1] = (char)(0x80|((c>> 6) & 0x3f)); /* 10xxxxxx */ + buf[2] = (char)(0x80|( c & 0x3f)); /* 10xxxxxx */ + buf[3] = (char)0; /* terminate string */ + return buf; + } + if ( c <= (wchar_t)0x001FFFFF ) { /* UTF8 == 21bit */ + buf[0] = (char)(0xf0|((c>>18) & 0x07)); /* 11110xxx */ + buf[1] = (char)(0x80|((c>>12) & 0x3f)); /* 10xxxxxx */ + buf[2] = (char)(0x80|((c>> 6) & 0x3f)); /* 10xxxxxx */ + buf[3] = (char)(0x80|( c & 0x3f)); /* 10xxxxxx */ + buf[4] = (char)0; /* terminate string */ + return buf; + } + if ( c <= (wchar_t)0x03FFFFFF ) { /* UTF8 == 26bit */ + buf[0] = (char)(0xf8|((c>>24) & 0x03)); /* 111110xx */ + buf[1] = (char)(0x80|((c>>18) & 0x3f)); /* 10xxxxxx */ + buf[2] = (char)(0x80|((c>>12) & 0x3f)); /* 10xxxxxx */ + buf[3] = (char)(0x80|((c>> 6) & 0x3f)); /* 10xxxxxx */ + buf[4] = (char)(0x80|( c & 0x3f)); /* 10xxxxxx */ + buf[5] = (char)0; /* terminate string */ + return buf; + } + if ( c <= (wchar_t)0x7FFFFFFF ) { /* UTF8 == 31bit */ + buf[0] = (char)(0xfc|((c>>30) & 0x01)); /* 1111110x */ + buf[1] = (char)(0x80|((c>>24) & 0x3f)); /* 10xxxxxx */ + buf[2] = (char)(0x80|((c>>18) & 0x3f)); /* 10xxxxxx */ + buf[3] = (char)(0x80|((c>>12) & 0x3f)); /* 10xxxxxx */ + buf[4] = (char)(0x80|((c>> 6) & 0x3f)); /* 10xxxxxx */ + buf[5] = (char)(0x80|( c & 0x3f)); /* 10xxxxxx */ + buf[6] = (char)0; /* terminate string */ + return buf; + } + return (const char *)UNDEFINED; + } +} diff --git a/lib/gocr/unicode.h b/lib/gocr/unicode.h new file mode 100644 index 00000000..b85fd444 --- /dev/null +++ b/lib/gocr/unicode.h @@ -0,0 +1,1257 @@ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000-2007 Joerg Schulenburg + + The character codes in this file are Copyright (c) 1991-1999 Unicode, Inc. + All Rights reserved. + + This file is provided as-is by Unicode, Inc. (The Unicode Consortium). + No claims are made as to fitness for any particular purpose. No + warranties of any kind are expressed or implied. The recipient + agrees to determine applicability of information provided. If this + file has been provided on optical media by Unicode, Inc., the sole + remedy for any claim will be exchange of defective media within 90 + days of receipt. + + Unicode, Inc. hereby grants the right to freely use the information + supplied in this file in the creation of products supporting the + Unicode Standard, and to make copies of this file in any form for + internal or external distribution as long as this notice remains + attached. + +For the rest of the file, the following applies: + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + see README for EMAIL-address + */ + +/* + Only the codes judged necessary by the developers are present in this + file. It conforms with MES-1. You'll find also Greek characters, + mathematical symbols and some extra symbols. + + Use the following regular expression to help add new codes from the + Unicode data files: ([\w\s]*);([\w\s]*);[^\n]* +*/ + +#ifndef G_UNICODE_H +#define G_UNICODE_H + +#include <stddef.h> + +enum format { + ISO8859_1, TeX, HTML, XML, SGML, UTF8, ASCII +}; +typedef enum format FORMAT; + +/* + * Prototypes + */ +wchar_t compose(wchar_t main, wchar_t modifier); +const char *decode(wchar_t c, FORMAT type); + +/* + * Unicode codes + */ + +/* E000-F8FF are for private use. We'll reserve E000-E0FF by now. */ +/* the next line isnt proper, but was the easiest way to fix a problem */ +#ifndef UNKNOWN +#define UNKNOWN 0xE000 +#endif +#define PICTURE 0xE001 +#define HEADER_FILE 0xE010 + +/* most codes 0x00-0x1F are not needed, but we provide them anyway. U0000 had + the name changed from NULL to UNICODE_NULL to avoid conflicts. */ +#define UNICODE_NULL 0x0000 +#define START_OF_HEADING 0x0001 +#define START_OF_TEXT 0x0002 +#define END_OF_TEXT 0x0003 +#define END_OF_TRANSMISSION 0x0004 +#define ENQUIRY 0x0005 +#define ACKNOWLEDGE 0x0006 +#define BELL 0x0007 +#define BACKSPACE 0x0008 +#define HORIZONTAL_TABULATION 0x0009 +#define LINE_FEED 0x000A +#define VERTICAL_TABULATION 0x000B +#define FORM_FEED 0x000C +#define CARRIAGE_RETURN 0x000D +#define SHIFT_OUT 0x000E +#define SHIFT_IN 0x000F +#define DATA_LINK_ESCAPE 0x0010 +#define DEVICE_CONTROL_ONE 0x0011 +#define DEVICE_CONTROL_TWO 0x0012 +#define DEVICE_CONTROL_THREE 0x0013 +#define DEVICE_CONTROL_FOUR 0x0014 +#define NEGATIVE_ACKNOWLEDGE 0x0015 +#define SYNCHRONOUS_IDLE 0x0016 +#define END_OF_TRANSMISSION_BLOCK 0x0017 +#define CANCEL 0x0018 +#define END_OF_MEDIUM 0x0019 +#define SUBSTITUTE 0x001A +#define ESCAPE 0x001B +#define FILE_SEPARATOR 0x001C +#define GROUP_SEPARATOR 0x001D +#define RECORD_SEPARATOR 0x001E +#define UNIT_SEPARATOR 0x001F + +/* ASCII */ +#define SPACE 0x0020 +#define EXCLAMATION_MARK 0x0021 +#define QUOTATION_MARK 0x0022 +#define NUMBER_SIGN 0x0023 +#define DOLLAR_SIGN 0x0024 +#define PERCENT_SIGN 0x0025 +#define AMPERSAND 0x0026 +#define APOSTROPHE 0x0027 +#define LEFT_PARENTHESIS 0x0028 +#define RIGHT_PARENTHESIS 0x0029 +#define ASTERISK 0x002A +#define PLUS_SIGN 0x002B +#define COMMA 0x002C +#define HYPHEN_MINUS 0x002D +#define FULL_STOP 0x002E +#define SOLIDUS 0x002F +#define DIGIT_ZERO 0x0030 +#define DIGIT_ONE 0x0031 +#define DIGIT_TWO 0x0032 +#define DIGIT_THREE 0x0033 +#define DIGIT_FOUR 0x0034 +#define DIGIT_FIVE 0x0035 +#define DIGIT_SIX 0x0036 +#define DIGIT_SEVEN 0x0037 +#define DIGIT_EIGHT 0x0038 +#define DIGIT_NINE 0x0039 +#define COLON 0x003A +#define SEMICOLON 0x003B +#define LESS_THAN_SIGN 0x003C +#define EQUALS_SIGN 0x003D +#define GREATER_THAN_SIGN 0x003E +#define QUESTION_MARK 0x003F +#define COMMERCIAL_AT 0x0040 +#define LATIN_CAPITAL_LETTER_A 0x0041 +#define LATIN_CAPITAL_LETTER_B 0x0042 +#define LATIN_CAPITAL_LETTER_C 0x0043 +#define LATIN_CAPITAL_LETTER_D 0x0044 +#define LATIN_CAPITAL_LETTER_E 0x0045 +#define LATIN_CAPITAL_LETTER_F 0x0046 +#define LATIN_CAPITAL_LETTER_G 0x0047 +#define LATIN_CAPITAL_LETTER_H 0x0048 +#define LATIN_CAPITAL_LETTER_I 0x0049 +#define LATIN_CAPITAL_LETTER_J 0x004A +#define LATIN_CAPITAL_LETTER_K 0x004B +#define LATIN_CAPITAL_LETTER_L 0x004C +#define LATIN_CAPITAL_LETTER_M 0x004D +#define LATIN_CAPITAL_LETTER_N 0x004E +#define LATIN_CAPITAL_LETTER_O 0x004F +#define LATIN_CAPITAL_LETTER_P 0x0050 +#define LATIN_CAPITAL_LETTER_Q 0x0051 +#define LATIN_CAPITAL_LETTER_R 0x0052 +#define LATIN_CAPITAL_LETTER_S 0x0053 +#define LATIN_CAPITAL_LETTER_T 0x0054 +#define LATIN_CAPITAL_LETTER_U 0x0055 +#define LATIN_CAPITAL_LETTER_V 0x0056 +#define LATIN_CAPITAL_LETTER_W 0x0057 +#define LATIN_CAPITAL_LETTER_X 0x0058 +#define LATIN_CAPITAL_LETTER_Y 0x0059 +#define LATIN_CAPITAL_LETTER_Z 0x005A +#define LEFT_SQUARE_BRACKET 0x005B +#define REVERSE_SOLIDUS 0x005C +#define RIGHT_SQUARE_BRACKET 0x005D +#define CIRCUMFLEX_ACCENT 0x005E +#define LOW_LINE 0x005F +#define GRAVE_ACCENT 0x0060 +#define LATIN_SMALL_LETTER_A 0x0061 +#define LATIN_SMALL_LETTER_B 0x0062 +#define LATIN_SMALL_LETTER_C 0x0063 +#define LATIN_SMALL_LETTER_D 0x0064 +#define LATIN_SMALL_LETTER_E 0x0065 +#define LATIN_SMALL_LETTER_F 0x0066 +#define LATIN_SMALL_LETTER_G 0x0067 +#define LATIN_SMALL_LETTER_H 0x0068 +#define LATIN_SMALL_LETTER_I 0x0069 +#define LATIN_SMALL_LETTER_J 0x006A +#define LATIN_SMALL_LETTER_K 0x006B +#define LATIN_SMALL_LETTER_L 0x006C +#define LATIN_SMALL_LETTER_M 0x006D +#define LATIN_SMALL_LETTER_N 0x006E +#define LATIN_SMALL_LETTER_O 0x006F +#define LATIN_SMALL_LETTER_P 0x0070 +#define LATIN_SMALL_LETTER_Q 0x0071 +#define LATIN_SMALL_LETTER_R 0x0072 +#define LATIN_SMALL_LETTER_S 0x0073 +#define LATIN_SMALL_LETTER_T 0x0074 +#define LATIN_SMALL_LETTER_U 0x0075 +#define LATIN_SMALL_LETTER_V 0x0076 +#define LATIN_SMALL_LETTER_W 0x0077 +#define LATIN_SMALL_LETTER_X 0x0078 +#define LATIN_SMALL_LETTER_Y 0x0079 +#define LATIN_SMALL_LETTER_Z 0x007A +#define LEFT_CURLY_BRACKET 0x007B +#define VERTICAL_LINE 0x007C +#define RIGHT_CURLY_BRACKET 0x007D +#define TILDE 0x007E + +/* codes 0x7F-0xBF are not needed */ +#define NO_BREAK_SPACE 0x00A0 +#define INVERTED_EXCLAMATION_MARK 0x00A1 +#define CENT_SIGN 0x00A2 +#define POUND_SIGN 0x00A3 +#define CURRENCY_SIGN 0x00A4 +#define YEN_SIGN 0x00A5 +#define BROKEN_BAR 0x00A6 +#define SECTION_SIGN 0x00A7 +#define DIAERESIS 0x00A8 +#define COPYRIGHT_SIGN 0x00A9 +#define FEMININE_ORDINAL_INDICATOR 0x00AA +#define LEFT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK 0x00AB +#define NOT_SIGN 0x00AC +#define SOFT_HYPHEN 0x00AD +#define REGISTERED_SIGN 0x00AE +#define MACRON 0x00AF +#define DEGREE_SIGN 0x00B0 +#define PLUS_MINUS_SIGN 0x00B1 +#define SUPERSCRIPT_TWO 0x00B2 +#define SUPERSCRIPT_THREE 0x00B3 +#define ACUTE_ACCENT 0x00B4 +#define MICRO_SIGN 0x00B5 +#define PILCROW_SIGN 0x00B6 +#define MIDDLE_DOT 0x00B7 +#define CEDILLA 0x00B8 +#define SUPERSCRIPT_ONE 0x00B9 +#define MASCULINE_ORDINAL_INDICATOR 0x00BA +#define RIGHT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK 0x00BB +#define VULGAR_FRACTION_ONE_QUARTER 0x00BC +#define VULGAR_FRACTION_ONE_HALF 0x00BD +#define VULGAR_FRACTION_THREE_QUARTERS 0x00BE +#define INVERTED_QUESTION_MARK 0x00BF +#define LATIN_CAPITAL_LETTER_A_WITH_GRAVE 0x00C0 +#define LATIN_CAPITAL_LETTER_A_WITH_ACUTE 0x00C1 +#define LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX 0x00C2 +#define LATIN_CAPITAL_LETTER_A_WITH_TILDE 0x00C3 +#define LATIN_CAPITAL_LETTER_A_WITH_DIAERESIS 0x00C4 +#define LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE 0x00C5 +#define LATIN_CAPITAL_LETTER_AE 0x00C6 +#define LATIN_CAPITAL_LETTER_C_WITH_CEDILLA 0x00C7 +#define LATIN_CAPITAL_LETTER_E_WITH_GRAVE 0x00C8 +#define LATIN_CAPITAL_LETTER_E_WITH_ACUTE 0x00C9 +#define LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX 0x00CA +#define LATIN_CAPITAL_LETTER_E_WITH_DIAERESIS 0x00CB +#define LATIN_CAPITAL_LETTER_I_WITH_GRAVE 0x00CC +#define LATIN_CAPITAL_LETTER_I_WITH_ACUTE 0x00CD +#define LATIN_CAPITAL_LETTER_I_WITH_CIRCUMFLEX 0x00CE +#define LATIN_CAPITAL_LETTER_I_WITH_DIAERESIS 0x00CF +#define LATIN_CAPITAL_LETTER_ETH 0x00D0 +#define LATIN_CAPITAL_LETTER_N_WITH_TILDE 0x00D1 +#define LATIN_CAPITAL_LETTER_O_WITH_GRAVE 0x00D2 +#define LATIN_CAPITAL_LETTER_O_WITH_ACUTE 0x00D3 +#define LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX 0x00D4 +#define LATIN_CAPITAL_LETTER_O_WITH_TILDE 0x00D5 +#define LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS 0x00D6 +#define MULTIPLICATION_SIGN 0x00D7 +#define LATIN_CAPITAL_LETTER_O_WITH_STROKE 0x00D8 +#define LATIN_CAPITAL_LETTER_U_WITH_GRAVE 0x00D9 +#define LATIN_CAPITAL_LETTER_U_WITH_ACUTE 0x00DA +#define LATIN_CAPITAL_LETTER_U_WITH_CIRCUMFLEX 0x00DB +#define LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS 0x00DC +#define LATIN_CAPITAL_LETTER_Y_WITH_ACUTE 0x00DD +#define LATIN_CAPITAL_LETTER_THORN 0x00DE +#define LATIN_SMALL_LETTER_SHARP_S 0x00DF +#define LATIN_SMALL_LETTER_A_WITH_GRAVE 0x00E0 +#define LATIN_SMALL_LETTER_A_WITH_ACUTE 0x00E1 +#define LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX 0x00E2 +#define LATIN_SMALL_LETTER_A_WITH_TILDE 0x00E3 +#define LATIN_SMALL_LETTER_A_WITH_DIAERESIS 0x00E4 +#define LATIN_SMALL_LETTER_A_WITH_RING_ABOVE 0x00E5 +#define LATIN_SMALL_LETTER_AE 0x00E6 +#define LATIN_SMALL_LETTER_C_WITH_CEDILLA 0x00E7 +#define LATIN_SMALL_LETTER_E_WITH_GRAVE 0x00E8 +#define LATIN_SMALL_LETTER_E_WITH_ACUTE 0x00E9 +#define LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX 0x00EA +#define LATIN_SMALL_LETTER_E_WITH_DIAERESIS 0x00EB +#define LATIN_SMALL_LETTER_I_WITH_GRAVE 0x00EC +#define LATIN_SMALL_LETTER_I_WITH_ACUTE 0x00ED +#define LATIN_SMALL_LETTER_I_WITH_CIRCUMFLEX 0x00EE +#define LATIN_SMALL_LETTER_I_WITH_DIAERESIS 0x00EF +#define LATIN_SMALL_LETTER_ETH 0x00F0 +#define LATIN_SMALL_LETTER_N_WITH_TILDE 0x00F1 +#define LATIN_SMALL_LETTER_O_WITH_GRAVE 0x00F2 +#define LATIN_SMALL_LETTER_O_WITH_ACUTE 0x00F3 +#define LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX 0x00F4 +#define LATIN_SMALL_LETTER_O_WITH_TILDE 0x00F5 +#define LATIN_SMALL_LETTER_O_WITH_DIAERESIS 0x00F6 +#define DIVISION_SIGN 0x00F7 +#define LATIN_SMALL_LETTER_O_WITH_STROKE 0x00F8 +#define LATIN_SMALL_LETTER_U_WITH_GRAVE 0x00F9 +#define LATIN_SMALL_LETTER_U_WITH_ACUTE 0x00FA +#define LATIN_SMALL_LETTER_U_WITH_CIRCUMFLEX 0x00FB +#define LATIN_SMALL_LETTER_U_WITH_DIAERESIS 0x00FC +#define LATIN_SMALL_LETTER_Y_WITH_ACUTE 0x00FD +#define LATIN_SMALL_LETTER_THORN 0x00FE +#define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS 0x00FF + +/* latin extended-A */ +#define LATIN_CAPITAL_LETTER_A_WITH_MACRON 0x0100 +#define LATIN_SMALL_LETTER_A_WITH_MACRON 0x0101 +#define LATIN_CAPITAL_LETTER_A_WITH_BREVE 0x0102 +#define LATIN_SMALL_LETTER_A_WITH_BREVE 0x0103 +#define LATIN_CAPITAL_LETTER_A_WITH_OGONEK 0x0104 +#define LATIN_SMALL_LETTER_A_WITH_OGONEK 0x0105 +#define LATIN_CAPITAL_LETTER_C_WITH_ACUTE 0x0106 +#define LATIN_SMALL_LETTER_C_WITH_ACUTE 0x0107 +#define LATIN_CAPITAL_LETTER_C_WITH_CIRCUMFLEX 0x0108 +#define LATIN_SMALL_LETTER_C_WITH_CIRCUMFLEX 0x0109 +#define LATIN_CAPITAL_LETTER_C_WITH_DOT_ABOVE 0x010A +#define LATIN_SMALL_LETTER_C_WITH_DOT_ABOVE 0x010B +#define LATIN_CAPITAL_LETTER_C_WITH_CARON 0x010C +#define LATIN_SMALL_LETTER_C_WITH_CARON 0x010D +#define LATIN_CAPITAL_LETTER_D_WITH_CARON 0x010E +#define LATIN_SMALL_LETTER_D_WITH_CARON 0x010F +#define LATIN_CAPITAL_LETTER_D_WITH_STROKE 0x0110 +#define LATIN_SMALL_LETTER_D_WITH_STROKE 0x0111 +#define LATIN_CAPITAL_LETTER_E_WITH_MACRON 0x0112 +#define LATIN_SMALL_LETTER_E_WITH_MACRON 0x0113 +#define LATIN_CAPITAL_LETTER_E_WITH_BREVE 0x0114 +#define LATIN_SMALL_LETTER_E_WITH_BREVE 0x0115 +#define LATIN_CAPITAL_LETTER_E_WITH_DOT_ABOVE 0x0116 +#define LATIN_SMALL_LETTER_E_WITH_DOT_ABOVE 0x0117 +#define LATIN_CAPITAL_LETTER_E_WITH_OGONEK 0x0118 +#define LATIN_SMALL_LETTER_E_WITH_OGONEK 0x0119 +#define LATIN_CAPITAL_LETTER_E_WITH_CARON 0x011A +#define LATIN_SMALL_LETTER_E_WITH_CARON 0x011B +#define LATIN_CAPITAL_LETTER_G_WITH_CIRCUMFLEX 0x011C +#define LATIN_SMALL_LETTER_G_WITH_CIRCUMFLEX 0x011D +#define LATIN_CAPITAL_LETTER_G_WITH_BREVE 0x011E +#define LATIN_SMALL_LETTER_G_WITH_BREVE 0x011F +#define LATIN_CAPITAL_LETTER_G_WITH_DOT_ABOVE 0x0120 +#define LATIN_SMALL_LETTER_G_WITH_DOT_ABOVE 0x0121 +#define LATIN_CAPITAL_LETTER_G_WITH_CEDILLA 0x0122 +#define LATIN_SMALL_LETTER_G_WITH_CEDILLA 0x0123 +#define LATIN_CAPITAL_LETTER_H_WITH_CIRCUMFLEX 0x0124 +#define LATIN_SMALL_LETTER_H_WITH_CIRCUMFLEX 0x0125 +#define LATIN_CAPITAL_LETTER_H_WITH_STROKE 0x0126 +#define LATIN_SMALL_LETTER_H_WITH_STROKE 0x0127 +#define LATIN_CAPITAL_LETTER_I_WITH_TILDE 0x0128 +#define LATIN_SMALL_LETTER_I_WITH_TILDE 0x0129 +#define LATIN_CAPITAL_LETTER_I_WITH_MACRON 0x012A +#define LATIN_SMALL_LETTER_I_WITH_MACRON 0x012B +#define LATIN_CAPITAL_LETTER_I_WITH_BREVE 0x012C +#define LATIN_SMALL_LETTER_I_WITH_BREVE 0x012D +#define LATIN_CAPITAL_LETTER_I_WITH_OGONEK 0x012E +#define LATIN_SMALL_LETTER_I_WITH_OGONEK 0x012F +#define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE 0x0130 +#define LATIN_SMALL_LETTER_DOTLESS_I 0x0131 +#define LATIN_CAPITAL_LIGATURE_IJ 0x0132 +#define LATIN_SMALL_LIGATURE_IJ 0x0133 +#define LATIN_CAPITAL_LETTER_J_WITH_CIRCUMFLEX 0x0134 +#define LATIN_SMALL_LETTER_J_WITH_CIRCUMFLEX 0x0135 +#define LATIN_CAPITAL_LETTER_K_WITH_CEDILLA 0x0136 +#define LATIN_SMALL_LETTER_K_WITH_CEDILLA 0x0137 +#define LATIN_SMALL_LETTER_KRA 0x0138 +#define LATIN_CAPITAL_LETTER_L_WITH_ACUTE 0x0139 +#define LATIN_SMALL_LETTER_L_WITH_ACUTE 0x013A +#define LATIN_CAPITAL_LETTER_L_WITH_CEDILLA 0x013B +#define LATIN_SMALL_LETTER_L_WITH_CEDILLA 0x013C +#define LATIN_CAPITAL_LETTER_L_WITH_CARON 0x013D +#define LATIN_SMALL_LETTER_L_WITH_CARON 0x013E +#define LATIN_CAPITAL_LETTER_L_WITH_MIDDLE_DOT 0x013F +#define LATIN_SMALL_LETTER_L_WITH_MIDDLE_DOT 0x0140 +#define LATIN_CAPITAL_LETTER_L_WITH_STROKE 0x0141 +#define LATIN_SMALL_LETTER_L_WITH_STROKE 0x0142 +#define LATIN_CAPITAL_LETTER_N_WITH_ACUTE 0x0143 +#define LATIN_SMALL_LETTER_N_WITH_ACUTE 0x0144 +#define LATIN_CAPITAL_LETTER_N_WITH_CEDILLA 0x0145 +#define LATIN_SMALL_LETTER_N_WITH_CEDILLA 0x0146 +#define LATIN_CAPITAL_LETTER_N_WITH_CARON 0x0147 +#define LATIN_SMALL_LETTER_N_WITH_CARON 0x0148 +#define LATIN_SMALL_LETTER_N_PRECEDED_BY_APOSTROPHE 0x0149 +#define LATIN_CAPITAL_LETTER_ENG 0x014A +#define LATIN_SMALL_LETTER_ENG 0x014B +#define LATIN_CAPITAL_LETTER_O_WITH_MACRON 0x014C +#define LATIN_SMALL_LETTER_O_WITH_MACRON 0x014D +#define LATIN_CAPITAL_LETTER_O_WITH_BREVE 0x014E +#define LATIN_SMALL_LETTER_O_WITH_BREVE 0x014F +#define LATIN_CAPITAL_LETTER_O_WITH_DOUBLE_ACUTE 0x0150 +#define LATIN_SMALL_LETTER_O_WITH_DOUBLE_ACUTE 0x0151 +#define LATIN_CAPITAL_LIGATURE_OE 0x0152 +#define LATIN_SMALL_LIGATURE_OE 0x0153 +#define LATIN_CAPITAL_LETTER_R_WITH_ACUTE 0x0154 +#define LATIN_SMALL_LETTER_R_WITH_ACUTE 0x0155 +#define LATIN_CAPITAL_LETTER_R_WITH_CEDILLA 0x0156 +#define LATIN_SMALL_LETTER_R_WITH_CEDILLA 0x0157 +#define LATIN_CAPITAL_LETTER_R_WITH_CARON 0x0158 +#define LATIN_SMALL_LETTER_R_WITH_CARON 0x0159 +#define LATIN_CAPITAL_LETTER_S_WITH_ACUTE 0x015A +#define LATIN_SMALL_LETTER_S_WITH_ACUTE 0x015B +#define LATIN_CAPITAL_LETTER_S_WITH_CIRCUMFLEX 0x015C +#define LATIN_SMALL_LETTER_S_WITH_CIRCUMFLEX 0x015D +#define LATIN_CAPITAL_LETTER_S_WITH_CEDILLA 0x015E +#define LATIN_SMALL_LETTER_S_WITH_CEDILLA 0x015F +#define LATIN_CAPITAL_LETTER_S_WITH_CARON 0x0160 +#define LATIN_SMALL_LETTER_S_WITH_CARON 0x0161 +#define LATIN_CAPITAL_LETTER_T_WITH_CEDILLA 0x0162 +#define LATIN_SMALL_LETTER_T_WITH_CEDILLA 0x0163 +#define LATIN_CAPITAL_LETTER_T_WITH_CARON 0x0164 +#define LATIN_SMALL_LETTER_T_WITH_CARON 0x0165 +#define LATIN_CAPITAL_LETTER_T_WITH_STROKE 0x0166 +#define LATIN_SMALL_LETTER_T_WITH_STROKE 0x0167 +#define LATIN_CAPITAL_LETTER_U_WITH_TILDE 0x0168 +#define LATIN_SMALL_LETTER_U_WITH_TILDE 0x0169 +#define LATIN_CAPITAL_LETTER_U_WITH_MACRON 0x016A +#define LATIN_SMALL_LETTER_U_WITH_MACRON 0x016B +#define LATIN_CAPITAL_LETTER_U_WITH_BREVE 0x016C +#define LATIN_SMALL_LETTER_U_WITH_BREVE 0x016D +#define LATIN_CAPITAL_LETTER_U_WITH_RING_ABOVE 0x016E +#define LATIN_SMALL_LETTER_U_WITH_RING_ABOVE 0x016F +#define LATIN_CAPITAL_LETTER_U_WITH_DOUBLE_ACUTE 0x0170 +#define LATIN_SMALL_LETTER_U_WITH_DOUBLE_ACUTE 0x0171 +#define LATIN_CAPITAL_LETTER_U_WITH_OGONEK 0x0172 +#define LATIN_SMALL_LETTER_U_WITH_OGONEK 0x0173 +#define LATIN_CAPITAL_LETTER_W_WITH_CIRCUMFLEX 0x0174 +#define LATIN_SMALL_LETTER_W_WITH_CIRCUMFLEX 0x0175 +#define LATIN_CAPITAL_LETTER_Y_WITH_CIRCUMFLEX 0x0176 +#define LATIN_SMALL_LETTER_Y_WITH_CIRCUMFLEX 0x0177 +#define LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS 0x0178 +#define LATIN_CAPITAL_LETTER_Z_WITH_ACUTE 0x0179 +#define LATIN_SMALL_LETTER_Z_WITH_ACUTE 0x017A +#define LATIN_CAPITAL_LETTER_Z_WITH_DOT_ABOVE 0x017B +#define LATIN_SMALL_LETTER_Z_WITH_DOT_ABOVE 0x017C +#define LATIN_CAPITAL_LETTER_Z_WITH_CARON 0x017D +#define LATIN_SMALL_LETTER_Z_WITH_CARON 0x017E +#define LATIN_SMALL_LETTER_LONG_S 0x017F + +/* latin extended B */ +#define LATIN_SMALL_LETTER_B_WITH_STROKE 0x0180 +#define LATIN_CAPITAL_LETTER_B_WITH_HOOK 0x0181 +#define LATIN_CAPITAL_LETTER_B_WITH_TOPBAR 0x0182 +#define LATIN_SMALL_LETTER_B_WITH_TOPBAR 0x0183 +#define LATIN_CAPITAL_LETTER_TONE_SIX 0x0184 +#define LATIN_SMALL_LETTER_TONE_SIX 0x0185 +#define LATIN_CAPITAL_LETTER_OPEN_O 0x0186 +#define LATIN_CAPITAL_LETTER_C_WITH_HOOK 0x0187 +#define LATIN_SMALL_LETTER_C_WITH_HOOK 0x0188 +#define LATIN_CAPITAL_LETTER_AFRICAN_D 0x0189 +#define LATIN_CAPITAL_LETTER_D_WITH_HOOK 0x018A +#define LATIN_CAPITAL_LETTER_D_WITH_TOPBAR 0x018B +#define LATIN_SMALL_LETTER_D_WITH_TOPBAR 0x018C +#define LATIN_SMALL_LETTER_TURNED_DELTA 0x018D +#define LATIN_CAPITAL_LETTER_REVERSED_E 0x018E +#define LATIN_CAPITAL_LETTER_SCHWA 0x018F +#define LATIN_CAPITAL_LETTER_OPEN_E 0x0190 +#define LATIN_CAPITAL_LETTER_F_WITH_HOOK 0x0191 +#define LATIN_SMALL_LETTER_F_WITH_HOOK 0x0192 +#define LATIN_CAPITAL_LETTER_G_WITH_HOOK 0x0193 +#define LATIN_CAPITAL_LETTER_GAMMA 0x0194 +#define LATIN_SMALL_LETTER_HV 0x0195 +#define LATIN_CAPITAL_LETTER_IOTA 0x0196 +#define LATIN_CAPITAL_LETTER_I_WITH_STROKE 0x0197 +#define LATIN_CAPITAL_LETTER_K_WITH_HOOK 0x0198 +#define LATIN_SMALL_LETTER_K_WITH_HOOK 0x0199 +#define LATIN_SMALL_LETTER_L_WITH_BAR 0x019A +#define LATIN_SMALL_LETTER_LAMBDA_WITH_STROKE 0x019B +#define LATIN_CAPITAL_LETTER_TURNED_M 0x019C +#define LATIN_CAPITAL_LETTER_N_WITH_LEFT_HOOK 0x019D +#define LATIN_SMALL_LETTER_N_WITH_LONG_RIGHT_LEG 0x019E +#define LATIN_CAPITAL_LETTER_O_WITH_MIDDLE_TILDE 0x019F +#define LATIN_CAPITAL_LETTER_O_WITH_HORN 0x01A0 +#define LATIN_SMALL_LETTER_O_WITH_HORN 0x01A1 +#define LATIN_CAPITAL_LETTER_OI 0x01A2 +#define LATIN_SMALL_LETTER_OI 0x01A3 +#define LATIN_CAPITAL_LETTER_P_WITH_HOOK 0x01A4 +#define LATIN_SMALL_LETTER_P_WITH_HOOK 0x01A5 +#define LATIN_LETTER_YR 0x01A6 +#define LATIN_CAPITAL_LETTER_TONE_TWO 0x01A7 +#define LATIN_SMALL_LETTER_TONE_TWO 0x01A8 +#define LATIN_CAPITAL_LETTER_ESH 0x01A9 +#define LATIN_LETTER_REVERSED_ESH_LOOP 0x01AA +#define LATIN_SMALL_LETTER_T_WITH_PALATAL_HOOK 0x01AB +#define LATIN_CAPITAL_LETTER_T_WITH_HOOK 0x01AC +#define LATIN_SMALL_LETTER_T_WITH_HOOK 0x01AD +#define LATIN_CAPITAL_LETTER_T_WITH_RETROFLEX_HOOK 0x01AE +#define LATIN_CAPITAL_LETTER_U_WITH_HORN 0x01AF +#define LATIN_SMALL_LETTER_U_WITH_HORN 0x01B0 +#define LATIN_CAPITAL_LETTER_UPSILON 0x01B1 +#define LATIN_CAPITAL_LETTER_V_WITH_HOOK 0x01B2 +#define LATIN_CAPITAL_LETTER_Y_WITH_HOOK 0x01B3 +#define LATIN_SMALL_LETTER_Y_WITH_HOOK 0x01B4 +#define LATIN_CAPITAL_LETTER_Z_WITH_STROKE 0x01B5 +#define LATIN_SMALL_LETTER_Z_WITH_STROKE 0x01B6 +#define LATIN_CAPITAL_LETTER_EZH 0x01B7 +#define LATIN_CAPITAL_LETTER_EZH_REVERSED 0x01B8 +#define LATIN_SMALL_LETTER_EZH_REVERSED 0x01B9 +#define LATIN_SMALL_LETTER_EZH_WITH_TAIL 0x01BA +#define LATIN_LETTER_TWO_WITH_STROKE 0x01BB +#define LATIN_CAPITAL_LETTER_TONE_FIVE 0x01BC +#define LATIN_SMALL_LETTER_TONE_FIVE 0x01BD +#define LATIN_LETTER_INVERTED_GLOTTAL_STOP_WITH_STROKE 0x01BE +#define LATIN_LETTER_WYNN 0x01BF +#define LATIN_LETTER_DENTAL_CLICK 0x01C0 +#define LATIN_LETTER_LATERAL_CLICK 0x01C1 +#define LATIN_LETTER_ALVEOLAR_CLICK 0x01C2 +#define LATIN_LETTER_RETROFLEX_CLICK 0x01C3 +#define LATIN_CAPITAL_LETTER_DZ_WITH_CARON 0x01C4 +#define LATIN_CAPITAL_LETTER_D_WITH_SMALL_LETTER_Z_WITH_CARON 0x01C5 +#define LATIN_SMALL_LETTER_DZ_WITH_CARON 0x01C6 +#define LATIN_CAPITAL_LETTER_LJ 0x01C7 +#define LATIN_CAPITAL_LETTER_L_WITH_SMALL_LETTER_J 0x01C8 +#define LATIN_SMALL_LETTER_LJ 0x01C9 +#define LATIN_CAPITAL_LETTER_NJ 0x01CA +#define LATIN_CAPITAL_LETTER_N_WITH_SMALL_LETTER_J 0x01CB +#define LATIN_SMALL_LETTER_NJ 0x01CC +#define LATIN_CAPITAL_LETTER_A_WITH_CARON 0x01CD +#define LATIN_SMALL_LETTER_A_WITH_CARON 0x01CE +#define LATIN_CAPITAL_LETTER_I_WITH_CARON 0x01CF +#define LATIN_SMALL_LETTER_I_WITH_CARON 0x01D0 +#define LATIN_CAPITAL_LETTER_O_WITH_CARON 0x01D1 +#define LATIN_SMALL_LETTER_O_WITH_CARON 0x01D2 +#define LATIN_CAPITAL_LETTER_U_WITH_CARON 0x01D3 +#define LATIN_SMALL_LETTER_U_WITH_CARON 0x01D4 +#define LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS_AND_MACRON 0x01D5 +#define LATIN_SMALL_LETTER_U_WITH_DIAERESIS_AND_MACRON 0x01D6 +#define LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS_AND_ACUTE 0x01D7 +#define LATIN_SMALL_LETTER_U_WITH_DIAERESIS_AND_ACUTE 0x01D8 +#define LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS_AND_CARON 0x01D9 +#define LATIN_SMALL_LETTER_U_WITH_DIAERESIS_AND_CARON 0x01DA +#define LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS_AND_GRAVE 0x01DB +#define LATIN_SMALL_LETTER_U_WITH_DIAERESIS_AND_GRAVE 0x01DC +#define LATIN_SMALL_LETTER_TURNED_E 0x01DD +#define LATIN_CAPITAL_LETTER_A_WITH_DIAERESIS_AND_MACRON 0x01DE +#define LATIN_SMALL_LETTER_A_WITH_DIAERESIS_AND_MACRON 0x01DF +#define LATIN_CAPITAL_LETTER_A_WITH_DOT_ABOVE_AND_MACRON 0x01E0 +#define LATIN_SMALL_LETTER_A_WITH_DOT_ABOVE_AND_MACRON 0x01E1 +#define LATIN_CAPITAL_LETTER_AE_WITH_MACRON 0x01E2 +#define LATIN_SMALL_LETTER_AE_WITH_MACRON 0x01E3 +#define LATIN_CAPITAL_LETTER_G_WITH_STROKE 0x01E4 +#define LATIN_SMALL_LETTER_G_WITH_STROKE 0x01E5 +#define LATIN_CAPITAL_LETTER_G_WITH_CARON 0x01E6 +#define LATIN_SMALL_LETTER_G_WITH_CARON 0x01E7 +#define LATIN_CAPITAL_LETTER_K_WITH_CARON 0x01E8 +#define LATIN_SMALL_LETTER_K_WITH_CARON 0x01E9 +#define LATIN_CAPITAL_LETTER_O_WITH_OGONEK 0x01EA +#define LATIN_SMALL_LETTER_O_WITH_OGONEK 0x01EB +#define LATIN_CAPITAL_LETTER_O_WITH_OGONEK_AND_MACRON 0x01EC +#define LATIN_SMALL_LETTER_O_WITH_OGONEK_AND_MACRON 0x01ED +#define LATIN_CAPITAL_LETTER_EZH_WITH_CARON 0x01EE +#define LATIN_SMALL_LETTER_EZH_WITH_CARON 0x01EF +#define LATIN_SMALL_LETTER_J_WITH_CARON 0x01F0 +#define LATIN_CAPITAL_LETTER_DZ 0x01F1 +#define LATIN_CAPITAL_LETTER_D_WITH_SMALL_LETTER_Z 0x01F2 +#define LATIN_SMALL_LETTER_DZ 0x01F3 +#define LATIN_CAPITAL_LETTER_G_WITH_ACUTE 0x01F4 +#define LATIN_SMALL_LETTER_G_WITH_ACUTE 0x01F5 +#define LATIN_CAPITAL_LETTER_HWAIR 0x01F6 +#define LATIN_CAPITAL_LETTER_WYNN 0x01F7 +#define LATIN_CAPITAL_LETTER_N_WITH_GRAVE 0x01F8 +#define LATIN_SMALL_LETTER_N_WITH_GRAVE 0x01F9 +#define LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE_AND_ACUTE 0x01FA +#define LATIN_SMALL_LETTER_A_WITH_RING_ABOVE_AND_ACUTE 0x01FB +#define LATIN_CAPITAL_LETTER_AE_WITH_ACUTE 0x01FC +#define LATIN_SMALL_LETTER_AE_WITH_ACUTE 0x01FD +#define LATIN_CAPITAL_LETTER_O_WITH_STROKE_AND_ACUTE 0x01FE +#define LATIN_SMALL_LETTER_O_WITH_STROKE_AND_ACUTE 0x01FF +#define LATIN_CAPITAL_LETTER_A_WITH_DOUBLE_GRAVE 0x0200 +#define LATIN_SMALL_LETTER_A_WITH_DOUBLE_GRAVE 0x0201 +#define LATIN_CAPITAL_LETTER_A_WITH_INVERTED_BREVE 0x0202 +#define LATIN_SMALL_LETTER_A_WITH_INVERTED_BREVE 0x0203 +#define LATIN_CAPITAL_LETTER_E_WITH_DOUBLE_GRAVE 0x0204 +#define LATIN_SMALL_LETTER_E_WITH_DOUBLE_GRAVE 0x0205 +#define LATIN_CAPITAL_LETTER_E_WITH_INVERTED_BREVE 0x0206 +#define LATIN_SMALL_LETTER_E_WITH_INVERTED_BREVE 0x0207 +#define LATIN_CAPITAL_LETTER_I_WITH_DOUBLE_GRAVE 0x0208 +#define LATIN_SMALL_LETTER_I_WITH_DOUBLE_GRAVE 0x0209 +#define LATIN_CAPITAL_LETTER_I_WITH_INVERTED_BREVE 0x020A +#define LATIN_SMALL_LETTER_I_WITH_INVERTED_BREVE 0x020B +#define LATIN_CAPITAL_LETTER_O_WITH_DOUBLE_GRAVE 0x020C +#define LATIN_SMALL_LETTER_O_WITH_DOUBLE_GRAVE 0x020D +#define LATIN_CAPITAL_LETTER_O_WITH_INVERTED_BREVE 0x020E +#define LATIN_SMALL_LETTER_O_WITH_INVERTED_BREVE 0x020F +#define LATIN_CAPITAL_LETTER_R_WITH_DOUBLE_GRAVE 0x0210 +#define LATIN_SMALL_LETTER_R_WITH_DOUBLE_GRAVE 0x0211 +#define LATIN_CAPITAL_LETTER_R_WITH_INVERTED_BREVE 0x0212 +#define LATIN_SMALL_LETTER_R_WITH_INVERTED_BREVE 0x0213 +#define LATIN_CAPITAL_LETTER_U_WITH_DOUBLE_GRAVE 0x0214 +#define LATIN_SMALL_LETTER_U_WITH_DOUBLE_GRAVE 0x0215 +#define LATIN_CAPITAL_LETTER_U_WITH_INVERTED_BREVE 0x0216 +#define LATIN_SMALL_LETTER_U_WITH_INVERTED_BREVE 0x0217 +#define LATIN_CAPITAL_LETTER_S_WITH_COMMA_BELOW 0x0218 +#define LATIN_SMALL_LETTER_S_WITH_COMMA_BELOW 0x0219 +#define LATIN_CAPITAL_LETTER_T_WITH_COMMA_BELOW 0x021A +#define LATIN_SMALL_LETTER_T_WITH_COMMA_BELOW 0x021B +#define LATIN_CAPITAL_LETTER_YOGH 0x021C +#define LATIN_SMALL_LETTER_YOGH 0x021D +#define LATIN_CAPITAL_LETTER_H_WITH_CARON 0x021E +#define LATIN_SMALL_LETTER_H_WITH_CARON 0x021F +#define LATIN_CAPITAL_LETTER_OU 0x0222 +#define LATIN_SMALL_LETTER_OU 0x0223 +#define LATIN_CAPITAL_LETTER_Z_WITH_HOOK 0x0224 +#define LATIN_SMALL_LETTER_Z_WITH_HOOK 0x0225 +#define LATIN_CAPITAL_LETTER_A_WITH_DOT_ABOVE 0x0226 +#define LATIN_SMALL_LETTER_A_WITH_DOT_ABOVE 0x0227 +#define LATIN_CAPITAL_LETTER_E_WITH_CEDILLA 0x0228 +#define LATIN_SMALL_LETTER_E_WITH_CEDILLA 0x0229 +#define LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS_AND_MACRON 0x022A +#define LATIN_SMALL_LETTER_O_WITH_DIAERESIS_AND_MACRON 0x022B +#define LATIN_CAPITAL_LETTER_O_WITH_TILDE_AND_MACRON 0x022C +#define LATIN_SMALL_LETTER_O_WITH_TILDE_AND_MACRON 0x022D +#define LATIN_CAPITAL_LETTER_O_WITH_DOT_ABOVE 0x022E +#define LATIN_SMALL_LETTER_O_WITH_DOT_ABOVE 0x022F +#define LATIN_CAPITAL_LETTER_O_WITH_DOT_ABOVE_AND_MACRON 0x0230 +#define LATIN_SMALL_LETTER_O_WITH_DOT_ABOVE_AND_MACRON 0x0231 +#define LATIN_CAPITAL_LETTER_Y_WITH_MACRON 0x0232 +#define LATIN_SMALL_LETTER_Y_WITH_MACRON 0x0233 + +/* IPA extensions */ +#define LATIN_SMALL_LETTER_TURNED_A 0x0250 +#define LATIN_SMALL_LETTER_ALPHA 0x0251 +#define LATIN_SMALL_LETTER_TURNED_ALPHA 0x0252 +#define LATIN_SMALL_LETTER_B_WITH_HOOK 0x0253 +#define LATIN_SMALL_LETTER_OPEN_O 0x0254 +#define LATIN_SMALL_LETTER_C_WITH_CURL 0x0255 +#define LATIN_SMALL_LETTER_D_WITH_TAIL 0x0256 +#define LATIN_SMALL_LETTER_D_WITH_HOOK 0x0257 +#define LATIN_SMALL_LETTER_REVERSED_E 0x0258 +#define LATIN_SMALL_LETTER_SCHWA 0x0259 +#define LATIN_SMALL_LETTER_SCHWA_WITH_HOOK 0x025A +#define LATIN_SMALL_LETTER_OPEN_E 0x025B +#define LATIN_SMALL_LETTER_REVERSED_OPEN_E 0x025C +#define LATIN_SMALL_LETTER_REVERSED_OPEN_E_WITH_HOOK 0x025D +#define LATIN_SMALL_LETTER_CLOSED_REVERSED_OPEN_E 0x025E +#define LATIN_SMALL_LETTER_DOTLESS_J_WITH_STROKE 0x025F +#define LATIN_SMALL_LETTER_G_WITH_HOOK 0x0260 +#define LATIN_SMALL_LETTER_SCRIPT_G 0x0261 +#define LATIN_LETTER_SMALL_CAPITAL_G 0x0262 +#define LATIN_SMALL_LETTER_GAMMA 0x0263 +#define LATIN_SMALL_LETTER_RAMS_HORN 0x0264 +#define LATIN_SMALL_LETTER_TURNED_H 0x0265 +#define LATIN_SMALL_LETTER_H_WITH_HOOK 0x0266 +#define LATIN_SMALL_LETTER_HENG_WITH_HOOK 0x0267 +#define LATIN_SMALL_LETTER_I_WITH_STROKE 0x0268 +#define LATIN_SMALL_LETTER_IOTA 0x0269 +#define LATIN_LETTER_SMALL_CAPITAL_I 0x026A +#define LATIN_SMALL_LETTER_L_WITH_MIDDLE_TILDE 0x026B +#define LATIN_SMALL_LETTER_L_WITH_BELT 0x026C +#define LATIN_SMALL_LETTER_L_WITH_RETROFLEX_HOOK 0x026D +#define LATIN_SMALL_LETTER_LEZH 0x026E +#define LATIN_SMALL_LETTER_TURNED_M 0x026F +#define LATIN_SMALL_LETTER_TURNED_M_WITH_LONG_LEG 0x0270 +#define LATIN_SMALL_LETTER_M_WITH_HOOK 0x0271 +#define LATIN_SMALL_LETTER_N_WITH_LEFT_HOOK 0x0272 +#define LATIN_SMALL_LETTER_N_WITH_RETROFLEX_HOOK 0x0273 +#define LATIN_LETTER_SMALL_CAPITAL_N 0x0274 +#define LATIN_SMALL_LETTER_BARRED_O 0x0275 +#define LATIN_LETTER_SMALL_CAPITAL_OE 0x0276 +#define LATIN_SMALL_LETTER_CLOSED_OMEGA 0x0277 +#define LATIN_SMALL_LETTER_PHI 0x0278 +#define LATIN_SMALL_LETTER_TURNED_R 0x0279 +#define LATIN_SMALL_LETTER_TURNED_R_WITH_LONG_LEG 0x027A +#define LATIN_SMALL_LETTER_TURNED_R_WITH_HOOK 0x027B +#define LATIN_SMALL_LETTER_R_WITH_LONG_LEG 0x027C +#define LATIN_SMALL_LETTER_R_WITH_TAIL 0x027D +#define LATIN_SMALL_LETTER_R_WITH_FISHHOOK 0x027E +#define LATIN_SMALL_LETTER_REVERSED_R_WITH_FISHHOOK 0x027F +#define LATIN_LETTER_SMALL_CAPITAL_R 0x0280 +#define LATIN_LETTER_SMALL_CAPITAL_INVERTED_R 0x0281 +#define LATIN_SMALL_LETTER_S_WITH_HOOK 0x0282 +#define LATIN_SMALL_LETTER_ESH 0x0283 +#define LATIN_SMALL_LETTER_DOTLESS_J_WITH_STROKE_AND_HOOK 0x0284 +#define LATIN_SMALL_LETTER_SQUAT_REVERSED_ESH 0x0285 +#define LATIN_SMALL_LETTER_ESH_WITH_CURL 0x0286 +#define LATIN_SMALL_LETTER_TURNED_T 0x0287 +#define LATIN_SMALL_LETTER_T_WITH_RETROFLEX_HOOK 0x0288 +#define LATIN_SMALL_LETTER_U_BAR 0x0289 +#define LATIN_SMALL_LETTER_UPSILON 0x028A +#define LATIN_SMALL_LETTER_V_WITH_HOOK 0x028B +#define LATIN_SMALL_LETTER_TURNED_V 0x028C +#define LATIN_SMALL_LETTER_TURNED_W 0x028D +#define LATIN_SMALL_LETTER_TURNED_Y 0x028E +#define LATIN_LETTER_SMALL_CAPITAL_Y 0x028F +#define LATIN_SMALL_LETTER_Z_WITH_RETROFLEX_HOOK 0x0290 +#define LATIN_SMALL_LETTER_Z_WITH_CURL 0x0291 +#define LATIN_SMALL_LETTER_EZH 0x0292 +#define LATIN_SMALL_LETTER_EZH_WITH_CURL 0x0293 +#define LATIN_LETTER_GLOTTAL_STOP 0x0294 +#define LATIN_LETTER_PHARYNGEAL_VOICED_FRICATIVE 0x0295 +#define LATIN_LETTER_INVERTED_GLOTTAL_STOP 0x0296 +#define LATIN_LETTER_STRETCHED_C 0x0297 +#define LATIN_LETTER_BILABIAL_CLICK 0x0298 +#define LATIN_LETTER_SMALL_CAPITAL_B 0x0299 +#define LATIN_SMALL_LETTER_CLOSED_OPEN_E 0x029A +#define LATIN_LETTER_SMALL_CAPITAL_G_WITH_HOOK 0x029B +#define LATIN_LETTER_SMALL_CAPITAL_H 0x029C +#define LATIN_SMALL_LETTER_J_WITH_CROSSED_TAIL 0x029D +#define LATIN_SMALL_LETTER_TURNED_K 0x029E +#define LATIN_LETTER_SMALL_CAPITAL_L 0x029F +#define LATIN_SMALL_LETTER_Q_WITH_HOOK 0x02A0 +#define LATIN_LETTER_GLOTTAL_STOP_WITH_STROKE 0x02A1 +#define LATIN_LETTER_REVERSED_GLOTTAL_STOP_WITH_STROKE 0x02A2 +#define LATIN_SMALL_LETTER_DZ_DIGRAPH 0x02A3 +#define LATIN_SMALL_LETTER_DEZH_DIGRAPH 0x02A4 +#define LATIN_SMALL_LETTER_DZ_DIGRAPH_WITH_CURL 0x02A5 +#define LATIN_SMALL_LETTER_TS_DIGRAPH 0x02A6 +#define LATIN_SMALL_LETTER_TESH_DIGRAPH 0x02A7 +#define LATIN_SMALL_LETTER_TC_DIGRAPH_WITH_CURL 0x02A8 +#define LATIN_SMALL_LETTER_FENG_DIGRAPH 0x02A9 +#define LATIN_SMALL_LETTER_LS_DIGRAPH 0x02AA +#define LATIN_SMALL_LETTER_LZ_DIGRAPH 0x02AB +#define LATIN_LETTER_BILABIAL_PERCUSSIVE 0x02AC +#define LATIN_LETTER_BIDENTAL_PERCUSSIVE 0x02AD + +/* spacing modifier letters */ +#define MODIFIER_LETTER_SMALL_H 0x02B0 +#define MODIFIER_LETTER_SMALL_H_WITH_HOOK 0x02B1 +#define MODIFIER_LETTER_SMALL_J 0x02B2 +#define MODIFIER_LETTER_SMALL_R 0x02B3 +#define MODIFIER_LETTER_SMALL_TURNED_R 0x02B4 +#define MODIFIER_LETTER_SMALL_TURNED_R_WITH_HOOK 0x02B5 +#define MODIFIER_LETTER_SMALL_CAPITAL_INVERTED_R 0x02B6 +#define MODIFIER_LETTER_SMALL_W 0x02B7 +#define MODIFIER_LETTER_SMALL_Y 0x02B8 +#define MODIFIER_LETTER_PRIME 0x02B9 +#define MODIFIER_LETTER_DOUBLE_PRIME 0x02BA +#define MODIFIER_LETTER_TURNED_COMMA 0x02BB +#define MODIFIER_LETTER_APOSTROPHE 0x02BC +#define MODIFIER_LETTER_REVERSED_COMMA 0x02BD +#define MODIFIER_LETTER_RIGHT_HALF_RING 0x02BE +#define MODIFIER_LETTER_LEFT_HALF_RING 0x02BF +#define MODIFIER_LETTER_GLOTTAL_STOP 0x02C0 +#define MODIFIER_LETTER_REVERSED_GLOTTAL_STOP 0x02C1 +#define MODIFIER_LETTER_LEFT_ARROWHEAD 0x02C2 +#define MODIFIER_LETTER_RIGHT_ARROWHEAD 0x02C3 +#define MODIFIER_LETTER_UP_ARROWHEAD 0x02C4 +#define MODIFIER_LETTER_DOWN_ARROWHEAD 0x02C5 +#define MODIFIER_LETTER_CIRCUMFLEX_ACCENT 0x02C6 +#define CARON 0x02C7 +#define MODIFIER_LETTER_VERTICAL_LINE 0x02C8 +#define MODIFIER_LETTER_MACRON 0x02C9 +#define MODIFIER_LETTER_ACUTE_ACCENT 0x02CA +#define MODIFIER_LETTER_GRAVE_ACCENT 0x02CB +#define MODIFIER_LETTER_LOW_VERTICAL_LINE 0x02CC +#define MODIFIER_LETTER_LOW_MACRON 0x02CD +#define MODIFIER_LETTER_LOW_GRAVE_ACCENT 0x02CE +#define MODIFIER_LETTER_LOW_ACUTE_ACCENT 0x02CF +#define MODIFIER_LETTER_TRIANGULAR_COLON 0x02D0 +#define MODIFIER_LETTER_HALF_TRIANGULAR_COLON 0x02D1 +#define MODIFIER_LETTER_CENTRED_RIGHT_HALF_RING 0x02D2 +#define MODIFIER_LETTER_CENTRED_LEFT_HALF_RING 0x02D3 +#define MODIFIER_LETTER_UP_TACK 0x02D4 +#define MODIFIER_LETTER_DOWN_TACK 0x02D5 +#define MODIFIER_LETTER_PLUS_SIGN 0x02D6 +#define MODIFIER_LETTER_MINUS_SIGN 0x02D7 +#define BREVE 0x02D8 +#define DOT_ABOVE 0x02D9 +#define RING_ABOVE 0x02DA +#define OGONEK 0x02DB +#define SMALL_TILDE 0x02DC +#define DOUBLE_ACUTE_ACCENT 0x02DD +#define MODIFIER_LETTER_RHOTIC_HOOK 0x02DE +#define MODIFIER_LETTER_CROSS_ACCENT 0x02DF +#define MODIFIER_LETTER_SMALL_GAMMA 0x02E0 +#define MODIFIER_LETTER_SMALL_L 0x02E1 +#define MODIFIER_LETTER_SMALL_S 0x02E2 +#define MODIFIER_LETTER_SMALL_X 0x02E3 +#define MODIFIER_LETTER_SMALL_REVERSED_GLOTTAL_STOP 0x02E4 +#define MODIFIER_LETTER_EXTRA_HIGH_TONE_BAR 0x02E5 +#define MODIFIER_LETTER_HIGH_TONE_BAR 0x02E6 +#define MODIFIER_LETTER_MID_TONE_BAR 0x02E7 +#define MODIFIER_LETTER_LOW_TONE_BAR 0x02E8 +#define MODIFIER_LETTER_EXTRA_LOW_TONE_BAR 0x02E9 +#define MODIFIER_LETTER_YIN_DEPARTING_TONE_MARK 0x02EA +#define MODIFIER_LETTER_YANG_DEPARTING_TONE_MARK 0x02EB +#define MODIFIER_LETTER_VOICING 0x02EC +#define MODIFIER_LETTER_UNASPIRATED 0x02ED +#define MODIFIER_LETTER_DOUBLE_APOSTROPHE 0x02EE + +/* combining diacritical marks */ +#define COMBINING_GRAVE_ACCENT 0x0300 +#define COMBINING_ACUTE_ACCENT 0x0301 +#define COMBINING_CIRCUMFLEX_ACCENT 0x0302 +#define COMBINING_TILDE 0x0303 +#define COMBINING_MACRON 0x0304 +#define COMBINING_OVERLINE 0x0305 +#define COMBINING_BREVE 0x0306 +#define COMBINING_DOT_ABOVE 0x0307 +#define COMBINING_DIAERESIS 0x0308 +#define COMBINING_HOOK_ABOVE 0x0309 +#define COMBINING_RING_ABOVE 0x030A +#define COMBINING_DOUBLE_ACUTE_ACCENT 0x030B +#define COMBINING_CARON 0x030C +#define COMBINING_VERTICAL_LINE_ABOVE 0x030D +#define COMBINING_DOUBLE_VERTICAL_LINE_ABOVE 0x030E +#define COMBINING_DOUBLE_GRAVE_ACCENT 0x030F +#define COMBINING_CANDRABINDU 0x0310 +#define COMBINING_INVERTED_BREVE 0x0311 +#define COMBINING_TURNED_COMMA_ABOVE 0x0312 +#define COMBINING_COMMA_ABOVE 0x0313 +#define COMBINING_REVERSED_COMMA_ABOVE 0x0314 +#define COMBINING_COMMA_ABOVE_RIGHT 0x0315 +#define COMBINING_GRAVE_ACCENT_BELOW 0x0316 +#define COMBINING_ACUTE_ACCENT_BELOW 0x0317 +#define COMBINING_LEFT_TACK_BELOW 0x0318 +#define COMBINING_RIGHT_TACK_BELOW 0x0319 +#define COMBINING_LEFT_ANGLE_ABOVE 0x031A +#define COMBINING_HORN 0x031B +#define COMBINING_LEFT_HALF_RING_BELOW 0x031C +#define COMBINING_UP_TACK_BELOW 0x031D +#define COMBINING_DOWN_TACK_BELOW 0x031E +#define COMBINING_PLUS_SIGN_BELOW 0x031F +#define COMBINING_MINUS_SIGN_BELOW 0x0320 +#define COMBINING_PALATALIZED_HOOK_BELOW 0x0321 +#define COMBINING_RETROFLEX_HOOK_BELOW 0x0322 +#define COMBINING_DOT_BELOW 0x0323 +#define COMBINING_DIAERESIS_BELOW 0x0324 +#define COMBINING_RING_BELOW 0x0325 +#define COMBINING_COMMA_BELOW 0x0326 +#define COMBINING_CEDILLA 0x0327 +#define COMBINING_OGONEK 0x0328 +#define COMBINING_VERTICAL_LINE_BELOW 0x0329 +#define COMBINING_BRIDGE_BELOW 0x032A +#define COMBINING_INVERTED_DOUBLE_ARCH_BELOW 0x032B +#define COMBINING_CARON_BELOW 0x032C +#define COMBINING_CIRCUMFLEX_ACCENT_BELOW 0x032D +#define COMBINING_BREVE_BELOW 0x032E +#define COMBINING_INVERTED_BREVE_BELOW 0x032F +#define COMBINING_TILDE_BELOW 0x0330 +#define COMBINING_MACRON_BELOW 0x0331 +#define COMBINING_LOW_LINE 0x0332 +#define COMBINING_DOUBLE_LOW_LINE 0x0333 +#define COMBINING_TILDE_OVERLAY 0x0334 +#define COMBINING_SHORT_STROKE_OVERLAY 0x0335 +#define COMBINING_LONG_STROKE_OVERLAY 0x0336 +#define COMBINING_SHORT_SOLIDUS_OVERLAY 0x0337 +#define COMBINING_LONG_SOLIDUS_OVERLAY 0x0338 +#define COMBINING_RIGHT_HALF_RING_BELOW 0x0339 +#define COMBINING_INVERTED_BRIDGE_BELOW 0x033A +#define COMBINING_SQUARE_BELOW 0x033B +#define COMBINING_SEAGULL_BELOW 0x033C +#define COMBINING_X_ABOVE 0x033D +#define COMBINING_VERTICAL_TILDE 0x033E +#define COMBINING_DOUBLE_OVERLINE 0x033F +#define COMBINING_GRAVE_TONE_MARK 0x0340 +#define COMBINING_ACUTE_TONE_MARK 0x0341 +#define COMBINING_GREEK_PERISPOMENI 0x0342 +#define COMBINING_GREEK_KORONIS 0x0343 +#define COMBINING_GREEK_DIALYTIKA_TONOS 0x0344 +#define COMBINING_GREEK_YPOGEGRAMMENI 0x0345 +#define COMBINING_BRIDGE_ABOVE 0x0346 +#define COMBINING_EQUALS_SIGN_BELOW 0x0347 +#define COMBINING_DOUBLE_VERTICAL_LINE_BELOW 0x0348 +#define COMBINING_LEFT_ANGLE_BELOW 0x0349 +#define COMBINING_NOT_TILDE_ABOVE 0x034A +#define COMBINING_HOMOTHETIC_ABOVE 0x034B +#define COMBINING_ALMOST_EQUAL_TO_ABOVE 0x034C +#define COMBINING_LEFT_RIGHT_ARROW_BELOW 0x034D +#define COMBINING_UPWARDS_ARROW_BELOW 0x034E +#define COMBINING_DOUBLE_TILDE 0x0360 +#define COMBINING_DOUBLE_INVERTED_BREVE 0x0361 +#define COMBINING_DOUBLE_RIGHTWARDS_ARROW_BELOW 0x0362 + +/* greek letters */ +#define GREEK_NUMERAL_SIGN 0x0374 +#define GREEK_LOWER_NUMERAL_SIGN 0x0375 +#define GREEK_YPOGEGRAMMENI 0x037A +#define GREEK_QUESTION_MARK 0x037E +#define GREEK_TONOS 0x0384 +#define GREEK_DIALYTIKA_TONOS 0x0385 +#define GREEK_CAPITAL_LETTER_ALPHA_WITH_TONOS 0x0386 +#define GREEK_ANO_TELEIA 0x0387 +#define GREEK_CAPITAL_LETTER_EPSILON_WITH_TONOS 0x0388 +#define GREEK_CAPITAL_LETTER_ETA_WITH_TONOS 0x0389 +#define GREEK_CAPITAL_LETTER_IOTA_WITH_TONOS 0x038A +#define GREEK_CAPITAL_LETTER_OMICRON_WITH_TONOS 0x038C +#define GREEK_CAPITAL_LETTER_UPSILON_WITH_TONOS 0x038E +#define GREEK_CAPITAL_LETTER_OMEGA_WITH_TONOS 0x038F +#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390 +#define GREEK_CAPITAL_LETTER_ALPHA 0x0391 +#define GREEK_CAPITAL_LETTER_BETA 0x0392 +#define GREEK_CAPITAL_LETTER_GAMMA 0x0393 +#define GREEK_CAPITAL_LETTER_DELTA 0x0394 +#define GREEK_CAPITAL_LETTER_EPSILON 0x0395 +#define GREEK_CAPITAL_LETTER_ZETA 0x0396 +#define GREEK_CAPITAL_LETTER_ETA 0x0397 +#define GREEK_CAPITAL_LETTER_THETA 0x0398 +#define GREEK_CAPITAL_LETTER_IOTA 0x0399 +#define GREEK_CAPITAL_LETTER_KAPPA 0x039A +#define GREEK_CAPITAL_LETTER_LAMDA 0x039B +#define GREEK_CAPITAL_LETTER_MU 0x039C +#define GREEK_CAPITAL_LETTER_NU 0x039D +#define GREEK_CAPITAL_LETTER_XI 0x039E +#define GREEK_CAPITAL_LETTER_OMICRON 0x039F +#define GREEK_CAPITAL_LETTER_PI 0x03A0 +#define GREEK_CAPITAL_LETTER_RHO 0x03A1 +#define GREEK_CAPITAL_LETTER_SIGMA 0x03A3 +#define GREEK_CAPITAL_LETTER_TAU 0x03A4 +#define GREEK_CAPITAL_LETTER_UPSILON 0x03A5 +#define GREEK_CAPITAL_LETTER_PHI 0x03A6 +#define GREEK_CAPITAL_LETTER_CHI 0x03A7 +#define GREEK_CAPITAL_LETTER_PSI 0x03A8 +#define GREEK_CAPITAL_LETTER_OMEGA 0x03A9 +#define GREEK_CAPITAL_LETTER_IOTA_WITH_DIALYTIKA 0x03AA +#define GREEK_CAPITAL_LETTER_UPSILON_WITH_DIALYTIKA 0x03AB +#define GREEK_SMALL_LETTER_ALPHA_WITH_TONOS 0x03AC +#define GREEK_SMALL_LETTER_EPSILON_WITH_TONOS 0x03AD +#define GREEK_SMALL_LETTER_ETA_WITH_TONOS 0x03AE +#define GREEK_SMALL_LETTER_IOTA_WITH_TONOS 0x03AF +#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0 +#define GREEK_SMALL_LETTER_ALPHA 0x03B1 +#define GREEK_SMALL_LETTER_BETA 0x03B2 +#define GREEK_SMALL_LETTER_GAMMA 0x03B3 +#define GREEK_SMALL_LETTER_DELTA 0x03B4 +#define GREEK_SMALL_LETTER_EPSILON 0x03B5 +#define GREEK_SMALL_LETTER_ZETA 0x03B6 +#define GREEK_SMALL_LETTER_ETA 0x03B7 +#define GREEK_SMALL_LETTER_THETA 0x03B8 +#define GREEK_SMALL_LETTER_IOTA 0x03B9 +#define GREEK_SMALL_LETTER_KAPPA 0x03BA +#define GREEK_SMALL_LETTER_LAMDA 0x03BB +#define GREEK_SMALL_LETTER_MU 0x03BC +#define GREEK_SMALL_LETTER_NU 0x03BD +#define GREEK_SMALL_LETTER_XI 0x03BE +#define GREEK_SMALL_LETTER_OMICRON 0x03BF +#define GREEK_SMALL_LETTER_PI 0x03C0 +#define GREEK_SMALL_LETTER_RHO 0x03C1 +#define GREEK_SMALL_LETTER_FINAL_SIGMA 0x03C2 +#define GREEK_SMALL_LETTER_SIGMA 0x03C3 +#define GREEK_SMALL_LETTER_TAU 0x03C4 +#define GREEK_SMALL_LETTER_UPSILON 0x03C5 +#define GREEK_SMALL_LETTER_PHI 0x03C6 +#define GREEK_SMALL_LETTER_CHI 0x03C7 +#define GREEK_SMALL_LETTER_PSI 0x03C8 +#define GREEK_SMALL_LETTER_OMEGA 0x03C9 +#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA 0x03CA +#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA 0x03CB +#define GREEK_SMALL_LETTER_OMICRON_WITH_TONOS 0x03CC +#define GREEK_SMALL_LETTER_UPSILON_WITH_TONOS 0x03CD +#define GREEK_SMALL_LETTER_OMEGA_WITH_TONOS 0x03CE +#define GREEK_BETA_SYMBOL 0x03D0 +#define GREEK_THETA_SYMBOL 0x03D1 +#define GREEK_UPSILON_WITH_HOOK_SYMBOL 0x03D2 +#define GREEK_UPSILON_WITH_ACUTE_AND_HOOK_SYMBOL 0x03D3 +#define GREEK_UPSILON_WITH_DIAERESIS_AND_HOOK_SYMBOL 0x03D4 +#define GREEK_PHI_SYMBOL 0x03D5 +#define GREEK_PI_SYMBOL 0x03D6 +#define GREEK_KAI_SYMBOL 0x03D7 +#define GREEK_LETTER_STIGMA 0x03DA +#define GREEK_SMALL_LETTER_STIGMA 0x03DB +#define GREEK_LETTER_DIGAMMA 0x03DC +#define GREEK_SMALL_LETTER_DIGAMMA 0x03DD +#define GREEK_LETTER_KOPPA 0x03DE +#define GREEK_SMALL_LETTER_KOPPA 0x03DF +#define GREEK_LETTER_SAMPI 0x03E0 +#define GREEK_SMALL_LETTER_SAMPI 0x03E1 + +/* general punctuation (partial) */ +#define HYPHEN 0x2010 +#define NON_BREAKING_HYPHEN 0x2011 +#define FIGURE_DASH 0x2012 +#define EN_DASH 0x2013 +#define EM_DASH 0x2014 +#define HORIZONTAL_BAR 0x2015 +#define LEFT_SINGLE_QUOTATION_MARK 0x2018 +#define RIGHT_SINGLE_QUOTATION_MARK 0x2019 +#define SINGLE_LOW_9_QUOTATION_MARK 0x201A +#define SINGLE_HIGH_REVERSED_9_QUOTATION_MARK 0x201B +#define LEFT_DOUBLE_QUOTATION_MARK 0x201C +#define RIGHT_DOUBLE_QUOTATION_MARK 0x201D +#define DOUBLE_LOW_9_QUOTATION_MARK 0x201E +#define DOUBLE_HIGH_REVERSED_9_QUOTATION_MARK 0x201F +#define DAGGER 0x2020 +#define DOUBLE_DAGGER 0x2021 +#define BULLET 0x2022 +#define TRIANGULAR_BULLET 0x2023 +#define HYPHENATION_POINT 0x2027 +#define HORIZONTAL_ELLIPSIS 0x2026 +#define PER_MILLE_SIGN 0x2030 +#define SINGLE_LEFT_POINTING_ANGLE_QUOTATION_MARK 0x2039 +#define SINGLE_RIGHT_POINTING_ANGLE_QUOTATION_MARK 0x203A +#define EURO_CURRENCY_SIGN 0x20AC + +/* mathematical operators */ +#define FOR_ALL 0x2200 +#define COMPLEMENT 0x2201 +#define PARTIAL_DIFFERENTIAL 0x2202 +#define THERE_EXISTS 0x2203 +#define THERE_DOES_NOT_EXIST 0x2204 +#define EMPTY_SET 0x2205 +#define INCREMENT 0x2206 +#define NABLA 0x2207 +#define ELEMENT_OF 0x2208 +#define NOT_AN_ELEMENT_OF 0x2209 +#define SMALL_ELEMENT_OF 0x220A +#define CONTAINS_AS_MEMBER 0x220B +#define DOES_NOT_CONTAIN_AS_MEMBER 0x220C +#define SMALL_CONTAINS_AS_MEMBER 0x220D +#define END_OF_PROOF 0x220E +#define N_ARY_PRODUCT 0x220F +#define N_ARY_COPRODUCT 0x2210 +#define N_ARY_SUMMATION 0x2211 +#define MINUS_SIGN 0x2212 +#define MINUS_OR_PLUS_SIGN 0x2213 +#define DOT_PLUS 0x2214 +#define DIVISION_SLASH 0x2215 +#define SET_MINUS 0x2216 +#define ASTERISK_OPERATOR 0x2217 +#define RING_OPERATOR 0x2218 +#define BULLET_OPERATOR 0x2219 +#define SQUARE_ROOT 0x221A +#define CUBE_ROOT 0x221B +#define FOURTH_ROOT 0x221C +#define PROPORTIONAL_TO 0x221D +/* INFINITY conflicts with math.h */ +#define INFINITY 0x221E +#define RIGHT_ANGLE 0x221F +#define ANGLE 0x2220 +#define MEASURED_ANGLE 0x2221 +#define SPHERICAL_ANGLE 0x2222 +#define DIVIDES 0x2223 +#define DOES_NOT_DIVIDE 0x2224 +#define PARALLEL_TO 0x2225 +#define NOT_PARALLEL_TO 0x2226 +#define LOGICAL_AND 0x2227 +#define LOGICAL_OR 0x2228 +#define INTERSECTION 0x2229 +#define UNION 0x222A +#define INTEGRAL 0x222B +#define DOUBLE_INTEGRAL 0x222C +#define TRIPLE_INTEGRAL 0x222D +#define CONTOUR_INTEGRAL 0x222E +#define SURFACE_INTEGRAL 0x222F +#define VOLUME_INTEGRAL 0x2230 +#define CLOCKWISE_INTEGRAL 0x2231 +#define CLOCKWISE_CONTOUR_INTEGRAL 0x2232 +#define ANTICLOCKWISE_CONTOUR_INTEGRAL 0x2233 +#define THEREFORE 0x2234 +#define BECAUSE 0x2235 +#define RATIO 0x2236 +#define PROPORTION 0x2237 +#define DOT_MINUS 0x2238 +#define EXCESS 0x2239 +#define GEOMETRIC_PROPORTION 0x223A +#define HOMOTHETIC 0x223B +#define TILDE_OPERATOR 0x223C +#define REVERSED_TILDE 0x223D +#define INVERTED_LAZY_S 0x223E +#define SINE_WAVE 0x223F +#define WREATH_PRODUCT 0x2240 +#define NOT_TILDE 0x2241 +#define MINUS_TILDE 0x2242 +#define ASYMPTOTICALLY_EQUAL_TO 0x2243 +#define NOT_ASYMPTOTICALLY_EQUAL_TO 0x2244 +#define APPROXIMATELY_EQUAL_TO 0x2245 +#define APPROXIMATELY_BUT_NOT_ACTUALLY_EQUAL_TO 0x2246 +#define NEITHER_APPROXIMATELY_NOR_ACTUALLY_EQUAL_TO 0x2247 +#define ALMOST_EQUAL_TO 0x2248 +#define NOT_ALMOST_EQUAL_TO 0x2249 +#define ALMOST_EQUAL_OR_EQUAL_TO 0x224A +#define TRIPLE_TILDE 0x224B +#define ALL_EQUAL_TO 0x224C +#define EQUIVALENT_TO 0x224D +#define GEOMETRICALLY_EQUIVALENT_TO 0x224E +#define DIFFERENCE_BETWEEN 0x224F +#define APPROACHES_THE_LIMIT 0x2250 +#define GEOMETRICALLY_EQUAL_TO 0x2251 +#define APPROXIMATELY_EQUAL_TO_OR_THE_IMAGE_OF 0x2252 +#define IMAGE_OF_OR_APPROXIMATELY_EQUAL_TO 0x2253 +#define COLON_EQUALS 0x2254 +#define EQUALS_COLON 0x2255 +#define RING_IN_EQUAL_TO 0x2256 +#define RING_EQUAL_TO 0x2257 +#define CORRESPONDS_TO 0x2258 +#define ESTIMATES 0x2259 +#define EQUIANGULAR_TO 0x225A +#define STAR_EQUALS 0x225B +#define DELTA_EQUAL_TO 0x225C +#define EQUAL_TO_BY_DEFINITION 0x225D +#define MEASURED_BY 0x225E +#define QUESTIONED_EQUAL_TO 0x225F +#define NOT_EQUAL_TO 0x2260 +#define IDENTICAL_TO 0x2261 +#define NOT_IDENTICAL_TO 0x2262 +#define STRICTLY_EQUIVALENT_TO 0x2263 +#define LESS_THAN_OR_EQUAL_TO 0x2264 +#define GREATER_THAN_OR_EQUAL_TO 0x2265 +#define LESS_THAN_OVER_EQUAL_TO 0x2266 +#define GREATER_THAN_OVER_EQUAL_TO 0x2267 +#define LESS_THAN_BUT_NOT_EQUAL_TO 0x2268 +#define GREATER_THAN_BUT_NOT_EQUAL_TO 0x2269 +#define MUCH_LESS_THAN 0x226A +#define MUCH_GREATER_THAN 0x226B +#define BETWEEN 0x226C +#define NOT_EQUIVALENT_TO 0x226D +#define NOT_LESS_THAN 0x226E +#define NOT_GREATER_THAN 0x226F +#define NEITHER_LESS_THAN_NOR_EQUAL_TO 0x2270 +#define NEITHER_GREATER_THAN_NOR_EQUAL_TO 0x2271 +#define LESS_THAN_OR_EQUIVALENT_TO 0x2272 +#define GREATER_THAN_OR_EQUIVALENT_TO 0x2273 +#define NEITHER_LESS_THAN_NOR_EQUIVALENT_TO 0x2274 +#define NEITHER_GREATER_THAN_NOR_EQUIVALENT_TO 0x2275 +#define LESS_THAN_OR_GREATER_THAN 0x2276 +#define GREATER_THAN_OR_LESS_THAN 0x2277 +#define NEITHER_LESS_THAN_NOR_GREATER_THAN 0x2278 +#define NEITHER_GREATER_THAN_NOR_LESS_THAN 0x2279 +#define PRECEDES 0x227A +#define SUCCEEDS 0x227B +#define PRECEDES_OR_EQUAL_TO 0x227C +#define SUCCEEDS_OR_EQUAL_TO 0x227D +#define PRECEDES_OR_EQUIVALENT_TO 0x227E +#define SUCCEEDS_OR_EQUIVALENT_TO 0x227F +#define DOES_NOT_PRECEDE 0x2280 +#define DOES_NOT_SUCCEED 0x2281 +#define SUBSET_OF 0x2282 +#define SUPERSET_OF 0x2283 +#define NOT_A_SUBSET_OF 0x2284 +#define NOT_A_SUPERSET_OF 0x2285 +#define SUBSET_OF_OR_EQUAL_TO 0x2286 +#define SUPERSET_OF_OR_EQUAL_TO 0x2287 +#define NEITHER_A_SUBSET_OF_NOR_EQUAL_TO 0x2288 +#define NEITHER_A_SUPERSET_OF_NOR_EQUAL_TO 0x2289 +#define SUBSET_OF_WITH_NOT_EQUAL_TO 0x228A +#define SUPERSET_OF_WITH_NOT_EQUAL_TO 0x228B +#define MULTISET 0x228C +#define MULTISET_MULTIPLICATION 0x228D +#define MULTISET_UNION 0x228E +#define SQUARE_IMAGE_OF 0x228F +#define SQUARE_ORIGINAL_OF 0x2290 +#define SQUARE_IMAGE_OF_OR_EQUAL_TO 0x2291 +#define SQUARE_ORIGINAL_OF_OR_EQUAL_TO 0x2292 +#define SQUARE_CAP 0x2293 +#define SQUARE_CUP 0x2294 +#define CIRCLED_PLUS 0x2295 +#define CIRCLED_MINUS 0x2296 +#define CIRCLED_TIMES 0x2297 +#define CIRCLED_DIVISION_SLASH 0x2298 +#define CIRCLED_DOT_OPERATOR 0x2299 +#define CIRCLED_RING_OPERATOR 0x229A +#define CIRCLED_ASTERISK_OPERATOR 0x229B +#define CIRCLED_EQUALS 0x229C +#define CIRCLED_DASH 0x229D +#define SQUARED_PLUS 0x229E +#define SQUARED_MINUS 0x229F +#define SQUARED_TIMES 0x22A0 +#define SQUARED_DOT_OPERATOR 0x22A1 +#define RIGHT_TACK 0x22A2 +#define LEFT_TACK 0x22A3 +#define DOWN_TACK 0x22A4 +#define UP_TACK 0x22A5 +#define ASSERTION 0x22A6 +#define MODELS 0x22A7 +#define TRUEx 0x22A8 +#define FORCES 0x22A9 +#define TRIPLE_VERTICAL_BAR_RIGHT_TURNSTILE 0x22AA +#define DOUBLE_VERTICAL_BAR_DOUBLE_RIGHT_TURNSTILE 0x22AB +#define DOES_NOT_PROVE 0x22AC +#define NOT_TRUE 0x22AD +#define DOES_NOT_FORCE 0x22AE +#define NEGATED_DOUBLE_VERTICAL_BAR_DOUBLE_RIGHT_TURNSTILE 0x22AF +#define PRECEDES_UNDER_RELATION 0x22B0 +#define SUCCEEDS_UNDER_RELATION 0x22B1 +#define NORMAL_SUBGROUP_OF 0x22B2 +#define CONTAINS_AS_NORMAL_SUBGROUP 0x22B3 +#define NORMAL_SUBGROUP_OF_OR_EQUAL_TO 0x22B4 +#define CONTAINS_AS_NORMAL_SUBGROUP_OR_EQUAL_TO 0x22B5 +#define ORIGINAL_OF 0x22B6 +#define IMAGE_OF 0x22B7 +#define MULTIMAP 0x22B8 +#define HERMITIAN_CONJUGATE_MATRIX 0x22B9 +#define INTERCALATE 0x22BA +#define XOR 0x22BB +#define NAND 0x22BC +#define NOR 0x22BD +#define RIGHT_ANGLE_WITH_ARC 0x22BE +#define RIGHT_TRIANGLE 0x22BF +#define N_ARY_LOGICAL_AND 0x22C0 +#define N_ARY_LOGICAL_OR 0x22C1 +#define N_ARY_INTERSECTION 0x22C2 +#define N_ARY_UNION 0x22C3 +#define DIAMOND_OPERATOR 0x22C4 +#define DOT_OPERATOR 0x22C5 +#define STAR_OPERATOR 0x22C6 +#define DIVISION_TIMES 0x22C7 +#define BOWTIE 0x22C8 +#define LEFT_NORMAL_FACTOR_SEMIDIRECT_PRODUCT 0x22C9 +#define RIGHT_NORMAL_FACTOR_SEMIDIRECT_PRODUCT 0x22CA +#define LEFT_SEMIDIRECT_PRODUCT 0x22CB +#define RIGHT_SEMIDIRECT_PRODUCT 0x22CC +#define REVERSED_TILDE_EQUALS 0x22CD +#define CURLY_LOGICAL_OR 0x22CE +#define CURLY_LOGICAL_AND 0x22CF +#define DOUBLE_SUBSET 0x22D0 +#define DOUBLE_SUPERSET 0x22D1 +#define DOUBLE_INTERSECTION 0x22D2 +#define DOUBLE_UNION 0x22D3 +#define PITCHFORK 0x22D4 +#define EQUAL_AND_PARALLEL_TO 0x22D5 +#define LESS_THAN_WITH_DOT 0x22D6 +#define GREATER_THAN_WITH_DOT 0x22D7 +#define VERY_MUCH_LESS_THAN 0x22D8 +#define VERY_MUCH_GREATER_THAN 0x22D9 +#define LESS_THAN_EQUAL_TO_OR_GREATER_THAN 0x22DA +#define GREATER_THAN_EQUAL_TO_OR_LESS_THAN 0x22DB +#define EQUAL_TO_OR_LESS_THAN 0x22DC +#define EQUAL_TO_OR_GREATER_THAN 0x22DD +#define EQUAL_TO_OR_PRECEDES 0x22DE +#define EQUAL_TO_OR_SUCCEEDS 0x22DF +#define DOES_NOT_PRECEDE_OR_EQUAL 0x22E0 +#define DOES_NOT_SUCCEED_OR_EQUAL 0x22E1 +#define NOT_SQUARE_IMAGE_OF_OR_EQUAL_TO 0x22E2 +#define NOT_SQUARE_ORIGINAL_OF_OR_EQUAL_TO 0x22E3 +#define SQUARE_IMAGE_OF_OR_NOT_EQUAL_TO 0x22E4 +#define SQUARE_ORIGINAL_OF_OR_NOT_EQUAL_TO 0x22E5 +#define LESS_THAN_BUT_NOT_EQUIVALENT_TO 0x22E6 +#define GREATER_THAN_BUT_NOT_EQUIVALENT_TO 0x22E7 +#define PRECEDES_BUT_NOT_EQUIVALENT_TO 0x22E8 +#define SUCCEEDS_BUT_NOT_EQUIVALENT_TO 0x22E9 +#define NOT_NORMAL_SUBGROUP_OF 0x22EA +#define DOES_NOT_CONTAIN_AS_NORMAL_SUBGROUP 0x22EB +#define NOT_NORMAL_SUBGROUP_OF_OR_EQUAL_TO 0x22EC +#define DOES_NOT_CONTAIN_AS_NORMAL_SUBGROUP_OR_EQUAL 0x22ED +#define VERTICAL_ELLIPSIS 0x22EE +#define MIDLINE_HORIZONTAL_ELLIPSIS 0x22EF +#define UP_RIGHT_DIAGONAL_ELLIPSIS 0x22F0 +#define DOWN_RIGHT_DIAGONAL_ELLIPSIS 0x22F1 + +/* Optical Character Recognition in Unicode (see wikipedia/OCR) + * range: 0x2440-0x245F + */ +#define OCR_Hook 0x2440 +#define OCR_Chair 0x2441 +#define OCR_Fork 0x2442 +#define OCR_Inverted_Fork 0x2443 +#define OCR_Belt_Buckle 0x2444 +#define OCR_Bow_Tie 0x2445 + +/* the following 4 MICR chars (magnetic ink character recognition) + * They look like: |: ,|' ||' ||| (known also as MICR-A,B,C,D) + */ +#define OCR_Branch_Bank_Identification 0x2446 +#define OCR_Amount_Of_Check 0x2447 +#define OCR_Customer_Account_Number 0x2448 +#define OCR_Dash 0x2449 + +#define OCR_Double_Backslash 0x244A + +/* latin ligatures */ +#define LATIN_SMALL_LIGATURE_FF 0xFB00 +#define LATIN_SMALL_LIGATURE_FI 0xFB01 +#define LATIN_SMALL_LIGATURE_FL 0xFB02 +#define LATIN_SMALL_LIGATURE_FFI 0xFB03 +#define LATIN_SMALL_LIGATURE_FFL 0xFB04 +#define LATIN_SMALL_LIGATURE_LONG_S_T 0xFB05 +#define LATIN_SMALL_LIGATURE_ST 0xFB06 + +#endif |