GOCR v0.50 原理分析

一，简介：

GOCR是一个c写的开源OCR库，GNU Public License，作者：Joerg Schulenburg

项目主页：http://jocr.sourceforge.net/index.html

源代码(v0.50) ： http://pan.baidu.com/s/1y1Jj1 （VS2005工程项目）

Update : http://pan.baidu.com/s/1c0b278O （windows下通过liblept支持jpeg/png等格式的OCR）

版本(version.h):
#define version_string "0.50"

#define release_string "20130305"

二，原理分析：

1，GOCR的主要流程如下：

int pgm2asc(job_t *job)

{

  pix *pp;

  progress_counter_t *pc;

  static int multi_image_count=;  /* number of image within multi-image */

  int orig_cs=; 

  if (!multi_image_count) orig_cs = job->cfg.cs; /* save for multi-images */

  multi_image_count++;

  assert(job);

  /* FIXME jb: remove pp */

  pp = &(job->src.p);

  pc = open_progress(,"pgm2asc_main");

  progress(,pc); /* start progress output 0% 0% */

#if 0 /* dont vast memory */

  /* FIXME jb: malloc */

  if ( job->cfg.verbose &  ) {

    // generate 2nd imagebuffer for debugging output

    job->tmp.ppo.p = (unsigned char *)malloc(job->src.p.y * job->src.p.x);

    // buffer

    assert(job->tmp.ppo.p);

    copybox(&job->src.p,

            , , job->src.p.x, job->src.p.y,

            &job->tmp.ppo,

            job->src.p.x * job->src.p.y);

  }

#else

  job->tmp.ppo=job->src.p; /* temporarely, removed later */

#endif

  // if (job->cfg.verbose&32) debug_img("out000.ppm",job,0);

  /* ----- count colors ------ create histogram -------

     - this should be used to create a upper and lower limit for cs

     - cs is the optimum gray value between cs_min and cs_max

     - also inverse scans could be detected here later */

  if (orig_cs==)

    job->cfg.cs=otsu( pp->p,pp->y,pp->x,,,pp->x,pp->y,job->cfg.verbose & );

  else  // dont set cs, output stats + do inversion if needed 2010-10-07

    otsu( pp->p,pp->y,pp->x,,,pp->x,pp->y,job->cfg.verbose & );

//  if (job->cfg.verbose&32) debug_img("out001.ppm",job,0);

  /* renormalize the image and set the normalized threshold value */

  job->cfg.cs=thresholding( pp->p,pp->y,pp->x,,,pp->x,pp->y, job->cfg.cs );

  if( job->cfg.verbose )

    fprintf(stderr, "# thresholding new_threshold= %d\n", job->cfg.cs);

//  if (job->cfg.verbose&32) debug_img("out002.ppm",job,0);

  progress(,pc); /* progress is only estimated */

  /* this is first step for reorganize the PG

     ---- look for letters, put rectangular frames around letters

     letter = connected points near color F

     should be used by dust removing (faster) and line detection!

     ---- 0..cs = black letters, last change = Mai99 */

  progress(,pc); /* progress is only estimated */

//  if (job->cfg.verbose&32) debug_img("out008.ppm",job,8);

  scan_boxes( job, pp );

  if ( !job->res.numC ){

    fprintf( stderr,"# no boxes found - stopped\n" );

    if(job->cfg.verbose&) debug_img("out01",job,);

    /***** should free stuff, etc) */

    return();

  }

  // tmp10/bug100818a.pgm creates artefacts on image

//  if (job->cfg.verbose&32) debug_img("out00",job,4+8);

  progress(,pc); /* progress is only estimated */

  // if(job->cfg.verbose&32) debug_img("out01",job,4+8);

  // output_list(job);  // for debugging

  // ToDo: matrix printer preprocessing

  remove_dust( job ); /* from the &(job->res.boxlist)! */

// if(job->cfg.verbose&32) debug_img("out02",job,4+8);

// output_list(job);  // for debugging

#if 0 // ToDo 2010-10-15 destroys QR-barcodes

  smooth_borders( job ); /* only for big chars */

#endif

  progress(,pc); /* progress is only estimated */

// if(job->cfg.verbose&32) debug_img("out03",job,4+8);

// output_list(job);  // for debugging 

  detect_barcode( job );  /* mark barcode */

// if(job->cfg.verbose&32) debug_img("out04",job,4+8);

// output_list(job);  // for debugging 

  detect_pictures( job ); /* mark pictures */

//  if(job->cfg.verbose&32) debug_img("out05",job,4+8);

// output_list(job);  // for debugging 

  remove_pictures( job ); /* do this as early as possible, before layout */

//  if(job->cfg.verbose&32) debug_img("out06",job,4+8);

// output_list(job);  // for debugging

  glue_holes_inside_chars( pp ); /* including count subboxes (holes)  */

  detect_rotation_angle( job );

#if 1         /* Rotate the whole picture! move boxes */

  if( job->res.lines.dy!= ){  // move down lowest first, move up highest first

    // in work! ??? (at end set dy=0) think on ppo!

  }

#endif

  detect_text_lines( pp, job->cfg.mode ); /* detect and mark job->tmp.ppo */

// if(job->cfg.verbose&32) debug_img("out07",job,4+8);

  progress(,pc); /* progress is only estimated */

  add_line_info( job /* , &(job->res.boxlist) */);

  if (job->cfg.verbose&) debug_img("out10",job,+);

  divide_vert_glued_boxes( pp, job->cfg.mode); /* after add_line_info, before list_sort! */

//  if(job->cfg.verbose&32) debug_img("out11",job,0);

  remove_melted_serifs( job, pp ); /* make some corrections on pixmap */

  /* list_ins seems to sort in the boxes on the wrong place ??? */

//  if(job->cfg.verbose&32) debug_img("out12",job,4+8);

  glue_broken_chars( job, pp ); /* 2nd glue */

//  if(job->cfg.verbose&32) debug_img("out14",job,4+8);

// 2010-09-24 overall box size is correct here, but later broken

  remove_rest_of_dust( job );

//  if(job->cfg.verbose&32) debug_img("out15",job,4+8);

  /* better sort after dust is removed (slow for lot of pixels) */

  list_sort(&(job->res.boxlist), sort_box_func);

  measure_pitch( job );

  if(job->cfg.mode&) find_same_chars( pp );

  progress(,pc); /* progress is only estimated */

//  if(job->cfg.verbose&32) debug_img("out16",job,4+8);

  char_recognition( pp, job->cfg.mode);

  progress(,pc); /* progress is only estimated */

//  if(job->cfg.verbose&32) debug_img("out17",job,4+8);

  if ( adjust_text_lines( pp, job->cfg.mode ) ) { /* correct using chars */

    /* may be, characters/pictures have changed line number */

    list_sort(&(job->res.boxlist), sort_box_func);

    // 2nd recognition call if lines are adjusted

    char_recognition( pp, job->cfg.mode);

  }

#define BlownUpDrawing 0     /* german: Explosionszeichnung, temporarly */

#if     BlownUpDrawing == 1  /* german: Explosionszeichnung */

{ /* just for debugging */

  int i,ii,ni; struct box *box2;

  i=ii=ni=;

  for_each_data(&(job->res.boxlist)) { /* count boxes */

    box2 = (struct box *)list_get_current(&(job->res.boxlist));

    if (box2->c==UNKNOWN)  i++;

    if (box2->c==PICTURE) ii++;

    ni++;

  } end_for_each(&(job->res.boxlist));

  if (job->cfg.verbose)

    fprintf(stderr,"# debug: unknown= %d picts= %d boxes= %d\n",i,ii,ni);

}

#endif

  // ----------- write out20.pgm ----------- mark lines + boxes

  if (job->cfg.verbose&) debug_img("out20",job,++);

 compare_unknown_with_known_chars( pp, job->cfg.mode);

  progress(,pc); /* progress is only estimated */

    try_to_divide_boxes( pp, job->cfg.mode);

  progress(,pc); /* progress is only estimated */

  /* --- list output ---- for debugging --- */

  if (job->cfg.verbose&) output_list(job);

  /* ---- insert spaces ---- */

  list_insert_spaces( pp , job );

  // ---- proof difficult chars Il1 by context view ----

  if (job->cfg.verbose)

    fprintf(stderr,"# context correction if !(mode&32)\n");

  if (!(job->cfg.mode&)) context_correction( job );

  store_boxtree_lines( job, job->cfg.mode );

  progress(,pc); /* progress is only estimated */

/* 0050002.pgm.gz ca. 109 digits, only 50 recognized (only in lines?)

 * ./gocr -v 39 -m 56 -e - -m 4 -C 0-9 -f XML tmp0406/0050002.pbm.gz

 *  awk 'BEGIN{num=0}/1<\/box>/{num++;}END{print num}' o

 * 15*0 24*1 18*2 19*3 15*4 6*5 6*6 6*7 4*8 8*9 sum=125digits counted boxes

 *  9*0 19*1 14*2 15*3 11*4 6*5 5*6 6*7 4*8 8*9 sum=97digits recognized

 * 1*1 1*7 not recognized (Oct04)

 *  33*SPC 76*NL = 109 spaces + 36*unknown sum=241 * 16 missed

 */

#if     BlownUpDrawing == 1  /* german: Explosionszeichnung */

{ /* just for debugging */

  int i,ii,ni; struct box *box2; const char *testc="0123456789ABCDEFGHIJK";

    i=ii=ni=;

  for_each_data(&(job->res.boxlist)) { /* count boxes */

    box2 = (struct box *)list_get_current(&(job->res.boxlist));

    if (box2->c==UNKNOWN)  i++;

    if (box2->c==PICTURE) ii++;

    if (box2->c>' ' && box2->c<='z') ni++;

  } end_for_each(&(job->res.boxlist));

  if(job->cfg.verbose)

    fprintf(stderr,"# debug: (_)= %d picts= %d chars= %d",i,ii,ni);

  for (i=;i<;i++) {

    ni=;

    for_each_data(&(job->res.boxlist)) { /* count boxes */

      box2 = (struct box *)list_get_current(&(job->res.boxlist));

      if (box2->c==testc[i]) ni++;

    } end_for_each(&(job->res.boxlist));

    if(job->cfg.verbose && ni>)

      fprintf(stderr," (%c)=%d",testc[i],ni);

  }

  if(job->cfg.verbose)

    fprintf(stderr,"\n");

}

#endif

  // ---- frame-size-histogram

  // ---- (my own defined) distance between letters

  // ---- write internal picture of textsite

  // ----------- write out30.pgm -----------

  if( job->cfg.verbose& ) debug_img("out30",job,+);

  progress(,pc); /* progress is only estimated */

  close_progress(pc);

  return ;     /* what should I return? error-state? num-of-chars? */

}

2，Scan boxes分析：

流程：从上往下，分别在X,Y轴方向投影，得到box list。

3，去除噪点：

/* ---- remove dust ---------------------------------

   What is dust? I think, this is a very small pixel cluster without

   neighbours. Of course not all dust clusters can be detected correct.

   This feature should be possible to switch off via option.

   -> may be, all clusters should be stored here?

   speed is very slow, I know, but I am happy that it is working well

*/

4，detect barcode and pictures ， remove pictures：

图片：所有box的平均宽度为avgwidth，平均高度为avgheight，符合box.width > 4 * avgwidth || height > 4*avgheight条件并且相近大小的box少于4个的box认为是图像box。

5，glur holes inside char：

/* ---- join holes to chars( before step1 ) v0.42  -----------------------

   join boxes lying inside another box (usually holes, ex: "aeobdg46890")

   Dont add dust to a char!

   lines are not detected yet

*/

6，detect rotation angle：

/*

** Detect rotation angle (one for whole image)

** old: longest text-line and determining the angle of this line.

 *

 * search right nearest neighbour of each box and average vectors

 * to get the text orientation,

 * upside down decision is not made here (I dont know how to do it)

 *  ToDo: set job->res.lines.{dx,dy}

 * pass 1: get mean vector to nearest char

 * pass 2: get mean vector to nearest char without outriders to pass 1

 * extimate direction as (dx,dy,num)[pass]

 * ToDo: estimate an error, boxes only work fine for zero-rotation

 *       for 45 degree use vectors, not boxes to get base line

 */

７，detect text lines：

http://en.wikipedia.org/wiki/Cap_height

8，measure pitch：

估计空格的宽度。

9，识别字符：

gocr的识别不是机器学习式的学习，没有training过程，完全靠先验的规则，因此只能识别英文字符，数字，标点等。识别主要是一个filter链路，每个filter决定box是否是该字符，是则略过后续filter。

a，从box外引出一条射线从某个方向（左，右，上，下）某个坐标（x，y）向box内部，第一个交点位置必须符合某个字符的先验规则；

代码：

/* move from x,y to direction r until pixel of color col is found

 *   or maximum of l steps

 * return the number of steps done */

int loop(pix *p,int x,int y,int l,int cs,int col, DIRECTION r){

  int i=;

  if(x>= && y>= && x<p->x && y<p->y){

    switch (r) {

    case UP:

      for( ;i<l && y>=;i++,y--)

    if( (getpixel(p,x,y)<cs)^col )

      break;

      break;

    case DO:

      for( ;i<l && y<p->y;i++,y++)

    if( (getpixel(p,x,y)<cs)^col )

      break;

      break;

    case LE:

      for( ;i<l && x>=;i++,x--)

    if( (getpixel(p,x,y)<cs)^col )

      break;

      break;

    case RI:

      for( ;i<l && x<p->x;i++,x++)

    if( (getpixel(p,x,y)<cs)^col )

      break;

      break;

    default:;

    }

  }

  return i;

}

b，经过box的一条直线与字符的交点个数必须符合某个字符的先验规则，算法：计算这样的点(如从左向右：Pixel(x,y) = white && Pixel(x+1,y) = black ) 的个数

代码：

int num_cross(int x0, int x1, int y0, int y1, pix *p, int cs) {

  int rc = , col = , k, x, y, i, d;    // rc=crossings  col=0=white

  int dx = x1 - x0, dy = y1 - y0;

  d = MAX(abs(dx), abs(dy));

  for (i = , x = x0, y = y0; i <= d; i++) {

    if (d) {

      x = x0 + i * dx / d;

      y = y0 + i * dy / d;

    }

    k = ((getpixel(p, x, y) < cs) ?  : );    // 0=white 1=black

    if (col ==  && k == )  // found a white-black transition

      rc++;

    col = k;        // last color

  }

  return rc;

}

c，孔洞的个数必须符合某个字符的先验规则，比如A有一个洞；这一步只是判断，实际工作在第5步已经完成。

d,如下面识别“{”的代码：

意思是横穿过dy条线，所有线与字符的交点个数均为1；在字符的前半面，竖直穿过dx/2条线，交点个数均为2，即左凸起部分；等等。

//  --------- test {} --------------------------------

   for(ad=d=;dx> && dy> && *dy>*dx;){

      DBG( wchar_t c_ask='}'; )

      if (!hchar) ad=*ad/;

      for(y=;y<dy;y++){

        if( num_cross(,dx-,y,y,bp,cs) !=  ) break;

      } if (y<dy) Break;

      for(x=;x<dx/;x++){

        if( num_cross(x,x,,dy-,bp,cs) !=  ) break;

      } if (y<dx/) Break;

      if ( num_cross(   ,   ,dy/,dy--dy/,bp,cs) !=  ) Break;

      if ( num_cross(dx-,dx-,dy/,dy--dy/,bp,cs) !=  ) Break;

      i1=loop(bp,dx-   ,dy/,dx,cs,,LE);

      i1=loop(bp,dx--i1,dy/,dx,cs,,LE); // thickness1

      for (i2=dx,i3=y=dy/--dy/;y<dy/++dy/;y++)

       { x=loop(bp,dx-   , y,dx,cs,,LE); if (x<i2) {i2=x;i3=y;} }

      i2=  loop(bp,dx--i2,i3,dx,cs,,LE); // thickness2

      if (i2<i1+dx/+) Break;

      if ( loop(bp,dx-,dy-,dx,cs,,LE)>*dx/ ) {ad=*ad/;MSG({})}

      if ( loop(bp,dx-,   ,dx,cs,,LE)>*dx/ ) {ad=*ad/;MSG({})} // >

      if ( loop(bp,dx-,   ,dy,cs,,DO)<dy/-dy/- ) {ad=*ad/;MSG({})}

      if ( loop(bp,dx-,dy-,dy,cs,,UP)<dy/-dy/- ) {ad=*ad/;MSG({})} // )

      if ( loop(bp,dx-,   ,dy,cs,,DO)<=dy/) Break;

      if (dy>=)

      if (   loop(bp,,   ,dx,cs,,RI)

         +   loop(bp,,dy/,dx,cs,,RI)

         - *loop(bp,,dy/,dx,cs,,RI) >=dx/ ) {ad=*ad/;MSG({})} // <

      if ( loop(bp,,dy-,dy,cs,,UP)>dy/ ) Break; // ???

      if ( get_bw(x1,x1,y0,y0+dy/,box1->p,cs,) ==

        || get_bw(x1,x1,y1-dy/,y1,box1->p,cs,) ==  ) Break;

      Setac(box1,(bc='}'),ad);break;

   }

10，compare_unknown_with_known_chars try_to_divide_boxes等后处理；

11，list_insert_spaces 插入空格；

12，store_boxtree_lines；

13，输出识别结果。