/util.py
http://pytesser.googlecode.com/ · Python · 124 lines · 67 code · 11 blank · 46 comment · 2 complexity · 06deebf1a2e106d3e55e19c3ec5d1a67 MD5 · raw file
- """Utility functions for processing images for delivery to Tesseract"""
-
- import os
- import re
-
- _add_dot_txt_flag = False
-
-
- def image_to_scratch(im, scratch_image_name):
- """Saves image in memory to scratch file. .bmp format will be read correctly by Tesseract"""
- ## if im.mode=='RGBA':
- ## im=im.convert('RGB')
- ## try:
- ## im.save(scratch_image_name, dpi=(200,200))
- ## except: ### Eventually this should catch only the specific im.save exception
- ## im = im.convert('RGB')
- ## im.save(scratch_image_name, dpi=(200,200))
- #im = im.convert('1')
- im.save(scratch_image_name, dpi=(200,200))
-
- def retrieve_text(scratch_text_name_root):
- if _add_dot_txt_flag:
- inf = file(scratch_text_name_root + '.txt')
- else:
- inf = file(scratch_text_name_root)
- text = inf.read().strip()
- inf.close()
- return text
-
- # Eventually this more thorough class should be used:
-
- ##class OCR_character:
- ## """Object exposing internals of Tesseract result for particular characters
- ## (See documentation of EANYCODE_CHAR
- ## http://tesseract-ocr.googlecode.com/svn&cs_f=trunk/ccutil/ocrclass.h
- ## for detailed explanations)
- ## self.letter - OCRed letter guess
- ## self.char_code - Character code of letter
- ## self.x_bounds - (left bound, right bound)
- ## self.y_bounds - (top bound, bottom bound)
- ## self.font_index - Index of character's font
- ## self.confidence - 0 (low conf) to 100 (high)
- ## self.point_size - Estimated size of font (units unclear)
- ## self.formatting - Bit flags for formatting and layout information
- ## """
- ## def __init__(self, line):
- ## data = line.split(' ')
- ## self.letter = data[0]
- ## self.char_code = int(data[1], 16)
- ## self.x_bounds = (data[2], data[4])
- ## self.y_bounds = (data[5], data[3])
- ## self.font_index = data[6]
- ## self.confidence = data[7]
- ## self.point_size = data[8]
- ## self.formatting = data[9]
- ## def __str__(self):
- ## return self.letter
-
- # This simple class is used for now:
-
- class OCR_character:
- """Object exposing internals of Tesseract result for particular characters
- (See documentation of EANYCODE_CHAR
- http://tesseract-ocr.googlecode.com/svn&cs_f=trunk/ccutil/ocrclass.h
- for detailed explanations)
- self.letter - OCRed letter guess
- self.x_bounds - (left bound, right bound)
- self.y_bounds - (top bound, bottom bound)
- """
- def __init__(self, line):
- parse_re = re.compile(r'^(.).*\((.+),(.+)\).*\((.+),(.+)\)') # Match example 'T[54]->[54](35,115)->(56,90)'
- data = parse_re.findall(line)[0]
- self.letter = data[0]
- self.x_bounds = (int(data[1]), int(data[3]))
- self.y_bounds = (int(data[2]), int(data[4]))
-
-
- class OCR_result(str):
- """Parsed results of call to Tesseract; subclass of 'str'.
- self OCR string.
- self.internals is array (aligned with self.text) of OCR_letter
- internal data objects (for characters) or None (for whitespace,
- since Tesseract provides no internal data for whitespace characters)."""
- def __new__(self, text):
- raw_letters = []
- internals = []
- data = text.split('\n')
- i = 0
- while i<len(data):
- line = data[i].strip()
- if line=='<nl>': # New line
- raw_letters.append('\n')
- internals.append(None)
- i += 1
- elif line=='<para>': # End of input
- break
- elif line=='': # Space character
- raw_letters.append(' ')
- internals.append(None)
- else:
- character = OCR_character(line)
- raw_letters.append(character.letter)
- internals.append(character)
- i += 1
- self = str.__new__(self, "".join(raw_letters))
- self.internals = internals
- return self
-
-
- def retrieve_result(scratch_text_name_root):
- text = retrieve_text(scratch_text_name_root)
- return OCR_result(text)
-
- def perform_cleanup(scratch_image_name, scratch_text_name_root):
- """Clean up temporary files from disk"""
- if _add_dot_txt_flag:
- scratch_text_name = scratch_text_name_root + '.txt'
- else:
- scratch_text_name = scratch_text_name_root
- for name in (scratch_image_name, scratch_text_name, "tesseract.log"):
- try:
- os.remove(name)
- except OSError:
- pass