PageRenderTime 44ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/util.py

http://pytesser.googlecode.com/
Python | 124 lines | 67 code | 11 blank | 46 comment | 2 complexity | 06deebf1a2e106d3e55e19c3ec5d1a67 MD5 | raw file
Possible License(s): Apache-2.0
  1. """Utility functions for processing images for delivery to Tesseract"""
  2. import os
  3. import re
  4. _add_dot_txt_flag = False
  5. def image_to_scratch(im, scratch_image_name):
  6. """Saves image in memory to scratch file. .bmp format will be read correctly by Tesseract"""
  7. ## if im.mode=='RGBA':
  8. ## im=im.convert('RGB')
  9. ## try:
  10. ## im.save(scratch_image_name, dpi=(200,200))
  11. ## except: ### Eventually this should catch only the specific im.save exception
  12. ## im = im.convert('RGB')
  13. ## im.save(scratch_image_name, dpi=(200,200))
  14. #im = im.convert('1')
  15. im.save(scratch_image_name, dpi=(200,200))
  16. def retrieve_text(scratch_text_name_root):
  17. if _add_dot_txt_flag:
  18. inf = file(scratch_text_name_root + '.txt')
  19. else:
  20. inf = file(scratch_text_name_root)
  21. text = inf.read().strip()
  22. inf.close()
  23. return text
  24. # Eventually this more thorough class should be used:
  25. ##class OCR_character:
  26. ## """Object exposing internals of Tesseract result for particular characters
  27. ## (See documentation of EANYCODE_CHAR
  28. ## http://tesseract-ocr.googlecode.com/svn&cs_f=trunk/ccutil/ocrclass.h
  29. ## for detailed explanations)
  30. ## self.letter - OCRed letter guess
  31. ## self.char_code - Character code of letter
  32. ## self.x_bounds - (left bound, right bound)
  33. ## self.y_bounds - (top bound, bottom bound)
  34. ## self.font_index - Index of character's font
  35. ## self.confidence - 0 (low conf) to 100 (high)
  36. ## self.point_size - Estimated size of font (units unclear)
  37. ## self.formatting - Bit flags for formatting and layout information
  38. ## """
  39. ## def __init__(self, line):
  40. ## data = line.split(' ')
  41. ## self.letter = data[0]
  42. ## self.char_code = int(data[1], 16)
  43. ## self.x_bounds = (data[2], data[4])
  44. ## self.y_bounds = (data[5], data[3])
  45. ## self.font_index = data[6]
  46. ## self.confidence = data[7]
  47. ## self.point_size = data[8]
  48. ## self.formatting = data[9]
  49. ## def __str__(self):
  50. ## return self.letter
  51. # This simple class is used for now:
  52. class OCR_character:
  53. """Object exposing internals of Tesseract result for particular characters
  54. (See documentation of EANYCODE_CHAR
  55. http://tesseract-ocr.googlecode.com/svn&cs_f=trunk/ccutil/ocrclass.h
  56. for detailed explanations)
  57. self.letter - OCRed letter guess
  58. self.x_bounds - (left bound, right bound)
  59. self.y_bounds - (top bound, bottom bound)
  60. """
  61. def __init__(self, line):
  62. parse_re = re.compile(r'^(.).*\((.+),(.+)\).*\((.+),(.+)\)') # Match example 'T[54]->[54](35,115)->(56,90)'
  63. data = parse_re.findall(line)[0]
  64. self.letter = data[0]
  65. self.x_bounds = (int(data[1]), int(data[3]))
  66. self.y_bounds = (int(data[2]), int(data[4]))
  67. class OCR_result(str):
  68. """Parsed results of call to Tesseract; subclass of 'str'.
  69. self OCR string.
  70. self.internals is array (aligned with self.text) of OCR_letter
  71. internal data objects (for characters) or None (for whitespace,
  72. since Tesseract provides no internal data for whitespace characters)."""
  73. def __new__(self, text):
  74. raw_letters = []
  75. internals = []
  76. data = text.split('\n')
  77. i = 0
  78. while i<len(data):
  79. line = data[i].strip()
  80. if line=='<nl>': # New line
  81. raw_letters.append('\n')
  82. internals.append(None)
  83. i += 1
  84. elif line=='<para>': # End of input
  85. break
  86. elif line=='': # Space character
  87. raw_letters.append(' ')
  88. internals.append(None)
  89. else:
  90. character = OCR_character(line)
  91. raw_letters.append(character.letter)
  92. internals.append(character)
  93. i += 1
  94. self = str.__new__(self, "".join(raw_letters))
  95. self.internals = internals
  96. return self
  97. def retrieve_result(scratch_text_name_root):
  98. text = retrieve_text(scratch_text_name_root)
  99. return OCR_result(text)
  100. def perform_cleanup(scratch_image_name, scratch_text_name_root):
  101. """Clean up temporary files from disk"""
  102. if _add_dot_txt_flag:
  103. scratch_text_name = scratch_text_name_root + '.txt'
  104. else:
  105. scratch_text_name = scratch_text_name_root
  106. for name in (scratch_image_name, scratch_text_name, "tesseract.log"):
  107. try:
  108. os.remove(name)
  109. except OSError:
  110. pass