PageRenderTime 66ms CodeModel.GetById 32ms RepoModel.GetById 0ms app.codeStats 1ms

/dev-tools/scripts/create_line_file_docs.py

http://github.com/apache/lucene-solr
Python | 247 lines | 203 code | 35 blank | 9 comment | 32 complexity | 28c15d4562b0c3c6481200446918cd51 MD5 | raw file
Possible License(s): LGPL-2.1, CPL-1.0, MPL-2.0-no-copyleft-exception, JSON, Apache-2.0, AGPL-1.0, GPL-2.0, GPL-3.0, MIT, BSD-3-Clause
  1. import os
  2. import gzip
  3. import time
  4. import random
  5. import re
  6. import urllib.request
  7. import subprocess
  8. import tempfile
  9. import shutil
  10. DEBUG = False
  11. TARGET_DOC_CHARS = 1024
  12. def compress_with_seek_points(file_name_in, file_name_out, num_seek_points):
  13. bytes_per_chunk = os.path.getsize(file_name_in) / num_seek_points
  14. seek_points = []
  15. if os.path.exists(file_name_out):
  16. os.remove(file_name_out)
  17. with open(file_name_in, 'rb') as f_in:
  18. f_out = None
  19. bytes_in_chunk = 0
  20. chunk_count = 0
  21. while True:
  22. if f_out is None:
  23. if os.path.exists(file_name_out):
  24. seek_points.append(os.path.getsize(file_name_out))
  25. print(' create chunk %s at pos=%s' % (chunk_count, seek_points[-1]))
  26. else:
  27. print(' create chunk %s at pos=0' % chunk_count)
  28. f_out = gzip.open(file_name_out, 'ab')
  29. chunk_count += 1
  30. line = f_in.readline()
  31. if len(line) == 0:
  32. break
  33. bytes_in_chunk += len(line)
  34. f_out.write(line)
  35. if bytes_in_chunk > bytes_per_chunk and chunk_count < num_seek_points:
  36. f_out.close()
  37. f_out = None
  38. bytes_in_chunk = 0
  39. with open(file_name_out[:-3] + '.seek', 'w') as f_out:
  40. for seek_point in seek_points:
  41. f_out.write('%d\n' % seek_point)
  42. re_tag = re.compile('<[^>]+?>')
  43. re_newlines = re.compile('\n+')
  44. re_space = re.compile('\s')
  45. # used to find word break, for splitting docs into ~1 KB sized smaller docs:
  46. re_next_non_word_character = re.compile('\W', re.U)
  47. EUROPARL_V7_URL = 'https://www.statmt.org/europarl/v7/europarl.tgz'
  48. def split_docs(all_out, title_string, date_string, body_string):
  49. '''
  50. Splits docs into smallish (~1 KB) sized docs, repeating same title and date
  51. '''
  52. doc_count = 0
  53. while len(body_string) > 0:
  54. char_count = int(random.gauss(TARGET_DOC_CHARS, TARGET_DOC_CHARS/4))
  55. if char_count < 64:
  56. # trimmed normal?
  57. continue
  58. m = re_next_non_word_character.search(body_string, char_count)
  59. if m is not None:
  60. char_count = m.start(0)
  61. else:
  62. char_count = len(body_string)
  63. body_string_fragment = body_string[:char_count].strip()
  64. #print('write title %d, body %d' % (len(title_string), len(body_string_fragment)))
  65. all_out.write('%s\t%s\t%s\n' % (title_string, date_string, body_string_fragment))
  66. body_string = body_string[char_count:]
  67. doc_count += 1
  68. return doc_count
  69. def sample_europarl():
  70. # download europarl.tgz v7, if not already here (in cwd):
  71. file_name = 'europarl.tgz'
  72. if not os.path.exists(file_name):
  73. print('Download %s to %s...' % (EUROPARL_V7_URL, file_name))
  74. urllib.request.urlretrieve(EUROPARL_V7_URL, file_name + '.tmp')
  75. os.rename(file_name + '.tmp', file_name)
  76. else:
  77. print('%s already here; skipping download...' % file_name)
  78. if not DEBUG:
  79. tmp_dir_path = tempfile.mkdtemp()
  80. else:
  81. tmp_dir_path = '/tmp/tmp31ekzg75'
  82. print('Using tmp dir "%s"...' % tmp_dir_path)
  83. try:
  84. if not DEBUG:
  85. cmd = 'tar xzf %s -C %s' % (file_name, tmp_dir_path)
  86. print('Run: %s' % cmd)
  87. subprocess.run(cmd, shell=True)
  88. doc_count = 0
  89. skip_count = 0
  90. file_count = 0
  91. all_txt_file_name = '%s/all.txt' % tmp_dir_path
  92. print('Extract text...')
  93. start_time = time.time()
  94. next_print_time = start_time + 3
  95. # normalize text a bit and concatenate all lines into single file, counting total lines/bytes
  96. with open(all_txt_file_name, 'w', encoding='utf-8') as all_out:
  97. for dir_path, dir_names, file_names in os.walk('%s/txt' % tmp_dir_path):
  98. for file_name in file_names:
  99. if file_name.endswith('.txt'):
  100. file_count += 1
  101. year, month, day = (int(x) for x in file_name[3:-4].split('-')[:3])
  102. if year >= 50:
  103. year = 1900 + year
  104. else:
  105. year = 2000 + year
  106. date_string = '%04d-%02d-%02d' % (year, month, day)
  107. # unfortunately we need errors='ignore' since in Europarl v7, one file (pl/ep-09-10-22-009.txt) has invalid utf-8:
  108. chapter_count = 0
  109. with open('%s/%s' % (dir_path, file_name), 'r', encoding='utf-8', errors='ignore') as f_in:
  110. last_text = []
  111. last_title = None
  112. while True:
  113. line = f_in.readline()
  114. if line == '':
  115. break
  116. line = line.strip()
  117. if line.startswith('<CHAPTER '):
  118. if last_title is not None:
  119. s = ' '.join(last_text)
  120. s = re_tag.sub(' ', s)
  121. s = re_newlines.sub(' ', s)
  122. s = s.strip()
  123. if len(s) > 0:
  124. doc_count += split_docs(all_out, last_title, date_string, s)
  125. else:
  126. skip_count += 1
  127. last_text = []
  128. chapter_count += 1
  129. while True:
  130. last_title = f_in.readline()
  131. if last_title == '':
  132. last_title = None
  133. break
  134. last_title = re_tag.sub(' ', last_title).strip()
  135. if len(last_title) > 0:
  136. break
  137. continue
  138. else:
  139. last_text.append(line)
  140. if last_title is not None:
  141. s = ' '.join(last_text)
  142. s = re_tag.sub(' ', s)
  143. s = re_newlines.sub(' ', s)
  144. s = s.strip()
  145. if len(s) > 0:
  146. doc_count += split_docs(all_out, last_title, date_string, s)
  147. else:
  148. skip_count += 1
  149. chapter_count += 1
  150. else:
  151. skip_count += 1
  152. if chapter_count > 0:
  153. #print('%s/%s: %d chapters' % (dir_path, file_name, chapter_count))
  154. pass
  155. now = time.time()
  156. if now > next_print_time:
  157. print('%4.1fs: keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \
  158. (now - start_time, (file_count - skip_count) / 1000, file_count / 1000,
  159. 100 * (file_count - skip_count) / file_count,
  160. doc_count / 1000000, all_out.tell() / 1024/1024/1024))
  161. while next_print_time < now:
  162. next_print_time += 3
  163. total_mb = os.path.getsize(all_txt_file_name)/1024/1024
  164. now = time.time()
  165. print('%4.1fs (done): keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \
  166. (now - start_time, (file_count - skip_count) / 1000, file_count / 1000,
  167. 100 * (file_count - skip_count) / file_count,
  168. doc_count / 1000000, os.path.getsize(all_txt_file_name) / 1024/1024/1024))
  169. print('Shuffle...')
  170. subprocess.run('shuf %s > %s.shuffled' % (all_txt_file_name, all_txt_file_name), shell=True)
  171. for mb in (20, 200, 2000):
  172. print('Sample %d MB file...' % mb)
  173. file_name_out = '%dmb.txt' % mb
  174. with open(file_name_out, 'w', encoding='utf-8') as f_out:
  175. chance = mb / total_mb
  176. with open(all_txt_file_name + '.shuffled', 'r', encoding='utf-8') as f:
  177. while True:
  178. line = f.readline()
  179. if len(line) == 0:
  180. break
  181. if random.random() <= chance:
  182. f_out.write(line)
  183. print(' got %.2f MB' % (os.path.getsize(file_name_out)/1024/1024))
  184. compress_with_seek_points(file_name_out,
  185. file_name_out + '.gz',
  186. mb)
  187. finally:
  188. print('Removing tmp dir "%s"...' % tmp_dir_path)
  189. if not DEBUG:
  190. shutil.rmtree(tmp_dir_path)
  191. print('\nWARNING: left ./europarl.tgz, which you should delete if you do not want it!\n')
  192. if False:
  193. compress_with_seek_points('/x/tmp/europarl.lines.txt',
  194. '/x/tmp/foo.txt.gz',
  195. 16)
  196. else:
  197. sample_europarl()