/dev-tools/scripts/create_line_file_docs.py

http://github.com/apache/lucene-solr · Python · 247 lines · 187 code · 50 blank · 10 comment · 56 complexity · 28c15d4562b0c3c6481200446918cd51 MD5 · raw file

  1. import os
  2. import gzip
  3. import time
  4. import random
  5. import re
  6. import urllib.request
  7. import subprocess
  8. import tempfile
  9. import shutil
  10. DEBUG = False
  11. TARGET_DOC_CHARS = 1024
  12. def compress_with_seek_points(file_name_in, file_name_out, num_seek_points):
  13. bytes_per_chunk = os.path.getsize(file_name_in) / num_seek_points
  14. seek_points = []
  15. if os.path.exists(file_name_out):
  16. os.remove(file_name_out)
  17. with open(file_name_in, 'rb') as f_in:
  18. f_out = None
  19. bytes_in_chunk = 0
  20. chunk_count = 0
  21. while True:
  22. if f_out is None:
  23. if os.path.exists(file_name_out):
  24. seek_points.append(os.path.getsize(file_name_out))
  25. print(' create chunk %s at pos=%s' % (chunk_count, seek_points[-1]))
  26. else:
  27. print(' create chunk %s at pos=0' % chunk_count)
  28. f_out = gzip.open(file_name_out, 'ab')
  29. chunk_count += 1
  30. line = f_in.readline()
  31. if len(line) == 0:
  32. break
  33. bytes_in_chunk += len(line)
  34. f_out.write(line)
  35. if bytes_in_chunk > bytes_per_chunk and chunk_count < num_seek_points:
  36. f_out.close()
  37. f_out = None
  38. bytes_in_chunk = 0
  39. with open(file_name_out[:-3] + '.seek', 'w') as f_out:
  40. for seek_point in seek_points:
  41. f_out.write('%d\n' % seek_point)
  42. re_tag = re.compile('<[^>]+?>')
  43. re_newlines = re.compile('\n+')
  44. re_space = re.compile('\s')
  45. # used to find word break, for splitting docs into ~1 KB sized smaller docs:
  46. re_next_non_word_character = re.compile('\W', re.U)
  47. EUROPARL_V7_URL = 'https://www.statmt.org/europarl/v7/europarl.tgz'
  48. def split_docs(all_out, title_string, date_string, body_string):
  49. '''
  50. Splits docs into smallish (~1 KB) sized docs, repeating same title and date
  51. '''
  52. doc_count = 0
  53. while len(body_string) > 0:
  54. char_count = int(random.gauss(TARGET_DOC_CHARS, TARGET_DOC_CHARS/4))
  55. if char_count < 64:
  56. # trimmed normal?
  57. continue
  58. m = re_next_non_word_character.search(body_string, char_count)
  59. if m is not None:
  60. char_count = m.start(0)
  61. else:
  62. char_count = len(body_string)
  63. body_string_fragment = body_string[:char_count].strip()
  64. #print('write title %d, body %d' % (len(title_string), len(body_string_fragment)))
  65. all_out.write('%s\t%s\t%s\n' % (title_string, date_string, body_string_fragment))
  66. body_string = body_string[char_count:]
  67. doc_count += 1
  68. return doc_count
  69. def sample_europarl():
  70. # download europarl.tgz v7, if not already here (in cwd):
  71. file_name = 'europarl.tgz'
  72. if not os.path.exists(file_name):
  73. print('Download %s to %s...' % (EUROPARL_V7_URL, file_name))
  74. urllib.request.urlretrieve(EUROPARL_V7_URL, file_name + '.tmp')
  75. os.rename(file_name + '.tmp', file_name)
  76. else:
  77. print('%s already here; skipping download...' % file_name)
  78. if not DEBUG:
  79. tmp_dir_path = tempfile.mkdtemp()
  80. else:
  81. tmp_dir_path = '/tmp/tmp31ekzg75'
  82. print('Using tmp dir "%s"...' % tmp_dir_path)
  83. try:
  84. if not DEBUG:
  85. cmd = 'tar xzf %s -C %s' % (file_name, tmp_dir_path)
  86. print('Run: %s' % cmd)
  87. subprocess.run(cmd, shell=True)
  88. doc_count = 0
  89. skip_count = 0
  90. file_count = 0
  91. all_txt_file_name = '%s/all.txt' % tmp_dir_path
  92. print('Extract text...')
  93. start_time = time.time()
  94. next_print_time = start_time + 3
  95. # normalize text a bit and concatenate all lines into single file, counting total lines/bytes
  96. with open(all_txt_file_name, 'w', encoding='utf-8') as all_out:
  97. for dir_path, dir_names, file_names in os.walk('%s/txt' % tmp_dir_path):
  98. for file_name in file_names:
  99. if file_name.endswith('.txt'):
  100. file_count += 1
  101. year, month, day = (int(x) for x in file_name[3:-4].split('-')[:3])
  102. if year >= 50:
  103. year = 1900 + year
  104. else:
  105. year = 2000 + year
  106. date_string = '%04d-%02d-%02d' % (year, month, day)
  107. # unfortunately we need errors='ignore' since in Europarl v7, one file (pl/ep-09-10-22-009.txt) has invalid utf-8:
  108. chapter_count = 0
  109. with open('%s/%s' % (dir_path, file_name), 'r', encoding='utf-8', errors='ignore') as f_in:
  110. last_text = []
  111. last_title = None
  112. while True:
  113. line = f_in.readline()
  114. if line == '':
  115. break
  116. line = line.strip()
  117. if line.startswith('<CHAPTER '):
  118. if last_title is not None:
  119. s = ' '.join(last_text)
  120. s = re_tag.sub(' ', s)
  121. s = re_newlines.sub(' ', s)
  122. s = s.strip()
  123. if len(s) > 0:
  124. doc_count += split_docs(all_out, last_title, date_string, s)
  125. else:
  126. skip_count += 1
  127. last_text = []
  128. chapter_count += 1
  129. while True:
  130. last_title = f_in.readline()
  131. if last_title == '':
  132. last_title = None
  133. break
  134. last_title = re_tag.sub(' ', last_title).strip()
  135. if len(last_title) > 0:
  136. break
  137. continue
  138. else:
  139. last_text.append(line)
  140. if last_title is not None:
  141. s = ' '.join(last_text)
  142. s = re_tag.sub(' ', s)
  143. s = re_newlines.sub(' ', s)
  144. s = s.strip()
  145. if len(s) > 0:
  146. doc_count += split_docs(all_out, last_title, date_string, s)
  147. else:
  148. skip_count += 1
  149. chapter_count += 1
  150. else:
  151. skip_count += 1
  152. if chapter_count > 0:
  153. #print('%s/%s: %d chapters' % (dir_path, file_name, chapter_count))
  154. pass
  155. now = time.time()
  156. if now > next_print_time:
  157. print('%4.1fs: keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \
  158. (now - start_time, (file_count - skip_count) / 1000, file_count / 1000,
  159. 100 * (file_count - skip_count) / file_count,
  160. doc_count / 1000000, all_out.tell() / 1024/1024/1024))
  161. while next_print_time < now:
  162. next_print_time += 3
  163. total_mb = os.path.getsize(all_txt_file_name)/1024/1024
  164. now = time.time()
  165. print('%4.1fs (done): keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \
  166. (now - start_time, (file_count - skip_count) / 1000, file_count / 1000,
  167. 100 * (file_count - skip_count) / file_count,
  168. doc_count / 1000000, os.path.getsize(all_txt_file_name) / 1024/1024/1024))
  169. print('Shuffle...')
  170. subprocess.run('shuf %s > %s.shuffled' % (all_txt_file_name, all_txt_file_name), shell=True)
  171. for mb in (20, 200, 2000):
  172. print('Sample %d MB file...' % mb)
  173. file_name_out = '%dmb.txt' % mb
  174. with open(file_name_out, 'w', encoding='utf-8') as f_out:
  175. chance = mb / total_mb
  176. with open(all_txt_file_name + '.shuffled', 'r', encoding='utf-8') as f:
  177. while True:
  178. line = f.readline()
  179. if len(line) == 0:
  180. break
  181. if random.random() <= chance:
  182. f_out.write(line)
  183. print(' got %.2f MB' % (os.path.getsize(file_name_out)/1024/1024))
  184. compress_with_seek_points(file_name_out,
  185. file_name_out + '.gz',
  186. mb)
  187. finally:
  188. print('Removing tmp dir "%s"...' % tmp_dir_path)
  189. if not DEBUG:
  190. shutil.rmtree(tmp_dir_path)
  191. print('\nWARNING: left ./europarl.tgz, which you should delete if you do not want it!\n')
  192. if False:
  193. compress_with_seek_points('/x/tmp/europarl.lines.txt',
  194. '/x/tmp/foo.txt.gz',
  195. 16)
  196. else:
  197. sample_europarl()