/dev-tools/scripts/create_line_file_docs.py
Python | 247 lines | 203 code | 35 blank | 9 comment | 32 complexity | 28c15d4562b0c3c6481200446918cd51 MD5 | raw file
Possible License(s): LGPL-2.1, CPL-1.0, MPL-2.0-no-copyleft-exception, JSON, Apache-2.0, AGPL-1.0, GPL-2.0, GPL-3.0, MIT, BSD-3-Clause
- import os
- import gzip
- import time
- import random
- import re
- import urllib.request
- import subprocess
- import tempfile
- import shutil
- DEBUG = False
- TARGET_DOC_CHARS = 1024
- def compress_with_seek_points(file_name_in, file_name_out, num_seek_points):
- bytes_per_chunk = os.path.getsize(file_name_in) / num_seek_points
- seek_points = []
- if os.path.exists(file_name_out):
- os.remove(file_name_out)
- with open(file_name_in, 'rb') as f_in:
- f_out = None
- bytes_in_chunk = 0
- chunk_count = 0
- while True:
- if f_out is None:
- if os.path.exists(file_name_out):
- seek_points.append(os.path.getsize(file_name_out))
- print(' create chunk %s at pos=%s' % (chunk_count, seek_points[-1]))
- else:
- print(' create chunk %s at pos=0' % chunk_count)
- f_out = gzip.open(file_name_out, 'ab')
- chunk_count += 1
- line = f_in.readline()
- if len(line) == 0:
- break
- bytes_in_chunk += len(line)
- f_out.write(line)
- if bytes_in_chunk > bytes_per_chunk and chunk_count < num_seek_points:
- f_out.close()
- f_out = None
- bytes_in_chunk = 0
- with open(file_name_out[:-3] + '.seek', 'w') as f_out:
- for seek_point in seek_points:
- f_out.write('%d\n' % seek_point)
- re_tag = re.compile('<[^>]+?>')
- re_newlines = re.compile('\n+')
- re_space = re.compile('\s')
- # used to find word break, for splitting docs into ~1 KB sized smaller docs:
- re_next_non_word_character = re.compile('\W', re.U)
- EUROPARL_V7_URL = 'https://www.statmt.org/europarl/v7/europarl.tgz'
- def split_docs(all_out, title_string, date_string, body_string):
- '''
- Splits docs into smallish (~1 KB) sized docs, repeating same title and date
- '''
- doc_count = 0
- while len(body_string) > 0:
- char_count = int(random.gauss(TARGET_DOC_CHARS, TARGET_DOC_CHARS/4))
- if char_count < 64:
- # trimmed normal?
- continue
- m = re_next_non_word_character.search(body_string, char_count)
- if m is not None:
- char_count = m.start(0)
- else:
- char_count = len(body_string)
- body_string_fragment = body_string[:char_count].strip()
-
- #print('write title %d, body %d' % (len(title_string), len(body_string_fragment)))
- all_out.write('%s\t%s\t%s\n' % (title_string, date_string, body_string_fragment))
- body_string = body_string[char_count:]
- doc_count += 1
- return doc_count
- def sample_europarl():
- # download europarl.tgz v7, if not already here (in cwd):
- file_name = 'europarl.tgz'
- if not os.path.exists(file_name):
- print('Download %s to %s...' % (EUROPARL_V7_URL, file_name))
- urllib.request.urlretrieve(EUROPARL_V7_URL, file_name + '.tmp')
- os.rename(file_name + '.tmp', file_name)
- else:
- print('%s already here; skipping download...' % file_name)
- if not DEBUG:
- tmp_dir_path = tempfile.mkdtemp()
- else:
- tmp_dir_path = '/tmp/tmp31ekzg75'
- print('Using tmp dir "%s"...' % tmp_dir_path)
- try:
- if not DEBUG:
- cmd = 'tar xzf %s -C %s' % (file_name, tmp_dir_path)
- print('Run: %s' % cmd)
- subprocess.run(cmd, shell=True)
- doc_count = 0
- skip_count = 0
- file_count = 0
- all_txt_file_name = '%s/all.txt' % tmp_dir_path
- print('Extract text...')
- start_time = time.time()
- next_print_time = start_time + 3
- # normalize text a bit and concatenate all lines into single file, counting total lines/bytes
- with open(all_txt_file_name, 'w', encoding='utf-8') as all_out:
- for dir_path, dir_names, file_names in os.walk('%s/txt' % tmp_dir_path):
- for file_name in file_names:
- if file_name.endswith('.txt'):
- file_count += 1
- year, month, day = (int(x) for x in file_name[3:-4].split('-')[:3])
- if year >= 50:
- year = 1900 + year
- else:
- year = 2000 + year
- date_string = '%04d-%02d-%02d' % (year, month, day)
-
- # unfortunately we need errors='ignore' since in Europarl v7, one file (pl/ep-09-10-22-009.txt) has invalid utf-8:
- chapter_count = 0
- with open('%s/%s' % (dir_path, file_name), 'r', encoding='utf-8', errors='ignore') as f_in:
- last_text = []
- last_title = None
- while True:
- line = f_in.readline()
- if line == '':
- break
- line = line.strip()
- if line.startswith('<CHAPTER '):
- if last_title is not None:
- s = ' '.join(last_text)
- s = re_tag.sub(' ', s)
- s = re_newlines.sub(' ', s)
- s = s.strip()
- if len(s) > 0:
- doc_count += split_docs(all_out, last_title, date_string, s)
- else:
- skip_count += 1
-
- last_text = []
- chapter_count += 1
- while True:
- last_title = f_in.readline()
- if last_title == '':
- last_title = None
- break
- last_title = re_tag.sub(' ', last_title).strip()
- if len(last_title) > 0:
- break
- continue
- else:
- last_text.append(line)
- if last_title is not None:
- s = ' '.join(last_text)
- s = re_tag.sub(' ', s)
- s = re_newlines.sub(' ', s)
- s = s.strip()
- if len(s) > 0:
- doc_count += split_docs(all_out, last_title, date_string, s)
- else:
- skip_count += 1
- chapter_count += 1
- else:
- skip_count += 1
- if chapter_count > 0:
- #print('%s/%s: %d chapters' % (dir_path, file_name, chapter_count))
- pass
- now = time.time()
- if now > next_print_time:
- print('%4.1fs: keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \
- (now - start_time, (file_count - skip_count) / 1000, file_count / 1000,
- 100 * (file_count - skip_count) / file_count,
- doc_count / 1000000, all_out.tell() / 1024/1024/1024))
- while next_print_time < now:
- next_print_time += 3
- total_mb = os.path.getsize(all_txt_file_name)/1024/1024
- now = time.time()
- print('%4.1fs (done): keep %.2f K of %.2f K files (%.1f%%), %.2f M docs, %.2f GB...' % \
- (now - start_time, (file_count - skip_count) / 1000, file_count / 1000,
- 100 * (file_count - skip_count) / file_count,
- doc_count / 1000000, os.path.getsize(all_txt_file_name) / 1024/1024/1024))
- print('Shuffle...')
- subprocess.run('shuf %s > %s.shuffled' % (all_txt_file_name, all_txt_file_name), shell=True)
- for mb in (20, 200, 2000):
- print('Sample %d MB file...' % mb)
- file_name_out = '%dmb.txt' % mb
- with open(file_name_out, 'w', encoding='utf-8') as f_out:
- chance = mb / total_mb
- with open(all_txt_file_name + '.shuffled', 'r', encoding='utf-8') as f:
- while True:
- line = f.readline()
- if len(line) == 0:
- break
- if random.random() <= chance:
- f_out.write(line)
- print(' got %.2f MB' % (os.path.getsize(file_name_out)/1024/1024))
- compress_with_seek_points(file_name_out,
- file_name_out + '.gz',
- mb)
-
- finally:
- print('Removing tmp dir "%s"...' % tmp_dir_path)
- if not DEBUG:
- shutil.rmtree(tmp_dir_path)
- print('\nWARNING: left ./europarl.tgz, which you should delete if you do not want it!\n')
- if False:
- compress_with_seek_points('/x/tmp/europarl.lines.txt',
- '/x/tmp/foo.txt.gz',
- 16)
- else:
- sample_europarl()