mine.py - Copyright (c) Facebook, Inc. and its affiliates. …

/wav2vec_cycle_code/fairseq/examples/criss/mining/mine.py

https://gitlab.com/lwd17/enhanced_examplar_ae · Python · 240 lines · 209 code · 26 blank · 5 comment · 25 complexity · 65f82803b8ea07c3495a5d48833731a5 MD5 · raw file

#!/usr/bin/env python3 -u
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import glob
from subprocess import check_call

try:
    import faiss

    has_faiss = True
except ImportError:
    has_faiss = False
import numpy as np


GB = 1024 * 1024 * 1024


def call(cmd):
    print(cmd)
    check_call(cmd, shell=True)


def get_batches(directory, lang, prefix="all_avg_pool"):
    print(f"Finding in {directory}/{prefix}.{lang}*")
    files = glob.glob(f"{directory}/{prefix}.{lang}*")
    emb_files = []
    txt_files = []
    for emb_fi in files:
        emb_files.append(emb_fi)
        txt_fi = emb_fi.replace(prefix, "sentences")
        txt_files.append(txt_fi)
    return emb_files, txt_files


def load_batch(emb_file, dim):
    embeddings = np.fromfile(emb_file, dtype=np.float32)
    num_rows = int(embeddings.shape[0] / dim)
    embeddings = embeddings.reshape((num_rows, dim))
    faiss.normalize_L2(embeddings)
    return embeddings


def knnGPU_sharded(x_batches_f, y_batches_f, dim, k, direction="x2y"):
    if not has_faiss:
        raise ImportError("Please install Faiss")
    sims = []
    inds = []
    xfrom = 0
    xto = 0
    for x_batch_f in x_batches_f:
        yfrom = 0
        yto = 0
        x_batch = load_batch(x_batch_f, dim)
        xto = xfrom + x_batch.shape[0]
        bsims, binds = [], []
        for y_batch_f in y_batches_f:
            y_batch = load_batch(y_batch_f, dim)
            neighbor_size = min(k, y_batch.shape[0])
            yto = yfrom + y_batch.shape[0]
            print("{}-{}  ->  {}-{}".format(xfrom, xto, yfrom, yto))
            idx = faiss.IndexFlatIP(dim)
            idx = faiss.index_cpu_to_all_gpus(idx)
            idx.add(y_batch)
            bsim, bind = idx.search(x_batch, neighbor_size)

            bsims.append(bsim)
            binds.append(bind + yfrom)
            yfrom += y_batch.shape[0]
            del idx
            del y_batch
        bsims = np.concatenate(bsims, axis=1)
        binds = np.concatenate(binds, axis=1)
        aux = np.argsort(-bsims, axis=1)
        sim_batch = np.zeros((x_batch.shape[0], k), dtype=np.float32)
        ind_batch = np.zeros((x_batch.shape[0], k), dtype=np.int64)
        for i in range(x_batch.shape[0]):
            for j in range(k):
                sim_batch[i, j] = bsims[i, aux[i, j]]
                ind_batch[i, j] = binds[i, aux[i, j]]
        sims.append(sim_batch)
        inds.append(ind_batch)
        xfrom += x_batch.shape[0]
        del x_batch
    sim = np.concatenate(sims, axis=0)
    ind = np.concatenate(inds, axis=0)
    return sim, ind


def score(sim, fwd_mean, bwd_mean, margin):
    return margin(sim, (fwd_mean + bwd_mean) / 2)


def score_candidates(
    sim_mat, candidate_inds, fwd_mean, bwd_mean, margin, verbose=False
):
    print(" - scoring {:d} candidates".format(sim_mat.shape[0]))
    scores = np.zeros(candidate_inds.shape)
    for i in range(scores.shape[0]):
        for j in range(scores.shape[1]):
            k = int(candidate_inds[i, j])
            scores[i, j] = score(sim_mat[i, j], fwd_mean[i], bwd_mean[k], margin)
    return scores


def load_text(files):
    all_sentences = []
    for fi in files:
        with open(fi) as sentence_fi:
            for line in sentence_fi:
                all_sentences.append(line.strip())
    print(f"Read {len(all_sentences)} sentences")
    return all_sentences


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Mine bitext")
    parser.add_argument("--src-lang", help="Source language")
    parser.add_argument("--tgt-lang", help="Target language")
    parser.add_argument(
        "--dict-path", help="Path to dictionary file", default="dict.txt"
    )
    parser.add_argument(
        "--spm-path", help="Path to SPM model file", default="sentence.bpe.model"
    )
    parser.add_argument("--dim", type=int, default=1024, help="Embedding dimension")
    parser.add_argument("--mem", type=int, default=5, help="Memory in GB")
    parser.add_argument("--src-dir", help="Source directory")
    parser.add_argument("--tgt-dir", help="Target directory")
    parser.add_argument("--output", help="Output path")
    parser.add_argument(
        "--neighborhood", type=int, default=4, help="Embedding dimension"
    )
    parser.add_argument(
        "--threshold", type=float, default=1.06, help="Threshold on mined bitext"
    )
    parser.add_argument(
        "--valid-size",
        type=int,
        default=2000,
        help="Number of sentences used for validation set",
    )
    parser.add_argument(
        "--min-count",
        type=int,
        default=50000,
        help="Min num sentences used for each language",
    )
    args = parser.parse_args()

    x_batches_f, x_sents_f = get_batches(args.src_dir, args.src_lang)
    y_batches_f, y_sents_f = get_batches(args.tgt_dir, args.tgt_lang)
    margin = lambda a, b: a / b
    y2x_sim, y2x_ind = knnGPU_sharded(
        y_batches_f, x_batches_f, args.dim, args.neighborhood, direction="y2x"
    )
    x2y_sim, x2y_ind = knnGPU_sharded(
        x_batches_f, y_batches_f, args.dim, args.neighborhood, direction="x2y"
    )

    x2y_mean = x2y_sim.mean(axis=1)
    y2x_mean = y2x_sim.mean(axis=1)
    fwd_scores = score_candidates(x2y_sim, x2y_ind, x2y_mean, y2x_mean, margin)
    bwd_scores = score_candidates(y2x_sim, y2x_ind, y2x_mean, x2y_mean, margin)
    fwd_best = x2y_ind[np.arange(x2y_sim.shape[0]), fwd_scores.argmax(axis=1)]
    bwd_best = y2x_ind[np.arange(y2x_sim.shape[0]), bwd_scores.argmax(axis=1)]
    indices = np.stack(
        (
            np.concatenate((np.arange(x2y_ind.shape[0]), bwd_best)),
            np.concatenate((fwd_best, np.arange(y2x_ind.shape[0]))),
        ),
        axis=1,
    )
    scores = np.concatenate((fwd_scores.max(axis=1), bwd_scores.max(axis=1)))

    x_sentences = load_text(x_sents_f)
    y_sentences = load_text(y_sents_f)

    threshold = args.threshold
    min_count = args.min_count
    seen_src, seen_trg = set(), set()
    directory = args.output
    call(f"mkdir -p {directory}")
    src_out = open(
        f"{directory}/all.{args.src_lang}",
        mode="w",
        encoding="utf-8",
        errors="surrogateescape",
    )
    tgt_out = open(
        f"{directory}/all.{args.tgt_lang}",
        mode="w",
        encoding="utf-8",
        errors="surrogateescape",
    )
    scores_out = open(
        f"{directory}/all.scores", mode="w", encoding="utf-8", errors="surrogateescape"
    )
    count = 0
    for i in np.argsort(-scores):
        src_ind, trg_ind = indices[i]
        if src_ind not in seen_src and trg_ind not in seen_trg:
            seen_src.add(src_ind)
            seen_trg.add(trg_ind)
            if scores[i] > threshold or count < min_count:
                if x_sentences[src_ind]:
                    print(scores[i], file=scores_out)
                    print(x_sentences[src_ind], file=src_out)
                    print(y_sentences[trg_ind], file=tgt_out)
                    count += 1
                else:
                    print(f"Ignoring sentence: {x_sentences[src_ind]}")
    src_out.close()
    tgt_out.close()
    scores_out.close()

    print(f"Found {count} pairs for threshold={threshold}")
    with open(f"{directory}/all.{args.src_lang}") as all_s, open(
        f"{directory}/all.{args.tgt_lang}"
    ) as all_t, open(f"{directory}/valid.{args.src_lang}", "w") as valid_s, open(
        f"{directory}/valid.{args.tgt_lang}", "w"
    ) as valid_t, open(
        f"{directory}/train.{args.src_lang}", "w"
    ) as train_s, open(
        f"{directory}/train.{args.tgt_lang}", "w"
    ) as train_t:
        count = 0
        for s_line, t_line in zip(all_s, all_t):
            s_line = s_line.split("\t")[1]
            t_line = t_line.split("\t")[1]
            if count >= args.valid_size:
                train_s.write(s_line)
                train_t.write(t_line)
            else:
                valid_s.write(s_line)
                valid_t.write(t_line)
                count += 1
Tech Fingerprint

Alerts (28)

'def' Ensure functions have docstrings for documentation
22 27 39 47 93 97 109
'print(' Use logging module for better control and configurability
23 28 64 100 115 210 211 212 215 220
'del' Avoid unless necessary; Python's garbage collector typically handles object deletion
73 74 87
Complexity hotspot; lines 111 to 113 (total complexity: 3)
111 112 113
'open(' Use 'with open()' to ensure Files are properly closed
187 193 199
Complexity hotspot; lines 208 to 209 (total complexity: 3)
208 209