File size: 3,796 Bytes
c9bb3f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Use this script in order to build symmetric alignments for your translation
dataset.
This script depends on fast_align and mosesdecoder tools. You will need to
build those before running the script.
fast_align:
    github: http://github.com/clab/fast_align
    instructions: follow the instructions in README.md
mosesdecoder:
    github: http://github.com/moses-smt/mosesdecoder
    instructions: http://www.statmt.org/moses/?n=Development.GetStarted
The script produces the following files under --output_dir:
    text.joined - concatenation of lines from the source_file and the
    target_file.
    align.forward - forward pass of fast_align.
    align.backward - backward pass of fast_align.
    aligned.sym_heuristic - symmetrized alignment.
"""

import argparse
import os
from itertools import zip_longest


def main():
    parser = argparse.ArgumentParser(description="symmetric alignment builer")
    # fmt: off
    parser.add_argument('--fast_align_dir',
                        help='path to fast_align build directory')
    parser.add_argument('--mosesdecoder_dir',
                        help='path to mosesdecoder root directory')
    parser.add_argument('--sym_heuristic',
                        help='heuristic to use for symmetrization',
                        default='grow-diag-final-and')
    parser.add_argument('--source_file',
                        help='path to a file with sentences '
                             'in the source language')
    parser.add_argument('--target_file',
                        help='path to a file with sentences '
                             'in the target language')
    parser.add_argument('--output_dir',
                        help='output directory')
    # fmt: on
    args = parser.parse_args()

    fast_align_bin = os.path.join(args.fast_align_dir, "fast_align")
    symal_bin = os.path.join(args.mosesdecoder_dir, "bin", "symal")
    sym_fast_align_bin = os.path.join(
        args.mosesdecoder_dir, "scripts", "ems", "support", "symmetrize-fast-align.perl"
    )

    # create joined file
    joined_file = os.path.join(args.output_dir, "text.joined")
    with open(args.source_file, "r", encoding="utf-8") as src, open(
        args.target_file, "r", encoding="utf-8"
    ) as tgt:
        with open(joined_file, "w", encoding="utf-8") as joined:
            for s, t in zip_longest(src, tgt):
                print("{} ||| {}".format(s.strip(), t.strip()), file=joined)

    bwd_align_file = os.path.join(args.output_dir, "align.backward")

    # run forward alignment
    fwd_align_file = os.path.join(args.output_dir, "align.forward")
    fwd_fast_align_cmd = "{FASTALIGN} -i {JOINED} -d -o -v > {FWD}".format(
        FASTALIGN=fast_align_bin, JOINED=joined_file, FWD=fwd_align_file
    )
    assert os.system(fwd_fast_align_cmd) == 0

    # run backward alignment
    bwd_align_file = os.path.join(args.output_dir, "align.backward")
    bwd_fast_align_cmd = "{FASTALIGN} -i {JOINED} -d -o -v -r > {BWD}".format(
        FASTALIGN=fast_align_bin, JOINED=joined_file, BWD=bwd_align_file
    )
    assert os.system(bwd_fast_align_cmd) == 0

    # run symmetrization
    sym_out_file = os.path.join(args.output_dir, "aligned")
    sym_cmd = "{SYMFASTALIGN} {FWD} {BWD} {SRC} {TGT} {OUT} {HEURISTIC} {SYMAL}".format(
        SYMFASTALIGN=sym_fast_align_bin,
        FWD=fwd_align_file,
        BWD=bwd_align_file,
        SRC=args.source_file,
        TGT=args.target_file,
        OUT=sym_out_file,
        HEURISTIC=args.sym_heuristic,
        SYMAL=symal_bin,
    )
    assert os.system(sym_cmd) == 0


if __name__ == "__main__":
    main()