MOSES_SCRIPT=${path}/mosesdecoder-RELEASE-4.0/scripts
SCRIPT_DIR=${path}/script.converter.distribution
BPE_PATH=${path}/subword-nmt.wo-codec
mkdir corpus.bpe
cd corpus.bpe
Building a BPE model cat ../corpus.tok/train.src ../corpus.tok/train.tgt | ${BPE_PATH}/learn_bpe.py -s 100000 -o codes_file
${BPE_PATH}/apply_bpe.py -c codes_file < ../corpus.tok/train.src | ${BPE_PATH}/get_vocab.py > vocab.src
${BPE_PATH}/apply_bpe.py -c codes_file < ../corpus.tok/train.tgt | ${BPE_PATH}/get_vocab.py > vocab.tgt
Applying the BPE model for name in train dev test; do
${BPE_PATH}/apply_bpe.py -c bpe_codes --vocabulary vocab.src --vocabulary-threshold 10 < ../corpus.tok/${name}.src > ${name}.src
${BPE_PATH}/apply_bpe.py -c bpe_codes --vocabulary vocab.tgt --vocabulary-threshold 10 < ../corpus.tok/${name}.tgt > ${name}.tgt
done
cd ..
NICT (National Institute of Information and Communications Technology)
Kyoto University
Last Modified: 2019-07-22