MOSES_SCRIPT=${path}/mosesdecoder-RELEASE-4.0/scripts
SCRIPT_DIR=${path}/script.converter.distribution
mkdir corpus.org
cd corpus.org/
for name in dev test train; do
cp -p ../wat2020.my-en/alt/${name}.en ${name}.en.txt
cp -p ../wat2020.my-en/alt/${name}.my ${name}.my.txt
done
cat ../wat2020.my-en/ucsy/ucsy.en >> train.en.txt
cat ../wat2020.my-en/ucsy/ucsy.my.new >> train.my.txt
cd ..
mkdir corpus.tok
cd corpus.tok
Tokenizing sentences in Myanmar for name in train dev test; do
python ../wat2020.my-en/myseg.py < ../corpus.org/${name}.my.txt > ${name}.my
done
Tokenizing sentences in English for name in train dev test; do
cat ../corpus.org/${name}.en.txt | \
perl ${SCRIPT_DIR}/z2h-utf8.pl | \
perl ${MOSES_SCRIPT}/tokenizer/tokenizer.perl -l en -no-escape \
> ${name}.en
done
cd ..
Applying BPE (see Data preparation by BPE for my-en and km-en)
NICT (National Institute of Information and Communications Technology)
Kyoto University
Last Modified: 2020-07-08