MOSES_SCRIPT=${path}/mosesdecoder-RELEASE-2.1.1/scripts
cd corpus.org/
Extracting sentences for name in train dev test; do
perl -ne 'chomp; @a=split/ \|\|\| /; print $a[3], "\n";' < ${name}.txt > ${name}.ko.txt
perl -ne 'chomp; @a=split/ \|\|\| /; print $a[4], "\n";' < ${name}.txt > ${name}.ja.txt
done
mkdir corpus.tok
cd corpus.tok
Tokenizing sentences in Korean for file in train dev test; do
cat ../corpus.org/${file}.ko.txt | \
mecab -O wakati | \
perl -Mencoding=utf8 -pe 's/ $//; tr/\|[]/|[]/; ' \
> ${file}.ko
done
notice: mecab for Korean is different from mecab for Japanese.Tokenizing sentences in Japanese for file in train dev test; do
cat ../corpus.org/${file}.ja.txt | \
perl -Mencoding=utf8 -pe 's/ / /g;' | \
juman -b | \
perl -ne 'chomp; if($_ eq "EOS"){print join(" ",@b),"\n"; @b=();} else {@a=split/ /; push @b, $a[0];}' | \
perl -pe 's/^ +//; s/ +$//; s/ +/ /g;' | \
perl -Mencoding=utf8 -pe 'tr/\|[]/|[]/; ' \
> ${file}.ja
done
Cleaning training data for translation models perl ${MOSES_SCRIPT}/training/clean-corpus-n.perl train ko ja train-clean 1 40
cd ..
JST (Japan Science and Technology Agency)
NICT (National Institute of Information and Communications Technology)
Kyoto University
Last Modified: 2017-07-21