MOSES_SCRIPT=${path}/mosesdecoder-RELEASE-2.1.1/scripts
SCRIPT_DIR=${path}/script.converter.distribution
cd corpus.org/
Extracting sentences for name in dev test train; do
perl -ne 'chomp; @a=split/ \|\|\| /; print $a[4], "\n";' < ${name}.txt > ${name}.ja.txt
perl -ne 'chomp; @a=split/ \|\|\| /; print $a[3], "\n";' < ${name}.txt > ${name}.en.txt
done
cd ..
mkdir corpus.tok
cd corpus.tok
Tokenizing sentences in Japanese for file in train dev test; do
cat ../corpus.org/${file}.ja.txt | \
perl -Mencoding=utf8 -pe 's/ / /g;' | \
juman -b | \
perl -ne 'chomp; if($_ eq "EOS"){print join(" ",@b),"\n"; @b=();} else {@a=split/ /; push @b, $a[0];}' | \
perl -pe 's/^ +//; s/ +$//; s/ +/ /g;' | \
perl -Mencoding=utf8 -pe 'tr/\|[]/|[]/; ' \
> ${file}.ja
done
Tokenizing sentences in English for file in train dev test; do
cat ../corpus.org/${file}.en.txt | \
perl ${SCRIPT_DIR}/z2h-utf8.pl | \
perl ${MOSES_SCRIPT}/tokenizer/tokenizer.perl -l en \
> ${file}.tok.en
done
Training truecaser for English cat train.tok.en dev.tok.en > train_dev.tok.en
${MOSES_SCRIPT}/recaser/train-truecaser.perl --model truecase-model.en --corpus train_dev.tok.en
Truecasing English sentences for file in train dev test; do
${MOSES_SCRIPT}/recaser/truecase.perl --model truecase-model.en < ${file}.tok.en > ${file}.en
done
Cleaning training data for translation models perl ${MOSES_SCRIPT}/training/clean-corpus-n.perl train ja en train-clean 1 40
cd ..
mkdir corpus.tree
cd corpus.tree
for file in train dev test; do
ln -s ../corpus.tok/${file}.ja
done
Tokenizing sentences in English for file in train dev test; do
cat ../corpus.org/${file}.en.txt | \
perl ${SCRIPT_DIR}/z2h-utf8.pl | \
perl ${MOSES_SCRIPT}/tokenizer/tokenizer.perl -l en -penn \
> ${file}.tok.en
done
Training truecaser for English cat train.tok.en dev.tok.en > train_dev.tok.en
${MOSES_SCRIPT}/recaser/train-truecaser.perl --model truecase-model.en --corpus train_dev.tok.en
Truecasing English sentences for file in train; do
${MOSES_SCRIPT}/recaser/truecase.perl --model truecase-model.en < ${file}.tok.en > ${file}.parenthesis.en
done
Building training data for a language model cat train.parenthesis.en | perl -pe 's/\-LRB\-/\(/g; s/\-RRB\-/\)/g;' > train.en
Parsing English sentences ln -s train.ja train.tok.ja
perl ${MOSES_SCRIPT}/training/clean-corpus-n.perl train.tok ja en train.reduced.tok 1 40
for file in train.reduced dev test; do
cat ${file}.tok.en | \
perl ${MOSES_SCRIPT}/training/wrappers/parse-de-berkeley.perl \
-binarize \
-ja ${path}/BerkeleyParser-1.7/BerkeleyParser-1.7.jar \
-gr ${path}/BerkeleyParser-1.7/eng_sm6.gr \
> ${file}.tok.xml.en
done
Cleaning training data for translation models ln -s train.reduced.tok.ja train.reduced.tok.xml.ja
perl ${MOSES_SCRIPT}/training/clean-corpus-n.perl \
train.reduced.tok.xml ja en \
train-clean.tok.xml 1 40 \
--ignore-xml
Truecasing English sentences for file in train-clean dev test; do
${MOSES_SCRIPT}/recaser/truecase.perl --model truecase-model.en < ${file}.tok.xml.en > ${file}.en
done
ln -s train-clean.tok.xml.ja train-clean.ja
cd ..
JST (Japan Science and Technology Agency)
NICT (National Institute of Information and Communications Technology)
Kyoto University
Last Modified: 2017-07-21