MOSES_SCRIPT=${path}/mosesdecoder-RELEASE-2.1.1/scripts
cd corpus.org/
Extracting sentences for name in train dev test; do
perl -ne 'chomp; @a=split/ \|\|\| /; print $a[2], "\n";' < ${name}.txt > ${name}.en.txt
perl -ne 'chomp; @a=split/ \|\|\| /; print $a[1], "\n";' < ${name}.txt > ${name}.id.txt
done
cd ..
mkdir corpus.tok
cd corpus.tok
Tokenizing sentences in Indonesian for file in train dev test; do
cat ../corpus.org/${file}.id.txt | \
perl ${MOSES_SCRIPT}/tokenizer/tokenizer.perl -l en \
> ${file}.tok.id
done
Tokenizing sentences in English for file in train dev test; do
cat ../corpus.org/${file}.en.txt | \
perl ${MOSES_SCRIPT}/tokenizer/tokenizer.perl -l en \
> ${file}.tok.en
done
Training truecaser for Indonesian cat train.tok.id dev.tok.id > train_dev.tok.id
${MOSES_SCRIPT}/recaser/train-truecaser.perl --model truecase-model.id --corpus train_dev.tok.id
Training truecaser for English cat train.tok.en dev.tok.en > train_dev.tok.en
${MOSES_SCRIPT}/recaser/train-truecaser.perl --model truecase-model.en --corpus train_dev.tok.en
Truecasing Indonesian sentences for file in train dev test; do
${MOSES_SCRIPT}/recaser/truecase.perl --model truecase-model.id < ${file}.tok.id > ${file}.id
done
Truecasing English sentences for file in train dev test; do
${MOSES_SCRIPT}/recaser/truecase.perl --model truecase-model.en < ${file}.tok.en > ${file}.en
done
Cleaning training data for translation models perl ${MOSES_SCRIPT}/training/clean-corpus-n.perl train id en train-clean 1 40
cd ..
mkdir corpus.tree
cd corpus.tree
for file in train dev test; do
ln -s ../corpus.tok/${file}.id
done
Tokenizing sentences in English for file in train dev test; do
cat ../corpus.org/${file}.en.txt | \
perl ${MOSES_SCRIPT}/tokenizer/tokenizer.perl -l en -penn \
> ${file}.tok.en
done
Training truecaser for English cat train.tok.en dev.tok.en > train_dev.tok.en
${MOSES_SCRIPT}/recaser/train-truecaser.perl --model truecase-model.en --corpus train_dev.tok.en
Truecasing English sentences for file in train; do
${MOSES_SCRIPT}/recaser/truecase.perl --model truecase-model.en < ${file}.tok.en > ${file}.parenthesis.en
done
Building training data for a language model cat train.parenthesis.en | perl -pe 's/\-LRB\-/\(/g; s/\-RRB\-/\)/g;' > train.en
Parsing English sentences ln -s train.id train.tok.id
perl ${MOSES_SCRIPT}/training/clean-corpus-n.perl train.tok id en train.reduced.tok 1 40
for file in train.reduced dev test; do
cat ${file}.tok.en | \
perl ${MOSES_SCRIPT}/training/wrappers/parse-de-berkeley.perl \
-binarize \
-ja ${path}/BerkeleyParser-1.7/BerkeleyParser-1.7.jar \
-gr ${path}/BerkeleyParser-1.7/eng_sm6.gr \
> ${file}.tok.xml.en
done
Cleaning training data for translation models ln -s train.reduced.tok.ja train.reduced.tok.xml.ja
perl ${MOSES_SCRIPT}/training/clean-corpus-n.perl \
train.reduced.tok.xml id en \
train-clean.tok.xml 1 40 \
--ignore-xml
Truecasing English sentences for file in train-clean dev test; do
${MOSES_SCRIPT}/recaser/truecase.perl --model truecase-model.en < ${file}.tok.xml.en > ${file}.en
done
ln -s train-clean.tok.xml.id train-clean.id
cd ..
JST (Japan Science and Technology Agency)
NICT (National Institute of Information and Communications Technology)
Kyoto University
Last Modified: 2015-06-24