MOSES_SCRIPT=${path}/mosesdecoder-RELEASE-2.1.1/scripts
SCRIPT_DIR=${path}/script.converter.distribution
INDIC_LIBRARY=/path/to/indic/library/cloned/from/bitbucket
mkdir corpus.tok
cd corpus.tok
Tokenizing sentences in English for file in train dev test; do
${MOSES_SCRIPT}/tokenizer/tokenizer.perl -l en < ../corpus.org/${file}.en > ${file}.en
done
Tokenizing/Normalizing sentences in Hindi for name in dev test train; do
We recommend that the participants also try out unsupervised morphological analysis and transliteration. (Available in the Indic NLP library)
python ${INDIC_LIBRARY}/src/indicnlp/normalize/indic_normalize.py ../corpus.org/${file}.hi ${file}.normalized.hi hi
python ${INDIC_LIBRARY}/src/indicnlp/tokenize/indic_tokenize.py ${file}.normalized.hi ${file}.hi hi
done
Cleaning training data for translation models perl ${MOSES_SCRIPT}/training/clean-corpus-n.perl train en hi train-clean 1 40
cd ..
JST (Japan Science and Technology Agency)
NICT (National Institute of Information and Communications Technology)
Kyoto University
Last Modified: 2016-06-11