LANG_F=en
LANG_E=ja
CORPUS_LM=../corpus.tok/train-all
CORPUS=../corpus.tree/train-clean
DEV_F=../corpus.tree/dev.${LANG_F}
DEV_E=../corpus.tok/dev.${LANG_E}
TEST=../corpus.tree/test.${LANG_F}
REF=../corpus.tok/test.${LANG_E}
LM_ORDER=5
JOBS=16
MOSES_SCRIPT=${path}/mosesdecoder-RELEASE-2.1.1/scripts
MOSES_BIN_DIR=${path}/mosesdecoder-RELEASE-2.1.1/bin
EXT_BIN_DIR=${path}/giza-pp/bin
WORK_DIR=work.${LANG_F}-${LANG_E}
TRAINING_DIR=${WORK_DIR}/training
MODEL_DIR=${WORK_DIR}/training/model
mkdir tree2stringModel
cd tree2stringModel/
mkdir -p ${TRAINING_DIR}/lm
LM_FILE=`pwd`/${TRAINING_DIR}/lm/lm.${LANG_E}.arpa.gz
${MOSES_BIN_DIR}/lmplz --order ${LM_ORDER} -S 80% -T /tmp < ${CORPUS_LM}.${LANG_E} | gzip > ${LM_FILE}
${MOSES_SCRIPT}/training/train-model.perl \
--root-dir `pwd`/${TRAINING_DIR} \
--model-dir `pwd`/${MODEL_DIR} \
--corpus ${CORPUS} \
--external-bin-dir ${EXT_BIN_DIR} \
--f ${LANG_F} \
--e ${LANG_E} \
--parallel \
--alignment grow-diag-final-and \
--score-options "--GoodTuring" \
--hierarchical \
--glue-grammar \
--lm 0:${LM_ORDER}:${LM_FILE}:8 \
--source-syntax \
--extract-options "--MaxSpan 1000 --MinHoleSource 1 --MinWords 0 --NonTermConsecSource --AllowOnlyUnalignedWords" \
--cores ${JOBS} \
--sort-buffer-size 10G \
--parallel \
>& ${TRAINING_DIR}/training_TM.log
mkdir -p ${WORK_DIR}/tuning
${MOSES_SCRIPT}/training/mert-moses.pl \
${DEV_F} \
${DEV_E} \
${MOSES_BIN_DIR}/moses_chart \
`pwd`/${MODEL_DIR}/moses.ini \
--mertdir ${MOSES_BIN_DIR} \
--working-dir `pwd`/${WORK_DIR}/tuning/mert \
--threads ${JOBS} \
--no-filter-phrase-table \
--decoder-flags "-threads ${JOBS} -max-chart-span 1000" \
--inputtype 3 \
--predictable-seeds \
>& ${WORK_DIR}/tuning/mert.log
Insert weights into the configuration file. perl ${MOSES_SCRIPT}/ems/support/substitute-weights.perl \
${MODEL_DIR}/moses.ini \
${WORK_DIR}/tuning/mert/moses.ini \
${MODEL_DIR}/moses-tuned.ini
OUTPUT_DIR=${WORK_DIR}/output
mkdir ${OUTPUT_DIR}
outfile=${OUTPUT_DIR}/test.out
${MOSES_BIN_DIR}/moses_chart -config ${MODEL_DIR}/moses-tuned.ini -max-chart-span 1000 -threads ${JOBS} -inputtype 3 < ${TEST} > ${outfile} 2> ${outfile}.log
For Japanese cat ${outfile} | \
perl -Mencoding=utf8 -pe 's/([^A-Za-zA-Za-z]) +/${1}/g; s/ +([^A-Za-zA-Za-z])/${1}/g; ' \
> ${outfile}.detok
JST (Japan Science and Technology Agency)
NICT (National Institute of Information and Communications Technology)
Kyoto University
Last Modified: 2014-07-07