JST_LOGO.JPG NICT_LOGO.JPG KYOTO-U_LOGO.JPG

WAT 2016

The 3rd Workshop on Asian Translation
Baseline Systems
Phrase-based SMT
for the JE, EJ, JC, and CJ subtasks

[BASELINE SYSTEMS TOP] | [SETUP] | [TRAINING LANGUAGE MODEL] | [TRAINING TRANSLATION MODEL] | [TUNING] | [TRANSLATING] | [RECASE THE OUTPUT] | [DETOKENIZE THE OUTPUT]

Setup

(Here, ${LANG_F} represents the source language and ${LANG_E} represents the target language. "ja" and "en" are samples.)
LANG_F=ja
LANG_E=en
CORPUS_LM=../corpus.tok/train
(For JE and EJ of ASPEC, "CORPUS_LM=../corpus.tok/train-all")
CORPUS=../corpus.tok/train-clean
DEV_F=../corpus.tok/dev.${LANG_F}
DEV_E=../corpus.tok/dev.${LANG_E}
TEST=../corpus.tok/test.${LANG_F}
REF=../corpus.tok/test.${LANG_E}
LM_ORDER=5
JOBS=16

MOSES_SCRIPT=${path}/mosesdecoder-RELEASE-2.1.1/scripts
MOSES_BIN_DIR=${path}/mosesdecoder-RELEASE-2.1.1/bin
EXT_BIN_DIR=${path}/giza-pp/bin

WORK_DIR=work.${LANG_F}-${LANG_E}
TRAINING_DIR=${WORK_DIR}/training
MODEL_DIR=${WORK_DIR}/training/model

mkdir phraseModel
cd phraseModel/
mkdir -p ${TRAINING_DIR}/lm

Back to top

Training Language Model

LM_FILE=`pwd`/${TRAINING_DIR}/lm/lm.${LANG_E}.arpa.gz

${MOSES_BIN_DIR}/lmplz --order ${LM_ORDER} -S 80% -T /tmp < ${CORPUS_LM}.${LANG_E} | gzip > ${LM_FILE}
Back to top

Training Translation Model

${MOSES_SCRIPT}/training/train-model.perl \
  --root-dir `pwd`/${TRAINING_DIR} \
  --model-dir `pwd`/${MODEL_DIR} \
  --corpus ${CORPUS} \
  --external-bin-dir ${EXT_BIN_DIR} \
  --f ${LANG_F} \
  --e ${LANG_E} \
  --parallel \
  --alignment grow-diag-final-and \
  --reordering msd-bidirectional-fe \
  --score-options "--GoodTuring" \
  --lm 0:${LM_ORDER}:${LM_FILE}:8 \
  --cores ${JOBS} \
  --sort-buffer-size 10G \
  --parallel \
  >& ${TRAINING_DIR}/training_TM.log
Back to top

Tuning

${MOSES_SCRIPT}/training/filter-model-given-input.pl \
  ${MODEL_DIR}.filtered/dev \
  ${MODEL_DIR}/moses.ini \
  ${DEV_F}

mkdir -p ${WORK_DIR}/tuning

${MOSES_SCRIPT}/training/mert-moses.pl \
  ${DEV_F} \
  ${DEV_E} \
  ${MOSES_BIN_DIR}/moses \
  `pwd`/${MODEL_DIR}.filtered/dev/moses.ini \
  --mertdir ${MOSES_BIN_DIR} \
  --working-dir `pwd`/${WORK_DIR}/tuning/mert \
  --threads ${JOBS} \
  --no-filter-phrase-table \
  --decoder-flags "-threads ${JOBS} -distortion-limit 20" \
  --predictable-seeds \
  >& ${WORK_DIR}/tuning/mert.log


  • Insert weights into the configuration file.
  • perl ${MOSES_SCRIPT}/ems/support/substitute-weights.perl \
      ${MODEL_DIR}/moses.ini \
      ${WORK_DIR}/tuning/mert/moses.ini \
      ${MODEL_DIR}/moses-tuned.ini

    Back to top

    Translating

    OUTPUT_DIR=${WORK_DIR}/output
    mkdir ${OUTPUT_DIR}

    ${MOSES_SCRIPT}/training/filter-model-given-input.pl \
      ${MODEL_DIR}.filtered/test \
      ${MODEL_DIR}/moses-tuned.ini \
      ${TEST}

    outfile=${OUTPUT_DIR}/test.out

    ${MOSES_BIN_DIR}/moses -config ${MODEL_DIR}.filtered/test/moses.ini -distortion-limit 20 -threads ${JOBS} < ${TEST} > ${outfile} 2> ${outfile}.log

    Back to top

    Recase the output

  • For English
  • ${MOSES_SCRIPT}/recaser/detruecase.perl < ${outfile} > ${outfile}.tok
    Back to top

    Detokenize the output

  • For English
  • ${MOSES_SCRIPT}/tokenizer/detokenizer.perl -l en < ${outfile}.tok > ${outfile}.detok

  • For Japanese and Chinese
  • cat ${outfile} | \
      perl -Mencoding=utf8 -pe 's/([^A-Za-zA-Za-z]) +/${1}/g; s/ +([^A-Za-zA-Za-z])/${1}/g; ' \
      > ${outfile}.detok

    Back to top

    JST (Japan Science and Technology Agency)
    NICT (National Institute of Information and Communications Technology)
    Kyoto University
    Last Modified: 2016-06-11