LANG_F=ja
LANG_E=en
TRAIN_F=../corpus.tok/train.${LANG_F}
TRAIN_E=../corpus.tok/train.${LANG_E}
DEV_F=../corpus.tok/dev.${LANG_F}
DEV_E=../corpus.tok/dev.${LANG_E}
TEST=../corpus.tok/test.${LANG_F}
NMT_PATH=${path}/OpenNMT-0.9.7
MOSES_SCRIPT=${path}/mosesdecoder-RELEASE-4.0/scripts
export LUA_PATH=${NMT_PATH}/?.lua
DATA_DIR=datadir
MODEL_DIR=model
SIZE=100000
SL=150
TL=150
GPUID=1
mkdir NMTModel
cd NMTModel/
mkdir -p ${DATA_DIR}
th ${NMT_PATH}/preprocess.lua -train_src ${TRAIN_F} -train_tgt ${TRAIN_E} -valid_src ${DEV_F} -valid_tgt ${DEV_E} -save_data ${DATA_DIR}/data -src_vocab_size ${SIZE} -tgt_vocab_size ${SIZE} -src_seq_length ${SL} -tgt_seq_length ${TL}
th ${NMT_PATH}/train.lua -data ${DATA_DIR}/data-train.t7 -save_model ${modeldir}/model -encoder_type brnn -brnn_merge concat -src_words_min_frequency 1 -tgt_words_min_frequency 1 -src_seq_length ${SL} -tgt_seq_length ${TL} -gpuid ${GPUID}
modelfile=${modeldir}/model_epoch13_*.t7
OUTPUT_DIR=output
mkdir ${OUTPUT_DIR}
outfile=${OUTPUT_DIR}/test.bpe.out
th ${NMT_PATH}/translate.lua -model $modelfile -src ${TEST} -output ${outfile} -gpuid ${GPUID}
perl -pe 's/@@ //g;' < ${outfile} > ${f}.tok
For English ${MOSES_SCRIPT}/tokenizer/detokenizer.perl -l en < ${outfile}.tok > ${outfile}.detok
For Japanese and Chinese cat ${outfile} | \
perl -Mencoding=utf8 -pe 's/([^A-Za-zA-Za-z]) +/${1}/g; s/ +([^A-Za-zA-Za-z])/${1}/g; ' \
> ${outfile}.detok
NICT (National Institute of Information and Communications Technology)
Kyoto University
Last Modified: 2018-07-30