JST_LOGO.JPG NICT_LOGO.JPG KYOTO-U_LOGO.JPG

WAT

The Workshop on Asian Translation
Automatic Evaluation Procedures
for Japanese Translation Results

[AUTOMATIC EVALUATION PROCEDURES TOP] | [SETUP] | [SEGMENTATION] | [EVALUATION]

Setup

Here, ${TASK} representes the source and target language pair, and "en-ja", "zh-ja" and "ko-ja" can be applied. "en-ja" is sample.
We assume that there are ASPEC reference files (${TASK}.txt) in tests.org/, translation result files (${TASK}.txt) in results.org/ and all tools in ${path}.
TASK=en-ja
SCRIPT=${path}/script.segmentation.distribution
MOSES_SCRIPT=${path}/mosesdecoder-RELEASE-2.1.1/scripts
KYTEA_MODEL=${path}
RIBES=${path}/RIBES-1.02.4
mkdir tests.tok
mkdir results.tok
Back to top

Segmentation

  • JUMAN
  • for file in tests results; do
      cat ${file}.org/${TASK}.txt | \
       perl -Mencoding=utf8 -pe 's/(.)[[0-9.]+]$/${1}/;' | \
       sh ${SCRIPT}/remove-space.sh | \
       perl ${SCRIPT}/h2z-utf8-without-space.pl | \
       juman -b | \
       perl -ne 'chomp; if($_ eq "EOS"){print join(" ",@b),"\n"; @b=();} else {@a=split/ /; push @b, $a[0];}' | \
       perl -pe 's/^ +//; s/ +$//; s/ +/ /g;' | \
       perl -Mencoding=utf8 -pe 'while(s/([A-Z]) ([A-Za-z])/$1$2/g){} while(s/([a-z]) ([a-z])/$1$2/g){}' \
       > ${file}.tok/${TASK}.juman.txt
    done

  • KyTea
  • for file in tests results; do
      cat ${file}.org/${TASK}.txt | \
       perl -Mencoding=utf8 -pe 's/(.)[[0-9.]+]$/${1}/;' | \
       sh ${SCRIPT}/remove-space.sh | \
       perl ${SCRIPT}/h2z-utf8-without-space.pl | \
       kytea -model ${KYTEA_MODEL}/jp-0.4.2-utf8-1.mod -out tok | \
       perl -pe 's/^ +//; s/ +$//; s/ +/ /g;' | \
       perl -Mencoding=utf8 -pe 'while(s/([0-9]) ([0-9])/$1$2/g){} s/([0-9]) (.) ([0-9])/$1$2$3/g; while(s/([A-Z]) ([A-Za-z])/$1$2/g){} while(s/([a-z]) ([a-z])/$1$2/g){}' \
       > ${file}.tok/${TASK}.kytea.txt
    done

  • MeCab
  • for file in tests results; do
      cat ${file}.org/${TASK}.txt | \
       perl -Mencoding=utf8 -pe 's/(.)[[0-9.]+]$/${1}/;' | \
       sh ${SCRIPT}/remove-space.sh | \
       perl ${SCRIPT}/h2z-utf8-without-space.pl | \
       mecab -O wakati | |
       perl -Mencoding=utf8 -pe 'while(s/([0-9]) ([0-9])/$1$2/g){} s/([0-9]) (.) ([0-9])/$1$2$3/g; while(s/([A-Z]) ([A-Za-z])/$1$2/g){} while(s/([a-z]) ([a-z])/$1$2/g){} s/ $//;' \
       > ${file}.tok/${TASK}.mecab.txt
    done

    Back to top

    Evaluation

  • BLEU
  • for segmenter in juman kytea mecab; do
      perl ${MOSES_SCRIPT}/generic/multi-bleu.perl tests.tok/${TASK}.${segmenter}.txt < results.tok/${TASK}.${segmenter}.txt
    done

  • RIBES
  • for segmenter in juman kytea mecab; do
      python3 ${RIBES}/RIBES.py -c -r tests.tok/${TASK}.${segmenter}.txt results.tok/${TASK}.${segmenter}.txt
    done

    Back to top

    JST (Japan Science and Technology Agency)
    NICT (National Institute of Information and Communications Technology)
    Kyoto University
    Last Modified: 2015-03-20