TASK=ja-zh
SCRIPT=${path}/script.segmentation.distribution
MOSES_SCRIPT=${path}/mosesdecoder-RELEASE-2.1.1/scripts
STANFORD_SEGMENTER=${path}/stanford-segmenter-2014-06-16
KYTEA_MODEL=${path}
RIBES=${path}/RIBES-1.02.4
mkdir tests.tok
mkdir results.tok
KyTea for file in tests results; do
cat ${file}.org/${TASK}.txt | \
sh ${SCRIPT}/remove-space.sh | \
perl ${SCRIPT}/h2z-utf8-without-space.pl | \
kytea -model ${KYTEA_MODEL}/msr-0.4.0-1.mod -out tok | \
perl -pe 's/^ +//; s/ +$//; s/ +/ /g;' | \
> ${file}.tok/${TASK}.kytea.txt
done
Stanford Word Segmenter (Using ctb model) for file in tests results; do
cat ${file}.org/${TASK}.txt | \
sh ${SCRIPT}/remove-space.sh | \
perl ${SCRIPT}/h2z-utf8-without-space.pl | \
perl -pe 's/^ *$/_____/;' | \
${STANFORD_SEGMENTER}/segment.sh -k ctb /dev/stdin UTF-8 0 | \
perl -pe 's/^_____$//;'
> ${file}.tok/${TASK}.stanford-ctb.txt
done
Stanford Word Segmenter (Using pku model) for file in tests results; do
cat ${file}.org/${TASK}.txt | \
sh ${SCRIPT}/remove-space.sh | \
perl ${SCRIPT}/h2z-utf8-without-space.pl | \
perl -pe 's/^ *$/_____/;' | \
${STANFORD_SEGMENTER}/segment.sh -k pku /dev/stdin UTF-8 0 | \
perl -pe 's/^_____$//;'
> ${file}.tok/${TASK}.stanford-pku.txt
done
BLEU for segmenter in kytea stanford-ctb stanford-pku; do
perl ${MOSES_SCRIPT}/generic/multi-bleu.perl tests.tok/${TASK}.${segmenter}.txt < results.tok/${TASK}.${segmenter}.txt
done
RIBES for segmenter in kytea stanford-ctb stanford-pku; do
python3 ${RIBES}/RIBES.py -c -r tests.tok/${TASK}.${segmenter}.txt results.tok/${TASK}.${segmenter}.txt
done
JST (Japan Science and Technology Agency)
NICT (National Institute of Information and Communications Technology)
Kyoto University
Last Modified: 2015-03-20