MOSES_SCRIPT=${path}/mosesdecoder-RELEASE-2.1.1/scripts
SCRIPT_DIR=${path}/script.converter.distribution
SCRIPT_DIR2=${path}/script.recipe.distribution
cd corpus.org/
Extracting sentences for data in dev train; do
for data_type in all title ingredients steps; do
echo $data $lang $data_type
cat ../../../data/Cookpad_Recipe/data/${data}.json | \
python3 ${SCRIPT_DIR2}/json2txt_for_recipe.py --mode 1 --data_type $data_type > ${data}_${data_type}.ja-en.txt
done
done
for f in `echo *.ja-en.txt`; do
cut -f1 $f > ${f:r:r}.ja.txt
cut -f2 $f > ${f:r:r}.en.txt
done
for l in ja en; do
for name in dev train; do
if [ -e ${name}_all.${l}.txt ]; then
mv ${name}_all.${l}.txt ${name}.${l}.txt
fi
done
done
for l in ja en; do
for f in `echo ../../../data/Cookpad_Recipe/data/test*${l}`; do
cp -p ${f} ${f:t}.txt
done
done
cd ..
mkdir corpus.tok
cd corpus.tok
Tokenizing sentences in Japanese for file in train dev test_all test_ingredient test_step test_title do
cat ../corpus.org/${file}.ja.txt | \
perl -Mencoding=utf8 -pe 's/ / /g;' | \
juman -b | \
perl -ne 'chomp; if($_ eq "EOS"){print join(" ",@b),"\n"; @b=();} else {@a=split/ /; push @b, $a[0];}' | \
perl -pe 's/^ +//; s/ +$//; s/ +/ /g;' | \
perl -Mencoding=utf8 -pe 'tr/\|[]/|[]/; ' \
> ${file}.ja
done
Tokenizing sentences in English for file in train dev test_all test_ingredient test_step test_title; do
cat ../corpus.org/${file}.en.txt | \
perl ${SCRIPT_DIR}/z2h-utf8.pl | \
perl ${MOSES_SCRIPT}/tokenizer/tokenizer.perl -l en \
> ${file}.tok.en
done
Training truecaser for English cat train.tok.en dev.tok.en > train_dev.tok.en
${MOSES_SCRIPT}/recaser/train-truecaser.perl --model truecase-model.en --corpus train_dev.tok.en
Truecasing English sentences for file in train dev test; do
${MOSES_SCRIPT}/recaser/truecase.perl --model truecase-model.en < ${file}.tok.en > ${file}.en
done
Cleaning training data for translation models perl ${MOSES_SCRIPT}/training/clean-corpus-n.perl train ja en train-clean 1 40
cd ..
JST (Japan Science and Technology Agency)
NICT (National Institute of Information and Communications Technology)
Kyoto University
Last Modified: 2017-08-17