MOSES_SCRIPT=${path}/mosesdecoder-RELEASE-4.0/scripts
SCRIPT_DIR=${path}/script.converter.distribution
cd corpus.org/
Extracting sentences for name in dev test; do
perl -ne 'chomp; @a=split/ \|\|\| /; print $a[2], "\n";' < ${name}.txt > ${name}.ja.txt
perl -ne 'chomp; @a=split/ \|\|\| /; print $a[3], "\n";' < ${name}.txt > ${name}.en.txt
done
for name in train-1 train-2 train-3; do
perl -ne 'chomp; @a=split/ \|\|\| /; print $a[3], "\n";' < ${name}.txt > ${name}.ja.txt
perl -ne 'chomp; @a=split/ \|\|\| /; print $a[4], "\n";' < ${name}.txt > ${name}.en.txt
done
(Removing date expressions at EOS in Japanese in the training and development data to reduce noise) for file in train-1 train-2 train-3 dev; do
mv ${file}.ja.txt ${file}.ja.txt.org
cat ${file}.ja.txt.org | perl -CSD -Mutf8 -pe 's/(.)[[0-9.]+]$/${1}/;' > ${file}.ja.txt
done
cd ..
mkdir corpus.tok
cd corpus.tok
ln -s train-1.ja.txt train.ja.txt
ln -s train-1.en.txt train.en.txt
Tokenizing sentences in Japanese for file in train-1 dev test; do
cat ../corpus.org/${file}.ja.txt | \
perl -CSD -Mutf8 -pe 's/ / /g;' | \
juman -b | \
perl -ne 'chomp; if($_ eq "EOS"){print join(" ",@b),"\n"; @b=();} else {@a=split/ /; push @b, $a[0];}' | \
perl -pe 's/^ +//; s/ +$//; s/ +/ /g;' | \
perl -CSD -Mutf8 -pe 'tr/\|[]/|[]/; ' \
> ${file}.ja
done
Tokenizing sentences in English for file in train-1 dev test; do
cat ../corpus.org/${file}.en.txt | \
perl ${SCRIPT_DIR}/z2h-utf8.pl | \
perl ${MOSES_SCRIPT}/tokenizer/tokenizer.perl -l en -no-escape \
> ${file}.en
done
ln -s train-1.ja train.ja
ln -s train-1.en train.en
cd ..
Applying BPE (see Data preparation by BPE)
NICT (National Institute of Information and Communications Technology)
Kyoto University
Last Modified: 2020-07-08