#!/bin/bash


# ==============================================================================
# generate jawiki-articles index
# ==============================================================================

echo "get jawiki-latest-pages-articles.xml..."
wget https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-pages-articles.xml.bz2
7z x jawiki-latest-pages-articles.xml.bz2
wget https://raw.githubusercontent.com/attardi/wikiextractor/master/WikiExtractor.py

echo "convert jawiki xml to text (30 minutes)..."
# -s, --sections        preserve sections
# --lists               preserve lists
# --keep_tables         Preserve tables in the output article text
python3 WikiExtractor.py -s --lists --keep_tables jawiki-latest-pages-articles.xml -o jawiki_text
echo "split jawiki text files (5 minutes)..."
ruby jawiki-split-files.rb jawiki_text/*/wiki_*

echo "generate jawiki index (80 minutes)..."
# -lt specifies the text size limitation by kilobytes. 
# By default, it is 128KB. If it is negative, the size is unlimited.
# If -xl is specified, the index is tuned to register more than 300000 documents.
# If -ft is specified, target files are treated as plain text.
# -ic specifies the input encoding. By default, it is detected automatically.
estcmd gather -lt -1 -xl -ft -ic UTF-8 jawiki_index jawiki_splitted/
echo "optimize jawiki index (16 minutes)..."
estcmd optimize jawiki_index


# ==============================================================================
# get mozc dictionary's entries
# ==============================================================================

# partial clone of mozc
# full clone:
# git clone https://github.com/google/mozc.git -b master --single-branch --recursive --depth=1 mozc-git
git clone https://github.com/google/mozc.git -b master --single-branch --depth=1 mozc-git
ruby get-mozcdic-entries.rb mozc-git/src/data/dictionary_oss/dictionary*.txt
mkdir -p dictionary_oss
mv mozc-git/src/data/dictionary_oss/dictionary*.txt.mozc dictionary_oss/


# ==============================================================================
# get file numbers that include the target word
# ==============================================================================

rm -f */*.hits*
touch {alt-cannadic/alt-cannadic.hits,edict/edict.hits,hatena/hatena.hits,jinmei/jinmei.hits,niconico/niconico.hits,skk-jisyo/skk-jisyo.hits}

cd chimei/
echo "generate chimei entries..."
sh generate-chimei-hits.sh
cd ../

cd ekimei/
echo "generate ekimei entries..."
sh generate-ekimei-hits.sh
cd ../

cd neologd/
echo "get hit numbers of neologd entries..."
sh generate-dictionary.sh
cd ../

cd alt-cannadic/
echo "get hit numbers of alt-cannadic entries..."
sh generate-dictionary.sh
cd ../

cd edict/
echo "get hit numbers of edict entries..."
sh generate-dictionary.sh
cd ../

cd hatena/
echo "get hit numbers of hatena entries..."
sh generate-dictionary.sh
cd ../

cd jinmei/
echo "get hit numbers of jinmei entries..."
sh generate-dictionary.sh
cd ../

cd niconico/
echo "get hit numbers of niconico entries..."
sh generate-dictionary.sh
cd ../

cd skk-jisyo
echo "get hit numbers of skk-jisyo entries..."
sh generate-dictionary.sh
cd ../

# カタカナ英語辞書には一定の控えめなコストを付ける
cd edict-katakana-english/
echo "generate katakana english entries..."
sh generate-dictionary.sh
cd ../

# リリース用のパッケージを作成
cd src/
sh generate-release.sh
