-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcreate_lm.sh
executable file
·184 lines (162 loc) · 7.77 KB
/
create_lm.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/env bash
# set -xe
usage="$(basename "$0") [-h|--help] [-o|--order <int>] [-l|--language {'en'|'de'|'fr'|'it'|...}] [-d|--data_structure {'probing'|'trie'}] [-t|--target_dir <string> ] [-r|--remove_artifacts remove_artifacts]
Create n-gram Language Model on ~2.2M Wikipedia articles using KenLM.
Parameters:
-h|--help show this help text
-o|--order set the order of the model, i.e. the n in n-gram (default: 4)
-l|--language ISO 639-1 code of the language to train on (default: de)
-d|--data_structure data structure to use (use 'trie' or 'probing'). See https://kheafield.com/code/kenlm/structures/ for details. (default: trie)
-t|--target_dir target directory to write to
-r|--remove_artifacts remove intermediate artifacts after training. Only set this flag if you really don't want to train another model because creating intermediate artifacts can take a long time. (default: false)
EXAMPLE USAGE: create a 5-gram model for German using the 40k most frequent words from the Wikipedia articles, using probing as data structure and removing everything but the trained model afterwards:
./create_lm.sh -l de -o 5 -r
Make sure the target directory specified by -t has enough free space (around 20-30G). KenLM binaries (lmplz and build_binary) need to be on the path. See https://kheafield.com/code/kenlm/ on how to build those.
The following intermediate artifacts are created and may be removed after training by setting the -r flag:
- {target_dir}/tmp/[language]wiki-latest-pages-articles.xml.bz2: Downloaded wikipedia dump
- {target_dir}/tmp/[language]_clean: directory containing preprocessed Wikipedia articles
- {target_dir}/tmp/wiki_[language].txt.bz2: compressed file containing the Wikipedia corpus used to train the LM (raw text contents of the Wikipedia articles one sentence per line)
- {target_dir}/tmp/wiki_[language].counts: file containing the full vocabulary of the corpus and the number of occurrences of each word (sorted descending by number of occurrences)
- {target_dir}/tmp/wiki_[language].vocab: file containing the most frequent words of the corpus used for training (as defined by the -t argument) in the format expected by KenLM (words separated by spaces)
- {target_dir}/tmp/wiki_[language].arpa: ARPA file used to create the KenLM binary model
The following result files are created and will not be removed:
- {target_dir}/wiki_[language].klm: final KenLM n-gram LM in binary format.
"
# Defaults
order=5
language='de'
data_structure=trie
target_dir='./lm'
remove_artifacts=false
POSITIONAL=()
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
-h|--help)
echo ${usage}
shift
exit
;;
-o|--order)
order="$2"
shift
shift
;;
-l|--language)
language="$2"
shift
shift
;;
-d|--data_structure)
data_structure="$2"
shift
shift
;;
-t|--target_dir)
target_dir="$2"
shift
shift
;;
-r|--remove_artifacts)
remove_artifacts=true
shift
;;
*) # unknown option
POSITIONAL+=("$1") # save it in an array for later
shift
;;
esac
done
set -- "${POSITIONAL[@]}" # restore positional parameters
# #################################
# Paths and filenames
# #################################
corpus_name="wiki_${language}"
lm_basename="${corpus_name}_${order}_gram"
target_dir="${target_dir}/${corpus_name}" # directory to store model
tmp_dir="${target_dir}/tmp" # directory for intermediate artifacts
cleaned_dir="${tmp_dir}/${corpus_name}_clean" # directory for WikiExtractor
corpus_file="${tmp_dir}/${corpus_name}.txt" # uncompressed corpus
lm_counts="${tmp_dir}/${corpus_name}.counts" # corpus vocabulary with counts (all words)
lm_arpa="${tmp_dir}/${lm_basename}.arpa" # ARPA file
lm_binary="${target_dir}/${lm_basename}.klm" # KenLM binary file (this is the result of the script)
# create target directories if the don't exist yet
mkdir -p ${target_dir}
mkdir -p ${tmp_dir}
# #################################
echo "creating $order-gram model from Wikipedia dump"
echo "time indications are based upon personal experience when training on my personal laptop (i7, 4 cores, 8GB RAM, SSD)"
# #################################
# STEP 1: Download the Wikipedia dump in the given language if necessary
# For a some statistics of Wikipedias see https://meta.wikimedia.org/wiki/List_of_Wikipedias
# #################################
download_url="http://download.wikimedia.org/${language}wiki/latest/${language}wiki-latest-pages-articles.xml.bz2"
target_file=${tmp_dir}/$(basename ${download_url}) # get corpus file name from url and corpus name
if [[ ! -f ${target_file} ]]; then
echo "downloading corpus ${corpus_name} from ${download_url} and saving in ${target_file}"
echo "This can take up to an hour (Wiki servers are slow). Have lunch or something..."
wget -O ${target_file} ${download_url}
fi
# #################################
# STEP 2: Create corpus from dump if necessary
# Use WikExtractor (see https://github.com/attardi/wikiextractor for details)
# #################################
if [[ ! -f "${corpus_file}" ]] ; then
cd ./src/
if [[ ! -d ${cleaned_dir} ]] ; then
echo "Extracting/cleaning text from Wikipedia data base dump at ${target_file} using WikiExtractor."
echo "Cleaned articles are saved to ${cleaned_dir}"
echo "This will take 2-3 hours. Have a walk or something..."
mkdir -p ${cleaned_dir}
python3 ./lm/WikiExtractor.py -c -b 25M -o ${cleaned_dir} ${target_file}
fi
echo "Uncompressing and preprocessing cleaned articles from $cleaned_dir"
echo "All articles will be written to $corpus_file (1 sentence per line, without dot at the end)."
echo "All XML tags will be removed. Numeric word tokens will be replaced by the <num> token."
echo "Non-ASCII characters will be replaced with their closest ASCII equivalent (if possible), but umlauts will be preserved!"
echo "This will take some time (~4h). Go to sleep or something..."
export PYTHONPATH=$(pwd)
result=$(find ${cleaned_dir} -name '*bz2' -exec bzcat {} \+ \
| pv \
| tee >( sed 's/<[^>]*>//g' \
| sed 's|["'\''„“‚‘]||g' \
| python3 ./lm/create_corpus.py ${language} > ${corpus_file} \
) \
| grep -e "<doc" \
| wc -l)
echo "Processed ${result} articles and saved raw text in $corpus_file"
echo "Processed $(cat ${corpus_file} | wc -l) sentences"
echo "Processed $(cat ${corpus_file} | wc -w) words"
echo "Processed $(cat ${corpus_file} | xargs -n1 | sort | uniq -c) unique words"
echo "(re-)creating vocabulary of $corpus_file because corpus file has changed"
echo "This usually takes around half an hour. Get a coffee or something..."
./create_corpus_vocab.sh ${corpus_file} --target_dir ${target_dir}
fi
echo "compressing $corpus_file. File size before:"
du -h ${corpus_file}
bzip2 -k ${corpus_file}
echo "done! Compressed file size:"
du -h ${corpus_file}.bz2
if [[ ! -f ${lm_arpa} ]]; then
echo "Training $order-gram KenLM model with data from $corpus_file.bz2 and saving ARPA file to $lm_arpa"
echo "This can take several hours, depending on the order of the model"
lmplz --order ${order} \
--temp_prefix ${tmp_dir} \
--memory 40% \
--arpa ${lm_arpa} \
--prune 0 0 0 1 <${corpus_file}.bz2
fi
if [[ ! -f ${lm_binary} ]]; then
echo "Building binary file from $lm_arpa and saving to $lm_binary"
echo "This should usually not take too much time even for high-order models"
build_binary -a 255 \
-q 8 \
${data_structure} \
${lm_arpa} \
${lm_binary}
fi
if ${remove_artifacts}; then
echo "removing intermediate artifacts in ${tmp_dir}"
rm -rf ${tmp_dir}
fi