-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathfilter-lm.sh
executable file
·93 lines (77 loc) · 2.38 KB
/
filter-lm.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env bash
set -e
set -o pipefail
scriptDir=$(dirname $0)
function usage {
echo "Usage: $0 dArpaModelHdfsIn filterVocabHdfsIn filteredModelHdfsOut [phrase]"
exit 1
}
if [[ $# != 3 && $# != 4 ]]; then
usage
fi
if [[ $# == 4 && "$4" != "phrase" ]]; then
usage
fi
set -x
dArpaModelHdfs=$1 #input
filterVocabHdfs=$2 # file to filter to
filterModelHdfs=$3 #output
usePhrase=$4 # either "phrase" or empty
# TODO: Grab task count from external config?
numTasks=400
localTmpDir=tmp/BigFatLM-$RANDOM
mkdir -p $localTmpDir
trap "rm -rf $localTmpDir" EXIT
filterBin=$scriptDir/filter
if [ ! -e $filterBin ]; then
echo >&2 "Could not find KenLM filter binary: $filterBin"
exit 1
fi
if which hadoop; then
haveHadoop=1
else
haveHadoop=""
fi
if [ $haveHadoop ]; then
hadoop dfs -rmr $filterModelHdfs
hadoop dfs -rmr $filterBin
hadoop dfs -put $filterBin $filterBin
(cat <<EOF
#!/usr/bin/env bash
set -e
set -o pipefail
set -x
find . >&2
# Hadoop will give us input on stdin
# The vocab file is provided via a distributed cache
./filter union $usePhrase raw vocab:$filterVocabHdfs /dev/stdout | \
tee <(cut -f1 | awk '{if(NF>0){print "reporter:counter:BigFatLM,"NF"-gram dARPA entries written,1"}; if(NF>5){print "WARNING: "\$0}}' >/dev/stderr || kill -9 $PPID)
EOF
)>$localTmpDir/filter-mapper.sh
frun="hadoop jar $HADOOP_HOME/hadoop-streaming.jar \
-Dmapred.job.queue.name=m45 \
-Dmapred.map.tasks=$numTasks \
-Dmapred.reduce.tasks=$numTasks \
-Dmapred.map.tasks.speculative.execution=True \
-Dmapred.reduce.tasks.speculative.execution=True \
-Dmapred.output.compress=true \
-Dmapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec"
$frun -Dmapred.job.name="BigFatLM -- Filter dARPA to test set: $testOut" \
-files $filterBin,$filterVocabHdfs \
-mapper $localTmpDir/filter-mapper.sh \
-file $localTmpDir/filter-mapper.sh \
-numReduceTasks 0 \
-input $dArpaModelHdfs \
-output $filterModelHdfs
else
# We're running locally
rm -rf $filterModelHdfs
if (( $(ls $dArpaModelHdfs/part* | egrep -c '\.bz2$') > 0 )); then
catCmd="bzcat $dArpaModelHdfs/part*"
elif (( $(ls $dArpaModelHdfs/part* | egrep -c '\.gz$') > 0 )); then
catCmd="zcat $dArpaModelHdfs/part*"
else
catCmd="cat $dArpaModelHdfs/part*"
fi
$filterBin union $usePhrase raw vocab:$filterVocabHdfs $filterModelHdfs < <($catCmd)
fi