-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathenamdict.rake
67 lines (57 loc) · 1.86 KB
/
enamdict.rake
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# frozen_string_literal: true
namespace :enamdict do
TMP_FILE = 'tmp/enamdict'.freeze
desc 'Downloads and compacts the ENAMDICT file'
task refresh: %w[download minify]
desc 'Downloads and extracts ENAMDICT file from ftp.monash.edu.au to /tmp/enamdict'
task :download do
require 'fileutils'
require 'open-uri'
require 'zlib'
puts 'Downloading ENAMDICT...'
FileUtils.mkdir_p 'tmp'
uri = 'http://ftp.monash.edu/pub/nihongo/enamdict.gz'
f = File.open(TMP_FILE, 'w')
f.write(Zlib::GzipReader.open(open(uri)).read)
f.close
puts "Downloaded to #{loc}"
end
# Minification steps:
# - Encode file as UTF-8 (increases size by ~25%)
# - Filter out non-human name dictionary entries
# - Remove Romaji from string (redundant with kana)
# - Uses pipe '|' char as delimiter
# - Sets kana value to kanji value if kana not specified (increases size by ~10%)
# - Output file is approx 40% of original filesize
#
# Format of minified file:
#
# kanji|kana|flag1(,flag2,...)
#
desc 'Compacts ENAMDICT file at /bin/enamdict.min'
task :minify do
puts 'Minifying ENAMDICT...'
# TODO: load this from main library
name_types = %w[s p u g f m]
skip_types = %w[h]
i = 0
j = 0
out = File.open('bin/enamdict.min', 'w:utf-8')
File.open(TMP_FILE, 'r:euc-jp') do |f|
f.gets # skip header
while (line = f.gets) != nil
data = line.scan(%r{^(.+?) (?:\[(.+?)\] )?/\((.+?)\).+/$})[0]
next unless data
data[1] ||= data[0]
data_types = data[2].split(',')
if (data_types & name_types).any? && !(data_types & skip_types).any? && !(%w[p u].include?(data[2]) && data[0].encode('utf-8') =~ /\p{Katakana}/)
out.puts(data.join('|'))
j += 1
end
i += 1
end
end
out.close
puts "Minified! (#{j} out of #{i} lines kept)"
end
end