forked from everypolitician/everypolitician-data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrakefile_common.rb
154 lines (126 loc) · 4.82 KB
/
rakefile_common.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# frozen_string_literal: true
# We take various steps to convert all the incoming data into the output
# formats. Each of these steps uses a different rake_helper:
#
# Step 1: fetch any missing sources
# Any recreateable file that is missing on disk (e.g. after running a
# `rake clobber` is fetched from remote.
# Step 2: merge_members
# This takes all the incoming data about People and Memberships (mostly as CSVs)
# and joins them together into 'sources/merged.csv'
# Step 3: verify_source_data
# Make sure that merged.csv has everything we need and is well-formed
# Step 4: turn_csv_to_popolo
# This turns the 'merged.csv' into a Popolo-formatted 'merged.json'
# Step 5: generate_ep_popolo
# This combines data from other sources with 'merged.json' to make
# 'ep-popolo.json'
# Step 6: generate_final_csvs
# Generates term-by-term CSVs from the ep-popolo
# Step 7: generate_stats
# Generates statistics about the data we have
require 'colorize'
require 'csv'
require 'csv_to_popolo'
require 'erb'
require 'fileutils'
require 'fuzzy_match'
require 'json'
require 'open-uri'
require 'pathname'
require 'pry'
require 'rake/clean'
require 'require_all'
require 'set'
require 'yajl/json_gem'
require_rel 'lib'
# Files within each Legislature directory
MERGED_JSON = Pathname.new('sources/merged.json')
MERGED_CSV = Pathname.new('sources/merged.csv')
POSITION_FILTER = Pathname.new('sources/manual/position-filter.json')
POSITION_FILTER_CSV = Pathname.new('sources/manual/position-filter.csv')
CABINET_FILTER = Pathname.new('sources/morph/cabinet-positions.csv')
POSITION_HTML = Pathname.new('sources/manual/.position-filter.html')
POSITION_RAW = Pathname.new('sources/wikidata/positions.json')
POSITION_CSV = Pathname.new('unstable/positions.csv')
POPOLO_JSON = Pathname.new('ep-popolo-v1.0.json')
NAMES_CSV = Pathname.new('names.csv')
LEGISLATURE_META = Pathname.new('meta.json')
COUNTRY_META = Pathname.new('../meta.json')
CLEAN.include(MERGED_CSV, MERGED_JSON, NAMES_CSV)
# Files at project level
PROJECT = Pathname.new('../../..')
POSITION_LEARNER = PROJECT + 'bin/learn_position.rb'
if RUBY_VERSION < '2.4'
Hash.class_eval do
def compact
reject { |_, v| v.to_s.empty? }
end
end
end
Numeric.class_eval do
def empty?
false
end
end
def deep_sort(element)
if element.is_a?(Hash)
element.keys.sort.each_with_object({}) { |k, newhash| newhash[k] = deep_sort(element[k]) }
elsif element.is_a?(Array)
element.map { |v| deep_sort(v) }
else
element
end
end
def json_load(file)
raise "No such file #{file}" unless File.exist? file
JSON.parse(File.read(file), symbolize_names: true)
end
def ep_popolo
EveryPolitician::Popolo.read(POPOLO_JSON)
end
def json_write(file, json)
File.write(file, JSON.pretty_generate(json))
end
def source_warn(message)
warn "#{DateTime.now.strftime('%T')} #{message.green}"
end
module Enumerable
# Workaround for native sort_by producing inconsistent results between OS X
# and Linux.
# @see https://bugs.ruby-lang.org/issues/11379
def portable_sort_by(&block)
group_by(&block).sort_by { |group_name, _| group_name }.flat_map { |_, group| group }
end
end
def popolo_write(pathname, json)
json[:persons] = json[:persons].portable_sort_by { |p| p[:id] }
json[:persons].each do |p|
p[:identifiers] &&= p[:identifiers].portable_sort_by { |i| [i[:scheme], i[:identifier]] }
p[:contact_details] &&= p[:contact_details].portable_sort_by { |d| [d[:type]] }
p[:links] &&= p[:links].portable_sort_by { |l| l[:note] }
p[:other_names] &&= p[:other_names].portable_sort_by { |n| [n[:lang].to_s, n[:name]] }
end
json[:organizations] = json[:organizations].portable_sort_by { |o| [o[:name].to_s.downcase, o[:id]] }
json[:memberships] = json[:memberships].portable_sort_by do |m|
[
m[:person_id], m[:organization_id], m[:legislative_period_id], m[:start_date].to_s, m[:on_behalf_of_id].to_s, m[:area_id].to_s,
]
end
json[:events] &&= json[:events].portable_sort_by { |e| [e[:start_date].to_s || '', e[:id].to_s] }
json[:areas] &&= json[:areas].portable_sort_by { |a| [a[:id]] }
json[:areas].each do |area|
area[:identifiers] &&= area[:identifiers].portable_sort_by { |i| [i[:scheme], i[:identifier]] }
area[:other_names] &&= area[:other_names].portable_sort_by { |name| [name[:lang].to_s, name[:name]] }
end
final = Hash[deep_sort(json).sort_by { |k, _| k }.reverse]
pathname.write(JSON.pretty_generate(final))
end
@INSTRUCTIONS_FILE = Pathname.new('sources/instructions.json')
raise("Can't read #{@INSTRUCTIONS_FILE}") unless @INSTRUCTIONS_FILE.exist?
@INSTRUCTIONS = Instructions.new(@INSTRUCTIONS_FILE)
@SOURCES = @INSTRUCTIONS.sources
desc 'Rebuild from source data'
task rebuild: [:clobber, POPOLO_JSON]
task default: ['csvlint:validate', :csvs, 'stats:regenerate']
Dir[File.dirname(__FILE__) + '/rake_*/*.rb'].each { |file| require file }