Skip to content

Commit

Permalink
Extract MemberPage
Browse files Browse the repository at this point in the history
  • Loading branch information
Oliver Denman committed Apr 25, 2017
1 parent 46943cd commit 3d661c5
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 99 deletions.
105 changes: 105 additions & 0 deletions lib/member_page.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
require 'scraped'
require 'pry'

class MemberPage < Scraped::HTML
decorator Scraped::Response::Decorator::CleanUrls

field :id do
url.gsub('.htm', '').split('members/').last
end

field :name do
name_parts.first.to_s.gsub(Regexp.union(titles << '.'), '').tidy
end

field :honorific_prefix do
titles.select { |prefix| name_parts.first.to_s.include? prefix }.join(' ')
end

field :honorific_suffix do
name_parts[1..-1].map(&:tidy).join(', ')
end

field :gender do
return 'female' if honorific_prefix.include?('Mrs')
end

field :faction do
f = bio.xpath('//p[contains(.,"Political affiliation")]/'\
'following-sibling::ul[not(position() > 1)]/li/text()')
return 'Independent' if f.empty?

# Some member pages list more than one group affiliation for that member
# Here, we remove affiliations with known non-party groups
f.map(&:to_s).map(&:tidy).find do |party|
!non_party_groups.to_s.include? party
end
end

field :email do
bio.xpath('//table/tr/td/a[contains(@href, "mailto")]/text()').to_s.tidy
end

field :website do
bio.xpath('//table/tr/td[contains(.,"Homepage")]/following-sibling::'\
'td/a/text()').to_s.tidy
end

field :phone do
bio.xpath('//table/tr/td[contains(.,"telephone")]/following-sibling::'\
'td[position() = 2]/text()').to_s.tidy
end

field :fax do
bio.xpath('//table/tr/td[contains(.,"fax")]/following-sibling::'\
'td[position() = 2]/text()').to_s.tidy
end

field :img do
# TODO: incorrect image being captured for 'WONG Ting-kwong'
# Change line to: bio.at_css('img/@src').to_s
bio.css('img/@src').last.to_s
end

field :area do
# splitting here by en-dash (not hyphen)
area_parts.last.split('–').last.tidy
end

field :area_type do
return 'functional' if area_parts.first.include?('Functional')
return 'geographical' if area_parts.first.include?('Geographical')
area_parts.first
end

field :source do
url
end

private

def area_parts
bio.xpath('//p[contains(.,"Constituency")]/following-sibling'\
'::ul[not(position() > 1)]/li/text()').to_s.split('-')
end

def name_parts
bio.css('h2').text.split(',')
end

def titles
%w(Ir Dr Prof Hon Mrs)
end

def bio
noko.css('div#container div')
end

def non_party_groups
[
'Kowloon West New Dynamic',
'New Territories Association of Societies',
'April Fifth Action'
]
end
end
106 changes: 7 additions & 99 deletions scraper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,112 +10,20 @@
require 'date'
require 'scraped'

require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'
# require 'scraped_page_archive/open-uri'
# require 'open-uri/cached'
# OpenURI::Cache.cache_path = '.cache'
require 'scraped_page_archive/open-uri'

require_rel 'lib'

class String
def tidy
self.gsub(/[[:space:]]+/, ' ').strip
end
end

def noko_for(url)
Nokogiri::HTML(open(url).read)
end

def scrape(h)
url, klass = h.to_a.first
klass.new(response: Scraped::Request.new(url: url).response)
end

def process_area(area)
area_info = {}

area_info[:area_type] = 'functional' if area.index('Functional')
area_info[:area_type] = 'geographical' if area.index('Geographical')
area_info[:area] = area.gsub(/.*(?:Geographical|Functional)\s+Constituency\s+[–-]\s+/, '').tidy

area_info
end

# if they have two affiliations listed then pick the sensible one where we
# mean the one listed in the breakdown at https://en.wikipedia.org/wiki/Legislative_Council_of_Hong_Kong
def fix_parties(parties)
return 'Labour Party' if parties.to_s.index('Labour Party')
return 'Democratic Alliance for the Betterment and Progress of Hong Kong' if parties.to_s.index('Democratic Alliance for the Betterment and Progress of Hong Kong')
return 'Business and Professionals Alliance for Hong Kong' if parties.to_s.index('Business and Professionals Alliance for Hong Kong')
return 'People Power' if parties.to_s.index('People Power')
return 'League of Social Democrats' if parties.to_s.index('League of Social Democrats')

# fall back to the first one in the list
parties[0].to_s
end

def scrape_person(url)
noko = noko_for(url)
bio = noko.css('div#container div')
# everything after the comma is qualification letters

id = url.to_s.gsub(/.*(yr\d\d.*)\.htm/, '\1')

name_parts = bio.css('h2').text.to_s.split(',')
name = name_parts.shift.to_s
honorific_prefix = ''
name.gsub(/^((?:(?:Hon|Prof|Dr|Ir|Mrs)\s+)+)(.*)$/) do
name = Regexp.last_match(2)
honorific_prefix = Regexp.last_match(1)
end
name = name.tidy
honorific_prefix = honorific_prefix.tidy if honorific_prefix

gender = ''
gender = 'female' if honorific_prefix.index('Mrs')

name_suffix = name_parts.join(', ').tidy

img = URI.join(url, bio.css('img/@src').to_s).to_s

area = bio.xpath('//p[contains(.,"Constituency")]/following-sibling::ul[not(position() > 1)]/li/text()').to_s
area_info = process_area(area)

faction = bio.xpath('//p[contains(.,"Political affiliation")]/following-sibling::ul[not(position() > 1)]/li/text()')
if faction.size > 1
faction = fix_parties(faction)
else
faction = faction.to_s.tidy
faction = 'Independent' if faction.empty?
end

email = bio.xpath('//table/tr/td/a[contains(@href, "mailto")]/text()').to_s.tidy

website = bio.xpath('//table/tr/td[contains(.,"Homepage")]/following-sibling::td/a/text()').to_s.tidy
phone = bio.xpath('//table/tr/td[contains(.,"telephone")]/following-sibling::td[position() = 2]/text()').to_s.tidy
fax = bio.xpath('//table/tr/td[contains(.,"fax")]/following-sibling::td[position() = 2]/text()').to_s.tidy

data = {
id: id,
term: 6,
name: name,
honorific_suffix: name_suffix,
honorific_prefix: honorific_prefix,
img: img,
faction: faction,
email: email,
website: website,
phone: phone,
fax: fax,
gender: gender,
source: url.to_s,
}

data = data.merge(area_info)

puts data.reject { |k, v| v.to_s.empty? }.sort_by { |k, v| k }.to_h
list_url = 'http://www.legco.gov.hk/general/english/members/yr16-20/biographies.htm'
(scrape list_url => MembersPage).member_urls.each do |url|
data = (scrape url => MemberPage).to_h.merge(term: 6)
ScraperWiki.save_sqlite([:id], data)
# puts data.reject { |k, v| v.to_s.empty? }.sort_by { |k, v| k }.to_h
end

list_url = "http://www.legco.gov.hk/general/english/members/yr16-20/biographies.htm"
(scrape list_url => MembersPage).member_urls.each { |url| scrape_person(url) }

0 comments on commit 3d661c5

Please sign in to comment.