Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use scraped #9

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 0 additions & 22 deletions .rubocop_todo.yml
Original file line number Diff line number Diff line change
@@ -1,22 +0,0 @@
# This configuration was generated by
# `rubocop --auto-gen-config`
# on 2017-04-25 08:39:37 +0100 using RuboCop version 0.46.0.
# The point is for the user to remove these configuration records
# one by one as the offenses are removed from the code base.
# Note that changes in the inspected code, or installation of new
# versions of RuboCop, may require this file to be generated again.

# Offense count: 1
Metrics/AbcSize:
Max: 54

# Offense count: 2
# Configuration parameters: AllowHeredoc, AllowURI, URISchemes, IgnoreCopDirectives, IgnoredPatterns.
# URISchemes: http, https
Metrics/LineLength:
Max: 165

# Offense count: 1
# Configuration parameters: CountComments.
Metrics/MethodLength:
Max: 46
121 changes: 121 additions & 0 deletions lib/member_page.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# frozen_string_literal: true

require 'scraped'

# This class represents the profile page of a given member
class MemberPage < Scraped::HTML
decorator Scraped::Response::Decorator::CleanUrls

field :id do
[term, File.basename(url, '.htm')].join('/')
end

field :term do
File.dirname(url).split('/').last
end

field :name do
(name_without_suffixes.split - titles).join(' ')
end

field :honorific_prefix do
(name_without_suffixes.split & titles).join(' ')
end

field :honorific_suffix do
suffixes.join(', ')
end

field :gender do
return 'female' if honorific_prefix.include?('Mrs')
end

field :faction do
return 'Independent' if political_affiliation.empty?
# Some member pages list more than one group affiliation for that member
# Here, we remove affiliations with known non-party groups
(political_affiliation - non_party_groups).first
end

field :email do
bio.xpath('//table/tr/td/a[contains(@href, "mailto")]/text()').to_s.tidy
end

field :website do
bio.xpath('//table/tr/td[contains(.,"Homepage")]/following-sibling::'\
'td/a/text()').to_s.tidy
end

field :phone do
bio.xpath('//table/tr/td[contains(.,"telephone")]/following-sibling::'\
'td[position() = 2]/text()').to_s.tidy
end

field :fax do
bio.xpath('//table/tr/td[contains(.,"fax")]/following-sibling::'\
'td[position() = 2]/text()').to_s.tidy
end

field :img do
# TODO: incorrect image being captured for 'WONG Ting-kwong'
# Change line to: bio.at_css('img/@src').to_s
bio.css('img/@src').last.to_s
end

field :area do
area_parts.last.split("\u{2013}").last.tidy
end

field :area_type do
return 'functional' if area_parts.first.include?('Functional')
return 'geographical' if area_parts.first.include?('Geographical')
area_parts.first
end

field :source do
url
end

private

def area_parts
bio.xpath('//p[contains(.,"Constituency")]/following-sibling'\
'::ul[not(position() > 1)]/li/text()').to_s.split('-')
end

def name_text
bio.css('h2').text
end

def name_without_suffixes
name_text.split(',').first
end

def suffixes
name_text.split(',').drop(1).map(&:tidy)
end

def titles
%w[Ir Dr Prof Hon Mrs]
end

def bio
noko.css('div#container div')
end

def non_party_groups
Set[
'Kowloon West New Dynamic',
'New Territories Association of Societies',
'April Fifth Action',
]
end

def political_affiliation
bio.xpath('//p[contains(.,"Political affiliation")]/'\
'following-sibling::ul[not(position() > 1)]/li/text()')
.map(&:to_s)
.map(&:tidy)
.to_set
end
end
12 changes: 12 additions & 0 deletions lib/members_page.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# frozen_string_literal: true

require 'scraped'

# This class represents a page listing members of the given legislature
class MembersPage < Scraped::HTML
decorator Scraped::Response::Decorator::CleanUrls

field :member_urls do
noko.css('.bio-member-detail-1 a/@href').map(&:text)
end
end
102 changes: 9 additions & 93 deletions scraper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,102 +10,18 @@
# OpenURI::Cache.cache_path = '.cache'
require 'scraped_page_archive/open-uri'

def noko_for(url)
Nokogiri::HTML(open(url).read)
end

def scrape_list(url)
noko = noko_for(url)
noko.css('.bio-member-detail-1 a/@href').each do |link|
bio = URI.join(url, link.to_s)
scrape_person(bio)
end
end

def process_area(area)
area_info = {}

area_info[:area_type] = 'functional' if area.index('Functional')
area_info[:area_type] = 'geographical' if area.index('Geographical')
area_info[:area] = area.gsub(/.*(?:Geographical|Functional)\s+Constituency\s+[–-]\s+/, '').tidy
require_rel 'lib'

area_info
def scrape(h)
url, klass = h.to_a.first
klass.new(response: Scraped::Request.new(url: url).response)
end

# if they have two affiliations listed then pick the sensible one where we
# mean the one listed in the breakdown at https://en.wikipedia.org/wiki/Legislative_Council_of_Hong_Kong
def fix_parties(parties)
return 'Labour Party' if parties.to_s.index('Labour Party')
return 'Democratic Alliance for the Betterment and Progress of Hong Kong' if parties.to_s.index('Democratic Alliance for the Betterment and Progress of Hong Kong')
return 'Business and Professionals Alliance for Hong Kong' if parties.to_s.index('Business and Professionals Alliance for Hong Kong')
return 'People Power' if parties.to_s.index('People Power')
return 'League of Social Democrats' if parties.to_s.index('League of Social Democrats')

# fall back to the first one in the list
parties[0].to_s
end

def scrape_person(url)
noko = noko_for(url)
bio = noko.css('div#container div')
# everything after the comma is qualification letters

id = url.to_s.gsub(/.*(yr\d\d.*)\.htm/, '\1')

name_parts = bio.css('h2').text.to_s.split(',')
name = name_parts.shift.to_s
honorific_prefix = ''
name.gsub(/^((?:(?:Hon|Prof|Dr|Ir|Mrs)\s+)+)(.*)$/) do
name = Regexp.last_match(2)
honorific_prefix = Regexp.last_match(1)
end
name = name.tidy
honorific_prefix = honorific_prefix.tidy if honorific_prefix

gender = ''
gender = 'female' if honorific_prefix.index('Mrs')

name_suffix = name_parts.join(', ').tidy

img = URI.join(url, bio.css('img/@src').to_s).to_s

area = bio.xpath('//p[contains(.,"Constituency")]/following-sibling::ul[not(position() > 1)]/li/text()').to_s
area_info = process_area(area)

faction = bio.xpath('//p[contains(.,"Political affiliation")]/following-sibling::ul[not(position() > 1)]/li/text()')
if faction.size > 1
faction = fix_parties(faction)
else
faction = faction.to_s.tidy
faction = 'Independent' if faction.empty?
end

email = bio.xpath('//table/tr/td/a[contains(@href, "mailto")]/text()').to_s.tidy

website = bio.xpath('//table/tr/td[contains(.,"Homepage")]/following-sibling::td/a/text()').to_s.tidy
phone = bio.xpath('//table/tr/td[contains(.,"telephone")]/following-sibling::td[position() = 2]/text()').to_s.tidy
fax = bio.xpath('//table/tr/td[contains(.,"fax")]/following-sibling::td[position() = 2]/text()').to_s.tidy

data = {
id: id,
term: 6,
name: name,
honorific_suffix: name_suffix,
honorific_prefix: honorific_prefix,
img: img,
faction: faction,
email: email,
website: website,
phone: phone,
fax: fax,
gender: gender,
source: url.to_s,
}

data = data.merge(area_info)

ScraperWiki.save_sqlite([:id], data)
list_url = 'http://www.legco.gov.hk/general/english/members/yr16-20/biographies.htm'
data = (scrape list_url => MembersPage).member_urls.map do |url|
(scrape url => MemberPage).to_h.merge(term: 6)
# puts data.reject { |k, v| v.to_s.empty? }.sort_by { |k, v| k }.to_h
end

ScraperWiki.sqliteexecute('DROP TABLE data') rescue nil
scrape_list('http://www.legco.gov.hk/general/english/members/yr16-20/biographies.htm')
ScraperWiki.save_sqlite([:id], data)
121 changes: 121 additions & 0 deletions test/cassettes/chd_htm.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading