Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Use scraped #3

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 0 additions & 22 deletions .rubocop_todo.yml
Original file line number Diff line number Diff line change
@@ -1,22 +0,0 @@
# This configuration was generated by
# `rubocop --auto-gen-config`
# on 2017-04-25 08:39:37 +0100 using RuboCop version 0.46.0.
# The point is for the user to remove these configuration records
# one by one as the offenses are removed from the code base.
# Note that changes in the inspected code, or installation of new
# versions of RuboCop, may require this file to be generated again.

# Offense count: 1
Metrics/AbcSize:
Max: 54

# Offense count: 2
# Configuration parameters: AllowHeredoc, AllowURI, URISchemes, IgnoreCopDirectives, IgnoredPatterns.
# URISchemes: http, https
Metrics/LineLength:
Max: 165

# Offense count: 1
# Configuration parameters: CountComments.
Metrics/MethodLength:
Max: 46
3 changes: 3 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@ rvm:
- 2.3.3
sudo: false
cache: bundler
script:
- bash <(curl -fsSL https://github.com/everypolitician/ensure-regression-tests/raw/v0.1.0/ensure-regression-tests)
- bundle exec rake
7 changes: 6 additions & 1 deletion Rakefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
# frozen_string_literal: true

require 'rubocop/rake_task'
require 'rake/testtask'

RuboCop::RakeTask.new

task default: %w[rubocop]
require 'scraper_test'
ScraperTest::RakeTask.new.install_tasks

task test: 'test:data'
task default: %w[rubocop test]
108 changes: 108 additions & 0 deletions lib/member_page.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# frozen_string_literal: true

require 'scraped'
require 'pry'

# This class represents the profile page of a given member
class MemberPage < Scraped::HTML
decorator Scraped::Response::Decorator::CleanUrls

field :id do
url.gsub('.htm', '').split('members/').last
end

field :name do
name_parts.first.to_s.gsub(Regexp.union(titles << '.'), '').tidy
end

field :honorific_prefix do
titles.select { |prefix| name_parts.first.to_s.include? prefix }.join(' ')
end

field :honorific_suffix do
name_parts[1..-1].map(&:tidy).join(', ')
end

field :gender do
return 'female' if honorific_prefix.include?('Mrs')
end

field :faction do
f = bio.xpath('//p[contains(.,"Political affiliation")]/'\
'following-sibling::ul[not(position() > 1)]/li/text()')
return 'Independent' if f.empty?

# Some member pages list more than one group affiliation for that member
# Here, we remove affiliations with known non-party groups
f.map(&:to_s).map(&:tidy).find do |party|
!non_party_groups.to_s.include? party
end
end

field :email do
bio.xpath('//table/tr/td/a[contains(@href, "mailto")]/text()').to_s.tidy
end

field :website do
bio.xpath('//table/tr/td[contains(.,"Homepage")]/following-sibling::'\
'td/a/text()').to_s.tidy
end

field :phone do
bio.xpath('//table/tr/td[contains(.,"telephone")]/following-sibling::'\
'td[position() = 2]/text()').to_s.tidy
end

field :fax do
bio.xpath('//table/tr/td[contains(.,"fax")]/following-sibling::'\
'td[position() = 2]/text()').to_s.tidy
end

field :img do
# TODO: incorrect image being captured for 'WONG Ting-kwong'
# Change line to: bio.at_css('img/@src').to_s
bio.css('img/@src').last.to_s
end

field :area do
# splitting here by en-dash (not hyphen)
area_parts.last.split('–').last.tidy
end

field :area_type do
return 'functional' if area_parts.first.include?('Functional')
return 'geographical' if area_parts.first.include?('Geographical')
area_parts.first
end

field :source do
url
end

private

def area_parts
bio.xpath('//p[contains(.,"Constituency")]/following-sibling'\
'::ul[not(position() > 1)]/li/text()').to_s.split('-')
end

def name_parts
bio.css('h2').text.split(',')
end

def titles
%w[Ir Dr Prof Hon Mrs]
end

def bio
noko.css('div#container div')
end

def non_party_groups
[
'Kowloon West New Dynamic',
'New Territories Association of Societies',
'April Fifth Action',
]
end
end
12 changes: 12 additions & 0 deletions lib/members_page.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# frozen_string_literal: true

require 'scraped'

# This class represents a page listing members of the given legislature
class MembersPage < Scraped::HTML
decorator Scraped::Response::Decorator::CleanUrls

field :member_urls do
noko.css('.bio-member-detail-1 a/@href').map(&:to_s)
end
end
106 changes: 12 additions & 94 deletions scraper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,107 +5,25 @@
require 'pry'
require 'scraped'
require 'scraperwiki'
require 'nokogiri'
require 'scraped_page_archive/open-uri'
require 'date'
require 'scraped'

# require 'open-uri/cached'
# OpenURI::Cache.cache_path = '.cache'
require 'scraped_page_archive/open-uri'

def noko_for(url)
Nokogiri::HTML(open(url).read)
end

def scrape_list(url)
noko = noko_for(url)
noko.css('.bio-member-detail-1 a/@href').each do |link|
bio = URI.join(url, link.to_s)
scrape_person(bio)
end
end

def process_area(area)
area_info = {}
require_rel 'lib'

area_info[:area_type] = 'functional' if area.index('Functional')
area_info[:area_type] = 'geographical' if area.index('Geographical')
area_info[:area] = area.gsub(/.*(?:Geographical|Functional)\s+Constituency\s+[–-]\s+/, '').tidy

area_info
end

# if they have two affiliations listed then pick the sensible one where we
# mean the one listed in the breakdown at https://en.wikipedia.org/wiki/Legislative_Council_of_Hong_Kong
def fix_parties(parties)
return 'Labour Party' if parties.to_s.index('Labour Party')
return 'Democratic Alliance for the Betterment and Progress of Hong Kong' if parties.to_s.index('Democratic Alliance for the Betterment and Progress of Hong Kong')
return 'Business and Professionals Alliance for Hong Kong' if parties.to_s.index('Business and Professionals Alliance for Hong Kong')
return 'People Power' if parties.to_s.index('People Power')
return 'League of Social Democrats' if parties.to_s.index('League of Social Democrats')

# fall back to the first one in the list
parties[0].to_s
def scrape(h)
url, klass = h.to_a.first
klass.new(response: Scraped::Request.new(url: url).response)
end

def scrape_person(url)
noko = noko_for(url)
bio = noko.css('div#container div')
# everything after the comma is qualification letters

id = url.to_s.gsub(/.*(yr\d\d.*)\.htm/, '\1')

name_parts = bio.css('h2').text.to_s.split(',')
name = name_parts.shift.to_s
honorific_prefix = ''
name.gsub(/^((?:(?:Hon|Prof|Dr|Ir|Mrs)\s+)+)(.*)$/) do
name = Regexp.last_match(2)
honorific_prefix = Regexp.last_match(1)
end
name = name.tidy
honorific_prefix = honorific_prefix.tidy if honorific_prefix

gender = ''
gender = 'female' if honorific_prefix.index('Mrs')

name_suffix = name_parts.join(', ').tidy

img = URI.join(url, bio.css('img/@src').to_s).to_s

area = bio.xpath('//p[contains(.,"Constituency")]/following-sibling::ul[not(position() > 1)]/li/text()').to_s
area_info = process_area(area)

faction = bio.xpath('//p[contains(.,"Political affiliation")]/following-sibling::ul[not(position() > 1)]/li/text()')
if faction.size > 1
faction = fix_parties(faction)
else
faction = faction.to_s.tidy
faction = 'Independent' if faction.empty?
end

email = bio.xpath('//table/tr/td/a[contains(@href, "mailto")]/text()').to_s.tidy

website = bio.xpath('//table/tr/td[contains(.,"Homepage")]/following-sibling::td/a/text()').to_s.tidy
phone = bio.xpath('//table/tr/td[contains(.,"telephone")]/following-sibling::td[position() = 2]/text()').to_s.tidy
fax = bio.xpath('//table/tr/td[contains(.,"fax")]/following-sibling::td[position() = 2]/text()').to_s.tidy

data = {
id: id,
term: 6,
name: name,
honorific_suffix: name_suffix,
honorific_prefix: honorific_prefix,
img: img,
faction: faction,
email: email,
website: website,
phone: phone,
fax: fax,
gender: gender,
source: url.to_s,
}

data = data.merge(area_info)

list_url = 'http://www.legco.gov.hk/general/english/members/yr16-20/biographies.htm'
(scrape list_url => MembersPage).member_urls.each do |url|
data = (scrape url => MemberPage).to_h.merge(term: 6)
ScraperWiki.save_sqlite([:id], data)
# puts data.reject { |k, v| v.to_s.empty? }.sort_by { |k, v| k }.to_h
end

ScraperWiki.sqliteexecute('DROP TABLE data') rescue nil
scrape_list('http://www.legco.gov.hk/general/english/members/yr16-20/biographies.htm')
Loading