everypolitician-scrapers · ondenman · Apr 24, 2017 · Apr 24, 2017 · Apr 24, 2017 · Apr 25, 2017
diff --git a/.rubocop_todo.yml b/.rubocop_todo.yml
@@ -1,22 +0,0 @@
-# This configuration was generated by
-# `rubocop --auto-gen-config`
-# on 2017-04-25 08:39:37 +0100 using RuboCop version 0.46.0.
-# The point is for the user to remove these configuration records
-# one by one as the offenses are removed from the code base.
-# Note that changes in the inspected code, or installation of new
-# versions of RuboCop, may require this file to be generated again.
-
-# Offense count: 1
-Metrics/AbcSize:
-  Max: 54
-
-# Offense count: 2
-# Configuration parameters: AllowHeredoc, AllowURI, URISchemes, IgnoreCopDirectives, IgnoredPatterns.
-# URISchemes: http, https
-Metrics/LineLength:
-  Max: 165
-
-# Offense count: 1
-# Configuration parameters: CountComments.
-Metrics/MethodLength:
-  Max: 46

diff --git a/.travis.yml b/.travis.yml
@@ -3,3 +3,6 @@ rvm:
   - 2.3.3
 sudo: false
 cache: bundler
+script:
+  - bash <(curl -fsSL https://github.com/everypolitician/ensure-regression-tests/raw/v0.1.0/ensure-regression-tests)
+  - bundle exec rake
diff --git a/Rakefile b/Rakefile
@@ -1,7 +1,12 @@
 # frozen_string_literal: true
 
 require 'rubocop/rake_task'
+require 'rake/testtask'
 
 RuboCop::RakeTask.new
 
-task default: %w[rubocop]
+require 'scraper_test'
+ScraperTest::RakeTask.new.install_tasks
+
+task test: 'test:data'
+task default: %w[rubocop test]
diff --git a/lib/member_page.rb b/lib/member_page.rb
@@ -0,0 +1,108 @@
+# frozen_string_literal: true
+
+require 'scraped'
+require 'pry'
+
+# This class represents the profile page of a given member
+class MemberPage < Scraped::HTML
+  decorator Scraped::Response::Decorator::CleanUrls
+
+  field :id do
+    url.gsub('.htm', '').split('members/').last
+  end
+
+  field :name do
+    name_parts.first.to_s.gsub(Regexp.union(titles << '.'), '').tidy
+  end
+
+  field :honorific_prefix do
+    titles.select { |prefix| name_parts.first.to_s.include? prefix }.join(' ')
+  end
+
+  field :honorific_suffix do
+    name_parts[1..-1].map(&:tidy).join(', ')
+  end
+
+  field :gender do
+    return 'female' if honorific_prefix.include?('Mrs')
+  end
+
+  field :faction do
+    f = bio.xpath('//p[contains(.,"Political affiliation")]/'\
+                  'following-sibling::ul[not(position() > 1)]/li/text()')
+    return 'Independent' if f.empty?
+
+    # Some member pages list more than one group affiliation for that member
+    # Here, we remove affiliations with known non-party groups
+    f.map(&:to_s).map(&:tidy).find do |party|
+      !non_party_groups.to_s.include? party
+    end
+  end
+
+  field :email do
+    bio.xpath('//table/tr/td/a[contains(@href, "mailto")]/text()').to_s.tidy
+  end
+
+  field :website do
+    bio.xpath('//table/tr/td[contains(.,"Homepage")]/following-sibling::'\
+              'td/a/text()').to_s.tidy
+  end
+
+  field :phone do
+    bio.xpath('//table/tr/td[contains(.,"telephone")]/following-sibling::'\
+              'td[position() = 2]/text()').to_s.tidy
+  end
+
+  field :fax do
+    bio.xpath('//table/tr/td[contains(.,"fax")]/following-sibling::'\
+              'td[position() = 2]/text()').to_s.tidy
+  end
+
+  field :img do
+    # TODO: incorrect image being captured for 'WONG Ting-kwong'
+    # Change line to: bio.at_css('img/@src').to_s
+    bio.css('img/@src').last.to_s
+  end
+
+  field :area do
+    # splitting here by en-dash (not hyphen)
+    area_parts.last.split('–').last.tidy
+  end
+
+  field :area_type do
+    return 'functional' if area_parts.first.include?('Functional')
+    return 'geographical' if area_parts.first.include?('Geographical')
+    area_parts.first
+  end
+
+  field :source do
+    url
+  end
+
+  private
+
+  def area_parts
+    bio.xpath('//p[contains(.,"Constituency")]/following-sibling'\
+              '::ul[not(position() > 1)]/li/text()').to_s.split('-')
+  end
+
+  def name_parts
+    bio.css('h2').text.split(',')
+  end
+
+  def titles
+    %w[Ir Dr Prof Hon Mrs]
+  end
+
+  def bio
+    noko.css('div#container div')
+  end
+
+  def non_party_groups
+    [
+      'Kowloon West New Dynamic',
+      'New Territories Association of Societies',
+      'April Fifth Action',
+    ]
+  end
+end
diff --git a/lib/members_page.rb b/lib/members_page.rb
@@ -0,0 +1,12 @@
+# frozen_string_literal: true
+
+require 'scraped'
+
+# This class represents a page listing members of the given legislature
+class MembersPage < Scraped::HTML
+  decorator Scraped::Response::Decorator::CleanUrls
+
+  field :member_urls do
+    noko.css('.bio-member-detail-1 a/@href').map(&:to_s)
+  end
+end
diff --git a/scraper.rb b/scraper.rb
@@ -5,107 +5,25 @@
 require 'pry'
 require 'scraped'
 require 'scraperwiki'
+require 'nokogiri'
+require 'scraped_page_archive/open-uri'
+require 'date'
+require 'scraped'
 
 # require 'open-uri/cached'
 # OpenURI::Cache.cache_path = '.cache'
 require 'scraped_page_archive/open-uri'
 
-def noko_for(url)
-  Nokogiri::HTML(open(url).read)
-end
-
-def scrape_list(url)
-  noko = noko_for(url)
-  noko.css('.bio-member-detail-1 a/@href').each do |link|
-    bio = URI.join(url, link.to_s)
-    scrape_person(bio)
-  end
-end
-
-def process_area(area)
-  area_info = {}
+require_rel 'lib'
 
-  area_info[:area_type] = 'functional' if area.index('Functional')
-  area_info[:area_type] = 'geographical' if area.index('Geographical')
-  area_info[:area] = area.gsub(/.*(?:Geographical|Functional)\s+Constituency\s+[–-]\s+/, '').tidy
-
-  area_info
-end
-
-# if they have two affiliations listed then pick the sensible one where we
-# mean the one listed in the breakdown at https://en.wikipedia.org/wiki/Legislative_Council_of_Hong_Kong
-def fix_parties(parties)
-  return 'Labour Party' if parties.to_s.index('Labour Party')
-  return 'Democratic Alliance for the Betterment and Progress of Hong Kong' if parties.to_s.index('Democratic Alliance for the Betterment and Progress of Hong Kong')
-  return 'Business and Professionals Alliance for Hong Kong' if parties.to_s.index('Business and Professionals Alliance for Hong Kong')
-  return 'People Power' if parties.to_s.index('People Power')
-  return 'League of Social Democrats' if parties.to_s.index('League of Social Democrats')
-
-  # fall back to the first one in the list
-  parties[0].to_s
+def scrape(h)
+  url, klass = h.to_a.first
+  klass.new(response: Scraped::Request.new(url: url).response)
 end
 
-def scrape_person(url)
-  noko = noko_for(url)
-  bio = noko.css('div#container div')
-  # everything after the comma is qualification letters
-
-  id = url.to_s.gsub(/.*(yr\d\d.*)\.htm/, '\1')
-
-  name_parts = bio.css('h2').text.to_s.split(',')
-  name = name_parts.shift.to_s
-  honorific_prefix = ''
-  name.gsub(/^((?:(?:Hon|Prof|Dr|Ir|Mrs)\s+)+)(.*)$/) do
-    name = Regexp.last_match(2)
-    honorific_prefix = Regexp.last_match(1)
-  end
-  name = name.tidy
-  honorific_prefix = honorific_prefix.tidy if honorific_prefix
-
-  gender = ''
-  gender = 'female' if honorific_prefix.index('Mrs')
-
-  name_suffix = name_parts.join(', ').tidy
-
-  img = URI.join(url, bio.css('img/@src').to_s).to_s
-
-  area = bio.xpath('//p[contains(.,"Constituency")]/following-sibling::ul[not(position() > 1)]/li/text()').to_s
-  area_info = process_area(area)
-
-  faction = bio.xpath('//p[contains(.,"Political affiliation")]/following-sibling::ul[not(position() > 1)]/li/text()')
-  if faction.size > 1
-    faction = fix_parties(faction)
-  else
-    faction = faction.to_s.tidy
-    faction = 'Independent' if faction.empty?
-  end
-
-  email = bio.xpath('//table/tr/td/a[contains(@href, "mailto")]/text()').to_s.tidy
-
-  website = bio.xpath('//table/tr/td[contains(.,"Homepage")]/following-sibling::td/a/text()').to_s.tidy
-  phone = bio.xpath('//table/tr/td[contains(.,"telephone")]/following-sibling::td[position() = 2]/text()').to_s.tidy
-  fax = bio.xpath('//table/tr/td[contains(.,"fax")]/following-sibling::td[position() = 2]/text()').to_s.tidy
-
-  data = {
-    id:               id,
-    term:             6,
-    name:             name,
-    honorific_suffix: name_suffix,
-    honorific_prefix: honorific_prefix,
-    img:              img,
-    faction:          faction,
-    email:            email,
-    website:          website,
-    phone:            phone,
-    fax:              fax,
-    gender:           gender,
-    source:           url.to_s,
-  }
-
-  data = data.merge(area_info)
-
+list_url = 'http://www.legco.gov.hk/general/english/members/yr16-20/biographies.htm'
+(scrape list_url => MembersPage).member_urls.each do |url|
+  data = (scrape url => MemberPage).to_h.merge(term: 6)
   ScraperWiki.save_sqlite([:id], data)
+  # puts data.reject { |k, v| v.to_s.empty? }.sort_by { |k, v| k }.to_h
 end
-
-ScraperWiki.sqliteexecute('DROP TABLE data') rescue nil
-scrape_list('http://www.legco.gov.hk/general/english/members/yr16-20/biographies.htm')