everypolitician-scrapers · ondenman · Apr 24, 2017 · Apr 24, 2017 · Apr 25, 2017 · Apr 25, 2017
diff --git a/.rubocop_todo.yml b/.rubocop_todo.yml
@@ -1,22 +0,0 @@
-# This configuration was generated by
-# `rubocop --auto-gen-config`
-# on 2017-04-25 08:39:37 +0100 using RuboCop version 0.46.0.
-# The point is for the user to remove these configuration records
-# one by one as the offenses are removed from the code base.
-# Note that changes in the inspected code, or installation of new
-# versions of RuboCop, may require this file to be generated again.
-
-# Offense count: 1
-Metrics/AbcSize:
-  Max: 54
-
-# Offense count: 2
-# Configuration parameters: AllowHeredoc, AllowURI, URISchemes, IgnoreCopDirectives, IgnoredPatterns.
-# URISchemes: http, https
-Metrics/LineLength:
-  Max: 165
-
-# Offense count: 1
-# Configuration parameters: CountComments.
-Metrics/MethodLength:
-  Max: 46

diff --git a/lib/member_page.rb b/lib/member_page.rb
@@ -0,0 +1,121 @@
+# frozen_string_literal: true
+
+require 'scraped'
+
+# This class represents the profile page of a given member
+class MemberPage < Scraped::HTML
+  decorator Scraped::Response::Decorator::CleanUrls
+
+  field :id do
+    [term, File.basename(url, '.htm')].join('/')
+  end
+
+  field :term do
+    File.dirname(url).split('/').last
+  end
+
+  field :name do
+    (name_without_suffixes.split - titles).join(' ')
+  end
+
+  field :honorific_prefix do
+    (name_without_suffixes.split & titles).join(' ')
+  end
+
+  field :honorific_suffix do
+    suffixes.join(', ')
+  end
+
+  field :gender do
+    return 'female' if honorific_prefix.include?('Mrs')
+  end
+
+  field :faction do
+    return 'Independent' if political_affiliation.empty?
+    # Some member pages list more than one group affiliation for that member
+    # Here, we remove affiliations with known non-party groups
+    (political_affiliation - non_party_groups).first
+  end
+
+  field :email do
+    bio.xpath('//table/tr/td/a[contains(@href, "mailto")]/text()').to_s.tidy
+  end
+
+  field :website do
+    bio.xpath('//table/tr/td[contains(.,"Homepage")]/following-sibling::'\
+              'td/a/text()').to_s.tidy
+  end
+
+  field :phone do
+    bio.xpath('//table/tr/td[contains(.,"telephone")]/following-sibling::'\
+              'td[position() = 2]/text()').to_s.tidy
+  end
+
+  field :fax do
+    bio.xpath('//table/tr/td[contains(.,"fax")]/following-sibling::'\
+              'td[position() = 2]/text()').to_s.tidy
+  end
+
+  field :img do
+    # TODO: incorrect image being captured for 'WONG Ting-kwong'
+    # Change line to: bio.at_css('img/@src').to_s
+    bio.css('img/@src').last.to_s
+  end
+
+  field :area do
+    area_parts.last.split("\u{2013}").last.tidy
+  end
+
+  field :area_type do
+    return 'functional' if area_parts.first.include?('Functional')
+    return 'geographical' if area_parts.first.include?('Geographical')
+    area_parts.first
+  end
+
+  field :source do
+    url
+  end
+
+  private
+
+  def area_parts
+    bio.xpath('//p[contains(.,"Constituency")]/following-sibling'\
+              '::ul[not(position() > 1)]/li/text()').to_s.split('-')
+  end
+
+  def name_text
+    bio.css('h2').text
+  end
+
+  def name_without_suffixes
+    name_text.split(',').first
+  end
+
+  def suffixes
+    name_text.split(',').drop(1).map(&:tidy)
+  end
+
+  def titles
+    %w[Ir Dr Prof Hon Mrs]
+  end
+
+  def bio
+    noko.css('div#container div')
+  end
+
+  def non_party_groups
+    Set[
+      'Kowloon West New Dynamic',
+      'New Territories Association of Societies',
+      'April Fifth Action',
+    ]
+  end
+
+  def political_affiliation
+    bio.xpath('//p[contains(.,"Political affiliation")]/'\
+                  'following-sibling::ul[not(position() > 1)]/li/text()')
+       .map(&:to_s)
+       .map(&:tidy)
+       .to_set
+  end
+end
diff --git a/lib/members_page.rb b/lib/members_page.rb
@@ -0,0 +1,12 @@
+# frozen_string_literal: true
+
+require 'scraped'
+
+# This class represents a page listing members of the given legislature
+class MembersPage < Scraped::HTML
+  decorator Scraped::Response::Decorator::CleanUrls
+
+  field :member_urls do
+    noko.css('.bio-member-detail-1 a/@href').map(&:text)
+  end
+end
diff --git a/scraper.rb b/scraper.rb
@@ -10,102 +10,18 @@
 # OpenURI::Cache.cache_path = '.cache'
 require 'scraped_page_archive/open-uri'
 
-def noko_for(url)
-  Nokogiri::HTML(open(url).read)
-end
-
-def scrape_list(url)
-  noko = noko_for(url)
-  noko.css('.bio-member-detail-1 a/@href').each do |link|
-    bio = URI.join(url, link.to_s)
-    scrape_person(bio)
-  end
-end
-
-def process_area(area)
-  area_info = {}
-
-  area_info[:area_type] = 'functional' if area.index('Functional')
-  area_info[:area_type] = 'geographical' if area.index('Geographical')
-  area_info[:area] = area.gsub(/.*(?:Geographical|Functional)\s+Constituency\s+[–-]\s+/, '').tidy
+require_rel 'lib'
 
-  area_info
+def scrape(h)
+  url, klass = h.to_a.first
+  klass.new(response: Scraped::Request.new(url: url).response)
 end
 
-# if they have two affiliations listed then pick the sensible one where we
-# mean the one listed in the breakdown at https://en.wikipedia.org/wiki/Legislative_Council_of_Hong_Kong
-def fix_parties(parties)
-  return 'Labour Party' if parties.to_s.index('Labour Party')
-  return 'Democratic Alliance for the Betterment and Progress of Hong Kong' if parties.to_s.index('Democratic Alliance for the Betterment and Progress of Hong Kong')
-  return 'Business and Professionals Alliance for Hong Kong' if parties.to_s.index('Business and Professionals Alliance for Hong Kong')
-  return 'People Power' if parties.to_s.index('People Power')
-  return 'League of Social Democrats' if parties.to_s.index('League of Social Democrats')
-
-  # fall back to the first one in the list
-  parties[0].to_s
-end
-
-def scrape_person(url)
-  noko = noko_for(url)
-  bio = noko.css('div#container div')
-  # everything after the comma is qualification letters
-
-  id = url.to_s.gsub(/.*(yr\d\d.*)\.htm/, '\1')
-
-  name_parts = bio.css('h2').text.to_s.split(',')
-  name = name_parts.shift.to_s
-  honorific_prefix = ''
-  name.gsub(/^((?:(?:Hon|Prof|Dr|Ir|Mrs)\s+)+)(.*)$/) do
-    name = Regexp.last_match(2)
-    honorific_prefix = Regexp.last_match(1)
-  end
-  name = name.tidy
-  honorific_prefix = honorific_prefix.tidy if honorific_prefix
-
-  gender = ''
-  gender = 'female' if honorific_prefix.index('Mrs')
-
-  name_suffix = name_parts.join(', ').tidy
-
-  img = URI.join(url, bio.css('img/@src').to_s).to_s
-
-  area = bio.xpath('//p[contains(.,"Constituency")]/following-sibling::ul[not(position() > 1)]/li/text()').to_s
-  area_info = process_area(area)
-
-  faction = bio.xpath('//p[contains(.,"Political affiliation")]/following-sibling::ul[not(position() > 1)]/li/text()')
-  if faction.size > 1
-    faction = fix_parties(faction)
-  else
-    faction = faction.to_s.tidy
-    faction = 'Independent' if faction.empty?
-  end
-
-  email = bio.xpath('//table/tr/td/a[contains(@href, "mailto")]/text()').to_s.tidy
-
-  website = bio.xpath('//table/tr/td[contains(.,"Homepage")]/following-sibling::td/a/text()').to_s.tidy
-  phone = bio.xpath('//table/tr/td[contains(.,"telephone")]/following-sibling::td[position() = 2]/text()').to_s.tidy
-  fax = bio.xpath('//table/tr/td[contains(.,"fax")]/following-sibling::td[position() = 2]/text()').to_s.tidy
-
-  data = {
-    id:               id,
-    term:             6,
-    name:             name,
-    honorific_suffix: name_suffix,
-    honorific_prefix: honorific_prefix,
-    img:              img,
-    faction:          faction,
-    email:            email,
-    website:          website,
-    phone:            phone,
-    fax:              fax,
-    gender:           gender,
-    source:           url.to_s,
-  }
-
-  data = data.merge(area_info)
-
-  ScraperWiki.save_sqlite([:id], data)
+list_url = 'http://www.legco.gov.hk/general/english/members/yr16-20/biographies.htm'
+data = (scrape list_url => MembersPage).member_urls.map do |url|
+  (scrape url => MemberPage).to_h.merge(term: 6)
+  # puts data.reject { |k, v| v.to_s.empty? }.sort_by { |k, v| k }.to_h
 end
 
 ScraperWiki.sqliteexecute('DROP TABLE data') rescue nil
-scrape_list('http://www.legco.gov.hk/general/english/members/yr16-20/biographies.htm')
+ScraperWiki.save_sqlite([:id], data)
diff --git a/test/cassettes/chd_htm.yml b/test/cassettes/chd_htm.yml