From 3d661c5769b1ae6ef31b4e88bd0dc23e25395e5f Mon Sep 17 00:00:00 2001 From: Oliver Denman Date: Mon, 24 Apr 2017 14:34:33 +0100 Subject: [PATCH] Extract MemberPage --- lib/member_page.rb | 105 ++++++++++++++++++++++++++++++++++++++++++++ scraper.rb | 106 +++------------------------------------------ 2 files changed, 112 insertions(+), 99 deletions(-) create mode 100644 lib/member_page.rb diff --git a/lib/member_page.rb b/lib/member_page.rb new file mode 100644 index 000000000..429d1595d --- /dev/null +++ b/lib/member_page.rb @@ -0,0 +1,105 @@ +require 'scraped' +require 'pry' + +class MemberPage < Scraped::HTML + decorator Scraped::Response::Decorator::CleanUrls + + field :id do + url.gsub('.htm', '').split('members/').last + end + + field :name do + name_parts.first.to_s.gsub(Regexp.union(titles << '.'), '').tidy + end + + field :honorific_prefix do + titles.select { |prefix| name_parts.first.to_s.include? prefix }.join(' ') + end + + field :honorific_suffix do + name_parts[1..-1].map(&:tidy).join(', ') + end + + field :gender do + return 'female' if honorific_prefix.include?('Mrs') + end + + field :faction do + f = bio.xpath('//p[contains(.,"Political affiliation")]/'\ + 'following-sibling::ul[not(position() > 1)]/li/text()') + return 'Independent' if f.empty? + + # Some member pages list more than one group affiliation for that member + # Here, we remove affiliations with known non-party groups + f.map(&:to_s).map(&:tidy).find do |party| + !non_party_groups.to_s.include? party + end + end + + field :email do + bio.xpath('//table/tr/td/a[contains(@href, "mailto")]/text()').to_s.tidy + end + + field :website do + bio.xpath('//table/tr/td[contains(.,"Homepage")]/following-sibling::'\ + 'td/a/text()').to_s.tidy + end + + field :phone do + bio.xpath('//table/tr/td[contains(.,"telephone")]/following-sibling::'\ + 'td[position() = 2]/text()').to_s.tidy + end + + field :fax do + bio.xpath('//table/tr/td[contains(.,"fax")]/following-sibling::'\ + 'td[position() = 2]/text()').to_s.tidy + end + + field :img do + # TODO: incorrect image being captured for 'WONG Ting-kwong' + # Change line to: bio.at_css('img/@src').to_s + bio.css('img/@src').last.to_s + end + + field :area do + # splitting here by en-dash (not hyphen) + area_parts.last.split('–').last.tidy + end + + field :area_type do + return 'functional' if area_parts.first.include?('Functional') + return 'geographical' if area_parts.first.include?('Geographical') + area_parts.first + end + + field :source do + url + end + + private + + def area_parts + bio.xpath('//p[contains(.,"Constituency")]/following-sibling'\ + '::ul[not(position() > 1)]/li/text()').to_s.split('-') + end + + def name_parts + bio.css('h2').text.split(',') + end + + def titles + %w(Ir Dr Prof Hon Mrs) + end + + def bio + noko.css('div#container div') + end + + def non_party_groups + [ + 'Kowloon West New Dynamic', + 'New Territories Association of Societies', + 'April Fifth Action' + ] + end +end diff --git a/scraper.rb b/scraper.rb index e33906b31..cbb5f9ca3 100644 --- a/scraper.rb +++ b/scraper.rb @@ -10,112 +10,20 @@ require 'date' require 'scraped' -require 'open-uri/cached' -OpenURI::Cache.cache_path = '.cache' -# require 'scraped_page_archive/open-uri' +# require 'open-uri/cached' +# OpenURI::Cache.cache_path = '.cache' +require 'scraped_page_archive/open-uri' require_rel 'lib' -class String - def tidy - self.gsub(/[[:space:]]+/, ' ').strip - end -end - -def noko_for(url) - Nokogiri::HTML(open(url).read) -end - def scrape(h) url, klass = h.to_a.first klass.new(response: Scraped::Request.new(url: url).response) end -def process_area(area) - area_info = {} - - area_info[:area_type] = 'functional' if area.index('Functional') - area_info[:area_type] = 'geographical' if area.index('Geographical') - area_info[:area] = area.gsub(/.*(?:Geographical|Functional)\s+Constituency\s+[–-]\s+/, '').tidy - - area_info -end - -# if they have two affiliations listed then pick the sensible one where we -# mean the one listed in the breakdown at https://en.wikipedia.org/wiki/Legislative_Council_of_Hong_Kong -def fix_parties(parties) - return 'Labour Party' if parties.to_s.index('Labour Party') - return 'Democratic Alliance for the Betterment and Progress of Hong Kong' if parties.to_s.index('Democratic Alliance for the Betterment and Progress of Hong Kong') - return 'Business and Professionals Alliance for Hong Kong' if parties.to_s.index('Business and Professionals Alliance for Hong Kong') - return 'People Power' if parties.to_s.index('People Power') - return 'League of Social Democrats' if parties.to_s.index('League of Social Democrats') - - # fall back to the first one in the list - parties[0].to_s -end - -def scrape_person(url) - noko = noko_for(url) - bio = noko.css('div#container div') - # everything after the comma is qualification letters - - id = url.to_s.gsub(/.*(yr\d\d.*)\.htm/, '\1') - - name_parts = bio.css('h2').text.to_s.split(',') - name = name_parts.shift.to_s - honorific_prefix = '' - name.gsub(/^((?:(?:Hon|Prof|Dr|Ir|Mrs)\s+)+)(.*)$/) do - name = Regexp.last_match(2) - honorific_prefix = Regexp.last_match(1) - end - name = name.tidy - honorific_prefix = honorific_prefix.tidy if honorific_prefix - - gender = '' - gender = 'female' if honorific_prefix.index('Mrs') - - name_suffix = name_parts.join(', ').tidy - - img = URI.join(url, bio.css('img/@src').to_s).to_s - - area = bio.xpath('//p[contains(.,"Constituency")]/following-sibling::ul[not(position() > 1)]/li/text()').to_s - area_info = process_area(area) - - faction = bio.xpath('//p[contains(.,"Political affiliation")]/following-sibling::ul[not(position() > 1)]/li/text()') - if faction.size > 1 - faction = fix_parties(faction) - else - faction = faction.to_s.tidy - faction = 'Independent' if faction.empty? - end - - email = bio.xpath('//table/tr/td/a[contains(@href, "mailto")]/text()').to_s.tidy - - website = bio.xpath('//table/tr/td[contains(.,"Homepage")]/following-sibling::td/a/text()').to_s.tidy - phone = bio.xpath('//table/tr/td[contains(.,"telephone")]/following-sibling::td[position() = 2]/text()').to_s.tidy - fax = bio.xpath('//table/tr/td[contains(.,"fax")]/following-sibling::td[position() = 2]/text()').to_s.tidy - - data = { - id: id, - term: 6, - name: name, - honorific_suffix: name_suffix, - honorific_prefix: honorific_prefix, - img: img, - faction: faction, - email: email, - website: website, - phone: phone, - fax: fax, - gender: gender, - source: url.to_s, - } - - data = data.merge(area_info) - - puts data.reject { |k, v| v.to_s.empty? }.sort_by { |k, v| k }.to_h +list_url = 'http://www.legco.gov.hk/general/english/members/yr16-20/biographies.htm' +(scrape list_url => MembersPage).member_urls.each do |url| + data = (scrape url => MemberPage).to_h.merge(term: 6) ScraperWiki.save_sqlite([:id], data) + # puts data.reject { |k, v| v.to_s.empty? }.sort_by { |k, v| k }.to_h end - -list_url = "http://www.legco.gov.hk/general/english/members/yr16-20/biographies.htm" -(scrape list_url => MembersPage).member_urls.each { |url| scrape_person(url) }