Skip to content

Commit

Permalink
Extract MembersPage
Browse files Browse the repository at this point in the history
This class represents a document listing members of the legislature.
  • Loading branch information
Oliver Denman committed Apr 25, 2017
1 parent 64138c0 commit 46943cd
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 11 deletions.
9 changes: 9 additions & 0 deletions lib/members_page.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
require 'scraped'

class MembersPage < Scraped::HTML
decorator Scraped::Response::Decorator::CleanUrls

field :member_urls do
noko.css('.bio-member-detail-1 a/@href').map(&:to_s)
end
end
26 changes: 15 additions & 11 deletions scraper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,27 @@
require 'nokogiri'
require 'scraped_page_archive/open-uri'
require 'date'
require 'scraped'

require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'
# require 'scraped_page_archive/open-uri'

# require 'open-uri/cached'
# OpenURI::Cache.cache_path = '.cache'
require 'scraped_page_archive/open-uri'
require_rel 'lib'

class String
def tidy
self.gsub(/[[:space:]]+/, ' ').strip
end
end

def noko_for(url)
Nokogiri::HTML(open(url).read)
end

def scrape_list(url)
noko = noko_for(url)
noko.css('.bio-member-detail-1 a/@href').each do |link|
bio = URI.join(url, link.to_s)
scrape_person(bio)
end
def scrape(h)
url, klass = h.to_a.first
klass.new(response: Scraped::Request.new(url: url).response)
end

def process_area(area)
Expand Down Expand Up @@ -110,8 +113,9 @@ def scrape_person(url)

data = data.merge(area_info)

puts data.reject { |k, v| v.to_s.empty? }.sort_by { |k, v| k }.to_h
ScraperWiki.save_sqlite([:id], data)
end

ScraperWiki.sqliteexecute('DROP TABLE data') rescue nil
scrape_list('http://www.legco.gov.hk/general/english/members/yr16-20/biographies.htm')
list_url = "http://www.legco.gov.hk/general/english/members/yr16-20/biographies.htm"
(scrape list_url => MembersPage).member_urls.each { |url| scrape_person(url) }

0 comments on commit 46943cd

Please sign in to comment.