-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfixweb.rb
61 lines (49 loc) · 2.9 KB
/
fixweb.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# require 'open-uri'
require 'nokogiri'
# Fix relative with absolute routes
my_heraldo = File.open('heraldo-origin.html').read
my_heraldo = my_heraldo.gsub("src=\"/","src=\"http://www.heraldo.es/")
my_heraldo = my_heraldo.gsub("href=\"/","href=\"http://www.heraldo.es/")
# videos
my_heraldo = my_heraldo.gsub("data=\"/","data=\"http://www.heraldo.es/")
my_heraldo = my_heraldo.gsub("\"/uploads","\"http://www.heraldo.es/uploads")
my_heraldo = my_heraldo.gsub("\"/MODULOS","\"http://www.heraldo.es/MODULOS")
# Get data from first new
my_noked_heraldo = Nokogiri::HTML(my_heraldo)
first_article = my_noked_heraldo.xpath('//div[starts-with(@id, "Noticia")]')[0]
title_to_change = first_article.xpath('.//h2/a')[0].text
link_to_change = my_noked_heraldo.xpath('.//h2/a/@href')[0]
pre_title = first_article.xpath('.//strong')[0].text
author = first_article.xpath('.//p/span')[0].text
subtext = first_article.xpath('.//div')[0].text
# sometimes there is an image
image_to_change = my_noked_heraldo.xpath('.//img[@data-original]')[0]
link_image = my_noked_heraldo.xpath('.//img[@data-original]/..')[0]
#title_to_change = my_noked_heraldo.xpath('//img[@data-original]/../../../h2/a')[0]
#alt_image_to_change = my_noked_heraldo.xpath('//div/div/a/img/@alt')[0]
# Policy
policy_to_change = my_noked_heraldo.xpath('//div[@id="pie"]//p')[2]
# Switch with my own data
my_heraldo = my_heraldo.gsub(title_to_change,"Sesión de web scraping este jueves en #zaragozarb")
my_heraldo = my_heraldo.gsub(link_to_change,"http://www.meetup.com/Zaragoza-Ruby-Jam-Sessions/events/215246072/")
my_heraldo = my_heraldo.gsub(pre_title,"Evento de programación en Zaragoza")
my_heraldo = my_heraldo.gsub(author,"Efe. Zaragoza")
my_heraldo = my_heraldo.gsub(subtext,"Esta tarde se explicará el uso de la gema Nokogiri, en La Jamonería")
# Image
my_heraldo = my_heraldo.gsub(image_to_change['src'],"http://photos4.meetupstatic.com/photos/event/4/9/5/e/600_226578782.jpeg")
my_heraldo = my_heraldo.gsub(link_image['href'],"http://www.meetup.com/Zaragoza-Ruby-Jam-Sessions/events/215246072/")
#my_heraldo = my_heraldo.gsub(alt_image_to_change,"Sesión de web scraping este jueves en #zaragozarb")
# Policy
my_heraldo = my_heraldo.gsub(policy_to_change,policy_to_change.to_s + "<p>Extraído de Heraldo de Aragón (2014).
Esta página ha sido replicada con fines docentes a los efectos del artículo 32.1, párrafo primero, de la Ley de Propiedad Intelectual.
Recuperado en octubre de 2014 de <a href=\"http://www.heraldo.es\" style=\"color:#f9ffff\">www.heraldo.es</a></p>")
# my_heraldo = my_heraldo.gsub(subtext,"Esta tarde se explicará el uso de la gema Nokogiri, en La Jamonería")
# Rewrite with changes in another file
File.open("heraldo-final.html", "w") { |io|
io.write(my_heraldo)
}
# puts pre_title
# puts subtext
# puts title_to_change
puts image_to_change['data-original'].to_s
# puts policy_to_change.to_s.length