From d6d7ec5ecc59ac21109b06841284d8081af61ccd Mon Sep 17 00:00:00 2001 From: Alexandre Harano Date: Sun, 6 Nov 2022 20:38:40 -0300 Subject: [PATCH 1/3] sp_sao_paulo: remove explicit scraped_at --- data_collection/gazette/spiders/sp/sp_sao_paulo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/data_collection/gazette/spiders/sp/sp_sao_paulo.py b/data_collection/gazette/spiders/sp/sp_sao_paulo.py index 104104635..95bfb84ca 100644 --- a/data_collection/gazette/spiders/sp/sp_sao_paulo.py +++ b/data_collection/gazette/spiders/sp/sp_sao_paulo.py @@ -1,6 +1,6 @@ import locale import re -from datetime import date, datetime +from datetime import date import scrapy from dateutil.rrule import DAILY, rrule @@ -48,5 +48,4 @@ def parse(self, response, day): is_extra_edition=False, territory_id=self.TERRITORY_ID, power="executive", - scraped_at=datetime.utcnow(), ) From 1eed4ab389c9009945bc5c64072fada5179c8bcf Mon Sep 17 00:00:00 2001 From: Alexandre Harano Date: Sun, 6 Nov 2022 20:54:59 -0300 Subject: [PATCH 2/3] sp_sao_paulo: replace locale use with a dict --- .../gazette/spiders/sp/sp_sao_paulo.py | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/data_collection/gazette/spiders/sp/sp_sao_paulo.py b/data_collection/gazette/spiders/sp/sp_sao_paulo.py index 95bfb84ca..bcc03f633 100644 --- a/data_collection/gazette/spiders/sp/sp_sao_paulo.py +++ b/data_collection/gazette/spiders/sp/sp_sao_paulo.py @@ -1,4 +1,3 @@ -import locale import re from datetime import date @@ -9,6 +8,20 @@ from gazette.spiders.base import BaseGazetteSpider RE_MAX_PAGE_NUM = re.compile(r"\d+ de (\d+)") +FULL_MONTH_NAME = { + 1: "Janeiro", + 2: "Fevereiro", + 3: "Março", + 4: "Abril", + 5: "Maio", + 6: "Junho", + 7: "Julho", + 8: "Agosto", + 9: "Setembro", + 10: "Outubro", + 11: "Novembro", + 12: "Dezembro", +} class SpSaoPauloSpider(BaseGazetteSpider): @@ -19,8 +32,6 @@ class SpSaoPauloSpider(BaseGazetteSpider): start_date = date(2017, 6, 1) def start_requests(self): - # Need to have the month's name in portuguese for the pdf url - locale.setlocale(locale.LC_TIME, "pt_BR.UTF-8") for day in rrule(freq=DAILY, dtstart=self.start_date, until=date.today()): url = f"{self.BASE_URL}/nav_v6/header.asp?txtData={day.strftime('%d/%m/%Y')}&cad=1" yield scrapy.Request(url, cb_kwargs=dict(day=day.date())) @@ -39,7 +50,14 @@ def parse(self, response, day): max_page = self.get_max_page(response) if not max_page: return - day_url = f"{self.BASE_URL}/doflash/prototipo/{day.strftime('%Y')}/{day.strftime('%B')}/{day.strftime('%d')}/cidade/pdf" + day_url = ( + f"{self.BASE_URL}" + "/doflash/prototipo" + f"/{day.strftime('%Y')}" + f"/{FULL_MONTH_NAME[day.month]}" + f"/{day.strftime('%d')}" + "/cidade/pdf" + ) urls = [f"{day_url}/pg_{page:04}.pdf" for page in range(1, max_page + 1)] yield Gazette( From 9f5f0b3862a72c8c29f6df8191b143a3b8f6dbb5 Mon Sep 17 00:00:00 2001 From: Alexandre Harano Date: Sun, 6 Nov 2022 21:06:37 -0300 Subject: [PATCH 3/3] sp_sao_paulo: use end_date parameter --- data_collection/gazette/spiders/sp/sp_sao_paulo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_collection/gazette/spiders/sp/sp_sao_paulo.py b/data_collection/gazette/spiders/sp/sp_sao_paulo.py index bcc03f633..c17079f30 100644 --- a/data_collection/gazette/spiders/sp/sp_sao_paulo.py +++ b/data_collection/gazette/spiders/sp/sp_sao_paulo.py @@ -32,7 +32,7 @@ class SpSaoPauloSpider(BaseGazetteSpider): start_date = date(2017, 6, 1) def start_requests(self): - for day in rrule(freq=DAILY, dtstart=self.start_date, until=date.today()): + for day in rrule(freq=DAILY, dtstart=self.start_date, until=self.end_date): url = f"{self.BASE_URL}/nav_v6/header.asp?txtData={day.strftime('%d/%m/%Y')}&cad=1" yield scrapy.Request(url, cb_kwargs=dict(day=day.date()))