From 651c56734b5df9803421cc809d3fded995e9aa0c Mon Sep 17 00:00:00 2001 From: Tales Mota Date: Mon, 10 Jun 2024 11:15:43 -0300 Subject: [PATCH 1/4] =?UTF-8?q?feat:=20Sistema=20replic=C3=A1vel=20para=20?= =?UTF-8?q?br=20transparencia?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/ba/ba_candeias.py | 13 ++++++ .../gazette/spiders/ba/ba_ibicoara.py | 13 ++++++ .../gazette/spiders/base/brtransparencia.py | 42 +++++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 data_collection/gazette/spiders/ba/ba_candeias.py create mode 100644 data_collection/gazette/spiders/ba/ba_ibicoara.py create mode 100644 data_collection/gazette/spiders/base/brtransparencia.py diff --git a/data_collection/gazette/spiders/ba/ba_candeias.py b/data_collection/gazette/spiders/ba/ba_candeias.py new file mode 100644 index 000000000..d65ff9573 --- /dev/null +++ b/data_collection/gazette/spiders/ba/ba_candeias.py @@ -0,0 +1,13 @@ +from datetime import date + +from gazette.spiders.base.brtransparencia import BaseBrTransparenciaSpider + + +class BaCandeiasSpider(BaseBrTransparenciaSpider): + name = "ba_candeias" + TERRITORY_ID = "2906501" + allowed_domains = ["www.camaraibicoara.ba.gov.br"] + start_urls = ["https://www.camaraibicoara.ba.gov.br/diario.html"] + start_date = date(2022, 12, 29) + br_tranparencia_entity = "63147391-dcb2-4d6c-9c5a-c4483a9d8306" + br_tranparencia_code = "CODE_ENT_CM207" diff --git a/data_collection/gazette/spiders/ba/ba_ibicoara.py b/data_collection/gazette/spiders/ba/ba_ibicoara.py new file mode 100644 index 000000000..72a720564 --- /dev/null +++ b/data_collection/gazette/spiders/ba/ba_ibicoara.py @@ -0,0 +1,13 @@ +from datetime import date + +from gazette.spiders.base.brtransparencia import BaseBrTransparenciaSpider + + +class BaIbicoaraSpider(BaseBrTransparenciaSpider): + name = "ba_ibicoara" + TERRITORY_ID = "2912202" + allowed_domains = ["www.camaraibicoara.ba.gov.br"] + start_urls = ["https://www.camaraibicoara.ba.gov.br/diario.html"] + start_date = date(2020, 2, 1) + br_tranparencia_entity = "691bea32-9b9f-40f8-ab18-31e079080a1a" + br_tranparencia_code = "CODE_ENT_CM204" diff --git a/data_collection/gazette/spiders/base/brtransparencia.py b/data_collection/gazette/spiders/base/brtransparencia.py new file mode 100644 index 000000000..90faba8d1 --- /dev/null +++ b/data_collection/gazette/spiders/base/brtransparencia.py @@ -0,0 +1,42 @@ +from datetime import datetime + +from scrapy import Request +from scrapy.selector import Selector + +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + + +class BaseBrTransparenciaSpider(BaseGazetteSpider): + name = "" + TERRITORY_ID = "" + allowed_domains = [""] + start_urls = [""] + br_tranparencia_entity = "" + br_tranparencia_code = "" + + def start_requests(self): + api_url = f"https://api.brtransparencia.com.br/api/diariooficial/filtro/{self.br_tranparencia_entity}/{self.br_tranparencia_code}/{self.start_date}/{self.end_date}/-1/-1" + + yield Request(api_url) + + def parse(self, response): + for entry in response.json(): + edition_date = datetime.strptime( + entry["dat_publicacao_dio"], "%Y-%m-%dT%H:%M:%S" + ).date() + extra_edition = True if entry["des_extra_dio"] is not None else False + edition_number = int(entry["num_diario_oficial_dio"]) + gezzetes = Selector(text=entry["des_resumo_dio"]).css("a") + urls = [] + for item in gezzetes: + link = item.css("a::attr(href)").get() + urls.append(link) + + yield Gazette( + edition_number=edition_number, + date=edition_date, + file_urls=urls, + is_extra_edition=extra_edition, + power="executive_legislative", + ) From 0730ce546e953e2096544751c24184e8afff30b2 Mon Sep 17 00:00:00 2001 From: Tales Mota Date: Mon, 10 Jun 2024 15:30:24 -0300 Subject: [PATCH 2/4] feat: adicionando outras fontes --- data_collection/gazette/spiders/ba/ba_candeias.py | 1 + .../spiders/ba/ba_conceicao_do_almeida_2024.py | 13 +++++++++++++ .../gazette/spiders/ba/ba_itaquara_2024.py | 13 +++++++++++++ .../gazette/spiders/ba/ba_porto_seguro.py | 14 ++++++++++++++ data_collection/gazette/spiders/ba/ba_rio_real.py | 14 ++++++++++++++ .../gazette/spiders/ba/ba_saude_2024.py | 14 ++++++++++++++ .../gazette/spiders/base/brtransparencia.py | 3 ++- 7 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 data_collection/gazette/spiders/ba/ba_conceicao_do_almeida_2024.py create mode 100644 data_collection/gazette/spiders/ba/ba_itaquara_2024.py create mode 100644 data_collection/gazette/spiders/ba/ba_porto_seguro.py create mode 100644 data_collection/gazette/spiders/ba/ba_rio_real.py create mode 100644 data_collection/gazette/spiders/ba/ba_saude_2024.py diff --git a/data_collection/gazette/spiders/ba/ba_candeias.py b/data_collection/gazette/spiders/ba/ba_candeias.py index d65ff9573..12f202913 100644 --- a/data_collection/gazette/spiders/ba/ba_candeias.py +++ b/data_collection/gazette/spiders/ba/ba_candeias.py @@ -11,3 +11,4 @@ class BaCandeiasSpider(BaseBrTransparenciaSpider): start_date = date(2022, 12, 29) br_tranparencia_entity = "63147391-dcb2-4d6c-9c5a-c4483a9d8306" br_tranparencia_code = "CODE_ENT_CM207" + power = "legislative" diff --git a/data_collection/gazette/spiders/ba/ba_conceicao_do_almeida_2024.py b/data_collection/gazette/spiders/ba/ba_conceicao_do_almeida_2024.py new file mode 100644 index 000000000..97ef80718 --- /dev/null +++ b/data_collection/gazette/spiders/ba/ba_conceicao_do_almeida_2024.py @@ -0,0 +1,13 @@ +from datetime import date + +from gazette.spiders.base.brtransparencia import BaseBrTransparenciaSpider + + +class BaConceicaoDoAlmeidaSpider(BaseBrTransparenciaSpider): + name = "ba_conceicao_do_almeida_2024" + TERRITORY_ID = "2908309" + allowed_domains = ["www.conceicaodoalmeida.ba.gov.br"] + start_urls = ["https://www.conceicaodoalmeida.ba.gov.br/diario.html"] + start_date = date(2019, 5, 3) + br_tranparencia_entity = "EF1662F7-9A2A-4FDB-ABAD-346211F97734" + br_tranparencia_code = "CODE_ENT_001" diff --git a/data_collection/gazette/spiders/ba/ba_itaquara_2024.py b/data_collection/gazette/spiders/ba/ba_itaquara_2024.py new file mode 100644 index 000000000..227d0961f --- /dev/null +++ b/data_collection/gazette/spiders/ba/ba_itaquara_2024.py @@ -0,0 +1,13 @@ +from datetime import date + +from gazette.spiders.base.brtransparencia import BaseBrTransparenciaSpider + + +class BaItaquaraSpider(BaseBrTransparenciaSpider): + name = "ba_itaquara_2024" + TERRITORY_ID = "2916708" + allowed_domains = ["www.itaquara.ba.gov.br"] + start_urls = ["https://www.itaquara.ba.gov.br/diario.html"] + start_date = date(2019, 1, 1) + br_tranparencia_entity = "1557447a-9381-44ad-9c0f-016868769479" + br_tranparencia_code = "CODE_ENT_PM003" diff --git a/data_collection/gazette/spiders/ba/ba_porto_seguro.py b/data_collection/gazette/spiders/ba/ba_porto_seguro.py new file mode 100644 index 000000000..59e060635 --- /dev/null +++ b/data_collection/gazette/spiders/ba/ba_porto_seguro.py @@ -0,0 +1,14 @@ +from datetime import date + +from gazette.spiders.base.brtransparencia import BaseBrTransparenciaSpider + + +class BaPortoSeguroSpider(BaseBrTransparenciaSpider): + name = "ba_porto_seguro" + TERRITORY_ID = "2925303" + allowed_domains = ["cmportoseguroba.brtransparencia.com.br"] + start_urls = ["https://cmportoseguroba.brtransparencia.com.br/diario.html"] + start_date = date(2022, 12, 19) + br_tranparencia_entity = "4557886f-5713-4999-b2c5-c54d9ee11b44" + br_tranparencia_code = "COD_ENT_CM210" + power = "legislative" diff --git a/data_collection/gazette/spiders/ba/ba_rio_real.py b/data_collection/gazette/spiders/ba/ba_rio_real.py new file mode 100644 index 000000000..e1de3e210 --- /dev/null +++ b/data_collection/gazette/spiders/ba/ba_rio_real.py @@ -0,0 +1,14 @@ +from datetime import date + +from gazette.spiders.base.brtransparencia import BaseBrTransparenciaSpider + + +class BaRioRealSpider(BaseBrTransparenciaSpider): + name = "ba_rio_real" + TERRITORY_ID = "2927002" + allowed_domains = ["cmriorealba.brtransparencia.com.br"] + start_urls = ["https://http://cmriorealba.brtransparencia.com.br/diario.html"] + start_date = date(2022, 12, 29) + br_tranparencia_entity = "45ae0af7-71a7-436e-9a8e-d41a68215062" + br_tranparencia_code = "COD_ENT_CM208" + power = "legislative" diff --git a/data_collection/gazette/spiders/ba/ba_saude_2024.py b/data_collection/gazette/spiders/ba/ba_saude_2024.py new file mode 100644 index 000000000..663301153 --- /dev/null +++ b/data_collection/gazette/spiders/ba/ba_saude_2024.py @@ -0,0 +1,14 @@ +from datetime import date + +from gazette.spiders.base.brtransparencia import BaseBrTransparenciaSpider + + +class BaSaudeSpider(BaseBrTransparenciaSpider): + name = "ba_saude_2024" + TERRITORY_ID = "2929800" + allowed_domains = ["pmsaudeba.brtransparencia.com.br"] + start_urls = ["https://pmsaudeba.brtransparencia.com.br/diario.html"] + start_date = date(2024, 1, 31) + br_tranparencia_entity = "46366dbc-7780-433d-a689-f287561a8a7a" + br_tranparencia_code = "COD_ENT_PM005" + power = "executive" diff --git a/data_collection/gazette/spiders/base/brtransparencia.py b/data_collection/gazette/spiders/base/brtransparencia.py index 90faba8d1..026abcc5a 100644 --- a/data_collection/gazette/spiders/base/brtransparencia.py +++ b/data_collection/gazette/spiders/base/brtransparencia.py @@ -14,6 +14,7 @@ class BaseBrTransparenciaSpider(BaseGazetteSpider): start_urls = [""] br_tranparencia_entity = "" br_tranparencia_code = "" + power = "executive" def start_requests(self): api_url = f"https://api.brtransparencia.com.br/api/diariooficial/filtro/{self.br_tranparencia_entity}/{self.br_tranparencia_code}/{self.start_date}/{self.end_date}/-1/-1" @@ -38,5 +39,5 @@ def parse(self, response): date=edition_date, file_urls=urls, is_extra_edition=extra_edition, - power="executive_legislative", + power=self.power, ) From c4ce4ce2249d591eea307933552aa09e7591ad5a Mon Sep 17 00:00:00 2001 From: Tales Mota Date: Sun, 6 Oct 2024 16:32:01 -0300 Subject: [PATCH 3/4] refactor: extract entity and code from content.js --- .../gazette/spiders/ba/ba_candeias.py | 4 +- .../ba/ba_conceicao_do_almeida_2024.py | 4 +- .../gazette/spiders/ba/ba_ibicoara.py | 4 +- .../gazette/spiders/ba/ba_itaquara_2024.py | 4 +- .../gazette/spiders/ba/ba_porto_seguro.py | 7 +-- .../gazette/spiders/ba/ba_rio_real.py | 7 +-- .../gazette/spiders/ba/ba_saude_2024.py | 4 +- .../gazette/spiders/base/brtransparencia.py | 45 ++++++++++++++++--- 8 files changed, 53 insertions(+), 26 deletions(-) diff --git a/data_collection/gazette/spiders/ba/ba_candeias.py b/data_collection/gazette/spiders/ba/ba_candeias.py index 12f202913..373a3e635 100644 --- a/data_collection/gazette/spiders/ba/ba_candeias.py +++ b/data_collection/gazette/spiders/ba/ba_candeias.py @@ -6,9 +6,7 @@ class BaCandeiasSpider(BaseBrTransparenciaSpider): name = "ba_candeias" TERRITORY_ID = "2906501" - allowed_domains = ["www.camaraibicoara.ba.gov.br"] + allowed_domains = ["www.camaraibicoara.ba.gov.br", "api.brtransparencia.com.br"] start_urls = ["https://www.camaraibicoara.ba.gov.br/diario.html"] start_date = date(2022, 12, 29) - br_tranparencia_entity = "63147391-dcb2-4d6c-9c5a-c4483a9d8306" - br_tranparencia_code = "CODE_ENT_CM207" power = "legislative" diff --git a/data_collection/gazette/spiders/ba/ba_conceicao_do_almeida_2024.py b/data_collection/gazette/spiders/ba/ba_conceicao_do_almeida_2024.py index 97ef80718..016607bd3 100644 --- a/data_collection/gazette/spiders/ba/ba_conceicao_do_almeida_2024.py +++ b/data_collection/gazette/spiders/ba/ba_conceicao_do_almeida_2024.py @@ -6,8 +6,6 @@ class BaConceicaoDoAlmeidaSpider(BaseBrTransparenciaSpider): name = "ba_conceicao_do_almeida_2024" TERRITORY_ID = "2908309" - allowed_domains = ["www.conceicaodoalmeida.ba.gov.br"] + allowed_domains = ["www.conceicaodoalmeida.ba.gov.br", "api.brtransparencia.com.br"] start_urls = ["https://www.conceicaodoalmeida.ba.gov.br/diario.html"] start_date = date(2019, 5, 3) - br_tranparencia_entity = "EF1662F7-9A2A-4FDB-ABAD-346211F97734" - br_tranparencia_code = "CODE_ENT_001" diff --git a/data_collection/gazette/spiders/ba/ba_ibicoara.py b/data_collection/gazette/spiders/ba/ba_ibicoara.py index 72a720564..cc3f9de38 100644 --- a/data_collection/gazette/spiders/ba/ba_ibicoara.py +++ b/data_collection/gazette/spiders/ba/ba_ibicoara.py @@ -6,8 +6,6 @@ class BaIbicoaraSpider(BaseBrTransparenciaSpider): name = "ba_ibicoara" TERRITORY_ID = "2912202" - allowed_domains = ["www.camaraibicoara.ba.gov.br"] + allowed_domains = ["www.camaraibicoara.ba.gov.br", "api.brtransparencia.com.br"] start_urls = ["https://www.camaraibicoara.ba.gov.br/diario.html"] start_date = date(2020, 2, 1) - br_tranparencia_entity = "691bea32-9b9f-40f8-ab18-31e079080a1a" - br_tranparencia_code = "CODE_ENT_CM204" diff --git a/data_collection/gazette/spiders/ba/ba_itaquara_2024.py b/data_collection/gazette/spiders/ba/ba_itaquara_2024.py index 227d0961f..a1e743769 100644 --- a/data_collection/gazette/spiders/ba/ba_itaquara_2024.py +++ b/data_collection/gazette/spiders/ba/ba_itaquara_2024.py @@ -6,8 +6,6 @@ class BaItaquaraSpider(BaseBrTransparenciaSpider): name = "ba_itaquara_2024" TERRITORY_ID = "2916708" - allowed_domains = ["www.itaquara.ba.gov.br"] + allowed_domains = ["www.itaquara.ba.gov.br", "api.brtransparencia.com.br"] start_urls = ["https://www.itaquara.ba.gov.br/diario.html"] start_date = date(2019, 1, 1) - br_tranparencia_entity = "1557447a-9381-44ad-9c0f-016868769479" - br_tranparencia_code = "CODE_ENT_PM003" diff --git a/data_collection/gazette/spiders/ba/ba_porto_seguro.py b/data_collection/gazette/spiders/ba/ba_porto_seguro.py index 59e060635..a313eae36 100644 --- a/data_collection/gazette/spiders/ba/ba_porto_seguro.py +++ b/data_collection/gazette/spiders/ba/ba_porto_seguro.py @@ -6,9 +6,10 @@ class BaPortoSeguroSpider(BaseBrTransparenciaSpider): name = "ba_porto_seguro" TERRITORY_ID = "2925303" - allowed_domains = ["cmportoseguroba.brtransparencia.com.br"] + allowed_domains = [ + "cmportoseguroba.brtransparencia.com.br", + "api.brtransparencia.com.br", + ] start_urls = ["https://cmportoseguroba.brtransparencia.com.br/diario.html"] start_date = date(2022, 12, 19) - br_tranparencia_entity = "4557886f-5713-4999-b2c5-c54d9ee11b44" - br_tranparencia_code = "COD_ENT_CM210" power = "legislative" diff --git a/data_collection/gazette/spiders/ba/ba_rio_real.py b/data_collection/gazette/spiders/ba/ba_rio_real.py index e1de3e210..429061459 100644 --- a/data_collection/gazette/spiders/ba/ba_rio_real.py +++ b/data_collection/gazette/spiders/ba/ba_rio_real.py @@ -6,9 +6,10 @@ class BaRioRealSpider(BaseBrTransparenciaSpider): name = "ba_rio_real" TERRITORY_ID = "2927002" - allowed_domains = ["cmriorealba.brtransparencia.com.br"] + allowed_domains = [ + "cmriorealba.brtransparencia.com.br", + "api.brtransparencia.com.br", + ] start_urls = ["https://http://cmriorealba.brtransparencia.com.br/diario.html"] start_date = date(2022, 12, 29) - br_tranparencia_entity = "45ae0af7-71a7-436e-9a8e-d41a68215062" - br_tranparencia_code = "COD_ENT_CM208" power = "legislative" diff --git a/data_collection/gazette/spiders/ba/ba_saude_2024.py b/data_collection/gazette/spiders/ba/ba_saude_2024.py index 663301153..6ced12942 100644 --- a/data_collection/gazette/spiders/ba/ba_saude_2024.py +++ b/data_collection/gazette/spiders/ba/ba_saude_2024.py @@ -6,9 +6,7 @@ class BaSaudeSpider(BaseBrTransparenciaSpider): name = "ba_saude_2024" TERRITORY_ID = "2929800" - allowed_domains = ["pmsaudeba.brtransparencia.com.br"] + allowed_domains = ["pmsaudeba.brtransparencia.com.br", "api.brtransparencia.com.br"] start_urls = ["https://pmsaudeba.brtransparencia.com.br/diario.html"] start_date = date(2024, 1, 31) - br_tranparencia_entity = "46366dbc-7780-433d-a689-f287561a8a7a" - br_tranparencia_code = "COD_ENT_PM005" power = "executive" diff --git a/data_collection/gazette/spiders/base/brtransparencia.py b/data_collection/gazette/spiders/base/brtransparencia.py index 026abcc5a..46b0ce33e 100644 --- a/data_collection/gazette/spiders/base/brtransparencia.py +++ b/data_collection/gazette/spiders/base/brtransparencia.py @@ -1,3 +1,4 @@ +import re from datetime import datetime from scrapy import Request @@ -10,17 +11,51 @@ class BaseBrTransparenciaSpider(BaseGazetteSpider): name = "" TERRITORY_ID = "" - allowed_domains = [""] + allowed_domains = [] start_urls = [""] - br_tranparencia_entity = "" - br_tranparencia_code = "" power = "executive" - def start_requests(self): - api_url = f"https://api.brtransparencia.com.br/api/diariooficial/filtro/{self.br_tranparencia_entity}/{self.br_tranparencia_code}/{self.start_date}/{self.end_date}/-1/-1" + def _extract_entity_code(self, response): + response_text = response.text + try: + intermediate_response_entity = re.search( + r'var entity="[\d\w\-]+"', + response_text, + re.IGNORECASE, + ).group() + response_entity = ( + re.search( + r'"[\d\w\-]+"', + intermediate_response_entity, + ) + .group() + .replace('"', "") + ) + except AttributeError as exc: + raise AttributeError("Was not possible to extract the entity code") from exc + try: + intermediate_response_code = re.search( + r'var code="[\d\w\-]+"', + response_text, + re.IGNORECASE, + ).group() + response_code = ( + re.search(r'"[\d\w\-]+"', intermediate_response_code) + .group() + .replace('"', "") + ) + except AttributeError as exc: + raise AttributeError("Was not possible to extract the code") from exc + api_url = f"https://api.brtransparencia.com.br/api/diariooficial/filtro/{response_entity}/{response_code}/{self.start_date}/{self.end_date}/-1/-1" yield Request(api_url) + def start_requests(self): + # getting the entity and code from inner JS Content file + url = self.start_urls[0].replace("/diario.html", "/js/content.js") + + yield Request(url, callback=self._extract_entity_code) + def parse(self, response): for entry in response.json(): edition_date = datetime.strptime( From 420ebd549595a725d2e68ccdf9ce97d7ed78c167 Mon Sep 17 00:00:00 2001 From: Tales Mota Date: Sun, 6 Oct 2024 16:55:15 -0300 Subject: [PATCH 4/4] refactor: extract code and entity from content.js --- .../gazette/spiders/base/brtransparencia.py | 32 +++++++------------ 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/data_collection/gazette/spiders/base/brtransparencia.py b/data_collection/gazette/spiders/base/brtransparencia.py index 46b0ce33e..2bc08aed1 100644 --- a/data_collection/gazette/spiders/base/brtransparencia.py +++ b/data_collection/gazette/spiders/base/brtransparencia.py @@ -15,34 +15,24 @@ class BaseBrTransparenciaSpider(BaseGazetteSpider): start_urls = [""] power = "executive" + def _extract_code_from_response_text(self, response_text, field="entity"): + return re.search( + rf'var {field}(\ )*=(\ )*["|\'](.+?)["|\']', + response_text, + re.IGNORECASE, + ).groups()[2] + def _extract_entity_code(self, response): response_text = response.text try: - intermediate_response_entity = re.search( - r'var entity="[\d\w\-]+"', - response_text, - re.IGNORECASE, - ).group() - response_entity = ( - re.search( - r'"[\d\w\-]+"', - intermediate_response_entity, - ) - .group() - .replace('"', "") + response_entity = self._extract_code_from_response_text( + response_text, field="entity" ) except AttributeError as exc: raise AttributeError("Was not possible to extract the entity code") from exc try: - intermediate_response_code = re.search( - r'var code="[\d\w\-]+"', - response_text, - re.IGNORECASE, - ).group() - response_code = ( - re.search(r'"[\d\w\-]+"', intermediate_response_code) - .group() - .replace('"', "") + response_code = self._extract_code_from_response_text( + response_text, field="code" ) except AttributeError as exc: raise AttributeError("Was not possible to extract the code") from exc