diff --git a/data_collection/gazette/spiders/ba/ba_candeias.py b/data_collection/gazette/spiders/ba/ba_candeias.py new file mode 100644 index 000000000..373a3e635 --- /dev/null +++ b/data_collection/gazette/spiders/ba/ba_candeias.py @@ -0,0 +1,12 @@ +from datetime import date + +from gazette.spiders.base.brtransparencia import BaseBrTransparenciaSpider + + +class BaCandeiasSpider(BaseBrTransparenciaSpider): + name = "ba_candeias" + TERRITORY_ID = "2906501" + allowed_domains = ["www.camaraibicoara.ba.gov.br", "api.brtransparencia.com.br"] + start_urls = ["https://www.camaraibicoara.ba.gov.br/diario.html"] + start_date = date(2022, 12, 29) + power = "legislative" diff --git a/data_collection/gazette/spiders/ba/ba_conceicao_do_almeida_2024.py b/data_collection/gazette/spiders/ba/ba_conceicao_do_almeida_2024.py new file mode 100644 index 000000000..016607bd3 --- /dev/null +++ b/data_collection/gazette/spiders/ba/ba_conceicao_do_almeida_2024.py @@ -0,0 +1,11 @@ +from datetime import date + +from gazette.spiders.base.brtransparencia import BaseBrTransparenciaSpider + + +class BaConceicaoDoAlmeidaSpider(BaseBrTransparenciaSpider): + name = "ba_conceicao_do_almeida_2024" + TERRITORY_ID = "2908309" + allowed_domains = ["www.conceicaodoalmeida.ba.gov.br", "api.brtransparencia.com.br"] + start_urls = ["https://www.conceicaodoalmeida.ba.gov.br/diario.html"] + start_date = date(2019, 5, 3) diff --git a/data_collection/gazette/spiders/ba/ba_ibicoara.py b/data_collection/gazette/spiders/ba/ba_ibicoara.py new file mode 100644 index 000000000..cc3f9de38 --- /dev/null +++ b/data_collection/gazette/spiders/ba/ba_ibicoara.py @@ -0,0 +1,11 @@ +from datetime import date + +from gazette.spiders.base.brtransparencia import BaseBrTransparenciaSpider + + +class BaIbicoaraSpider(BaseBrTransparenciaSpider): + name = "ba_ibicoara" + TERRITORY_ID = "2912202" + allowed_domains = ["www.camaraibicoara.ba.gov.br", "api.brtransparencia.com.br"] + start_urls = ["https://www.camaraibicoara.ba.gov.br/diario.html"] + start_date = date(2020, 2, 1) diff --git a/data_collection/gazette/spiders/ba/ba_itaquara_2024.py b/data_collection/gazette/spiders/ba/ba_itaquara_2024.py new file mode 100644 index 000000000..a1e743769 --- /dev/null +++ b/data_collection/gazette/spiders/ba/ba_itaquara_2024.py @@ -0,0 +1,11 @@ +from datetime import date + +from gazette.spiders.base.brtransparencia import BaseBrTransparenciaSpider + + +class BaItaquaraSpider(BaseBrTransparenciaSpider): + name = "ba_itaquara_2024" + TERRITORY_ID = "2916708" + allowed_domains = ["www.itaquara.ba.gov.br", "api.brtransparencia.com.br"] + start_urls = ["https://www.itaquara.ba.gov.br/diario.html"] + start_date = date(2019, 1, 1) diff --git a/data_collection/gazette/spiders/ba/ba_porto_seguro.py b/data_collection/gazette/spiders/ba/ba_porto_seguro.py new file mode 100644 index 000000000..a313eae36 --- /dev/null +++ b/data_collection/gazette/spiders/ba/ba_porto_seguro.py @@ -0,0 +1,15 @@ +from datetime import date + +from gazette.spiders.base.brtransparencia import BaseBrTransparenciaSpider + + +class BaPortoSeguroSpider(BaseBrTransparenciaSpider): + name = "ba_porto_seguro" + TERRITORY_ID = "2925303" + allowed_domains = [ + "cmportoseguroba.brtransparencia.com.br", + "api.brtransparencia.com.br", + ] + start_urls = ["https://cmportoseguroba.brtransparencia.com.br/diario.html"] + start_date = date(2022, 12, 19) + power = "legislative" diff --git a/data_collection/gazette/spiders/ba/ba_rio_real.py b/data_collection/gazette/spiders/ba/ba_rio_real.py new file mode 100644 index 000000000..429061459 --- /dev/null +++ b/data_collection/gazette/spiders/ba/ba_rio_real.py @@ -0,0 +1,15 @@ +from datetime import date + +from gazette.spiders.base.brtransparencia import BaseBrTransparenciaSpider + + +class BaRioRealSpider(BaseBrTransparenciaSpider): + name = "ba_rio_real" + TERRITORY_ID = "2927002" + allowed_domains = [ + "cmriorealba.brtransparencia.com.br", + "api.brtransparencia.com.br", + ] + start_urls = ["https://http://cmriorealba.brtransparencia.com.br/diario.html"] + start_date = date(2022, 12, 29) + power = "legislative" diff --git a/data_collection/gazette/spiders/ba/ba_saude_2024.py b/data_collection/gazette/spiders/ba/ba_saude_2024.py new file mode 100644 index 000000000..6ced12942 --- /dev/null +++ b/data_collection/gazette/spiders/ba/ba_saude_2024.py @@ -0,0 +1,12 @@ +from datetime import date + +from gazette.spiders.base.brtransparencia import BaseBrTransparenciaSpider + + +class BaSaudeSpider(BaseBrTransparenciaSpider): + name = "ba_saude_2024" + TERRITORY_ID = "2929800" + allowed_domains = ["pmsaudeba.brtransparencia.com.br", "api.brtransparencia.com.br"] + start_urls = ["https://pmsaudeba.brtransparencia.com.br/diario.html"] + start_date = date(2024, 1, 31) + power = "executive" diff --git a/data_collection/gazette/spiders/base/brtransparencia.py b/data_collection/gazette/spiders/base/brtransparencia.py new file mode 100644 index 000000000..2bc08aed1 --- /dev/null +++ b/data_collection/gazette/spiders/base/brtransparencia.py @@ -0,0 +1,68 @@ +import re +from datetime import datetime + +from scrapy import Request +from scrapy.selector import Selector + +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + + +class BaseBrTransparenciaSpider(BaseGazetteSpider): + name = "" + TERRITORY_ID = "" + allowed_domains = [] + start_urls = [""] + power = "executive" + + def _extract_code_from_response_text(self, response_text, field="entity"): + return re.search( + rf'var {field}(\ )*=(\ )*["|\'](.+?)["|\']', + response_text, + re.IGNORECASE, + ).groups()[2] + + def _extract_entity_code(self, response): + response_text = response.text + try: + response_entity = self._extract_code_from_response_text( + response_text, field="entity" + ) + except AttributeError as exc: + raise AttributeError("Was not possible to extract the entity code") from exc + try: + response_code = self._extract_code_from_response_text( + response_text, field="code" + ) + except AttributeError as exc: + raise AttributeError("Was not possible to extract the code") from exc + + api_url = f"https://api.brtransparencia.com.br/api/diariooficial/filtro/{response_entity}/{response_code}/{self.start_date}/{self.end_date}/-1/-1" + yield Request(api_url) + + def start_requests(self): + # getting the entity and code from inner JS Content file + url = self.start_urls[0].replace("/diario.html", "/js/content.js") + + yield Request(url, callback=self._extract_entity_code) + + def parse(self, response): + for entry in response.json(): + edition_date = datetime.strptime( + entry["dat_publicacao_dio"], "%Y-%m-%dT%H:%M:%S" + ).date() + extra_edition = True if entry["des_extra_dio"] is not None else False + edition_number = int(entry["num_diario_oficial_dio"]) + gezzetes = Selector(text=entry["des_resumo_dio"]).css("a") + urls = [] + for item in gezzetes: + link = item.css("a::attr(href)").get() + urls.append(link) + + yield Gazette( + edition_number=edition_number, + date=edition_date, + file_urls=urls, + is_extra_edition=extra_edition, + power=self.power, + )