dadosjusbr · joellensilva · Sep 24, 2024 · Sep 25, 2024 · Nov 6, 2024 · Jan 9, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+credentials.json
+venv
+__pycache__
+lista_planilhas_baixadas.csv
+.env
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,20 @@
+# set base image (host OS)
+FROM python:3.8-slim-buster
+
+# set the working directory in the container
+WORKDIR /code
+
+# copy the dependencies file to the working directory
+COPY requirements.txt .
+
+# install dependencies
+RUN pip install -r requirements.txt
+
+# copy the content of the local directory to the working directory
+COPY src/ .
+
+# copy the content of the local directory to the working directory
+COPY credentials.json .
+
+# command to run on container start
+CMD [ "python", "./main.py" ]
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,6 @@
+google-auth==2.35.0
+google-auth-oauthlib==1.2.1
+google-auth-httplib2==0.2.0
+google-api-python-client==2.146.0
+pandas>=2.0.3
+python-dotenv>=0.20.0
diff --git a/src/crawler.py b/src/crawler.py
@@ -0,0 +1,88 @@
+from google.oauth2 import service_account
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaIoBaseDownload
+import io
+import pandas as pd
+import pathlib
+import sys
+
+STATUS_DATA_UNAVAILABLE = 4
+
+# Caminho para o arquivo JSON da conta de serviço
+SERVICE_ACCOUNT_FILE = "credentials.json"
+
+# Escopos necessários para acessar o Google Drive
+SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]
+
+# Autentica usando as credenciais da conta de serviço
+creds = service_account.Credentials.from_service_account_file(
+    SERVICE_ACCOUNT_FILE, scopes=SCOPES
+)
+
+
+def download_list(file_id):
+    # Conecta-se à API do Google Drive
+    service = build("drive", "v3", credentials=creds)
+
+    file_name = "lista_planilhas_baixadas.csv"
+
+    request = service.files().get_media(fileId=file_id, supportsAllDrives=True)
+
+    fh = io.FileIO(file_name, "wb")
+    downloader = MediaIoBaseDownload(fh, request)
+
+    done = False
+    while not done:
+        status, done = downloader.next_chunk()
+
+
+def consult_list(orgao, mes, ano):
+    sheets_list = pd.read_csv("lista_planilhas_baixadas.csv")
+    filter_list = sheets_list[
+        (sheets_list.orgao == orgao)
+        & (sheets_list.mes == mes)
+        & (sheets_list.ano == ano)
+    ]
+
+    # Se os arquivos referentes ao órgão/mês/ano não existirem, retornamos status 4
+    if filter_list.empty:
+        sys.stderr.write(
+                f"Não existe planilhas para {orgao}/{mes}/{ano}."
+            )
+        sys.exit(STATUS_DATA_UNAVAILABLE)
+
+    return filter_list
+
+
+def download_files(output_path, filter_list):
+    # Pegamos a data e hora que o primeiro arquivo, do respectivo órgão/mês/ano, foi armazenado
+    timestamp = filter_list.data.min()
+    ts_files = [timestamp]
+
+    # Cria diretório, se não houver
+    pathlib.Path(output_path).mkdir(exist_ok=True)
+
+    # Conecta-se à API do Google Drive
+    service = build("drive", "v3", credentials=creds)
+
+    for row in filter_list.to_numpy():
+        # ID do arquivo
+        file_id = row[4]
+
+        # Nome para salvar o arquivo localmente
+        file_name = output_path + "/" + row[3]
+
+        ts_files.append(file_name)
+
+        # Solicitar o arquivo da API do Google Drive
+        request = service.files().get_media(fileId=file_id, supportsAllDrives=True)
+
+        # Fazer o download do arquivo
+        fh = io.FileIO(file_name, "wb")
+        downloader = MediaIoBaseDownload(fh, request)
+
+        done = False
+        while not done:
+            status, done = downloader.next_chunk()
+
+    return ts_files
diff --git a/src/list_drive_files.py b/src/list_drive_files.py
@@ -0,0 +1,147 @@
+from google.oauth2 import service_account
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaFileUpload
+import os
+import csv
+import sys
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+# ID da pasta que contém todas as planilhas a serem listas
+if "DATA_FOLDER_ID" in os.environ:
+    DATA_FOLDER_ID = os.environ["DATA_FOLDER_ID"]
+else:
+    sys.stderr.write("Invalid arguments, missing parameter: 'DATA_FOLDER_ID'.\n")
+    os._exit(1)
+
+# ID da lista que será atualizada no drive
+if "FILE_ID" in os.environ:
+    FILE_ID = os.environ["FILE_ID"]
+else:
+    sys.stderr.write("Invalid arguments, missing parameter: 'FILE_ID'.\n")
+    os._exit(1)
+
+# Caminho para o arquivo JSON da conta de serviço
+SERVICE_ACCOUNT_FILE = "credentials.json"
+
+# Escopos necessários para acessar o Google Drive
+SCOPES = ["https://www.googleapis.com/auth/drive"]
+
+# Autentica usando as credenciais da conta de serviço
+creds = service_account.Credentials.from_service_account_file(
+    SERVICE_ACCOUNT_FILE, scopes=SCOPES
+)
+
+# Conecta-se à API do Google Drive
+service = build("drive", "v3", credentials=creds)
+
+
+def list_folders():
+    folders = []
+    page_token = None
+
+    # Listar ID das pastas (de cada órgão) dentro da pasta especificada
+    while True:
+        results = (
+            service.files()
+            .list(
+                q=f"'{DATA_FOLDER_ID}' in parents",
+                pageSize=100,
+                fields="nextPageToken, files(id)",
+                supportsAllDrives=True,
+                includeItemsFromAllDrives=True,
+                pageToken=page_token,
+            )
+            .execute()
+        )
+
+        folders.extend(results.get("files", []))
+        page_token = results.get("nextPageToken", None)
+
+        if not page_token:
+            break
+
+    if not folders:
+        print("Pasta não encontrada.")
+        os._exit(1)
+    else:
+        return folders
+
+
+def list_files(folders):
+    files = []
+    for folder in folders:
+        page_token = None
+
+        while True:
+            results = (
+                service.files()
+                .list(
+                    q=f"'{folder['id']}' in parents",
+                    pageSize=100,
+                    fields="nextPageToken, files(id, name, createdTime)",
+                    supportsAllDrives=True,
+                    includeItemsFromAllDrives=True,
+                    pageToken=page_token,
+                )
+                .execute()
+            )
+
+            files.extend(results.get("files", []))
+            page_token = results.get("nextPageToken", None)
+
+            if not page_token:
+                break
+
+    return files
+
+
+def create_csv(files):
+    list_path = "lista_planilhas_baixadas.csv"
+    with open(list_path, mode="w", newline="", encoding="utf-8") as csv_list:
+        csv_writer = csv.writer(csv_list)
+
+        # Criando o cabeçalho
+        csv_writer.writerows([["orgao", "mes", "ano", "arquivo", "id_arquivo", "data"]])
+
+        for file in files:
+            # @old é o nome da pasta criada para armazenar planilhas "velhas",
+            # i.e. que foram baixadas, mas estavam quebradas/erradas e foram armazenadas novas 
+            if file['name'] != '@old':
+                # removendo a extensão
+                filename = os.path.splitext(file["name"])[0]
+
+                # Dividir a string pelo delimitador '-'
+                parts = filename.split("-")
+
+                orgao = parts[0].lower()
+                mes = parts[2]
+                ano = parts[3]
+
+                csv_writer.writerows(
+                    [[orgao, mes, ano, file["name"], file["id"], file["createdTime"]]]
+                )
+
+    return list_path
+
+
+def upload_list(list_path):
+    # Armazenando o csv no drive
+    file_name = os.path.basename(list_path)
+
+    # Upload do arquivo
+    media = MediaFileUpload(list_path, mimetype="text/csv")
+    file = (
+        service.files()
+        .update(fileId=FILE_ID, media_body=media, supportsAllDrives=True)
+        .execute()
+    )
+
+
+if __name__ == "__main__":
+    folders = list_folders()
+    files = list_files(folders)
+    list_path = create_csv(files)
+    upload_list(list_path)
diff --git a/src/main.py b/src/main.py
@@ -0,0 +1,46 @@
+import sys
+import os
+import crawler
+
+
+if "COURT" in os.environ:
+    court = os.environ["COURT"].casefold()
+else:
+    sys.stderr.write("Invalid arguments, missing parameter: 'COURT'.\n")
+    os._exit(1)
+
+if "YEAR" in os.environ:
+    year = int(os.environ["YEAR"])
+else:
+    sys.stderr.write("Invalid arguments, missing parameter: 'YEAR'.\n")
+    os._exit(1)
+
+if "MONTH" in os.environ:
+    month = int(os.environ["MONTH"])
+else:
+    sys.stderr.write("Invalid arguments, missing parameter: 'MONTH'.\n")
+    os._exit(1)
+
+if "OUTPUT_FOLDER" in os.environ:
+    output_path = os.environ["OUTPUT_FOLDER"]
+else:
+    output_path = "./output"
+
+# ID da lista no drive, referente às planilhas baixadas manualmente
+if "FILE_ID" in os.environ:
+    file_id = os.environ["FILE_ID"]
+else:
+    sys.stderr.write("Invalid arguments, missing parameter: 'FILE_ID'.\n")
+    os._exit(1)
+
+# Baixamos a lista de arquivos
+crawler.download_list(file_id)
+
+# Consultamos se os arquivos existem
+result = crawler.consult_list(court, month, year)
+
+# Baixamos os arquivos
+stdout = crawler.download_files(output_path, result)
+
+# Retornamos o timestamp e o caminho dos arquivos
+print('\n'.join(stdout))