From 08c601bcd528d123f8650793c7020e1b3938d513 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Guilherme=20Vanz?= Date: Thu, 21 Mar 2024 18:44:53 -0300 Subject: [PATCH 1/2] =?UTF-8?q?fix:=20atualiza=20vers=C3=A3o=20do=20Apache?= =?UTF-8?q?=20Tika=20em=20uso.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Quando tentado rodar o Apache Tika versão 1.9.4 atualmente em uso no container para rodar o servidor o comando falha. Parece que o binário está corrompido. Por isso, esse commit atualiza o Apache Tika em use para a versão 2.9.1. Signed-off-by: José Guilherme Vanz --- scripts/Dockerfile_apache_tika | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Dockerfile_apache_tika b/scripts/Dockerfile_apache_tika index 150e67a..4084622 100644 --- a/scripts/Dockerfile_apache_tika +++ b/scripts/Dockerfile_apache_tika @@ -6,7 +6,7 @@ RUN adduser --system gazette && \ apt-get clean # install Apache Tika -RUN curl -o /tika-server.jar http://archive.apache.org/dist/tika/tika-server-1.24.1.jar && \ +RUN curl -o /tika-server.jar https://dlcdn.apache.org/tika/2.9.1/tika-server-standard-2.9.1.jar && \ chmod 755 /tika-server.jar USER gazette From bbb703879beae54c19b2c76770403e06f038d256 Mon Sep 17 00:00:00 2001 From: Giulio Date: Sat, 14 Sep 2024 10:59:44 -0300 Subject: [PATCH 2/2] Atualiza Apache Tika e corrige build corrompido MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Versões mais novas do Tika exigem o "Accept: text/plain" para retornar apenas o conteúdo textual, pois o padrão é retornar HTML. --- data_extraction/text_extraction.py | 5 ++++- scripts/Dockerfile_apache_tika | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/data_extraction/text_extraction.py b/data_extraction/text_extraction.py index 8595aaf..723ca24 100644 --- a/data_extraction/text_extraction.py +++ b/data_extraction/text_extraction.py @@ -25,7 +25,10 @@ def _try_extract_text(self, filepath: str) -> str: if self.is_txt(filepath): return self._return_file_content(filepath) with open(filepath, "rb") as file: - headers = {"Content-Type": self._get_file_type(filepath)} + headers = { + "Content-Type": self._get_file_type(filepath), + "Accept": "text/plain", + } response = requests.put(f"{self._url}/tika", data=file, headers=headers) response.encoding = "UTF-8" return response.text diff --git a/scripts/Dockerfile_apache_tika b/scripts/Dockerfile_apache_tika index 4084622..f0da38b 100644 --- a/scripts/Dockerfile_apache_tika +++ b/scripts/Dockerfile_apache_tika @@ -6,7 +6,7 @@ RUN adduser --system gazette && \ apt-get clean # install Apache Tika -RUN curl -o /tika-server.jar https://dlcdn.apache.org/tika/2.9.1/tika-server-standard-2.9.1.jar && \ +RUN curl -o /tika-server.jar https://dlcdn.apache.org/tika/2.9.2/tika-server-standard-2.9.2.jar && \ chmod 755 /tika-server.jar USER gazette