From 1cb96a7ae23d24265dd5e6c6e3b9a8125f2a0f53 Mon Sep 17 00:00:00 2001
From: kapiwow <gavrikovantonkapi@gmail.com>
Date: Tue, 5 Nov 2024 12:18:29 +0100
Subject: [PATCH 01/11] feat: add parse_bytes, parse_url

---
 README.md                                     | 36 +++++--
 bindings/extractous-python/README.md          | 13 ++-
 bindings/extractous-python/src/extractor.rs   | 33 +++++++
 .../tests/test_extract_url.py                 | 10 ++
 bindings/extractous-python/tests/test_pdf.py  | 22 +++--
 bindings/extractous-python/tests/utils.py     | 12 ++-
 extractous-core/README.md                     | 30 +++---
 extractous-core/examples/extract_to_stream.rs |  9 ++
 extractous-core/src/extractor.rs              | 79 +++++++++++++---
 extractous-core/src/tika/parse.rs             | 94 ++++++++++++++++---
 extractous-core/tika-native/build.gradle      |  5 +-
 11 files changed, 283 insertions(+), 60 deletions(-)
 create mode 100644 bindings/extractous-python/tests/test_extract_url.py
diff --git a/README.md b/README.md
index f439854..297db83 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@
 <div align="center">
 
 _Extractous offers a fast and efficient solution for extracting content and metadata from various documents types such as PDF, Word, HTML, and [many other formats](#supported-file-formats).
-Our goal is to deliver a fast and efficient comprehensive solution in Rust with bindings for many programming 
+Our goal is to deliver a fast and efficient comprehensive solution in Rust with bindings for many programming
 languages._
 
 </div>
@@ -27,7 +27,7 @@ languages._
 ---
 
 **Demo**: showing that [Extractous 🚀](https://github.com/yobix-ai/extractous) is **25x faster** than the popular
-[unstructured-io](https://github.com/Unstructured-IO/unstructured) library ($65m in funding and 8.5k GitHub stars). 
+[unstructured-io](https://github.com/Unstructured-IO/unstructured) library ($65m in funding and 8.5k GitHub stars).
 For complete benchmarking details please consult our [benchmarking repository](https://github.com/yobix-ai/extractous-benchmarks)
 
 ![unstructured_vs_extractous](https://github.com/yobix-ai/extractous-benchmarks/raw/main/docs/extractous_vs_unstructured.gif)
@@ -55,7 +55,7 @@ With Extractous, the need for external services or APIs is eliminated, making da
 * High-performance unstructured data extraction optimized for speed and low memory usage.
 * Clear and simple API for extracting text and metadata content.
 * Automatically identifies document types and extracts content accordingly
-* Supports [many file formats](#supported-file-formats) (most formats supported by Apache Tika). 
+* Supports [many file formats](#supported-file-formats) (most formats supported by Apache Tika).
 * Extracts text from images and scanned documents with OCR through [tesseract-ocr](https://github.com/tesseract-ocr/tesseract).
 * Core engine written in Rust with bindings for [Python](https://pypi.org/project/extractous/) and upcoming support for JavaScript/TypeScript.
 * Detailed documentation and examples to help you get started quickly and efficiently.
@@ -77,13 +77,20 @@ extractor.set_extract_string_max_length(1000)
 result = extractor.extract_file_to_string("README.md")
 print(result)
 ```
-* Extracting a file to a buffered stream:
+* Extracting a file(URL / bytearray) to a buffered stream:
 
 ```python
 from extractous import Extractor
 
 extractor = Extractor()
+# for file
 reader = extractor.extract_file("tests/quarkus.pdf")
+# for url
+# reader = extractor.extract_url("https://www.google.com")
+# for bytearray
+# with open("tests/quarkus.pdf", "rb") as file:
+#     buffer = bytearray(file.read())
+# reader = extractor.extract_bytes(buffer)
 
 result = ""
 buffer = reader.read(4096)
@@ -122,9 +129,10 @@ fn main() {
 }
 ```
 
-* Extract a content of a file to a `StreamReader` and perform buffered reading
+* Extract a content of a file(URL/ bytes) to a `StreamReader` and perform buffered reading
 ```rust
-use std::io::Read;
+use std::io::{BufReader, Read};
+// use std::fs::File; use for bytes
 use extractous::Extractor;
 
 fn main() {
@@ -135,17 +143,25 @@ fn main() {
     // Extract the provided file content to a string
     let extractor = Extractor::new();
     let stream = extractor.extract_file(file_path).unwrap();
+    // Extract url
+    // let stream = extractor.extract_url("https://www.google.com/").unwrap();
+    // Extract bytes
+    // let mut file = File::open(file_path)?;
+    // let mut buffer = Vec::new();
+    // file.read_to_end(&mut buffer)?;
+    // let stream= extractor.extract_bytes(&file_bytes);
 
     // Because stream implements std::io::Read trait we can perform buffered reading
     // For example we can use it to create a BufReader
+    let mut reader = BufReader::new(stream);
     let mut buffer = Vec::new();
-    stream.read_to_end(&mut buffer).unwrap();
+    reader.read_to_end(&mut buffer).unwrap();
 
     println!("{}", String::from_utf8(buffer).unwrap())
 }
 ```
 
-* Extract content of PDF with OCR. 
+* Extract content of PDF with OCR.
 
 You need to have Tesseract installed with the language pack. For example on debian `sudo apt install tesseract-ocr tesseract-ocr-deu`
 
@@ -154,7 +170,7 @@ use extractous::Extractor;
 
 fn main() {
   let file_path = "../test_files/documents/deu-ocr.pdf";
-  
+
     let extractor = Extractor::new()
           .set_ocr_config(TesseractOcrConfig::new().set_language("deu"))
           .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::OCR_ONLY));
@@ -204,4 +220,4 @@ fn main() {
 Contributions are welcome! Please open an issue or submit a pull request if you have any improvements or new features to propose.
 
 ## 🕮 License
-This project is licensed under the Apache License 2.0. See the LICENSE file for details.
\ No newline at end of file
+This project is licensed under the Apache License 2.0. See the LICENSE file for details.
diff --git a/bindings/extractous-python/README.md b/bindings/extractous-python/README.md
index 9aa83ee..998bbae 100644
--- a/bindings/extractous-python/README.md
+++ b/bindings/extractous-python/README.md
@@ -1,6 +1,6 @@
 # Extractous Python Bindings
 
-This project provides Python bindings for the Extractous library, allowing you to use extractous functionality in 
+This project provides Python bindings for the Extractous library, allowing you to use extractous functionality in
 your Python applications.
 
 ## Installation
@@ -25,13 +25,20 @@ result = extractor.extract_file_to_string("README.md")
 print(result)
 ```
 
-Extracting a file to a buffered stream:
+Extracting a file(URL / bytearray) to a buffered stream:
 
 ```python
 from extractous import Extractor
 
 extractor = Extractor()
+# for file
 reader = extractor.extract_file("tests/quarkus.pdf")
+# for url
+# reader = extractor.extract_url("https://www.google.com")
+# for bytearray
+# with open("tests/quarkus.pdf", "rb") as file:
+#     buffer = bytearray(file.read())
+# reader = extractor.extract_bytes(buffer)
 
 result = ""
 buffer = reader.read(4096)
@@ -51,4 +58,4 @@ extractor = Extractor().set_ocr_config(TesseractOcrConfig().set_language("deu"))
 result = extractor.extract_file_to_string("../../test_files/documents/eng-ocr.pdf")
 
 print(result)
-```
\ No newline at end of file
+```
diff --git a/bindings/extractous-python/src/extractor.rs b/bindings/extractous-python/src/extractor.rs
index 7376cca..4c6f5ba 100644
--- a/bindings/extractous-python/src/extractor.rs
+++ b/bindings/extractous-python/src/extractor.rs
@@ -147,6 +147,39 @@ impl Extractor {
             .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))
     }
 
+    /// Extracts text from a bytearray. Returns a stream of the extracted text
+    /// the stream is decoded using the extractor's `encoding`
+    pub fn extract_bytes(&self, buffer: &Bound<'_, PyByteArray>) -> PyResult<StreamReader> {
+        let slice = buffer.to_vec();
+        let reader = self
+            .0
+            .extract_bytes(&slice)
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
+
+        // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
+        Ok(StreamReader {
+            reader,
+            buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
+            py_bytes: None,
+        })
+    }
+
+    /// Extracts text from a url. Returns a string that is of maximum length
+    /// of the extractor's `extract_string_max_length`
+    pub fn extract_url(&self, url: &str) -> PyResult<StreamReader> {
+        let reader = self
+            .0
+            .extract_url(&url)
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
+
+        // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
+        Ok(StreamReader {
+            reader,
+            buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
+            py_bytes: None,
+        })
+    }
+
     fn __repr__(&self) -> String {
         format!("{:?}", self.0)
     }
diff --git a/bindings/extractous-python/tests/test_extract_url.py b/bindings/extractous-python/tests/test_extract_url.py
new file mode 100644
index 0000000..b6f4158
--- /dev/null
+++ b/bindings/extractous-python/tests/test_extract_url.py
@@ -0,0 +1,10 @@
+from extractous import Extractor
+from utils import read_to_string
+
+def test_extract_url():
+    extractor = Extractor()
+
+    reader = extractor.extract_url("https://www.google.com")
+    result = read_to_string(reader)
+
+    assert "Google" in result
diff --git a/bindings/extractous-python/tests/test_pdf.py b/bindings/extractous-python/tests/test_pdf.py
index 5e85f3c..a14d9ed 100644
--- a/bindings/extractous-python/tests/test_pdf.py
+++ b/bindings/extractous-python/tests/test_pdf.py
@@ -1,4 +1,5 @@
 from extractous import Extractor
+from utils import read_to_string
 
 
 def expected_result():
@@ -12,16 +13,23 @@ def test_extract_file_to_string():
     #print(result)
     assert result == expected_result()
 
-
 def test_extract_file():
     extractor = Extractor()
     reader = extractor.extract_file("tests/quarkus.pdf")
 
-    result = ""
-    b = reader.read(4096)
-    while len(b) > 0:
-        result += b.decode("utf-8")
-        b = reader.read(4096)
+    result = read_to_string(reader)
 
     #print(result)
-    assert result == expected_result()
\ No newline at end of file
+    assert result == expected_result()
+
+def test_extract_bytes():
+    extractor = Extractor()
+
+    with open("tests/quarkus.pdf", "rb") as file:
+        buffer = bytearray(file.read())
+    reader = extractor.extract_bytes(buffer)
+
+    result = read_to_string(reader)
+
+    #print(result)
+    assert result == expected_result()
diff --git a/bindings/extractous-python/tests/utils.py b/bindings/extractous-python/tests/utils.py
index 30c3944..fb0e28b 100644
--- a/bindings/extractous-python/tests/utils.py
+++ b/bindings/extractous-python/tests/utils.py
@@ -10,4 +10,14 @@ def cosine_similarity(text1, text2):
 
     # Calculate cosine similarity between the two vectors
     cos_sim = cosine_sim(vectors)
-    return cos_sim[0][1]
\ No newline at end of file
+    return cos_sim[0][1]
+
+def read_to_string(reader):
+    """Read from stream to string."""
+    result = ""
+    b = reader.read(4096)
+    while len(b) > 0:
+        result += b.decode("utf-8")
+        b = reader.read(4096)
+    return result
+
diff --git a/extractous-core/README.md b/extractous-core/README.md
index 4e04bbb..0328db6 100644
--- a/extractous-core/README.md
+++ b/extractous-core/README.md
@@ -49,8 +49,9 @@ fn main() {
 }
 ```
 
-* Extract a content of a file to a `StreamReader` and perform buffered reading
+* Extract a content of a file(URL/ bytes) to a `StreamReader` and perform buffered reading
 ```rust
+// use std::fs::File; use for bytes
 use std::io::{BufReader, Read};
 use extractous::Extractor;
 
@@ -62,6 +63,13 @@ fn main() {
     // Extract the provided file content to a string
     let extractor = Extractor::new();
     let stream = extractor.extract_file(file_path).unwrap();
+    // Extract url
+    // let stream = extractor.extract_url("https://www.google.com/").unwrap();
+    // Extract bytes
+    // let mut file = File::open(file_path)?;
+    // let mut buffer = Vec::new();
+    // file.read_to_end(&mut buffer)?;
+    // let stream= extractor.extract_bytes(&file_bytes);
 
     // Because stream implements std::io::Read trait we can perform buffered reading
     // For example we can use it to create a BufReader
@@ -80,7 +88,7 @@ use extractous::Extractor;
 
 fn main() {
   let file_path = "../test_files/documents/deu-ocr.pdf";
-  
+
     let extractor = Extractor::new()
           .set_ocr_config(TesseractOcrConfig::new().set_language("deu"))
           .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::OCR_ONLY));
@@ -94,11 +102,11 @@ fn main() {
 ## Building
 
 ### Requirements
-* Extractous uses [Apache Tika](https://tika.apache.org/) for file formats that are not natively supported in Rust. 
-  However, to achieve one of Extractous goals, which is speed and efficiency, we do not set up any Tika as a servers or 
-  run any Java code. We instead, compile [Apache Tika](https://tika.apache.org/) as native shared libraries and use 
-  them on our Rust core as ffi. [GraalVm](https://www.graalvm.org/) is required to build Tika as native libs. 
-* The provided build script already takes care of installing the required GraalVM JDK. However, if you want to use a 
+* Extractous uses [Apache Tika](https://tika.apache.org/) for file formats that are not natively supported in Rust.
+  However, to achieve one of Extractous goals, which is speed and efficiency, we do not set up any Tika as a servers or
+  run any Java code. We instead, compile [Apache Tika](https://tika.apache.org/) as native shared libraries and use
+  them on our Rust core as ffi. [GraalVm](https://www.graalvm.org/) is required to build Tika as native libs.
+* The provided build script already takes care of installing the required GraalVM JDK. However, if you want to use a
   specific local version, you can do so by setting the GRAALVM_HOME environment variable
 * We recommend using [sdkman](https://sdkman.io/install) to install GraalVM JDKs
 * `sdk install java 22.0.1-graalce`
@@ -112,16 +120,16 @@ OpenJDK 64-Bit Server VM Liberica-NIK-24.0.1-1 (build 22.0.1+10, mixed mode, sha
 * On macOS the official GraalVM JDKs fail to work with code that use java awt. On macOS, we recommend using
   Bellsoft Liberica NIK
 * `sdk install java 24.0.1.r22-nik`
-* Extractous supports OCR through [tesseract](https://github.com/tesseract-ocr/tesseract), make sure tesseract is 
+* Extractous supports OCR through [tesseract](https://github.com/tesseract-ocr/tesseract), make sure tesseract is
 installed on your system because some of the OCR tests will fail if no tesseract is found.
 * `sudo apt install tesseract-ocr`
-* Install any language extensions you want. for example to install German and Arabic: 
+* Install any language extensions you want. for example to install German and Arabic:
 * `sudo apt install tesseract-ocr-deu tesseract-ocr-ara`
 
 ### Building Extractous
-* To build Extractous, just run: 
+* To build Extractous, just run:
 * `cargo build`
 
 ### Running Tests
 * To run tests, just run:
-* `cargo test`
\ No newline at end of file
+* `cargo test`
diff --git a/extractous-core/examples/extract_to_stream.rs b/extractous-core/examples/extract_to_stream.rs
index 7c99f85..9bbb142 100644
--- a/extractous-core/examples/extract_to_stream.rs
+++ b/extractous-core/examples/extract_to_stream.rs
@@ -1,4 +1,5 @@
 use extractous::Extractor;
+// use std::fs::File; use for bytes
 use std::io::{BufReader, Read};
 
 fn main() {
@@ -9,6 +10,14 @@ fn main() {
     // Extract the provided file content to a string
     let extractor = Extractor::new();
     let stream = extractor.extract_file(file_path).unwrap();
+    // Extract url
+    // let stream = extractor.extract_url("https://www.google.com/").unwrap();
+    // Extract bytes
+    // let mut file = File::open(file_path)?;
+    // let mut buffer = Vec::new();
+    // file.read_to_end(&mut buffer)?;
+    // let stream= extractor.extract_bytes(&file_bytes).unwrap();
+
     // Because stream implements std::io::Read trait we can perform buffered reading
     // For example we can use it to create a BufReader
     let mut reader = BufReader::new(stream);
diff --git a/extractous-core/src/extractor.rs b/extractous-core/src/extractor.rs
index 113e303..ea586d1 100644
--- a/extractous-core/src/extractor.rs
+++ b/extractous-core/src/extractor.rs
@@ -124,6 +124,31 @@ impl Extractor {
         )
     }
 
+    /// Extracts text from a byte buffer. Returns a stream of the extracted text
+    /// the stream is decoded using the extractor's `encoding`
+    pub fn extract_bytes(&self, buffer: &Vec<u8>) -> ExtractResult<StreamReader> {
+        tika::parse_bytes(
+            buffer,
+            &self.encoding,
+            &self.pdf_config,
+            &self.office_config,
+            &self.ocr_config,
+        )
+    }
+
+    /// Extracts text from a url. Returns a stream of the extracted text
+    /// the stream is decoded using the extractor's `encoding`
+    pub fn extract_url(&self, url: &str) -> ExtractResult<StreamReader> {
+        tika::parse_url(
+            url,
+            &self.encoding,
+            &self.pdf_config,
+            &self.office_config,
+            &self.ocr_config,
+        )
+    }
+
+
     /// Extracts text from a file path. Returns a string that is of maximum length
     /// of the extractor's `extract_string_max_length`
     pub fn extract_file_to_string(&self, file_path: &str) -> ExtractResult<String> {
@@ -141,10 +166,13 @@ impl Extractor {
 mod tests {
     use crate::Extractor;
     use std::fs::File;
-    use std::io::prelude::*;
+    use std::io::{self, Read};
     use std::io::BufReader;
 
+    use super::StreamReader;
+
     const TEST_FILE: &str = "README.md";
+    const TEST_URL: &str = "https://www.google.com/";
 
     fn expected_content() -> String {
         let mut file = File::open(TEST_FILE).unwrap();
@@ -153,6 +181,15 @@ mod tests {
         content
     }
 
+    fn read_content_from_stream(stream: StreamReader) -> String {
+        let mut reader = BufReader::new(stream);
+        let mut buffer = Vec::new();
+        reader.read_to_end(&mut buffer).unwrap();
+
+        let content = String::from_utf8(buffer).unwrap();
+        content
+    }
+
     #[test]
     fn extract_file_test() {
         // Prepare expected_content
@@ -161,17 +198,8 @@ mod tests {
         // Parse the files using extractous
         let extractor = Extractor::new();
         let result = extractor.extract_file(TEST_FILE);
-        let mut reader = BufReader::new(result.unwrap());
-        let mut buffer = Vec::new();
-        reader.read_to_end(&mut buffer).unwrap();
-
-        let content = String::from_utf8(buffer).unwrap();
+        let content = read_content_from_stream(result.unwrap());
         assert_eq!(content.trim(), expected_content.trim());
-
-        // let mut reader = BufReader::new(result.unwrap());
-        // let mut line = String::new();
-        // let _len = reader.read_line(&mut line).unwrap();
-        //assert_eq!("# Extractous", line.trim());
     }
 
     #[test]
@@ -185,4 +213,33 @@ mod tests {
         let content = result.unwrap();
         assert_eq!(content.trim(), expected_content.trim());
     }
+
+	fn read_file_as_bytes(path: &str) -> io::Result<Vec<u8>> {
+		let mut file = File::open(path)?;
+		let mut buffer = Vec::new();
+		file.read_to_end(&mut buffer)?;
+		Ok(buffer)
+	}
+
+	#[test]
+	fn extract_bytes_test() {
+        // Prepare expected_content
+        let expected_content = expected_content();
+
+        // Parse the bytes using extractous
+        let file_bytes = read_file_as_bytes(TEST_FILE).unwrap();
+        let extractor = Extractor::new();
+        let result = extractor.extract_bytes(&file_bytes);
+        let content = read_content_from_stream(result.unwrap());
+        assert_eq!(content.trim(), expected_content.trim());
+	}
+
+	#[test]
+	fn extract_url_test() {
+        // Parse url by extractous
+        let extractor = Extractor::new();
+        let result = extractor.extract_url(&TEST_URL);
+        let content = read_content_from_stream(result.unwrap());
+        assert!(content.contains("Google"));
+	}
 }
diff --git a/extractous-core/src/tika/parse.rs b/extractous-core/src/tika/parse.rs
index a019e9b..0a02bef 100644
--- a/extractous-core/src/tika/parse.rs
+++ b/extractous-core/src/tika/parse.rs
@@ -1,7 +1,7 @@
 use std::sync::OnceLock;
 
 use jni::objects::JValue;
-use jni::JavaVM;
+use jni::{AttachGuard, JavaVM};
 
 use crate::errors::ExtractResult;
 use crate::tika::jni_utils::*;
@@ -17,18 +17,24 @@ pub(crate) fn vm() -> &'static JavaVM {
     GRAAL_VM.get_or_init(create_vm_isolate)
 }
 
-pub fn parse_file(
-    file_path: &str,
+fn env<'local>() -> ExtractResult<AttachGuard<'local>> {
+    // Attaching a thead that is already attached is a no-op. Good to have this in case this method
+    // is called from another thread
+    let env = vm().attach_current_thread()?;
+    Ok(env)
+}
+
+fn parse_to_stream(
+    mut env: AttachGuard,
+    data_source_val: JValue,
     char_set: &CharSet,
     pdf_conf: &PdfParserConfig,
     office_conf: &OfficeParserConfig,
     ocr_conf: &TesseractOcrConfig,
+    method_name: &str,
+    signature: &str,
 ) -> ExtractResult<StreamReader> {
-    // Attaching a thead that is already attached is a no-op. Good to have this in case this method
-    // is called from another thread
-    let mut env = vm().attach_current_thread()?;
 
-    let file_path_val = jni_new_string_as_jvalue(&mut env, file_path)?;
     let charset_name_val = jni_new_string_as_jvalue(&mut env, &char_set.to_string())?;
     let j_pdf_conf = JPDFParserConfig::new(&mut env, pdf_conf)?;
     let j_office_conf = JOfficeParserConfig::new(&mut env, office_conf)?;
@@ -38,15 +44,10 @@ pub fn parse_file(
     let call_result = jni_call_static_method(
         &mut env,
         "ai/yobix/TikaNativeMain",
-        "parseFile",
-        "(Ljava/lang/String;\
-        Ljava/lang/String;\
-        Lorg/apache/tika/parser/pdf/PDFParserConfig;\
-        Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
-        Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
-        )Lai/yobix/ReaderResult;",
+        method_name,
+        signature,
         &[
-            (&file_path_val).into(),
+            data_source_val,
             (&charset_name_val).into(),
             (&j_pdf_conf.internal).into(),
             (&j_office_conf.internal).into(),
@@ -62,6 +63,27 @@ pub fn parse_file(
     Ok(StreamReader { inner: j_reader })
 }
 
+pub fn parse_file(
+    file_path: &str,
+    char_set: &CharSet,
+    pdf_conf: &PdfParserConfig,
+    office_conf: &OfficeParserConfig,
+    ocr_conf: &TesseractOcrConfig,
+) -> ExtractResult<StreamReader> {
+    let mut env = env()?;
+
+    let file_path_val = jni_new_string_as_jvalue(&mut env, file_path)?;
+    return parse_to_stream(env, (&file_path_val).into(), char_set, pdf_conf, office_conf, ocr_conf,
+        "parseFile",
+        "(Ljava/lang/String;\
+        Ljava/lang/String;\
+        Lorg/apache/tika/parser/pdf/PDFParserConfig;\
+        Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
+        Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
+        )Lai/yobix/ReaderResult;",
+        )
+}
+
 /// Parses a file to a string using the Apache Tika library.
 pub fn parse_file_to_string(
     file_path: &str,
@@ -102,3 +124,45 @@ pub fn parse_file_to_string(
 
     Ok(result.content)
 }
+
+pub fn parse_bytes(
+    buffer: &Vec<u8>,
+    char_set: &CharSet,
+    pdf_conf: &PdfParserConfig,
+    office_conf: &OfficeParserConfig,
+    ocr_conf: &TesseractOcrConfig,
+) -> ExtractResult<StreamReader> {
+    let env = env()?;
+
+    let buffer_val = env.byte_array_from_slice(&buffer).expect("Couldn't create byte array");
+    return parse_to_stream(env, (&buffer_val).into(), char_set, pdf_conf, office_conf, ocr_conf,
+        "parseBytes",
+        "([B\
+        Ljava/lang/String;\
+        Lorg/apache/tika/parser/pdf/PDFParserConfig;\
+        Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
+        Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
+        )Lai/yobix/ReaderResult;",
+        )
+}
+
+pub fn parse_url(
+    url: &str,
+    char_set: &CharSet,
+    pdf_conf: &PdfParserConfig,
+    office_conf: &OfficeParserConfig,
+    ocr_conf: &TesseractOcrConfig,
+) -> ExtractResult<StreamReader> {
+    let mut env = env()?;
+
+    let url_val = jni_new_string_as_jvalue(&mut env, url)?;
+    return parse_to_stream(env, (&url_val).into(), char_set, pdf_conf, office_conf, ocr_conf,
+        "parseUrl",
+        "(Ljava/lang/String;\
+        Ljava/lang/String;\
+        Lorg/apache/tika/parser/pdf/PDFParserConfig;\
+        Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
+        Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
+        )Lai/yobix/ReaderResult;",
+        )
+}
diff --git a/extractous-core/tika-native/build.gradle b/extractous-core/tika-native/build.gradle
index d153548..9ce61e1 100644
--- a/extractous-core/tika-native/build.gradle
+++ b/extractous-core/tika-native/build.gradle
@@ -68,10 +68,11 @@ graalvmNative {
                     "-H:+AddAllCharsets", // Very important to get UTF8 working
                     "-O3",
                     "--parallelism=$numThreads",
-                    "-march=compatibility" // VERY IMPORTANT to use compatibility flag. If not the libs will use the cpu arch of the build machine and will notwork on other CPUs if distributed
+                    "-march=compatibility", // VERY IMPORTANT to use compatibility flag. If not the libs will use the cpu arch of the build machine and will notwork on other CPUs if distributed
+                    "--enable-url-protocols=https",
             )
             jvmArgs.add('-Djava.awt.headless=true')
             requiredVersion = '22' // The minimal GraalVM version, can be `MAJOR`, `MAJOR.MINOR` or `MAJOR.MINOR.PATCH`
         }
     }
-}
\ No newline at end of file
+}

From ca5170a0a354e88e8cc988530772d2044c1df598 Mon Sep 17 00:00:00 2001
From: nmammeri <nmammeri@gmail.com>
Date: Fri, 8 Nov 2024 15:28:15 +0100
Subject: [PATCH 02/11] feat: use direct byte buffer for zero copy bytes
 reading

---
 extractous-core/src/tika/jni_utils.rs         | 15 +++-
 extractous-core/src/tika/parse.rs             | 25 +++---
 .../tests/extract_to_stream_tests.rs          | 45 ++++++++++
 extractous-core/tika-native/build.gradle      |  6 +-
 .../java/ai/yobix/ByteBufferInputStream.java  | 90 +++++++++++++++++++
 .../main/java/ai/yobix/TikaNativeMain.java    | 41 +++++----
 .../META-INF/native-image/jni-config.json     |  2 +-
 7 files changed, 189 insertions(+), 35 deletions(-)
 create mode 100644 extractous-core/tests/extract_to_stream_tests.rs
 create mode 100644 extractous-core/tika-native/src/main/java/ai/yobix/ByteBufferInputStream.java

diff --git a/extractous-core/src/tika/jni_utils.rs b/extractous-core/src/tika/jni_utils.rs
index 3eb9de6..7b3919e 100644
--- a/extractous-core/src/tika/jni_utils.rs
+++ b/extractous-core/src/tika/jni_utils.rs
@@ -1,11 +1,24 @@
 use std::os::raw::{c_char, c_void};
 
 use jni::errors::jni_error_code_to_result;
-use jni::objects::{JObject, JString, JValue, JValueOwned};
+use jni::objects::{JByteBuffer, JObject, JString, JValue, JValueOwned};
 use jni::{sys, JNIEnv, JavaVM};
 
 use crate::errors::{Error, ExtractResult};
 
+/// Calls a static method and prints any thrown exceptions to stderr
+pub fn jni_new_direct_buffer<'local>(
+    env: &mut JNIEnv<'local>,
+    data: *mut u8,
+    len: usize
+) -> ExtractResult<JByteBuffer<'local>> {
+    let direct_byte_buffer = unsafe {
+        env.new_direct_byte_buffer(data, len)
+    }.map_err(|_e| Error::JniEnvCall("Failed to create direct byte buffer"))?;
+
+    Ok(direct_byte_buffer)
+}
+
 /// Calls a static method and prints any thrown exceptions to stderr
 pub fn jni_call_static_method<'local>(
     env: &mut JNIEnv<'local>,
diff --git a/extractous-core/src/tika/parse.rs b/extractous-core/src/tika/parse.rs
index 0a02bef..e355e59 100644
--- a/extractous-core/src/tika/parse.rs
+++ b/extractous-core/src/tika/parse.rs
@@ -17,7 +17,7 @@ pub(crate) fn vm() -> &'static JavaVM {
     GRAAL_VM.get_or_init(create_vm_isolate)
 }
 
-fn env<'local>() -> ExtractResult<AttachGuard<'local>> {
+fn get_vm_attach_current_thread<'local>() -> ExtractResult<AttachGuard<'local>> {
     // Attaching a thead that is already attached is a no-op. Good to have this in case this method
     // is called from another thread
     let env = vm().attach_current_thread()?;
@@ -70,7 +70,7 @@ pub fn parse_file(
     office_conf: &OfficeParserConfig,
     ocr_conf: &TesseractOcrConfig,
 ) -> ExtractResult<StreamReader> {
-    let mut env = env()?;
+    let mut env = get_vm_attach_current_thread()?;
 
     let file_path_val = jni_new_string_as_jvalue(&mut env, file_path)?;
     return parse_to_stream(env, (&file_path_val).into(), char_set, pdf_conf, office_conf, ocr_conf,
@@ -92,9 +92,7 @@ pub fn parse_file_to_string(
     office_conf: &OfficeParserConfig,
     ocr_conf: &TesseractOcrConfig,
 ) -> ExtractResult<String> {
-    // Attaching a thead that is already attached is a no-op. Good to have this in case this method
-    // is called from another thread
-    let mut env = vm().attach_current_thread()?;
+    let mut env = get_vm_attach_current_thread()?;
 
     // Create a new Java string from the Rust string
     let file_path_val = jni_new_string_as_jvalue(&mut env, file_path)?;
@@ -126,18 +124,23 @@ pub fn parse_file_to_string(
 }
 
 pub fn parse_bytes(
-    buffer: &Vec<u8>,
+    buffer: &[u8],
     char_set: &CharSet,
     pdf_conf: &PdfParserConfig,
     office_conf: &OfficeParserConfig,
     ocr_conf: &TesseractOcrConfig,
 ) -> ExtractResult<StreamReader> {
-    let env = env()?;
+    let mut env = get_vm_attach_current_thread()?;
+
+    // Because we know the buffer is used for reading only, cast it to *mut u8 to satisfy the
+    // jni_new_direct_buffer call, which requires a mutable pointer
+    let mut_ptr: *mut u8 = buffer.as_ptr() as *mut u8;
+
+    let byte_buffer = jni_new_direct_buffer(&mut env, mut_ptr, buffer.len())?;
 
-    let buffer_val = env.byte_array_from_slice(&buffer).expect("Couldn't create byte array");
-    return parse_to_stream(env, (&buffer_val).into(), char_set, pdf_conf, office_conf, ocr_conf,
+    return parse_to_stream(env, (&byte_buffer).into(), char_set, pdf_conf, office_conf, ocr_conf,
         "parseBytes",
-        "([B\
+        "(Ljava/nio/ByteBuffer;\
         Ljava/lang/String;\
         Lorg/apache/tika/parser/pdf/PDFParserConfig;\
         Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
@@ -153,7 +156,7 @@ pub fn parse_url(
     office_conf: &OfficeParserConfig,
     ocr_conf: &TesseractOcrConfig,
 ) -> ExtractResult<StreamReader> {
-    let mut env = env()?;
+    let mut env = get_vm_attach_current_thread()?;
 
     let url_val = jni_new_string_as_jvalue(&mut env, url)?;
     return parse_to_stream(env, (&url_val).into(), char_set, pdf_conf, office_conf, ocr_conf,
diff --git a/extractous-core/tests/extract_to_stream_tests.rs b/extractous-core/tests/extract_to_stream_tests.rs
new file mode 100644
index 0000000..c77ff7d
--- /dev/null
+++ b/extractous-core/tests/extract_to_stream_tests.rs
@@ -0,0 +1,45 @@
+extern crate test_case;
+extern crate textdistance;
+
+use extractous::{Extractor};
+use std::fs;
+use test_case::test_case;
+use textdistance::nstr::cosine;
+use std::io::Read;
+
+#[test_case("2022_Q3_AAPL.pdf", 0.9; "Test PDF file")]
+#[test_case("science-exploration-1p.pptx", 0.9; "Test PPTX file")]
+#[test_case("simple.odt", 0.8; "Test ODT file")]
+#[test_case("table-multi-row-column-cells-actual.csv", 0.8; "Test CSV file")]
+#[test_case("vodafone.xlsx", 0.4; "Test XLSX file")]
+#[test_case("category-level.docx", 0.9; "Test DOCX file")]
+#[test_case("simple.doc", 0.9; "Test DOC file")]
+#[test_case("simple.pptx", 0.9; "Test another PPTX file")]
+#[test_case("table-multi-row-column-cells.png", -1.0; "Test PNG file")]
+#[test_case("winter-sports.epub", 0.9; "Test EPUB file")]
+#[test_case("bug_16.docx", 0.9; "Test bug16 DOCX file")]
+#[test_case("eng-ocr.pdf", 0.9; "Test eng-ocr PDF file")]
+fn test_extract_bytes_to_stream(file_name: &str, target_dist: f64) {
+    let extractor = Extractor::new();
+
+    let bytes = fs::read(&format!("../test_files/documents/{}", file_name)).unwrap();
+    let mut stream= extractor.extract_bytes(&bytes).unwrap();
+
+    let mut buffer = Vec::new();
+    stream.read_to_end(&mut buffer).unwrap();
+    let extracted = String::from_utf8_lossy(&buffer);
+
+    // read expected string
+    let expected =
+        fs::read_to_string(format!("../test_files/expected_result/{}.txt", file_name)).unwrap();
+
+    let dist = cosine(&expected, &extracted);
+    assert!(
+        dist > target_dist,
+        "Cosine similarity is less than {} for file: {}, dist: {}",
+        target_dist,
+        file_name,
+        dist
+    );
+    println!("{}: {}", file_name, dist);
+}
\ No newline at end of file
diff --git a/extractous-core/tika-native/build.gradle b/extractous-core/tika-native/build.gradle
index 9ce61e1..793ae26 100644
--- a/extractous-core/tika-native/build.gradle
+++ b/extractous-core/tika-native/build.gradle
@@ -66,13 +66,13 @@ graalvmNative {
 
             buildArgs.addAll(
                     "-H:+AddAllCharsets", // Very important to get UTF8 working
+                    "--enable-https", // Very important https working
                     "-O3",
                     "--parallelism=$numThreads",
-                    "-march=compatibility", // VERY IMPORTANT to use compatibility flag. If not the libs will use the cpu arch of the build machine and will notwork on other CPUs if distributed
-                    "--enable-url-protocols=https",
+                    "-march=compatibility" // VERY IMPORTANT to use compatibility flag. If not the libs will use the cpu arch of the build machine and will notwork on other CPUs if distributed
             )
             jvmArgs.add('-Djava.awt.headless=true')
             requiredVersion = '22' // The minimal GraalVM version, can be `MAJOR`, `MAJOR.MINOR` or `MAJOR.MINOR.PATCH`
         }
     }
-}
+}
\ No newline at end of file
diff --git a/extractous-core/tika-native/src/main/java/ai/yobix/ByteBufferInputStream.java b/extractous-core/tika-native/src/main/java/ai/yobix/ByteBufferInputStream.java
new file mode 100644
index 0000000..9abf3a2
--- /dev/null
+++ b/extractous-core/tika-native/src/main/java/ai/yobix/ByteBufferInputStream.java
@@ -0,0 +1,90 @@
+package ai.yobix;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+
+public class ByteBufferInputStream extends InputStream {
+
+    private ByteBuffer bb;
+
+    public ByteBufferInputStream(ByteBuffer bb) {
+        this.bb = bb;
+    }
+
+    @Override
+    public int read() throws IOException {
+        if (bb == null) {
+            throw new IOException("read on a closed InputStream");
+        }
+
+        if (bb.remaining() == 0) {
+            return -1;
+        }
+
+        return (bb.get() & 0xFF);   // need to be in the range 0 to 255
+    }
+
+    @Override
+    public int read(byte[] b, int off, int len) throws IOException {
+
+        if (bb == null) {
+            throw new IOException("read on a closed InputStream");
+        }
+
+        if (b == null) {
+            throw new NullPointerException();
+        } else if (off < 0 || len < 0 || len > b.length - off) {
+            throw new IndexOutOfBoundsException();
+        } else if (len == 0) {
+            return 0;
+        }
+
+        int length = Math.min(bb.remaining(), len);
+        if (length == 0) {
+            return -1;
+        }
+
+        bb.get(b, off, length);
+        return length;
+    }
+
+    @Override
+    public long skip(long n) throws IOException {
+
+        if (bb == null) {
+            throw new IOException("skip on a closed InputStream");
+        }
+
+        if (n <= 0) {
+            return 0;
+        }
+
+        /*
+         * ByteBuffers have at most an int, so lose the upper bits.
+         * The contract allows this.
+         */
+        int nInt = (int) n;
+        int skip = Math.min(bb.remaining(), nInt);
+
+        bb.position(bb.position() + skip);
+
+        return nInt;
+    }
+
+    @Override
+    public int available() throws IOException {
+
+        if (bb == null) {
+            throw new IOException("available on a closed InputStream");
+        }
+
+        return bb.remaining();
+    }
+
+    @Override
+    public void close() throws IOException {
+        bb = null;
+    }
+
+}
diff --git a/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java b/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java
index ba83662..b524b40 100644
--- a/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java
+++ b/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java
@@ -1,34 +1,22 @@
 package ai.yobix;
 
 import org.apache.commons.io.input.ReaderInputStream;
-import org.apache.tika.exception.WriteLimitReachedException;
-import org.apache.tika.parser.ParsingReader;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.WriteOutContentHandler;
 import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Reader;
-import java.net.MalformedURLException;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.nio.charset.Charset;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-
+import org.apache.tika.exception.WriteLimitReachedException;
+import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParsingReader;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.parser.ocr.TesseractOCRConfig;
 import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.WriteOutContentHandler;
 import org.graalvm.nativeimage.IsolateThread;
 import org.graalvm.nativeimage.c.function.CEntryPoint;
 import org.graalvm.nativeimage.c.type.CCharPointer;
@@ -36,6 +24,19 @@
 import org.graalvm.nativeimage.c.type.CTypeConversion;
 import org.xml.sax.SAXException;
 
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
 public class TikaNativeMain {
 
     private static final Tika tika = new Tika();
@@ -196,15 +197,17 @@ public static ReaderResult parseUrl(
      * @return ReaderResult
      */
     public static ReaderResult parseBytes(
-            byte[] data,
+            ByteBuffer data,
             String charsetName,
             PDFParserConfig pdfConfig,
             OfficeParserConfig officeConfig,
             TesseractOCRConfig tesseractConfig
     ) {
 
+
         final Metadata metadata = new Metadata();
-        final TikaInputStream stream = TikaInputStream.get(data, metadata);
+        final ByteBufferInputStream inStream = new ByteBufferInputStream(data);
+        final TikaInputStream stream = TikaInputStream.get(inStream, new TemporaryResources(), metadata);
 
         return parse(stream, metadata, charsetName, pdfConfig, officeConfig, tesseractConfig);
     }
diff --git a/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json b/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json
index 288d373..496d5d3 100644
--- a/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json
+++ b/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json
@@ -55,7 +55,7 @@
             {
                 "name": "parseBytes",
                 "parameterTypes": [
-                    "byte[]",
+                    "java.nio.ByteBuffer",
                     "java.lang.String",
                     "org.apache.tika.parser.pdf.PDFParserConfig",
                     "org.apache.tika.parser.microsoft.OfficeParserConfig",

From b265c55f4e6d1a55a02737f1b7074eb7f4f3aa06 Mon Sep 17 00:00:00 2001
From: nmammeri <nmammeri@gmail.com>
Date: Fri, 8 Nov 2024 16:45:02 +0100
Subject: [PATCH 03/11] test: add python extract bytes tests

---
 .../tests/test_extract_bytes_to_stream.py     | 39 +++++++++++++++++++
 bindings/extractous-python/tests/utils.py     |  5 +++
 2 files changed, 44 insertions(+)
 create mode 100644 bindings/extractous-python/tests/test_extract_bytes_to_stream.py

diff --git a/bindings/extractous-python/tests/test_extract_bytes_to_stream.py b/bindings/extractous-python/tests/test_extract_bytes_to_stream.py
new file mode 100644
index 0000000..2f8aae5
--- /dev/null
+++ b/bindings/extractous-python/tests/test_extract_bytes_to_stream.py
@@ -0,0 +1,39 @@
+import pytest
+
+from extractous import Extractor
+from utils import cosine_similarity, read_to_string, read_file_to_bytearray
+
+TEST_CASES = [
+    ("2022_Q3_AAPL.pdf", 0.9),
+    ("science-exploration-1p.pptx", 0.9),
+    ("simple.odt", 0.9),
+    ("table-multi-row-column-cells-actual.csv", 0.9),
+    ("vodafone.xlsx", 0.4),
+    ("category-level.docx", 0.9),
+    ("simple.doc", 0.9),
+    ("simple.pptx", 0.9),
+    ("table-multi-row-column-cells.png", -1.0),
+    ("winter-sports.epub", 0.9),
+    ("bug_16.docx", 0.9),
+    ("deu-ocr.pdf", 0.9),
+]
+
+@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
+def test_extract_bytes_to_stream(file_name, target_dist):
+    """Test the extraction from bytes of various file types."""
+    original_filepath = f"../../test_files/documents/{file_name}"
+    expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
+
+    file_bytes = read_file_to_bytearray(original_filepath)
+
+    extractor = Extractor()
+    reader = extractor.extract_bytes(file_bytes)
+    result = read_to_string(reader)
+
+    # Expected
+    with open(expected_result_filepath, "r",  encoding="utf8") as file:
+        expected = file.read()
+    
+    assert cosine_similarity(result, expected) > target_dist, \
+        f"Cosine similarity is less than {target_dist} for file: {file_name}"
+
diff --git a/bindings/extractous-python/tests/utils.py b/bindings/extractous-python/tests/utils.py
index fb0e28b..ac0f7b7 100644
--- a/bindings/extractous-python/tests/utils.py
+++ b/bindings/extractous-python/tests/utils.py
@@ -21,3 +21,8 @@ def read_to_string(reader):
         b = reader.read(4096)
     return result
 
+def read_file_to_bytearray(file_path: str):
+    """Read file to bytes array."""
+    with open(file_path, 'rb') as file:
+        file_content = bytearray(file.read())
+    return file_content
\ No newline at end of file

From c8da140776e91075db7961405b4ce80a568847bb Mon Sep 17 00:00:00 2001
From: nmammeri <nadjib@bouita.com>
Date: Mon, 11 Nov 2024 14:16:48 +0100
Subject: [PATCH 04/11] feat: add python read_into function to extractor

---
 bindings/extractous-python/src/extractor.rs | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/bindings/extractous-python/src/extractor.rs b/bindings/extractous-python/src/extractor.rs
index 4c6f5ba..ed95e7b 100644
--- a/bindings/extractous-python/src/extractor.rs
+++ b/bindings/extractous-python/src/extractor.rs
@@ -75,6 +75,18 @@ impl StreamReader {
             ))),
         }
     }
+
+    /// Reads into the specified buffer
+    pub fn readinto<'py>(&mut self, buf: Bound<'py, PyByteArray>) -> PyResult<usize> {
+        let bs = unsafe { buf.as_bytes_mut() };
+
+        let bytes_read = self.reader.read(bs)
+            .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(
+                format!("{}", e))
+            )?;
+        Ok(bytes_read)
+    }
+
 }
 
 /// `Extractor` is the entry for all extract APIs

From 88102e454b9f817bacc7946ad2d03513f2374da9 Mon Sep 17 00:00:00 2001
From: nmammeri <nadjib@bouita.com>
Date: Mon, 11 Nov 2024 14:18:47 +0100
Subject: [PATCH 05/11] tests: don't include tests for known ocr bug on mac

---
 .../tests/test_extract_bytes_to_stream.py     |  3 +-
 .../tests/test_extract_file_to_string.py      |  3 +-
 bindings/extractous-python/tests/utils.py     | 33 +++++++++++++++----
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/bindings/extractous-python/tests/test_extract_bytes_to_stream.py b/bindings/extractous-python/tests/test_extract_bytes_to_stream.py
index 2f8aae5..32be6a7 100644
--- a/bindings/extractous-python/tests/test_extract_bytes_to_stream.py
+++ b/bindings/extractous-python/tests/test_extract_bytes_to_stream.py
@@ -15,9 +15,10 @@
     ("table-multi-row-column-cells.png", -1.0),
     ("winter-sports.epub", 0.9),
     ("bug_16.docx", 0.9),
-    ("deu-ocr.pdf", 0.9),
+    #("eng-ocr.pdf", 0.9),
 ]
 
+
 @pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
 def test_extract_bytes_to_stream(file_name, target_dist):
     """Test the extraction from bytes of various file types."""
diff --git a/bindings/extractous-python/tests/test_extract_file_to_string.py b/bindings/extractous-python/tests/test_extract_file_to_string.py
index ed3dbe8..95b5bbb 100644
--- a/bindings/extractous-python/tests/test_extract_file_to_string.py
+++ b/bindings/extractous-python/tests/test_extract_file_to_string.py
@@ -15,9 +15,10 @@
     ("table-multi-row-column-cells.png", -1.0),
     ("winter-sports.epub", 0.9),
     ("bug_16.docx", 0.9),
-    ("deu-ocr.pdf", 0.9),
+    #("eng-ocr.pdf", 0.9),
 ]
 
+
 @pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
 def test_extract_file_to_string(file_name, target_dist):
     """Test the extraction and comparison of various file types."""
diff --git a/bindings/extractous-python/tests/utils.py b/bindings/extractous-python/tests/utils.py
index ac0f7b7..b153895 100644
--- a/bindings/extractous-python/tests/utils.py
+++ b/bindings/extractous-python/tests/utils.py
@@ -1,6 +1,7 @@
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.metrics.pairwise import cosine_similarity as cosine_sim
 
+
 def cosine_similarity(text1, text2):
     """Calculate the cosine similarity between two texts."""
 
@@ -12,17 +13,35 @@ def cosine_similarity(text1, text2):
     cos_sim = cosine_sim(vectors)
     return cos_sim[0][1]
 
+
+# def read_to_string(reader):
+#     """Read from stream to string."""
+#     result = ""
+#     b = reader.read(4096)
+#     while len(b) > 0:
+#         result += b.decode("utf-8")
+#         b = reader.read(4096)
+#     return result
+
 def read_to_string(reader):
     """Read from stream to string."""
-    result = ""
-    b = reader.read(4096)
-    while len(b) > 0:
-        result += b.decode("utf-8")
-        b = reader.read(4096)
-    return result
+    utf8_string = []
+    buffer = bytearray(4096)
+
+    while True:
+        bytes_read = reader.readinto(buffer)
+        # If no more data, exit the loop
+        if bytes_read == 0:
+            break
+        # Decode the valid portion of the buffer and append it to the result
+        utf8_string.append(buffer[:bytes_read].decode('utf-8'))
+
+    # Join all parts into a single string
+    return ''.join(utf8_string)
+
 
 def read_file_to_bytearray(file_path: str):
     """Read file to bytes array."""
     with open(file_path, 'rb') as file:
         file_content = bytearray(file.read())
-    return file_content
\ No newline at end of file
+    return file_content

From 76fb11b4e4172e3909d47ebebdb9e1f1b121782e Mon Sep 17 00:00:00 2001
From: nmammeri <nadjib@bouita.com>
Date: Mon, 11 Nov 2024 14:21:12 +0100
Subject: [PATCH 06/11] docs: add extract_to_stream python example

---
 .../examples/extract_to_stream.py             | 34 +++++++++++++++++++
 extractous-core/README.md                     |  2 ++
 2 files changed, 36 insertions(+)
 create mode 100755 bindings/extractous-python/examples/extract_to_stream.py

diff --git a/bindings/extractous-python/examples/extract_to_stream.py b/bindings/extractous-python/examples/extract_to_stream.py
new file mode 100755
index 0000000..8068f14
--- /dev/null
+++ b/bindings/extractous-python/examples/extract_to_stream.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+import os
+import sys
+
+from extractous import Extractor, PdfOcrStrategy, PdfParserConfig
+
+
+def extract_to_stream(file_path: str):
+
+    # Extract the file
+    extractor = Extractor()
+    reader = extractor.extract_file(in_file)
+
+    buffer = bytearray(4096 * 4096)
+    while True:
+        bytes_read = reader.readinto(buffer)
+        # If no more data, exit the loop
+        if bytes_read == 0:
+            break
+        # Decode the valid portion of the buffer and append it to the result
+        chunk = buffer[:bytes_read].decode('utf-8')
+        print(chunk)
+
+
+if __name__ == '__main__':
+    # Pare input args
+    if len(sys.argv) != 2:
+        print(f"Usage: '{sys.argv[0]}' <filename>")
+        sys.exit(1)
+    in_file = sys.argv[1]
+    if not os.path.isfile(in_file):
+        raise FileNotFoundError(f"No such file: '{in_file}'")
+
+    extract_to_stream(in_file)
diff --git a/extractous-core/README.md b/extractous-core/README.md
index 0328db6..3e55a42 100644
--- a/extractous-core/README.md
+++ b/extractous-core/README.md
@@ -125,6 +125,8 @@ installed on your system because some of the OCR tests will fail if no tesseract
 * `sudo apt install tesseract-ocr`
 * Install any language extensions you want. for example to install German and Arabic:
 * `sudo apt install tesseract-ocr-deu tesseract-ocr-ara`
+* On Mac 
+* `brew install tesseract tesseract-lang`
 
 ### Building Extractous
 * To build Extractous, just run:

From 6c5893bf887542e0c1cda2e96f02bc94308e325b Mon Sep 17 00:00:00 2001
From: nmammeri <nadjib@bouita.com>
Date: Mon, 11 Nov 2024 14:24:36 +0100
Subject: [PATCH 07/11] chore: add more vmoptions

---
 extractous-core/src/tika/jni_utils.rs | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/extractous-core/src/tika/jni_utils.rs b/extractous-core/src/tika/jni_utils.rs
index 7b3919e..203d1b4 100644
--- a/extractous-core/src/tika/jni_utils.rs
+++ b/extractous-core/src/tika/jni_utils.rs
@@ -112,20 +112,23 @@ pub fn jni_check_exception(env: &mut JNIEnv) -> ExtractResult<bool> {
 /// linked in by the build script.
 pub fn create_vm_isolate() -> JavaVM {
     unsafe {
-        // let mut option0 = sys::JavaVMOption {
-        //     optionString: "-Djava.awt.headless=true".as_ptr() as *mut c_char,
-        //     extraInfo: std::ptr::null_mut(),
-        // };
+        let mut vm_options : Vec<sys::JavaVMOption> = vec![];
 
         // Set java.library.path to be able to load libawt.so, which must be in the same dir as libtika_native.so
-        let mut options = sys::JavaVMOption {
+        vm_options.push(sys::JavaVMOption {
             optionString: "-Djava.library.path=.".as_ptr() as *mut c_char,
             extraInfo: std::ptr::null_mut(),
-        };
+        });
+        // enable awt headless mode
+        vm_options.push(sys::JavaVMOption {
+            optionString: "Djava.awt.headless=true".as_ptr() as *mut c_char,
+            extraInfo: std::ptr::null_mut(),
+        });
+
         let mut args = sys::JavaVMInitArgs {
             version: sys::JNI_VERSION_1_8,
-            nOptions: 1,
-            options: &mut options,
+            nOptions: vm_options.len() as sys::jint,
+            options: vm_options.as_ptr() as *mut sys::JavaVMOption,
             ignoreUnrecognized: sys::JNI_TRUE,
         };
         let mut ptr: *mut sys::JavaVM = std::ptr::null_mut();

From 8e5c919cde7ec5d7fbdea52d4895167b3eaa1732 Mon Sep 17 00:00:00 2001
From: nmammeri <nadjib@bouita.com>
Date: Mon, 11 Nov 2024 17:24:10 +0100
Subject: [PATCH 08/11] refactor: disable failing ocr tests on mac for now +
 clippy and fmt styling

---
 extractous-core/src/extractor.rs              | 29 +++++++-------
 extractous-core/src/tika/jni_utils.rs         | 31 +++++++--------
 extractous-core/src/tika/parse.rs             | 31 +++++++++++----
 .../tests/extract_to_stream_tests.rs          | 39 ++++++++++++++++---
 ...tor_test.rs => extract_to_string_tests.rs} | 18 ++++-----
 5 files changed, 96 insertions(+), 52 deletions(-)
 rename extractous-core/tests/{extractor_test.rs => extract_to_string_tests.rs} (87%)

diff --git a/extractous-core/src/extractor.rs b/extractous-core/src/extractor.rs
index ea586d1..9917afa 100644
--- a/extractous-core/src/extractor.rs
+++ b/extractous-core/src/extractor.rs
@@ -126,7 +126,7 @@ impl Extractor {
 
     /// Extracts text from a byte buffer. Returns a stream of the extracted text
     /// the stream is decoded using the extractor's `encoding`
-    pub fn extract_bytes(&self, buffer: &Vec<u8>) -> ExtractResult<StreamReader> {
+    pub fn extract_bytes(&self, buffer: &[u8]) -> ExtractResult<StreamReader> {
         tika::parse_bytes(
             buffer,
             &self.encoding,
@@ -148,7 +148,6 @@ impl Extractor {
         )
     }
 
-
     /// Extracts text from a file path. Returns a string that is of maximum length
     /// of the extractor's `extract_string_max_length`
     pub fn extract_file_to_string(&self, file_path: &str) -> ExtractResult<String> {
@@ -166,8 +165,8 @@ impl Extractor {
 mod tests {
     use crate::Extractor;
     use std::fs::File;
-    use std::io::{self, Read};
     use std::io::BufReader;
+    use std::io::{self, Read};
 
     use super::StreamReader;
 
@@ -214,15 +213,15 @@ mod tests {
         assert_eq!(content.trim(), expected_content.trim());
     }
 
-	fn read_file_as_bytes(path: &str) -> io::Result<Vec<u8>> {
-		let mut file = File::open(path)?;
-		let mut buffer = Vec::new();
-		file.read_to_end(&mut buffer)?;
-		Ok(buffer)
-	}
+    fn read_file_as_bytes(path: &str) -> io::Result<Vec<u8>> {
+        let mut file = File::open(path)?;
+        let mut buffer = Vec::new();
+        file.read_to_end(&mut buffer)?;
+        Ok(buffer)
+    }
 
-	#[test]
-	fn extract_bytes_test() {
+    #[test]
+    fn extract_bytes_test() {
         // Prepare expected_content
         let expected_content = expected_content();
 
@@ -232,14 +231,14 @@ mod tests {
         let result = extractor.extract_bytes(&file_bytes);
         let content = read_content_from_stream(result.unwrap());
         assert_eq!(content.trim(), expected_content.trim());
-	}
+    }
 
-	#[test]
-	fn extract_url_test() {
+    #[test]
+    fn extract_url_test() {
         // Parse url by extractous
         let extractor = Extractor::new();
         let result = extractor.extract_url(&TEST_URL);
         let content = read_content_from_stream(result.unwrap());
         assert!(content.contains("Google"));
-	}
+    }
 }
diff --git a/extractous-core/src/tika/jni_utils.rs b/extractous-core/src/tika/jni_utils.rs
index 203d1b4..a99bae2 100644
--- a/extractous-core/src/tika/jni_utils.rs
+++ b/extractous-core/src/tika/jni_utils.rs
@@ -10,11 +10,10 @@ use crate::errors::{Error, ExtractResult};
 pub fn jni_new_direct_buffer<'local>(
     env: &mut JNIEnv<'local>,
     data: *mut u8,
-    len: usize
+    len: usize,
 ) -> ExtractResult<JByteBuffer<'local>> {
-    let direct_byte_buffer = unsafe {
-        env.new_direct_byte_buffer(data, len)
-    }.map_err(|_e| Error::JniEnvCall("Failed to create direct byte buffer"))?;
+    let direct_byte_buffer = unsafe { env.new_direct_byte_buffer(data, len) }
+        .map_err(|_e| Error::JniEnvCall("Failed to create direct byte buffer"))?;
 
     Ok(direct_byte_buffer)
 }
@@ -112,18 +111,18 @@ pub fn jni_check_exception(env: &mut JNIEnv) -> ExtractResult<bool> {
 /// linked in by the build script.
 pub fn create_vm_isolate() -> JavaVM {
     unsafe {
-        let mut vm_options : Vec<sys::JavaVMOption> = vec![];
-
-        // Set java.library.path to be able to load libawt.so, which must be in the same dir as libtika_native.so
-        vm_options.push(sys::JavaVMOption {
-            optionString: "-Djava.library.path=.".as_ptr() as *mut c_char,
-            extraInfo: std::ptr::null_mut(),
-        });
-        // enable awt headless mode
-        vm_options.push(sys::JavaVMOption {
-            optionString: "Djava.awt.headless=true".as_ptr() as *mut c_char,
-            extraInfo: std::ptr::null_mut(),
-        });
+        let vm_options: Vec<sys::JavaVMOption> = vec![
+            // Set java.library.path to be able to load libawt.so, which must be in the same dir as libtika_native.so
+            sys::JavaVMOption {
+                optionString: "-Djava.library.path=.".as_ptr() as *mut c_char,
+                extraInfo: std::ptr::null_mut(),
+            },
+            // enable awt headless mode
+            sys::JavaVMOption {
+                optionString: "Djava.awt.headless=true".as_ptr() as *mut c_char,
+                extraInfo: std::ptr::null_mut(),
+            },
+        ];
 
         let mut args = sys::JavaVMInitArgs {
             version: sys::JNI_VERSION_1_8,
diff --git a/extractous-core/src/tika/parse.rs b/extractous-core/src/tika/parse.rs
index e355e59..8766d27 100644
--- a/extractous-core/src/tika/parse.rs
+++ b/extractous-core/src/tika/parse.rs
@@ -34,7 +34,6 @@ fn parse_to_stream(
     method_name: &str,
     signature: &str,
 ) -> ExtractResult<StreamReader> {
-
     let charset_name_val = jni_new_string_as_jvalue(&mut env, &char_set.to_string())?;
     let j_pdf_conf = JPDFParserConfig::new(&mut env, pdf_conf)?;
     let j_office_conf = JOfficeParserConfig::new(&mut env, office_conf)?;
@@ -73,7 +72,13 @@ pub fn parse_file(
     let mut env = get_vm_attach_current_thread()?;
 
     let file_path_val = jni_new_string_as_jvalue(&mut env, file_path)?;
-    return parse_to_stream(env, (&file_path_val).into(), char_set, pdf_conf, office_conf, ocr_conf,
+    parse_to_stream(
+        env,
+        (&file_path_val).into(),
+        char_set,
+        pdf_conf,
+        office_conf,
+        ocr_conf,
         "parseFile",
         "(Ljava/lang/String;\
         Ljava/lang/String;\
@@ -81,7 +86,7 @@ pub fn parse_file(
         Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
         Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
         )Lai/yobix/ReaderResult;",
-        )
+    )
 }
 
 /// Parses a file to a string using the Apache Tika library.
@@ -138,7 +143,13 @@ pub fn parse_bytes(
 
     let byte_buffer = jni_new_direct_buffer(&mut env, mut_ptr, buffer.len())?;
 
-    return parse_to_stream(env, (&byte_buffer).into(), char_set, pdf_conf, office_conf, ocr_conf,
+    parse_to_stream(
+        env,
+        (&byte_buffer).into(),
+        char_set,
+        pdf_conf,
+        office_conf,
+        ocr_conf,
         "parseBytes",
         "(Ljava/nio/ByteBuffer;\
         Ljava/lang/String;\
@@ -146,7 +157,7 @@ pub fn parse_bytes(
         Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
         Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
         )Lai/yobix/ReaderResult;",
-        )
+    )
 }
 
 pub fn parse_url(
@@ -159,7 +170,13 @@ pub fn parse_url(
     let mut env = get_vm_attach_current_thread()?;
 
     let url_val = jni_new_string_as_jvalue(&mut env, url)?;
-    return parse_to_stream(env, (&url_val).into(), char_set, pdf_conf, office_conf, ocr_conf,
+    parse_to_stream(
+        env,
+        (&url_val).into(),
+        char_set,
+        pdf_conf,
+        office_conf,
+        ocr_conf,
         "parseUrl",
         "(Ljava/lang/String;\
         Ljava/lang/String;\
@@ -167,5 +184,5 @@ pub fn parse_url(
         Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
         Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
         )Lai/yobix/ReaderResult;",
-        )
+    )
 }
diff --git a/extractous-core/tests/extract_to_stream_tests.rs b/extractous-core/tests/extract_to_stream_tests.rs
index c77ff7d..c29d089 100644
--- a/extractous-core/tests/extract_to_stream_tests.rs
+++ b/extractous-core/tests/extract_to_stream_tests.rs
@@ -1,11 +1,11 @@
 extern crate test_case;
 extern crate textdistance;
 
-use extractous::{Extractor};
+use extractous::{Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig};
 use std::fs;
+use std::io::Read;
 use test_case::test_case;
 use textdistance::nstr::cosine;
-use std::io::Read;
 
 #[test_case("2022_Q3_AAPL.pdf", 0.9; "Test PDF file")]
 #[test_case("science-exploration-1p.pptx", 0.9; "Test PPTX file")]
@@ -18,12 +18,12 @@ use std::io::Read;
 #[test_case("table-multi-row-column-cells.png", -1.0; "Test PNG file")]
 #[test_case("winter-sports.epub", 0.9; "Test EPUB file")]
 #[test_case("bug_16.docx", 0.9; "Test bug16 DOCX file")]
-#[test_case("eng-ocr.pdf", 0.9; "Test eng-ocr PDF file")]
+//#[test_case("eng-ocr.pdf", 0.9; "Test eng-ocr PDF file")]
 fn test_extract_bytes_to_stream(file_name: &str, target_dist: f64) {
     let extractor = Extractor::new();
 
     let bytes = fs::read(&format!("../test_files/documents/{}", file_name)).unwrap();
-    let mut stream= extractor.extract_bytes(&bytes).unwrap();
+    let mut stream = extractor.extract_bytes(&bytes).unwrap();
 
     let mut buffer = Vec::new();
     stream.read_to_end(&mut buffer).unwrap();
@@ -42,4 +42,33 @@ fn test_extract_bytes_to_stream(file_name: &str, target_dist: f64) {
         dist
     );
     println!("{}: {}", file_name, dist);
-}
\ No newline at end of file
+}
+
+#[test]
+fn test_extract_bytes_to_stream_ara_ocr_png() {
+    let extractor = Extractor::new()
+        .set_ocr_config(TesseractOcrConfig::new().set_language("ara"))
+        .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::NO_OCR));
+
+    // extract file with extractor
+    let bytes = fs::read(&"../test_files/documents/ara-ocr.png".to_string()).unwrap();
+    let mut stream = extractor.extract_bytes(&bytes).unwrap();
+
+    let mut buffer = Vec::new();
+    stream.read_to_end(&mut buffer).unwrap();
+    let extracted = String::from_utf8_lossy(&buffer);
+
+    println!("{}", extracted);
+
+    // read expected string
+    let expected =
+        fs::read_to_string("../test_files/expected_result/ara-ocr.png.txt".to_string()).unwrap();
+
+    let dist = cosine(&expected, &extracted);
+    assert!(
+        dist > 0.9,
+        "Cosine similarity is less than 0.9 for file: ara-ocr.png, dist: {}",
+        dist
+    );
+    println!("{}: {}", "ara-ocr.png", dist);
+}
diff --git a/extractous-core/tests/extractor_test.rs b/extractous-core/tests/extract_to_string_tests.rs
similarity index 87%
rename from extractous-core/tests/extractor_test.rs
rename to extractous-core/tests/extract_to_string_tests.rs
index 5322c3f..7456442 100644
--- a/extractous-core/tests/extractor_test.rs
+++ b/extractous-core/tests/extract_to_string_tests.rs
@@ -17,7 +17,7 @@ use textdistance::nstr::cosine;
 #[test_case("table-multi-row-column-cells.png", -1.0; "Test PNG file")]
 #[test_case("winter-sports.epub", 0.9; "Test EPUB file")]
 #[test_case("bug_16.docx", 0.9; "Test bug16 DOCX file")]
-#[test_case("eng-ocr.pdf", 0.9; "Test eng-ocr PDF file")]
+//#[test_case("eng-ocr.pdf", 0.9; "Test eng-ocr PDF file")]
 fn test_extract_file_to_string(file_name: &str, target_dist: f64) {
     let extractor = Extractor::new().set_extract_string_max_length(1000000);
     // extract file with extractor
@@ -40,7 +40,7 @@ fn test_extract_file_to_string(file_name: &str, target_dist: f64) {
 }
 
 #[test]
-fn test_extract_ara_ocr_png_to_string() {
+fn test_extract_file_to_string_ara_ocr_png() {
     let extractor = Extractor::new()
         .set_ocr_config(TesseractOcrConfig::new().set_language("ara"))
         .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::NO_OCR));
@@ -61,18 +61,18 @@ fn test_extract_ara_ocr_png_to_string() {
         "Cosine similarity is less than 0.9 for file: ara-ocr.png, dist: {}",
         dist
     );
-    println!("{}: {}", "ara-ocr.png", dist);
 }
 
+#[cfg(not(target_os = "macos"))]
 #[test]
-fn test_ocr_only_strategy_extract_deu_ocr_pdf_to_string() {
+fn test_extract_file_to_string_ocr_only_strategy_deu_ocr_pdf() {
     let extractor = Extractor::new()
         .set_ocr_config(TesseractOcrConfig::new().set_language("deu"))
         .set_pdf_config(
             PdfParserConfig::new()
-                .set_ocr_strategy(PdfOcrStrategy::OCR_ONLY)
-                .set_extract_inline_images(true)
-                .set_extract_unique_inline_images_only(true),
+                .set_ocr_strategy(PdfOcrStrategy::OCR_AND_TEXT_EXTRACTION)
+                .set_extract_inline_images(false)
+                .set_extract_unique_inline_images_only(false),
         );
     // extract file with extractor
     let extracted = extractor
@@ -89,11 +89,11 @@ fn test_ocr_only_strategy_extract_deu_ocr_pdf_to_string() {
         "Cosine similarity is less than 0.9 for file: ara-ocr.png, dist: {}",
         dist
     );
-    println!("{}: {}", "ara-ocr.png", dist);
 }
 
+#[cfg(not(target_os = "macos"))]
 #[test]
-fn test_no_ocr_strategy_extract_deu_ocr_pdf_to_string() {
+fn test_test_extract_file_to_string_no_ocr_strategy_deu_ocr_pdf() {
     let extractor = Extractor::new()
         .set_ocr_config(TesseractOcrConfig::new().set_language("deu"))
         .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::NO_OCR));

From eca83f5c1cb4bf751b4a2e422c8b82e36a43e032 Mon Sep 17 00:00:00 2001
From: nmammeri <nadjib@bouita.com>
Date: Mon, 11 Nov 2024 17:24:44 +0100
Subject: [PATCH 09/11] tests: fix ocr test assertions

---
 bindings/extractous-python/tests/test_ocr.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/bindings/extractous-python/tests/test_ocr.py b/bindings/extractous-python/tests/test_ocr.py
index 7f4de09..4baaf76 100644
--- a/bindings/extractous-python/tests/test_ocr.py
+++ b/bindings/extractous-python/tests/test_ocr.py
@@ -1,19 +1,20 @@
 from extractous import Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig
 from utils import cosine_similarity
 
+
 def test_ara_ocr_png():
     ocr_config = TesseractOcrConfig().set_language("ara")
     extractor = Extractor().set_ocr_config(ocr_config)
     result = extractor.extract_file_to_string("../../test_files/documents/ara-ocr.png")
 
-    with open("../../test_files/expected_result/ara-ocr.png.txt", "r",  encoding="utf8") as file:
+    with open("../../test_files/expected_result/ara-ocr.png.txt", "r", encoding="utf8") as file:
         expected = file.read()
 
-    assert cosine_similarity(result, expected)
+    assert cosine_similarity(result, expected) > 0.9
 
 
-def test_ocr_only_strategy_extract_deu_ocr_pdf_to_string():
-    test_file = "../../test_files/documents/eng-ocr.pdf"
+def test_extract_file_to_string_ocr_only_strategy_deu_ocr_pdf():
+    test_file = "../../test_files/documents/deu-ocr.pdf"
     expected_result_file = "../../test_files/expected_result/deu-ocr.pdf.txt"
 
     pdf_config = PdfParserConfig().set_ocr_strategy(PdfOcrStrategy.OCR_ONLY)
@@ -26,12 +27,13 @@ def test_ocr_only_strategy_extract_deu_ocr_pdf_to_string():
 
     result = extractor.extract_file_to_string(test_file)
 
-    with open(expected_result_file, "r",  encoding="utf8") as file:
+    with open(expected_result_file, "r", encoding="utf8") as file:
         expected = file.read()
 
-    assert cosine_similarity(result, expected)
+    assert cosine_similarity(result, expected) > 0.9
+
 
-def test_no_ocr_strategy_extract_deu_ocr_pdf_to_string():
+def test_test_extract_file_to_string_no_ocr_strategy_deu_ocr_pdf():
     test_file = "../../test_files/documents/deu-ocr.pdf"
 
     pdf_config = PdfParserConfig()
@@ -39,8 +41,8 @@ def test_no_ocr_strategy_extract_deu_ocr_pdf_to_string():
     ocr_config = TesseractOcrConfig()
     ocr_config = ocr_config.set_language("deu")
 
-    extractor = Extractor().set_ocr_config(ocr_config).set_pdf_config(PdfParserConfig().set_ocr_strategy(PdfOcrStrategy.NO_OCR))
+    extractor = Extractor().set_ocr_config(ocr_config).set_pdf_config(pdf_config)
 
     result = extractor.extract_file_to_string(test_file)
 
-    assert result.strip() == ""
\ No newline at end of file
+    assert result.strip() == ""

From 9c9c4483194ebe02bd1fd45d5df1e47a59d25a40 Mon Sep 17 00:00:00 2001
From: nmammeri <nadjib@bouita.com>
Date: Mon, 11 Nov 2024 21:24:27 +0100
Subject: [PATCH 10/11] chore: update reflection data for macos

---
 .../META-INF/native-image/jni-config.json     | 180 ++++++++++++++++++
 .../META-INF/native-image/reflect-config.json |   9 +
 .../native-image/resource-config.json         |  41 ++++
 .../native-image/serialization-config.json    |   3 +
 4 files changed, 233 insertions(+)

diff --git a/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json b/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json
index 496d5d3..5c074ba 100644
--- a/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json
+++ b/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json
@@ -95,6 +95,31 @@
         ],
         "name": "ai.yobix.TikaNativeMain"
     },
+    {
+        "methods": [
+            {
+                "name": "handleNativeNotification",
+                "parameterTypes": [
+                    "int"
+                ]
+            }
+        ],
+        "name": "com.apple.eawt._AppEventHandler"
+    },
+    {
+        "methods": [
+            {
+                "name": "initMenuStates",
+                "parameterTypes": [
+                    "boolean",
+                    "boolean",
+                    "boolean",
+                    "boolean"
+                ]
+            }
+        ],
+        "name": "com.apple.eawt._AppMenuBarHandler"
+    },
     {
         "methods": [
             {
@@ -166,6 +191,41 @@
         ],
         "name": "com.sun.imageio.plugins.jpeg.JPEGImageReader"
     },
+    {
+        "methods": [
+            {
+                "name": "grabPixels",
+                "parameterTypes": [
+                    "int"
+                ]
+            },
+            {
+                "name": "warningOccurred",
+                "parameterTypes": [
+                    "int"
+                ]
+            },
+            {
+                "name": "warningWithMessage",
+                "parameterTypes": [
+                    "java.lang.String"
+                ]
+            },
+            {
+                "name": "writeMetadata",
+                "parameterTypes": []
+            },
+            {
+                "name": "writeOutputData",
+                "parameterTypes": [
+                    "byte[]",
+                    "int",
+                    "int"
+                ]
+            }
+        ],
+        "name": "com.sun.imageio.plugins.jpeg.JPEGImageWriter"
+    },
     {
         "fields": [
             {
@@ -202,6 +262,20 @@
         ],
         "name": "java.awt.Dimension"
     },
+    {
+        "methods": [
+            {
+                "name": "<init>",
+                "parameterTypes": [
+                    "int",
+                    "int",
+                    "int",
+                    "int"
+                ]
+            }
+        ],
+        "name": "java.awt.DisplayMode"
+    },
     {
         "methods": [
             {
@@ -268,6 +342,15 @@
         ],
         "name": "java.awt.Toolkit"
     },
+    {
+        "methods": [
+            {
+                "name": "getButtonDownMasks",
+                "parameterTypes": []
+            }
+        ],
+        "name": "java.awt.event.InputEvent"
+    },
     {
         "fields": [
             {
@@ -313,6 +396,20 @@
         ],
         "name": "java.awt.geom.Path2D$Float"
     },
+    {
+        "methods": [
+            {
+                "name": "<init>",
+                "parameterTypes": [
+                    "double",
+                    "double",
+                    "double",
+                    "double"
+                ]
+            }
+        ],
+        "name": "java.awt.geom.Rectangle2D$Double"
+    },
     {
         "fields": [
             {
@@ -508,11 +605,41 @@
     },
     {
         "methods": [
+            {
+                "name": "lastIndexOf",
+                "parameterTypes": [
+                    "int"
+                ]
+            },
+            {
+                "name": "substring",
+                "parameterTypes": [
+                    "int"
+                ]
+            }
+        ],
+        "name": "java.lang.String"
+    },
+    {
+        "methods": [
+            {
+                "name": "getProperty",
+                "parameterTypes": [
+                    "java.lang.String"
+                ]
+            },
             {
                 "name": "load",
                 "parameterTypes": [
                     "java.lang.String"
                 ]
+            },
+            {
+                "name": "setProperty",
+                "parameterTypes": [
+                    "java.lang.String",
+                    "java.lang.String"
+                ]
             }
         ],
         "name": "java.lang.System"
@@ -721,6 +848,19 @@
         ],
         "name": "org.apache.tika.parser.pdf.PDFParserConfig"
     },
+    {
+        "methods": [
+            {
+                "name": "notifyToolkitThreadBusy",
+                "parameterTypes": []
+            },
+            {
+                "name": "notifyToolkitThreadFree",
+                "parameterTypes": []
+            }
+        ],
+        "name": "sun.awt.AWTAutoShutdown"
+    },
     {
         "fields": [
             {
@@ -814,6 +954,26 @@
         ],
         "name": "sun.awt.image.ByteComponentRaster"
     },
+    {
+        "fields": [
+            {
+                "name": "data"
+            },
+            {
+                "name": "dataBitOffset"
+            },
+            {
+                "name": "pixelBitStride"
+            },
+            {
+                "name": "scanlineStride"
+            },
+            {
+                "name": "type"
+            }
+        ],
+        "name": "sun.awt.image.BytePackedRaster"
+    },
     {
         "fields": [
             {
@@ -1356,5 +1516,25 @@
             }
         ],
         "name": "sun.java2d.xr.XRSurfaceData"
+    },
+    {
+        "methods": [
+            {
+                "name": "installToolkitThreadInJava",
+                "parameterTypes": []
+            }
+        ],
+        "name": "sun.lwawt.macosx.LWCToolkit"
+    },
+    {
+        "methods": [
+            {
+                "name": "main",
+                "parameterTypes": [
+                    "java.lang.String[]"
+                ]
+            }
+        ],
+        "name": "worker.org.gradle.process.internal.worker.GradleWorkerMain"
     }
 ]
\ No newline at end of file
diff --git a/extractous-core/tika-native/src/main/resources/META-INF/native-image/reflect-config.json b/extractous-core/tika-native/src/main/resources/META-INF/native-image/reflect-config.json
index 7a2a2d9..2f7f67a 100644
--- a/extractous-core/tika-native/src/main/resources/META-INF/native-image/reflect-config.json
+++ b/extractous-core/tika-native/src/main/resources/META-INF/native-image/reflect-config.json
@@ -20,6 +20,15 @@
     {
         "name": "[Lsun.security.pkcs.SignerInfo;"
     },
+    {
+        "methods": [
+            {
+                "name": "<init>",
+                "parameterTypes": []
+            }
+        ],
+        "name": "apple.security.AppleProvider"
+    },
     {
         "methods": [
             {
diff --git a/extractous-core/tika-native/src/main/resources/META-INF/native-image/resource-config.json b/extractous-core/tika-native/src/main/resources/META-INF/native-image/resource-config.json
index 80776b9..ea2099b 100644
--- a/extractous-core/tika-native/src/main/resources/META-INF/native-image/resource-config.json
+++ b/extractous-core/tika-native/src/main/resources/META-INF/native-image/resource-config.json
@@ -3,6 +3,7 @@
         {
             "locales": [
                 "en",
+                "en-GB",
                 "en-US",
                 "und"
             ],
@@ -10,6 +11,7 @@
         },
         {
             "locales": [
+                "en-GB",
                 "en-US",
                 "und"
             ],
@@ -20,13 +22,25 @@
                 "sun.awt.resources.awt"
             ],
             "locales": [
+                "en-GB",
                 "en-US"
             ],
             "name": "sun.awt.resources.awt"
         },
+        {
+            "classNames": [
+                "sun.awt.resources.awtosx"
+            ],
+            "locales": [
+                "en-GB"
+            ],
+            "name": "sun.awt.resources.awtosx"
+        },
         {
             "locales": [
                 "en",
+                "en-001",
+                "en-GB",
                 "en-US",
                 "und"
             ],
@@ -224,6 +238,9 @@
             {
                 "pattern": "\\Qorg/apache/pdfbox/resources/glyphlist/zapfdingbats.txt\\E"
             },
+            {
+                "pattern": "\\Qorg/apache/pdfbox/resources/icc/ISOcoated_v2_300_bas.icc\\E"
+            },
             {
                 "pattern": "\\Qorg/apache/pdfbox/resources/text/BidiMirroring.txt\\E"
             },
@@ -749,6 +766,9 @@
             {
                 "pattern": "\\Qorg/apache/xerces/impl/msg/SAXMessages_en.properties\\E"
             },
+            {
+                "pattern": "\\Qorg/apache/xerces/impl/msg/SAXMessages_en_GB.properties\\E"
+            },
             {
                 "pattern": "\\Qorg/apache/xerces/impl/msg/SAXMessages_en_US.properties\\E"
             },
@@ -758,6 +778,9 @@
             {
                 "pattern": "\\Qorg/apache/xmlbeans/impl/regex/message_en.properties\\E"
             },
+            {
+                "pattern": "\\Qorg/apache/xmlbeans/impl/regex/message_en_GB.properties\\E"
+            },
             {
                 "pattern": "\\Qorg/apache/xmlbeans/impl/regex/message_en_US.properties\\E"
             },
@@ -785,6 +808,9 @@
             {
                 "pattern": "\\Qorg/slf4j/impl/StaticLoggerBinder.class\\E"
             },
+            {
+                "pattern": "java.base:\\Qjdk/internal/icu/impl/data/icudt72b/nfc.nrm\\E"
+            },
             {
                 "pattern": "java.base:\\Qjdk/internal/icu/impl/data/icudt72b/nfkc.nrm\\E"
             },
@@ -794,6 +820,9 @@
             {
                 "pattern": "java.base:\\Qjdk/internal/icu/impl/data/icudt72b/uprops.icu\\E"
             },
+            {
+                "pattern": "java.base:\\Qjdk/internal/icu/impl/data/icudt74b/nfc.nrm\\E"
+            },
             {
                 "pattern": "java.base:\\Qjdk/internal/icu/impl/data/icudt74b/nfkc.nrm\\E"
             },
@@ -809,9 +838,21 @@
             {
                 "pattern": "java.desktop:\\Qsun/awt/resources/awt_en.properties\\E"
             },
+            {
+                "pattern": "java.desktop:\\Qsun/awt/resources/awt_en_GB.properties\\E"
+            },
             {
                 "pattern": "java.desktop:\\Qsun/awt/resources/awt_en_US.properties\\E"
             },
+            {
+                "pattern": "java.desktop:\\Qsun/awt/resources/awtosx_en.properties\\E"
+            },
+            {
+                "pattern": "java.desktop:\\Qsun/awt/resources/awtosx_en_GB.properties\\E"
+            },
+            {
+                "pattern": "java.desktop:\\Qsun/java2d/cmm/profiles/GRAY.pf\\E"
+            },
             {
                 "pattern": "java.desktop:\\Qsun/java2d/cmm/profiles/sRGB.pf\\E"
             },
diff --git a/extractous-core/tika-native/src/main/resources/META-INF/native-image/serialization-config.json b/extractous-core/tika-native/src/main/resources/META-INF/native-image/serialization-config.json
index 69a1360..b287a71 100644
--- a/extractous-core/tika-native/src/main/resources/META-INF/native-image/serialization-config.json
+++ b/extractous-core/tika-native/src/main/resources/META-INF/native-image/serialization-config.json
@@ -8,6 +8,9 @@
         {
             "name": "java.lang.AssertionError"
         },
+        {
+            "name": "java.lang.Boolean"
+        },
         {
             "name": "java.lang.Enum"
         },

From 1bc7fa86e50452373a5a820da3ba54c1c23a3d20 Mon Sep 17 00:00:00 2001
From: nmammeri <nadjib@bouita.com>
Date: Tue, 12 Nov 2024 09:50:29 +0100
Subject: [PATCH 11/11] Revert "chore: update reflection data for macos"

This reverts commit 9c9c4483194ebe02bd1fd45d5df1e47a59d25a40.
---
 .../META-INF/native-image/jni-config.json     | 180 ------------------
 .../META-INF/native-image/reflect-config.json |   9 -
 .../native-image/resource-config.json         |  41 ----
 .../native-image/serialization-config.json    |   3 -
 4 files changed, 233 deletions(-)

diff --git a/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json b/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json
index 5c074ba..496d5d3 100644
--- a/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json
+++ b/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json
@@ -95,31 +95,6 @@
         ],
         "name": "ai.yobix.TikaNativeMain"
     },
-    {
-        "methods": [
-            {
-                "name": "handleNativeNotification",
-                "parameterTypes": [
-                    "int"
-                ]
-            }
-        ],
-        "name": "com.apple.eawt._AppEventHandler"
-    },
-    {
-        "methods": [
-            {
-                "name": "initMenuStates",
-                "parameterTypes": [
-                    "boolean",
-                    "boolean",
-                    "boolean",
-                    "boolean"
-                ]
-            }
-        ],
-        "name": "com.apple.eawt._AppMenuBarHandler"
-    },
     {
         "methods": [
             {
@@ -191,41 +166,6 @@
         ],
         "name": "com.sun.imageio.plugins.jpeg.JPEGImageReader"
     },
-    {
-        "methods": [
-            {
-                "name": "grabPixels",
-                "parameterTypes": [
-                    "int"
-                ]
-            },
-            {
-                "name": "warningOccurred",
-                "parameterTypes": [
-                    "int"
-                ]
-            },
-            {
-                "name": "warningWithMessage",
-                "parameterTypes": [
-                    "java.lang.String"
-                ]
-            },
-            {
-                "name": "writeMetadata",
-                "parameterTypes": []
-            },
-            {
-                "name": "writeOutputData",
-                "parameterTypes": [
-                    "byte[]",
-                    "int",
-                    "int"
-                ]
-            }
-        ],
-        "name": "com.sun.imageio.plugins.jpeg.JPEGImageWriter"
-    },
     {
         "fields": [
             {
@@ -262,20 +202,6 @@
         ],
         "name": "java.awt.Dimension"
     },
-    {
-        "methods": [
-            {
-                "name": "<init>",
-                "parameterTypes": [
-                    "int",
-                    "int",
-                    "int",
-                    "int"
-                ]
-            }
-        ],
-        "name": "java.awt.DisplayMode"
-    },
     {
         "methods": [
             {
@@ -342,15 +268,6 @@
         ],
         "name": "java.awt.Toolkit"
     },
-    {
-        "methods": [
-            {
-                "name": "getButtonDownMasks",
-                "parameterTypes": []
-            }
-        ],
-        "name": "java.awt.event.InputEvent"
-    },
     {
         "fields": [
             {
@@ -396,20 +313,6 @@
         ],
         "name": "java.awt.geom.Path2D$Float"
     },
-    {
-        "methods": [
-            {
-                "name": "<init>",
-                "parameterTypes": [
-                    "double",
-                    "double",
-                    "double",
-                    "double"
-                ]
-            }
-        ],
-        "name": "java.awt.geom.Rectangle2D$Double"
-    },
     {
         "fields": [
             {
@@ -605,41 +508,11 @@
     },
     {
         "methods": [
-            {
-                "name": "lastIndexOf",
-                "parameterTypes": [
-                    "int"
-                ]
-            },
-            {
-                "name": "substring",
-                "parameterTypes": [
-                    "int"
-                ]
-            }
-        ],
-        "name": "java.lang.String"
-    },
-    {
-        "methods": [
-            {
-                "name": "getProperty",
-                "parameterTypes": [
-                    "java.lang.String"
-                ]
-            },
             {
                 "name": "load",
                 "parameterTypes": [
                     "java.lang.String"
                 ]
-            },
-            {
-                "name": "setProperty",
-                "parameterTypes": [
-                    "java.lang.String",
-                    "java.lang.String"
-                ]
             }
         ],
         "name": "java.lang.System"
@@ -848,19 +721,6 @@
         ],
         "name": "org.apache.tika.parser.pdf.PDFParserConfig"
     },
-    {
-        "methods": [
-            {
-                "name": "notifyToolkitThreadBusy",
-                "parameterTypes": []
-            },
-            {
-                "name": "notifyToolkitThreadFree",
-                "parameterTypes": []
-            }
-        ],
-        "name": "sun.awt.AWTAutoShutdown"
-    },
     {
         "fields": [
             {
@@ -954,26 +814,6 @@
         ],
         "name": "sun.awt.image.ByteComponentRaster"
     },
-    {
-        "fields": [
-            {
-                "name": "data"
-            },
-            {
-                "name": "dataBitOffset"
-            },
-            {
-                "name": "pixelBitStride"
-            },
-            {
-                "name": "scanlineStride"
-            },
-            {
-                "name": "type"
-            }
-        ],
-        "name": "sun.awt.image.BytePackedRaster"
-    },
     {
         "fields": [
             {
@@ -1516,25 +1356,5 @@
             }
         ],
         "name": "sun.java2d.xr.XRSurfaceData"
-    },
-    {
-        "methods": [
-            {
-                "name": "installToolkitThreadInJava",
-                "parameterTypes": []
-            }
-        ],
-        "name": "sun.lwawt.macosx.LWCToolkit"
-    },
-    {
-        "methods": [
-            {
-                "name": "main",
-                "parameterTypes": [
-                    "java.lang.String[]"
-                ]
-            }
-        ],
-        "name": "worker.org.gradle.process.internal.worker.GradleWorkerMain"
     }
 ]
\ No newline at end of file
diff --git a/extractous-core/tika-native/src/main/resources/META-INF/native-image/reflect-config.json b/extractous-core/tika-native/src/main/resources/META-INF/native-image/reflect-config.json
index 2f7f67a..7a2a2d9 100644
--- a/extractous-core/tika-native/src/main/resources/META-INF/native-image/reflect-config.json
+++ b/extractous-core/tika-native/src/main/resources/META-INF/native-image/reflect-config.json
@@ -20,15 +20,6 @@
     {
         "name": "[Lsun.security.pkcs.SignerInfo;"
     },
-    {
-        "methods": [
-            {
-                "name": "<init>",
-                "parameterTypes": []
-            }
-        ],
-        "name": "apple.security.AppleProvider"
-    },
     {
         "methods": [
             {
diff --git a/extractous-core/tika-native/src/main/resources/META-INF/native-image/resource-config.json b/extractous-core/tika-native/src/main/resources/META-INF/native-image/resource-config.json
index ea2099b..80776b9 100644
--- a/extractous-core/tika-native/src/main/resources/META-INF/native-image/resource-config.json
+++ b/extractous-core/tika-native/src/main/resources/META-INF/native-image/resource-config.json
@@ -3,7 +3,6 @@
         {
             "locales": [
                 "en",
-                "en-GB",
                 "en-US",
                 "und"
             ],
@@ -11,7 +10,6 @@
         },
         {
             "locales": [
-                "en-GB",
                 "en-US",
                 "und"
             ],
@@ -22,25 +20,13 @@
                 "sun.awt.resources.awt"
             ],
             "locales": [
-                "en-GB",
                 "en-US"
             ],
             "name": "sun.awt.resources.awt"
         },
-        {
-            "classNames": [
-                "sun.awt.resources.awtosx"
-            ],
-            "locales": [
-                "en-GB"
-            ],
-            "name": "sun.awt.resources.awtosx"
-        },
         {
             "locales": [
                 "en",
-                "en-001",
-                "en-GB",
                 "en-US",
                 "und"
             ],
@@ -238,9 +224,6 @@
             {
                 "pattern": "\\Qorg/apache/pdfbox/resources/glyphlist/zapfdingbats.txt\\E"
             },
-            {
-                "pattern": "\\Qorg/apache/pdfbox/resources/icc/ISOcoated_v2_300_bas.icc\\E"
-            },
             {
                 "pattern": "\\Qorg/apache/pdfbox/resources/text/BidiMirroring.txt\\E"
             },
@@ -766,9 +749,6 @@
             {
                 "pattern": "\\Qorg/apache/xerces/impl/msg/SAXMessages_en.properties\\E"
             },
-            {
-                "pattern": "\\Qorg/apache/xerces/impl/msg/SAXMessages_en_GB.properties\\E"
-            },
             {
                 "pattern": "\\Qorg/apache/xerces/impl/msg/SAXMessages_en_US.properties\\E"
             },
@@ -778,9 +758,6 @@
             {
                 "pattern": "\\Qorg/apache/xmlbeans/impl/regex/message_en.properties\\E"
             },
-            {
-                "pattern": "\\Qorg/apache/xmlbeans/impl/regex/message_en_GB.properties\\E"
-            },
             {
                 "pattern": "\\Qorg/apache/xmlbeans/impl/regex/message_en_US.properties\\E"
             },
@@ -808,9 +785,6 @@
             {
                 "pattern": "\\Qorg/slf4j/impl/StaticLoggerBinder.class\\E"
             },
-            {
-                "pattern": "java.base:\\Qjdk/internal/icu/impl/data/icudt72b/nfc.nrm\\E"
-            },
             {
                 "pattern": "java.base:\\Qjdk/internal/icu/impl/data/icudt72b/nfkc.nrm\\E"
             },
@@ -820,9 +794,6 @@
             {
                 "pattern": "java.base:\\Qjdk/internal/icu/impl/data/icudt72b/uprops.icu\\E"
             },
-            {
-                "pattern": "java.base:\\Qjdk/internal/icu/impl/data/icudt74b/nfc.nrm\\E"
-            },
             {
                 "pattern": "java.base:\\Qjdk/internal/icu/impl/data/icudt74b/nfkc.nrm\\E"
             },
@@ -838,21 +809,9 @@
             {
                 "pattern": "java.desktop:\\Qsun/awt/resources/awt_en.properties\\E"
             },
-            {
-                "pattern": "java.desktop:\\Qsun/awt/resources/awt_en_GB.properties\\E"
-            },
             {
                 "pattern": "java.desktop:\\Qsun/awt/resources/awt_en_US.properties\\E"
             },
-            {
-                "pattern": "java.desktop:\\Qsun/awt/resources/awtosx_en.properties\\E"
-            },
-            {
-                "pattern": "java.desktop:\\Qsun/awt/resources/awtosx_en_GB.properties\\E"
-            },
-            {
-                "pattern": "java.desktop:\\Qsun/java2d/cmm/profiles/GRAY.pf\\E"
-            },
             {
                 "pattern": "java.desktop:\\Qsun/java2d/cmm/profiles/sRGB.pf\\E"
             },
diff --git a/extractous-core/tika-native/src/main/resources/META-INF/native-image/serialization-config.json b/extractous-core/tika-native/src/main/resources/META-INF/native-image/serialization-config.json
index b287a71..69a1360 100644
--- a/extractous-core/tika-native/src/main/resources/META-INF/native-image/serialization-config.json
+++ b/extractous-core/tika-native/src/main/resources/META-INF/native-image/serialization-config.json
@@ -8,9 +8,6 @@
         {
             "name": "java.lang.AssertionError"
         },
-        {
-            "name": "java.lang.Boolean"
-        },
         {
             "name": "java.lang.Enum"
         },