Merge pull request #22 from yobix-ai/7-implement-extracting-from-an-a…

…rray-of-bytes 7 implement extracting from an array of bytes
yobix-ai · Nov 12, 2024 · a08e218 · a08e218
2 parents 2db7f6e + 1bc7fa8
commit a08e218
Show file tree

Hide file tree

Showing 21 changed files with 653 additions and 114 deletions.
diff --git a/README.md b/README.md
@@ -19,15 +19,15 @@
 <div align="center">
 
 _Extractous offers a fast and efficient solution for extracting content and metadata from various documents types such as PDF, Word, HTML, and [many other formats](#supported-file-formats).
-Our goal is to deliver a fast and efficient comprehensive solution in Rust with bindings for many programming 
+Our goal is to deliver a fast and efficient comprehensive solution in Rust with bindings for many programming
 languages._
 
 </div>
 
 ---
 
 **Demo**: showing that [Extractous 🚀](https://github.com/yobix-ai/extractous) is **25x faster** than the popular
-[unstructured-io](https://github.com/Unstructured-IO/unstructured) library ($65m in funding and 8.5k GitHub stars). 
+[unstructured-io](https://github.com/Unstructured-IO/unstructured) library ($65m in funding and 8.5k GitHub stars).
 For complete benchmarking details please consult our [benchmarking repository](https://github.com/yobix-ai/extractous-benchmarks)
 
 ![unstructured_vs_extractous](https://github.com/yobix-ai/extractous-benchmarks/raw/main/docs/extractous_vs_unstructured.gif)
@@ -55,7 +55,7 @@ With Extractous, the need for external services or APIs is eliminated, making da
 * High-performance unstructured data extraction optimized for speed and low memory usage.
 * Clear and simple API for extracting text and metadata content.
 * Automatically identifies document types and extracts content accordingly
-* Supports [many file formats](#supported-file-formats) (most formats supported by Apache Tika). 
+* Supports [many file formats](#supported-file-formats) (most formats supported by Apache Tika).
 * Extracts text from images and scanned documents with OCR through [tesseract-ocr](https://github.com/tesseract-ocr/tesseract).
 * Core engine written in Rust with bindings for [Python](https://pypi.org/project/extractous/) and upcoming support for JavaScript/TypeScript.
 * Detailed documentation and examples to help you get started quickly and efficiently.
@@ -77,13 +77,20 @@ extractor.set_extract_string_max_length(1000)
 result = extractor.extract_file_to_string("README.md")
 print(result)
 ```
-* Extracting a file to a buffered stream:
+* Extracting a file(URL / bytearray) to a buffered stream:
 
 ```python
 from extractous import Extractor
 
 extractor = Extractor()
+# for file
 reader = extractor.extract_file("tests/quarkus.pdf")
+# for url
+# reader = extractor.extract_url("https://www.google.com")
+# for bytearray
+# with open("tests/quarkus.pdf", "rb") as file:
+#     buffer = bytearray(file.read())
+# reader = extractor.extract_bytes(buffer)
 
 result = ""
 buffer = reader.read(4096)
@@ -122,9 +129,10 @@ fn main() {
 }
 ```
 
-* Extract a content of a file to a `StreamReader` and perform buffered reading
+* Extract a content of a file(URL/ bytes) to a `StreamReader` and perform buffered reading
 ```rust
-use std::io::Read;
+use std::io::{BufReader, Read};
+// use std::fs::File; use for bytes
 use extractous::Extractor;
 
 fn main() {
@@ -135,17 +143,25 @@ fn main() {
     // Extract the provided file content to a string
     let extractor = Extractor::new();
     let stream = extractor.extract_file(file_path).unwrap();
+    // Extract url
+    // let stream = extractor.extract_url("https://www.google.com/").unwrap();
+    // Extract bytes
+    // let mut file = File::open(file_path)?;
+    // let mut buffer = Vec::new();
+    // file.read_to_end(&mut buffer)?;
+    // let stream= extractor.extract_bytes(&file_bytes);
 
     // Because stream implements std::io::Read trait we can perform buffered reading
     // For example we can use it to create a BufReader
+    let mut reader = BufReader::new(stream);
     let mut buffer = Vec::new();
-    stream.read_to_end(&mut buffer).unwrap();
+    reader.read_to_end(&mut buffer).unwrap();
 
     println!("{}", String::from_utf8(buffer).unwrap())
 }
 ```
 
-* Extract content of PDF with OCR. 
+* Extract content of PDF with OCR.
 
 You need to have Tesseract installed with the language pack. For example on debian `sudo apt install tesseract-ocr tesseract-ocr-deu`
 
@@ -154,7 +170,7 @@ use extractous::Extractor;
 
 fn main() {
   let file_path = "../test_files/documents/deu-ocr.pdf";
-  
+
     let extractor = Extractor::new()
           .set_ocr_config(TesseractOcrConfig::new().set_language("deu"))
           .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::OCR_ONLY));
@@ -204,4 +220,4 @@ fn main() {
 Contributions are welcome! Please open an issue or submit a pull request if you have any improvements or new features to propose.
 
 ## 🕮 License
-This project is licensed under the Apache License 2.0. See the LICENSE file for details.
+This project is licensed under the Apache License 2.0. See the LICENSE file for details.
diff --git a/bindings/extractous-python/README.md b/bindings/extractous-python/README.md
@@ -1,6 +1,6 @@
 # Extractous Python Bindings
 
-This project provides Python bindings for the Extractous library, allowing you to use extractous functionality in 
+This project provides Python bindings for the Extractous library, allowing you to use extractous functionality in
 your Python applications.
 
 ## Installation
@@ -25,13 +25,20 @@ result = extractor.extract_file_to_string("README.md")
 print(result)
 ```
 
-Extracting a file to a buffered stream:
+Extracting a file(URL / bytearray) to a buffered stream:
 
 ```python
 from extractous import Extractor
 
 extractor = Extractor()
+# for file
 reader = extractor.extract_file("tests/quarkus.pdf")
+# for url
+# reader = extractor.extract_url("https://www.google.com")
+# for bytearray
+# with open("tests/quarkus.pdf", "rb") as file:
+#     buffer = bytearray(file.read())
+# reader = extractor.extract_bytes(buffer)
 
 result = ""
 buffer = reader.read(4096)
@@ -51,4 +58,4 @@ extractor = Extractor().set_ocr_config(TesseractOcrConfig().set_language("deu"))
 result = extractor.extract_file_to_string("../../test_files/documents/eng-ocr.pdf")
 
 print(result)
-```
+```
diff --git a/bindings/extractous-python/examples/extract_to_stream.py b/bindings/extractous-python/examples/extract_to_stream.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+import os
+import sys
+
+from extractous import Extractor, PdfOcrStrategy, PdfParserConfig
+
+
+def extract_to_stream(file_path: str):
+
+    # Extract the file
+    extractor = Extractor()
+    reader = extractor.extract_file(in_file)
+
+    buffer = bytearray(4096 * 4096)
+    while True:
+        bytes_read = reader.readinto(buffer)
+        # If no more data, exit the loop
+        if bytes_read == 0:
+            break
+        # Decode the valid portion of the buffer and append it to the result
+        chunk = buffer[:bytes_read].decode('utf-8')
+        print(chunk)
+
+
+if __name__ == '__main__':
+    # Pare input args
+    if len(sys.argv) != 2:
+        print(f"Usage: '{sys.argv[0]}' <filename>")
+        sys.exit(1)
+    in_file = sys.argv[1]
+    if not os.path.isfile(in_file):
+        raise FileNotFoundError(f"No such file: '{in_file}'")
+
+    extract_to_stream(in_file)
diff --git a/bindings/extractous-python/src/extractor.rs b/bindings/extractous-python/src/extractor.rs
@@ -75,6 +75,18 @@ impl StreamReader {
             ))),
         }
     }
+
+    /// Reads into the specified buffer
+    pub fn readinto<'py>(&mut self, buf: Bound<'py, PyByteArray>) -> PyResult<usize> {
+        let bs = unsafe { buf.as_bytes_mut() };
+
+        let bytes_read = self.reader.read(bs)
+            .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(
+                format!("{}", e))
+            )?;
+        Ok(bytes_read)
+    }
+
 }
 
 /// `Extractor` is the entry for all extract APIs
@@ -147,6 +159,39 @@ impl Extractor {
             .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))
     }
 
+    /// Extracts text from a bytearray. Returns a stream of the extracted text
+    /// the stream is decoded using the extractor's `encoding`
+    pub fn extract_bytes(&self, buffer: &Bound<'_, PyByteArray>) -> PyResult<StreamReader> {
+        let slice = buffer.to_vec();
+        let reader = self
+            .0
+            .extract_bytes(&slice)
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
+
+        // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
+        Ok(StreamReader {
+            reader,
+            buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
+            py_bytes: None,
+        })
+    }
+
+    /// Extracts text from a url. Returns a string that is of maximum length
+    /// of the extractor's `extract_string_max_length`
+    pub fn extract_url(&self, url: &str) -> PyResult<StreamReader> {
+        let reader = self
+            .0
+            .extract_url(&url)
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
+
+        // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
+        Ok(StreamReader {
+            reader,
+            buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
+            py_bytes: None,
+        })
+    }
+
     fn __repr__(&self) -> String {
         format!("{:?}", self.0)
     }

diff --git a/bindings/extractous-python/tests/test_extract_bytes_to_stream.py b/bindings/extractous-python/tests/test_extract_bytes_to_stream.py
@@ -0,0 +1,40 @@
+import pytest
+
+from extractous import Extractor
+from utils import cosine_similarity, read_to_string, read_file_to_bytearray
+
+TEST_CASES = [
+    ("2022_Q3_AAPL.pdf", 0.9),
+    ("science-exploration-1p.pptx", 0.9),
+    ("simple.odt", 0.9),
+    ("table-multi-row-column-cells-actual.csv", 0.9),
+    ("vodafone.xlsx", 0.4),
+    ("category-level.docx", 0.9),
+    ("simple.doc", 0.9),
+    ("simple.pptx", 0.9),
+    ("table-multi-row-column-cells.png", -1.0),
+    ("winter-sports.epub", 0.9),
+    ("bug_16.docx", 0.9),
+    #("eng-ocr.pdf", 0.9),
+]
+
+
+@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
+def test_extract_bytes_to_stream(file_name, target_dist):
+    """Test the extraction from bytes of various file types."""
+    original_filepath = f"../../test_files/documents/{file_name}"
+    expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
+
+    file_bytes = read_file_to_bytearray(original_filepath)
+
+    extractor = Extractor()
+    reader = extractor.extract_bytes(file_bytes)
+    result = read_to_string(reader)
+
+    # Expected
+    with open(expected_result_filepath, "r",  encoding="utf8") as file:
+        expected = file.read()
+
+    assert cosine_similarity(result, expected) > target_dist, \
+        f"Cosine similarity is less than {target_dist} for file: {file_name}"
+
diff --git a/bindings/extractous-python/tests/test_extract_file_to_string.py b/bindings/extractous-python/tests/test_extract_file_to_string.py
@@ -15,9 +15,10 @@
     ("table-multi-row-column-cells.png", -1.0),
     ("winter-sports.epub", 0.9),
     ("bug_16.docx", 0.9),
-    ("deu-ocr.pdf", 0.9),
+    #("eng-ocr.pdf", 0.9),
 ]
 
+
 @pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
 def test_extract_file_to_string(file_name, target_dist):
     """Test the extraction and comparison of various file types."""

diff --git a/bindings/extractous-python/tests/test_extract_url.py b/bindings/extractous-python/tests/test_extract_url.py
@@ -0,0 +1,10 @@
+from extractous import Extractor
+from utils import read_to_string
+
+def test_extract_url():
+    extractor = Extractor()
+
+    reader = extractor.extract_url("https://www.google.com")
+    result = read_to_string(reader)
+
+    assert "Google" in result
diff --git a/bindings/extractous-python/tests/test_ocr.py b/bindings/extractous-python/tests/test_ocr.py
@@ -1,19 +1,20 @@
 from extractous import Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig
 from utils import cosine_similarity
 
+
 def test_ara_ocr_png():
     ocr_config = TesseractOcrConfig().set_language("ara")
     extractor = Extractor().set_ocr_config(ocr_config)
     result = extractor.extract_file_to_string("../../test_files/documents/ara-ocr.png")
 
-    with open("../../test_files/expected_result/ara-ocr.png.txt", "r",  encoding="utf8") as file:
+    with open("../../test_files/expected_result/ara-ocr.png.txt", "r", encoding="utf8") as file:
         expected = file.read()
 
-    assert cosine_similarity(result, expected)
+    assert cosine_similarity(result, expected) > 0.9
 
 
-def test_ocr_only_strategy_extract_deu_ocr_pdf_to_string():
-    test_file = "../../test_files/documents/eng-ocr.pdf"
+def test_extract_file_to_string_ocr_only_strategy_deu_ocr_pdf():
+    test_file = "../../test_files/documents/deu-ocr.pdf"
     expected_result_file = "../../test_files/expected_result/deu-ocr.pdf.txt"
 
     pdf_config = PdfParserConfig().set_ocr_strategy(PdfOcrStrategy.OCR_ONLY)
@@ -26,21 +27,22 @@ def test_ocr_only_strategy_extract_deu_ocr_pdf_to_string():
 
     result = extractor.extract_file_to_string(test_file)
 
-    with open(expected_result_file, "r",  encoding="utf8") as file:
+    with open(expected_result_file, "r", encoding="utf8") as file:
         expected = file.read()
 
-    assert cosine_similarity(result, expected)
+    assert cosine_similarity(result, expected) > 0.9
+
 
-def test_no_ocr_strategy_extract_deu_ocr_pdf_to_string():
+def test_test_extract_file_to_string_no_ocr_strategy_deu_ocr_pdf():
     test_file = "../../test_files/documents/deu-ocr.pdf"
 
     pdf_config = PdfParserConfig()
     pdf_config = pdf_config.set_ocr_strategy(PdfOcrStrategy.NO_OCR)
     ocr_config = TesseractOcrConfig()
     ocr_config = ocr_config.set_language("deu")
 
-    extractor = Extractor().set_ocr_config(ocr_config).set_pdf_config(PdfParserConfig().set_ocr_strategy(PdfOcrStrategy.NO_OCR))
+    extractor = Extractor().set_ocr_config(ocr_config).set_pdf_config(pdf_config)
 
     result = extractor.extract_file_to_string(test_file)
 
-    assert result.strip() == ""
+    assert result.strip() == ""
diff --git a/bindings/extractous-python/tests/test_pdf.py b/bindings/extractous-python/tests/test_pdf.py
@@ -1,4 +1,5 @@
 from extractous import Extractor
+from utils import read_to_string
 
 
 def expected_result():
@@ -12,16 +13,23 @@ def test_extract_file_to_string():
     #print(result)
     assert result == expected_result()
 
-
 def test_extract_file():
     extractor = Extractor()
     reader = extractor.extract_file("tests/quarkus.pdf")
 
-    result = ""
-    b = reader.read(4096)
-    while len(b) > 0:
-        result += b.decode("utf-8")
-        b = reader.read(4096)
+    result = read_to_string(reader)
 
     #print(result)
-    assert result == expected_result()
+    assert result == expected_result()
+
+def test_extract_bytes():
+    extractor = Extractor()
+
+    with open("tests/quarkus.pdf", "rb") as file:
+        buffer = bytearray(file.read())
+    reader = extractor.extract_bytes(buffer)
+
+    result = read_to_string(reader)
+
+    #print(result)
+    assert result == expected_result()