Skip to content

Commit

Permalink
Merge pull request #22 from yobix-ai/7-implement-extracting-from-an-a…
Browse files Browse the repository at this point in the history
…rray-of-bytes

7 implement extracting from an array of bytes
  • Loading branch information
nmammeri authored Nov 12, 2024
2 parents 2db7f6e + 1bc7fa8 commit a08e218
Show file tree
Hide file tree
Showing 21 changed files with 653 additions and 114 deletions.
36 changes: 26 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@
<div align="center">

_Extractous offers a fast and efficient solution for extracting content and metadata from various documents types such as PDF, Word, HTML, and [many other formats](#supported-file-formats).
Our goal is to deliver a fast and efficient comprehensive solution in Rust with bindings for many programming
Our goal is to deliver a fast and efficient comprehensive solution in Rust with bindings for many programming
languages._

</div>

---

**Demo**: showing that [Extractous 🚀](https://github.com/yobix-ai/extractous) is **25x faster** than the popular
[unstructured-io](https://github.com/Unstructured-IO/unstructured) library ($65m in funding and 8.5k GitHub stars).
[unstructured-io](https://github.com/Unstructured-IO/unstructured) library ($65m in funding and 8.5k GitHub stars).
For complete benchmarking details please consult our [benchmarking repository](https://github.com/yobix-ai/extractous-benchmarks)

![unstructured_vs_extractous](https://github.com/yobix-ai/extractous-benchmarks/raw/main/docs/extractous_vs_unstructured.gif)
Expand Down Expand Up @@ -55,7 +55,7 @@ With Extractous, the need for external services or APIs is eliminated, making da
* High-performance unstructured data extraction optimized for speed and low memory usage.
* Clear and simple API for extracting text and metadata content.
* Automatically identifies document types and extracts content accordingly
* Supports [many file formats](#supported-file-formats) (most formats supported by Apache Tika).
* Supports [many file formats](#supported-file-formats) (most formats supported by Apache Tika).
* Extracts text from images and scanned documents with OCR through [tesseract-ocr](https://github.com/tesseract-ocr/tesseract).
* Core engine written in Rust with bindings for [Python](https://pypi.org/project/extractous/) and upcoming support for JavaScript/TypeScript.
* Detailed documentation and examples to help you get started quickly and efficiently.
Expand All @@ -77,13 +77,20 @@ extractor.set_extract_string_max_length(1000)
result = extractor.extract_file_to_string("README.md")
print(result)
```
* Extracting a file to a buffered stream:
* Extracting a file(URL / bytearray) to a buffered stream:

```python
from extractous import Extractor

extractor = Extractor()
# for file
reader = extractor.extract_file("tests/quarkus.pdf")
# for url
# reader = extractor.extract_url("https://www.google.com")
# for bytearray
# with open("tests/quarkus.pdf", "rb") as file:
# buffer = bytearray(file.read())
# reader = extractor.extract_bytes(buffer)

result = ""
buffer = reader.read(4096)
Expand Down Expand Up @@ -122,9 +129,10 @@ fn main() {
}
```

* Extract a content of a file to a `StreamReader` and perform buffered reading
* Extract a content of a file(URL/ bytes) to a `StreamReader` and perform buffered reading
```rust
use std::io::Read;
use std::io::{BufReader, Read};
// use std::fs::File; use for bytes
use extractous::Extractor;

fn main() {
Expand All @@ -135,17 +143,25 @@ fn main() {
// Extract the provided file content to a string
let extractor = Extractor::new();
let stream = extractor.extract_file(file_path).unwrap();
// Extract url
// let stream = extractor.extract_url("https://www.google.com/").unwrap();
// Extract bytes
// let mut file = File::open(file_path)?;
// let mut buffer = Vec::new();
// file.read_to_end(&mut buffer)?;
// let stream= extractor.extract_bytes(&file_bytes);

// Because stream implements std::io::Read trait we can perform buffered reading
// For example we can use it to create a BufReader
let mut reader = BufReader::new(stream);
let mut buffer = Vec::new();
stream.read_to_end(&mut buffer).unwrap();
reader.read_to_end(&mut buffer).unwrap();

println!("{}", String::from_utf8(buffer).unwrap())
}
```

* Extract content of PDF with OCR.
* Extract content of PDF with OCR.

You need to have Tesseract installed with the language pack. For example on debian `sudo apt install tesseract-ocr tesseract-ocr-deu`

Expand All @@ -154,7 +170,7 @@ use extractous::Extractor;

fn main() {
let file_path = "../test_files/documents/deu-ocr.pdf";

let extractor = Extractor::new()
.set_ocr_config(TesseractOcrConfig::new().set_language("deu"))
.set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::OCR_ONLY));
Expand Down Expand Up @@ -204,4 +220,4 @@ fn main() {
Contributions are welcome! Please open an issue or submit a pull request if you have any improvements or new features to propose.

## 🕮 License
This project is licensed under the Apache License 2.0. See the LICENSE file for details.
This project is licensed under the Apache License 2.0. See the LICENSE file for details.
13 changes: 10 additions & 3 deletions bindings/extractous-python/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Extractous Python Bindings

This project provides Python bindings for the Extractous library, allowing you to use extractous functionality in
This project provides Python bindings for the Extractous library, allowing you to use extractous functionality in
your Python applications.

## Installation
Expand All @@ -25,13 +25,20 @@ result = extractor.extract_file_to_string("README.md")
print(result)
```

Extracting a file to a buffered stream:
Extracting a file(URL / bytearray) to a buffered stream:

```python
from extractous import Extractor

extractor = Extractor()
# for file
reader = extractor.extract_file("tests/quarkus.pdf")
# for url
# reader = extractor.extract_url("https://www.google.com")
# for bytearray
# with open("tests/quarkus.pdf", "rb") as file:
# buffer = bytearray(file.read())
# reader = extractor.extract_bytes(buffer)

result = ""
buffer = reader.read(4096)
Expand All @@ -51,4 +58,4 @@ extractor = Extractor().set_ocr_config(TesseractOcrConfig().set_language("deu"))
result = extractor.extract_file_to_string("../../test_files/documents/eng-ocr.pdf")

print(result)
```
```
34 changes: 34 additions & 0 deletions bindings/extractous-python/examples/extract_to_stream.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env python3
import os
import sys

from extractous import Extractor, PdfOcrStrategy, PdfParserConfig


def extract_to_stream(file_path: str):

# Extract the file
extractor = Extractor()
reader = extractor.extract_file(in_file)

buffer = bytearray(4096 * 4096)
while True:
bytes_read = reader.readinto(buffer)
# If no more data, exit the loop
if bytes_read == 0:
break
# Decode the valid portion of the buffer and append it to the result
chunk = buffer[:bytes_read].decode('utf-8')
print(chunk)


if __name__ == '__main__':
# Pare input args
if len(sys.argv) != 2:
print(f"Usage: '{sys.argv[0]}' <filename>")
sys.exit(1)
in_file = sys.argv[1]
if not os.path.isfile(in_file):
raise FileNotFoundError(f"No such file: '{in_file}'")

extract_to_stream(in_file)
45 changes: 45 additions & 0 deletions bindings/extractous-python/src/extractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,18 @@ impl StreamReader {
))),
}
}

/// Reads into the specified buffer
pub fn readinto<'py>(&mut self, buf: Bound<'py, PyByteArray>) -> PyResult<usize> {
let bs = unsafe { buf.as_bytes_mut() };

let bytes_read = self.reader.read(bs)
.map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(
format!("{}", e))
)?;
Ok(bytes_read)
}

}

/// `Extractor` is the entry for all extract APIs
Expand Down Expand Up @@ -147,6 +159,39 @@ impl Extractor {
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))
}

/// Extracts text from a bytearray. Returns a stream of the extracted text
/// the stream is decoded using the extractor's `encoding`
pub fn extract_bytes(&self, buffer: &Bound<'_, PyByteArray>) -> PyResult<StreamReader> {
let slice = buffer.to_vec();
let reader = self
.0
.extract_bytes(&slice)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
Ok(StreamReader {
reader,
buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
py_bytes: None,
})
}

/// Extracts text from a url. Returns a string that is of maximum length
/// of the extractor's `extract_string_max_length`
pub fn extract_url(&self, url: &str) -> PyResult<StreamReader> {
let reader = self
.0
.extract_url(&url)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
Ok(StreamReader {
reader,
buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
py_bytes: None,
})
}

fn __repr__(&self) -> String {
format!("{:?}", self.0)
}
Expand Down
40 changes: 40 additions & 0 deletions bindings/extractous-python/tests/test_extract_bytes_to_stream.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import pytest

from extractous import Extractor
from utils import cosine_similarity, read_to_string, read_file_to_bytearray

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9),
("science-exploration-1p.pptx", 0.9),
("simple.odt", 0.9),
("table-multi-row-column-cells-actual.csv", 0.9),
("vodafone.xlsx", 0.4),
("category-level.docx", 0.9),
("simple.doc", 0.9),
("simple.pptx", 0.9),
("table-multi-row-column-cells.png", -1.0),
("winter-sports.epub", 0.9),
("bug_16.docx", 0.9),
#("eng-ocr.pdf", 0.9),
]


@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
def test_extract_bytes_to_stream(file_name, target_dist):
"""Test the extraction from bytes of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"

file_bytes = read_file_to_bytearray(original_filepath)

extractor = Extractor()
reader = extractor.extract_bytes(file_bytes)
result = read_to_string(reader)

# Expected
with open(expected_result_filepath, "r", encoding="utf8") as file:
expected = file.read()

assert cosine_similarity(result, expected) > target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"

Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@
("table-multi-row-column-cells.png", -1.0),
("winter-sports.epub", 0.9),
("bug_16.docx", 0.9),
("deu-ocr.pdf", 0.9),
#("eng-ocr.pdf", 0.9),
]


@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
def test_extract_file_to_string(file_name, target_dist):
"""Test the extraction and comparison of various file types."""
Expand Down
10 changes: 10 additions & 0 deletions bindings/extractous-python/tests/test_extract_url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from extractous import Extractor
from utils import read_to_string

def test_extract_url():
extractor = Extractor()

reader = extractor.extract_url("https://www.google.com")
result = read_to_string(reader)

assert "Google" in result
20 changes: 11 additions & 9 deletions bindings/extractous-python/tests/test_ocr.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
from extractous import Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig
from utils import cosine_similarity


def test_ara_ocr_png():
ocr_config = TesseractOcrConfig().set_language("ara")
extractor = Extractor().set_ocr_config(ocr_config)
result = extractor.extract_file_to_string("../../test_files/documents/ara-ocr.png")

with open("../../test_files/expected_result/ara-ocr.png.txt", "r", encoding="utf8") as file:
with open("../../test_files/expected_result/ara-ocr.png.txt", "r", encoding="utf8") as file:
expected = file.read()

assert cosine_similarity(result, expected)
assert cosine_similarity(result, expected) > 0.9


def test_ocr_only_strategy_extract_deu_ocr_pdf_to_string():
test_file = "../../test_files/documents/eng-ocr.pdf"
def test_extract_file_to_string_ocr_only_strategy_deu_ocr_pdf():
test_file = "../../test_files/documents/deu-ocr.pdf"
expected_result_file = "../../test_files/expected_result/deu-ocr.pdf.txt"

pdf_config = PdfParserConfig().set_ocr_strategy(PdfOcrStrategy.OCR_ONLY)
Expand All @@ -26,21 +27,22 @@ def test_ocr_only_strategy_extract_deu_ocr_pdf_to_string():

result = extractor.extract_file_to_string(test_file)

with open(expected_result_file, "r", encoding="utf8") as file:
with open(expected_result_file, "r", encoding="utf8") as file:
expected = file.read()

assert cosine_similarity(result, expected)
assert cosine_similarity(result, expected) > 0.9


def test_no_ocr_strategy_extract_deu_ocr_pdf_to_string():
def test_test_extract_file_to_string_no_ocr_strategy_deu_ocr_pdf():
test_file = "../../test_files/documents/deu-ocr.pdf"

pdf_config = PdfParserConfig()
pdf_config = pdf_config.set_ocr_strategy(PdfOcrStrategy.NO_OCR)
ocr_config = TesseractOcrConfig()
ocr_config = ocr_config.set_language("deu")

extractor = Extractor().set_ocr_config(ocr_config).set_pdf_config(PdfParserConfig().set_ocr_strategy(PdfOcrStrategy.NO_OCR))
extractor = Extractor().set_ocr_config(ocr_config).set_pdf_config(pdf_config)

result = extractor.extract_file_to_string(test_file)

assert result.strip() == ""
assert result.strip() == ""
22 changes: 15 additions & 7 deletions bindings/extractous-python/tests/test_pdf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from extractous import Extractor
from utils import read_to_string


def expected_result():
Expand All @@ -12,16 +13,23 @@ def test_extract_file_to_string():
#print(result)
assert result == expected_result()


def test_extract_file():
extractor = Extractor()
reader = extractor.extract_file("tests/quarkus.pdf")

result = ""
b = reader.read(4096)
while len(b) > 0:
result += b.decode("utf-8")
b = reader.read(4096)
result = read_to_string(reader)

#print(result)
assert result == expected_result()
assert result == expected_result()

def test_extract_bytes():
extractor = Extractor()

with open("tests/quarkus.pdf", "rb") as file:
buffer = bytearray(file.read())
reader = extractor.extract_bytes(buffer)

result = read_to_string(reader)

#print(result)
assert result == expected_result()
Loading

0 comments on commit a08e218

Please sign in to comment.