Skip to content

Commit

Permalink
Merge pull request #26 from s4zuk3/feature/3-return-tika-metadata
Browse files Browse the repository at this point in the history
Feature/3 return tika metadata
  • Loading branch information
nmammeri authored Nov 15, 2024
2 parents a08e218 + dfb991e commit ef326fa
Show file tree
Hide file tree
Showing 35 changed files with 2,097 additions and 399 deletions.
33 changes: 33 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,23 @@ result = extractor.extract_file_to_string("../../test_files/documents/eng-ocr.pd
print(result)
```

* Extracting with metadata:

You can extend the functionality with `_with_metadata` to return the file's metadata.

```python
from extractous import Extractor

# Create a new extractor
extractor = Extractor()
extractor.set_extract_string_max_length(1000)

# Extract text from a file
result, metadata = extractor.extract_file_to_string_with_metadata("README.md")
print(result)
print(metadata)
```

#### Rust
* Extract a file content to a string:
```rust
Expand All @@ -129,6 +146,22 @@ fn main() {
}
```

* Extracting with metadata:

```rust
use extractous::Extractor;

fn main() {
// Create a new extractor. Note it uses a consuming builder pattern
let mut extractor = Extractor::new().set_extract_string_max_length(1000);

// Extract text from a file
let (text, metadata) = extractor.extract_file_to_string_with_metadata("README.md").unwrap();
println!("{}", text);
println!("{:?}", metadata);
}
```

* Extract a content of a file(URL/ bytes) to a `StreamReader` and perform buffered reading
```rust
use std::io::{BufReader, Read};
Expand Down
2 changes: 1 addition & 1 deletion bindings/extractous-python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ doc = false

[dependencies]
# "abi3-py310" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.10
pyo3 = { version = "0.22.2", features = ["abi3", "abi3-py38"] }
pyo3 = { version = "0.22.2", features = ["abi3", "abi3-py38", "gil-refs"] }
extractous = { path = "../../extractous-core" }
114 changes: 109 additions & 5 deletions bindings/extractous-python/src/extractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ use crate::{ecore, OfficeParserConfig, PdfParserConfig, TesseractOcrConfig};
use pyo3::exceptions::PyTypeError;
use pyo3::prelude::*;
use pyo3::types::PyByteArray;
use pyo3::types::PyDict;
use std::collections::HashMap;
use std::io::Read;

// PyO3 supports unit-only enums (which contain only unit variants)
Expand Down Expand Up @@ -80,13 +82,12 @@ impl StreamReader {
pub fn readinto<'py>(&mut self, buf: Bound<'py, PyByteArray>) -> PyResult<usize> {
let bs = unsafe { buf.as_bytes_mut() };

let bytes_read = self.reader.read(bs)
.map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(
format!("{}", e))
)?;
let bytes_read = self
.reader
.read(bs)
.map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(format!("{}", e)))?;
Ok(bytes_read)
}

}

/// `Extractor` is the entry for all extract APIs
Expand Down Expand Up @@ -151,6 +152,30 @@ impl Extractor {
})
}

/// Extracts text from a file path. Returns a tuple with stream of the extracted text
/// the stream is decoded using the extractor's `encoding` and tika metadata.
pub fn extract_file_with_metadata<'py>(
&self,
filename: &str,
py: Python<'py>,
) -> PyResult<(StreamReader, PyObject)> {
let (reader, metadata) = self
.0
.extract_file_with_metadata(filename)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
Ok((
StreamReader {
reader,
buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
py_bytes: None,
},
py_metadata.into(),
))
}

/// Extracts text from a file path. Returns a string that is of maximum length
/// of the extractor's `extract_string_max_length`
pub fn extract_file_to_string(&self, filename: &str) -> PyResult<String> {
Expand All @@ -159,6 +184,22 @@ impl Extractor {
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))
}

/// Extracts text from a file path. Returns a tuple with string and dict that is of maximum length
/// of the extractor's `extract_string_max_length` and the metadata.
pub fn extract_file_to_string_with_metadata<'py>(
&self,
filename: &str,
py: Python<'py>,
) -> PyResult<(String, PyObject)> {
let (content, metadata) = self
.0
.extract_file_to_string_with_metadata(filename)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
Ok((content, py_metadata.into()))
}

/// Extracts text from a bytearray. Returns a stream of the extracted text
/// the stream is decoded using the extractor's `encoding`
pub fn extract_bytes(&self, buffer: &Bound<'_, PyByteArray>) -> PyResult<StreamReader> {
Expand All @@ -176,6 +217,31 @@ impl Extractor {
})
}

/// Extracts text from a bytearray. Returns a tuple with stream of the extracted text
/// the stream is decoded using the extractor's `encoding` and tika metadata.
pub fn extract_bytes_with_metadata<'py>(
&self,
buffer: &Bound<'_, PyByteArray>,
py: Python<'py>,
) -> PyResult<(StreamReader, PyObject)> {
let slice = buffer.to_vec();
let (reader, metadata) = self
.0
.extract_bytes_with_metadata(&slice)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
Ok((
StreamReader {
reader,
buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
py_bytes: None,
},
py_metadata.into(),
))
}

/// Extracts text from a url. Returns a string that is of maximum length
/// of the extractor's `extract_string_max_length`
pub fn extract_url(&self, url: &str) -> PyResult<StreamReader> {
Expand All @@ -192,7 +258,45 @@ impl Extractor {
})
}

/// Extracts text from a url. Returns a tuple with string that is of maximum length
/// of the extractor's `extract_string_max_length` and tika metdata.
pub fn extract_url_with_metadata<'py>(
&self,
url: &str,
py: Python<'py>,
) -> PyResult<(StreamReader, PyObject)> {
let (reader, metadata) = self
.0
.extract_url_with_metadata(&url)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;

// Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
Ok((
StreamReader {
reader,
buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
py_bytes: None,
},
py_metadata.into(),
))
}

fn __repr__(&self) -> String {
format!("{:?}", self.0)
}
}

/// Converts HashMap<String, Vec<String> to PyDict
fn metadata_hashmap_to_pydict<'py>(
py: Python<'py>,
hashmap: &HashMap<String, Vec<String>>,
) -> Result<Bound<'py, PyDict>, PyErr> {
let pydict = PyDict::new_bound(py);
for (key, value) in hashmap {
pydict
.set_item(key, value)
.map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
}
Ok(pydict)
}
32 changes: 31 additions & 1 deletion bindings/extractous-python/tests/test_extract_bytes_to_stream.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import json
import pytest

from extractous import Extractor
from utils import cosine_similarity, read_to_string, read_file_to_bytearray
from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9),
Expand Down Expand Up @@ -38,3 +39,32 @@ def test_extract_bytes_to_stream(file_name, target_dist):
assert cosine_similarity(result, expected) > target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"


TEST_CASES_METADATA = [
("2022_Q3_AAPL.pdf", 0.9),
("science-exploration-1p.pptx", 0.9),
("simple.odt", 0.9),
("table-multi-row-column-cells-actual.csv", 0.6),
("vodafone.xlsx", 0.8),
("category-level.docx", 0.9),
("simple.doc", 0.9),
("simple.pptx", 0.9),
("table-multi-row-column-cells.png", 0.9),
("winter-sports.epub", 0.8),
("bug_16.docx", 0.9),
]


@pytest.mark.parametrize("file_name, similarity_percent", TEST_CASES_METADATA)
def test_extract_bytes_to_stream(file_name, similarity_percent):
"""Test the extraction from bytes of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
file_bytes = read_file_to_bytearray(original_filepath)
extractor = Extractor()
_reader, metadata = extractor.extract_bytes_with_metadata(file_bytes)
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)
percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
assert percent_similarity > similarity_percent, \
f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
30 changes: 29 additions & 1 deletion bindings/extractous-python/tests/test_extract_file_to_string.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import json
import pytest

from extractous import Extractor
from utils import cosine_similarity
from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained

TEST_CASES = [
("2022_Q3_AAPL.pdf", 0.9),
Expand Down Expand Up @@ -32,3 +33,30 @@ def test_extract_file_to_string(file_name, target_dist):
assert cosine_similarity(result, expected) > target_dist, \
f"Cosine similarity is less than {target_dist} for file: {file_name}"


TEST_CASES_METADATA = [
"2022_Q3_AAPL.pdf",
"science-exploration-1p.pptx",
"simple.odt",
"table-multi-row-column-cells-actual.csv",
"vodafone.xlsx",
"category-level.docx",
"simple.doc",
"simple.pptx",
"table-multi-row-column-cells.png",
"winter-sports.epub",
"bug_16.docx",
]

@pytest.mark.parametrize("file_name", TEST_CASES_METADATA)
def test_extract_file_to_string_with_metadata(file_name):
"""Test the extraction and comparison of various file types."""
original_filepath = f"../../test_files/documents/{file_name}"
expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
extractor = Extractor()
_result, metadata = extractor.extract_file_to_string_with_metadata(original_filepath)
with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
expected_metadata = json.load(file)

#metadata.pop("dc:format")
assert is_expected_metadata_contained(expected_metadata, metadata)
5 changes: 5 additions & 0 deletions bindings/extractous-python/tests/test_extract_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,8 @@ def test_extract_url():
result = read_to_string(reader)

assert "Google" in result

def test_extract_url_with_metadata():
extractor = Extractor()
_reader, metadata = extractor.extract_url_with_metadata("https://www.google.com")
assert len(metadata.keys()) > 0
33 changes: 33 additions & 0 deletions bindings/extractous-python/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,36 @@ def read_file_to_bytearray(file_path: str):
with open(file_path, 'rb') as file:
file_content = bytearray(file.read())
return file_content


def is_expected_metadata_contained(expected: dict, current: dict) -> bool:
"""
Check if all keys in `expected` are present in `current` and have identical values.
"""
for key, expected_values in expected.items():
actual_values = current.get(key)
if actual_values is None:
print(f"\nexpected key = {key} not found !!")
return False
elif actual_values != expected_values:
print(f"\nvalues for key = {key} differ!! expected = {expected_values} and actual = {actual_values}")
return False
return True


def calculate_similarity_percent(expected, current):
matches = 0
total = 0

# Iterate over all keys in the 'expected' dictionary
for key, value1 in expected.items():
if key in current:
total += 1
if value1 == current[key]:
matches += 1

if total == 0:
return 0.0

# Return the similarity percentage
return matches / total
Loading

0 comments on commit ef326fa

Please sign in to comment.