Merge pull request #26 from s4zuk3/feature/3-return-tika-metadata

Feature/3 return tika metadata
yobix-ai · Nov 15, 2024 · ef326fa · ef326fa
2 parents a08e218 + dfb991e
commit ef326fa
Show file tree

Hide file tree

Showing 35 changed files with 2,097 additions and 399 deletions.
diff --git a/README.md b/README.md
@@ -114,6 +114,23 @@ result = extractor.extract_file_to_string("../../test_files/documents/eng-ocr.pd
 print(result)
 ```
 
+* Extracting with metadata:
+
+You can extend the functionality with `_with_metadata` to return the file's metadata.
+
+```python
+from extractous import Extractor
+
+# Create a new extractor
+extractor = Extractor()
+extractor.set_extract_string_max_length(1000)
+
+# Extract text from a file
+result, metadata = extractor.extract_file_to_string_with_metadata("README.md")
+print(result)
+print(metadata)
+```
+
 #### Rust
 * Extract a file content to a string:
 ```rust
@@ -129,6 +146,22 @@ fn main() {
 }
 ```
 
+* Extracting with metadata:
+
+```rust
+use extractous::Extractor;
+
+fn main() {
+    // Create a new extractor. Note it uses a consuming builder pattern
+    let mut extractor = Extractor::new().set_extract_string_max_length(1000);
+
+    // Extract text from a file
+    let (text, metadata) = extractor.extract_file_to_string_with_metadata("README.md").unwrap();
+    println!("{}", text);
+    println!("{:?}", metadata);
+}
+```
+
 * Extract a content of a file(URL/ bytes) to a `StreamReader` and perform buffered reading
 ```rust
 use std::io::{BufReader, Read};

diff --git a/bindings/extractous-python/Cargo.toml b/bindings/extractous-python/Cargo.toml
@@ -18,5 +18,5 @@ doc = false
 
 [dependencies]
 # "abi3-py310" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.10
-pyo3 = { version = "0.22.2", features = ["abi3", "abi3-py38"] }
+pyo3 = { version = "0.22.2", features = ["abi3", "abi3-py38", "gil-refs"] }
 extractous = { path = "../../extractous-core" }
diff --git a/bindings/extractous-python/src/extractor.rs b/bindings/extractous-python/src/extractor.rs
@@ -2,6 +2,8 @@ use crate::{ecore, OfficeParserConfig, PdfParserConfig, TesseractOcrConfig};
 use pyo3::exceptions::PyTypeError;
 use pyo3::prelude::*;
 use pyo3::types::PyByteArray;
+use pyo3::types::PyDict;
+use std::collections::HashMap;
 use std::io::Read;
 
 // PyO3 supports unit-only enums (which contain only unit variants)
@@ -80,13 +82,12 @@ impl StreamReader {
     pub fn readinto<'py>(&mut self, buf: Bound<'py, PyByteArray>) -> PyResult<usize> {
         let bs = unsafe { buf.as_bytes_mut() };
 
-        let bytes_read = self.reader.read(bs)
-            .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(
-                format!("{}", e))
-            )?;
+        let bytes_read = self
+            .reader
+            .read(bs)
+            .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(format!("{}", e)))?;
         Ok(bytes_read)
     }
-
 }
 
 /// `Extractor` is the entry for all extract APIs
@@ -151,6 +152,30 @@ impl Extractor {
         })
     }
 
+    /// Extracts text from a file path. Returns a tuple with stream of the extracted text
+    /// the stream is decoded using the extractor's `encoding` and tika metadata.
+    pub fn extract_file_with_metadata<'py>(
+        &self,
+        filename: &str,
+        py: Python<'py>,
+    ) -> PyResult<(StreamReader, PyObject)> {
+        let (reader, metadata) = self
+            .0
+            .extract_file_with_metadata(filename)
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
+
+        // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
+        let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
+        Ok((
+            StreamReader {
+                reader,
+                buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
+                py_bytes: None,
+            },
+            py_metadata.into(),
+        ))
+    }
+
     /// Extracts text from a file path. Returns a string that is of maximum length
     /// of the extractor's `extract_string_max_length`
     pub fn extract_file_to_string(&self, filename: &str) -> PyResult<String> {
@@ -159,6 +184,22 @@ impl Extractor {
             .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))
     }
 
+    /// Extracts text from a file path. Returns a tuple with string and dict that is of maximum length
+    /// of the extractor's `extract_string_max_length` and the metadata.
+    pub fn extract_file_to_string_with_metadata<'py>(
+        &self,
+        filename: &str,
+        py: Python<'py>,
+    ) -> PyResult<(String, PyObject)> {
+        let (content, metadata) = self
+            .0
+            .extract_file_to_string_with_metadata(filename)
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
+
+        let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
+        Ok((content, py_metadata.into()))
+    }
+
     /// Extracts text from a bytearray. Returns a stream of the extracted text
     /// the stream is decoded using the extractor's `encoding`
     pub fn extract_bytes(&self, buffer: &Bound<'_, PyByteArray>) -> PyResult<StreamReader> {
@@ -176,6 +217,31 @@ impl Extractor {
         })
     }
 
+    /// Extracts text from a bytearray. Returns a tuple with stream of the extracted text
+    /// the stream is decoded using the extractor's `encoding` and tika metadata.
+    pub fn extract_bytes_with_metadata<'py>(
+        &self,
+        buffer: &Bound<'_, PyByteArray>,
+        py: Python<'py>,
+    ) -> PyResult<(StreamReader, PyObject)> {
+        let slice = buffer.to_vec();
+        let (reader, metadata) = self
+            .0
+            .extract_bytes_with_metadata(&slice)
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
+
+        // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
+        let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
+        Ok((
+            StreamReader {
+                reader,
+                buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
+                py_bytes: None,
+            },
+            py_metadata.into(),
+        ))
+    }
+
     /// Extracts text from a url. Returns a string that is of maximum length
     /// of the extractor's `extract_string_max_length`
     pub fn extract_url(&self, url: &str) -> PyResult<StreamReader> {
@@ -192,7 +258,45 @@ impl Extractor {
         })
     }
 
+    /// Extracts text from a url. Returns a tuple with string that is of maximum length
+    /// of the extractor's `extract_string_max_length` and tika metdata.
+    pub fn extract_url_with_metadata<'py>(
+        &self,
+        url: &str,
+        py: Python<'py>,
+    ) -> PyResult<(StreamReader, PyObject)> {
+        let (reader, metadata) = self
+            .0
+            .extract_url_with_metadata(&url)
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
+
+        // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
+        let py_metadata = metadata_hashmap_to_pydict(py, &metadata)?;
+        Ok((
+            StreamReader {
+                reader,
+                buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
+                py_bytes: None,
+            },
+            py_metadata.into(),
+        ))
+    }
+
     fn __repr__(&self) -> String {
         format!("{:?}", self.0)
     }
 }
+
+/// Converts HashMap<String, Vec<String> to PyDict
+fn metadata_hashmap_to_pydict<'py>(
+    py: Python<'py>,
+    hashmap: &HashMap<String, Vec<String>>,
+) -> Result<Bound<'py, PyDict>, PyErr> {
+    let pydict = PyDict::new_bound(py);
+    for (key, value) in hashmap {
+        pydict
+            .set_item(key, value)
+            .map_err(|e| PyErr::new::<PyTypeError, _>(format!("{:?}", e)))?;
+    }
+    Ok(pydict)
+}
diff --git a/bindings/extractous-python/tests/test_extract_bytes_to_stream.py b/bindings/extractous-python/tests/test_extract_bytes_to_stream.py
@@ -1,7 +1,8 @@
+import json
 import pytest
 
 from extractous import Extractor
-from utils import cosine_similarity, read_to_string, read_file_to_bytearray
+from utils import calculate_similarity_percent, cosine_similarity, read_to_string, read_file_to_bytearray
 
 TEST_CASES = [
     ("2022_Q3_AAPL.pdf", 0.9),
@@ -38,3 +39,32 @@ def test_extract_bytes_to_stream(file_name, target_dist):
     assert cosine_similarity(result, expected) > target_dist, \
         f"Cosine similarity is less than {target_dist} for file: {file_name}"
 
+
+TEST_CASES_METADATA = [
+    ("2022_Q3_AAPL.pdf", 0.9),
+    ("science-exploration-1p.pptx", 0.9),
+    ("simple.odt", 0.9),
+    ("table-multi-row-column-cells-actual.csv", 0.6),
+    ("vodafone.xlsx", 0.8),
+    ("category-level.docx", 0.9),
+    ("simple.doc", 0.9),
+    ("simple.pptx", 0.9),
+    ("table-multi-row-column-cells.png", 0.9),
+    ("winter-sports.epub", 0.8),
+    ("bug_16.docx", 0.9),
+]
+
+
+@pytest.mark.parametrize("file_name, similarity_percent", TEST_CASES_METADATA)
+def test_extract_bytes_to_stream(file_name, similarity_percent):
+    """Test the extraction from bytes of various file types."""
+    original_filepath = f"../../test_files/documents/{file_name}"
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+    file_bytes = read_file_to_bytearray(original_filepath)
+    extractor = Extractor()
+    _reader, metadata = extractor.extract_bytes_with_metadata(file_bytes)
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
+    percent_similarity = calculate_similarity_percent(metadata, expected_metadata)
+    assert percent_similarity > similarity_percent, \
+        f"The metadata similarity is lower than expected. Current {percent_similarity}% | filename: {file_name}"
diff --git a/bindings/extractous-python/tests/test_extract_file_to_string.py b/bindings/extractous-python/tests/test_extract_file_to_string.py
@@ -1,7 +1,8 @@
+import json
 import pytest
 
 from extractous import Extractor
-from utils import cosine_similarity
+from utils import calculate_similarity_percent, cosine_similarity, is_expected_metadata_contained
 
 TEST_CASES = [
     ("2022_Q3_AAPL.pdf", 0.9),
@@ -32,3 +33,30 @@ def test_extract_file_to_string(file_name, target_dist):
     assert cosine_similarity(result, expected) > target_dist, \
         f"Cosine similarity is less than {target_dist} for file: {file_name}"
 
+
+TEST_CASES_METADATA = [
+    "2022_Q3_AAPL.pdf",
+    "science-exploration-1p.pptx",
+    "simple.odt",
+    "table-multi-row-column-cells-actual.csv",
+    "vodafone.xlsx",
+    "category-level.docx",
+    "simple.doc",
+    "simple.pptx",
+    "table-multi-row-column-cells.png",
+    "winter-sports.epub",
+    "bug_16.docx",
+]
+
+@pytest.mark.parametrize("file_name", TEST_CASES_METADATA)
+def test_extract_file_to_string_with_metadata(file_name):
+    """Test the extraction and comparison of various file types."""
+    original_filepath = f"../../test_files/documents/{file_name}"
+    expected_metadata_result_filepath = f"../../test_files/expected_result/{file_name}.metadata.json"
+    extractor = Extractor()
+    _result, metadata = extractor.extract_file_to_string_with_metadata(original_filepath)
+    with open(expected_metadata_result_filepath, 'r', encoding="utf8") as file:
+        expected_metadata = json.load(file)
+
+    #metadata.pop("dc:format")
+    assert is_expected_metadata_contained(expected_metadata, metadata)
diff --git a/bindings/extractous-python/tests/test_extract_url.py b/bindings/extractous-python/tests/test_extract_url.py
@@ -8,3 +8,8 @@ def test_extract_url():
     result = read_to_string(reader)
 
     assert "Google" in result
+
+def test_extract_url_with_metadata():
+    extractor = Extractor()
+    _reader, metadata = extractor.extract_url_with_metadata("https://www.google.com")
+    assert len(metadata.keys()) > 0
diff --git a/bindings/extractous-python/tests/utils.py b/bindings/extractous-python/tests/utils.py
@@ -45,3 +45,36 @@ def read_file_to_bytearray(file_path: str):
     with open(file_path, 'rb') as file:
         file_content = bytearray(file.read())
     return file_content
+
+
+def is_expected_metadata_contained(expected: dict, current: dict) -> bool:
+    """
+    Check if all keys in `expected` are present in `current` and have identical values.
+    """
+    for key, expected_values in expected.items():
+        actual_values = current.get(key)
+        if actual_values is None:
+            print(f"\nexpected key = {key} not found !!")
+            return False
+        elif actual_values != expected_values:
+            print(f"\nvalues for key = {key} differ!! expected = {expected_values} and actual = {actual_values}")
+            return False
+    return True
+
+
+def calculate_similarity_percent(expected, current):
+    matches = 0
+    total = 0
+
+    # Iterate over all keys in the 'expected' dictionary
+    for key, value1 in expected.items():
+        if key in current:
+            total += 1
+            if value1 == current[key]:
+                matches += 1
+
+    if total == 0:
+        return 0.0
+
+    # Return the similarity percentage
+    return matches / total