diff --git a/README.md b/README.md
index f439854..297db83 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@
_Extractous offers a fast and efficient solution for extracting content and metadata from various documents types such as PDF, Word, HTML, and [many other formats](#supported-file-formats).
-Our goal is to deliver a fast and efficient comprehensive solution in Rust with bindings for many programming
+Our goal is to deliver a fast and efficient comprehensive solution in Rust with bindings for many programming
languages._
@@ -27,7 +27,7 @@ languages._
---
**Demo**: showing that [Extractous 🚀](https://github.com/yobix-ai/extractous) is **25x faster** than the popular
-[unstructured-io](https://github.com/Unstructured-IO/unstructured) library ($65m in funding and 8.5k GitHub stars).
+[unstructured-io](https://github.com/Unstructured-IO/unstructured) library ($65m in funding and 8.5k GitHub stars).
For complete benchmarking details please consult our [benchmarking repository](https://github.com/yobix-ai/extractous-benchmarks)
![unstructured_vs_extractous](https://github.com/yobix-ai/extractous-benchmarks/raw/main/docs/extractous_vs_unstructured.gif)
@@ -55,7 +55,7 @@ With Extractous, the need for external services or APIs is eliminated, making da
* High-performance unstructured data extraction optimized for speed and low memory usage.
* Clear and simple API for extracting text and metadata content.
* Automatically identifies document types and extracts content accordingly
-* Supports [many file formats](#supported-file-formats) (most formats supported by Apache Tika).
+* Supports [many file formats](#supported-file-formats) (most formats supported by Apache Tika).
* Extracts text from images and scanned documents with OCR through [tesseract-ocr](https://github.com/tesseract-ocr/tesseract).
* Core engine written in Rust with bindings for [Python](https://pypi.org/project/extractous/) and upcoming support for JavaScript/TypeScript.
* Detailed documentation and examples to help you get started quickly and efficiently.
@@ -77,13 +77,20 @@ extractor.set_extract_string_max_length(1000)
result = extractor.extract_file_to_string("README.md")
print(result)
```
-* Extracting a file to a buffered stream:
+* Extracting a file(URL / bytearray) to a buffered stream:
```python
from extractous import Extractor
extractor = Extractor()
+# for file
reader = extractor.extract_file("tests/quarkus.pdf")
+# for url
+# reader = extractor.extract_url("https://www.google.com")
+# for bytearray
+# with open("tests/quarkus.pdf", "rb") as file:
+# buffer = bytearray(file.read())
+# reader = extractor.extract_bytes(buffer)
result = ""
buffer = reader.read(4096)
@@ -122,9 +129,10 @@ fn main() {
}
```
-* Extract a content of a file to a `StreamReader` and perform buffered reading
+* Extract a content of a file(URL/ bytes) to a `StreamReader` and perform buffered reading
```rust
-use std::io::Read;
+use std::io::{BufReader, Read};
+// use std::fs::File; use for bytes
use extractous::Extractor;
fn main() {
@@ -135,17 +143,25 @@ fn main() {
// Extract the provided file content to a string
let extractor = Extractor::new();
let stream = extractor.extract_file(file_path).unwrap();
+ // Extract url
+ // let stream = extractor.extract_url("https://www.google.com/").unwrap();
+ // Extract bytes
+ // let mut file = File::open(file_path)?;
+ // let mut buffer = Vec::new();
+ // file.read_to_end(&mut buffer)?;
+ // let stream= extractor.extract_bytes(&file_bytes);
// Because stream implements std::io::Read trait we can perform buffered reading
// For example we can use it to create a BufReader
+ let mut reader = BufReader::new(stream);
let mut buffer = Vec::new();
- stream.read_to_end(&mut buffer).unwrap();
+ reader.read_to_end(&mut buffer).unwrap();
println!("{}", String::from_utf8(buffer).unwrap())
}
```
-* Extract content of PDF with OCR.
+* Extract content of PDF with OCR.
You need to have Tesseract installed with the language pack. For example on debian `sudo apt install tesseract-ocr tesseract-ocr-deu`
@@ -154,7 +170,7 @@ use extractous::Extractor;
fn main() {
let file_path = "../test_files/documents/deu-ocr.pdf";
-
+
let extractor = Extractor::new()
.set_ocr_config(TesseractOcrConfig::new().set_language("deu"))
.set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::OCR_ONLY));
@@ -204,4 +220,4 @@ fn main() {
Contributions are welcome! Please open an issue or submit a pull request if you have any improvements or new features to propose.
## 🕮 License
-This project is licensed under the Apache License 2.0. See the LICENSE file for details.
\ No newline at end of file
+This project is licensed under the Apache License 2.0. See the LICENSE file for details.
diff --git a/bindings/extractous-python/README.md b/bindings/extractous-python/README.md
index 9aa83ee..998bbae 100644
--- a/bindings/extractous-python/README.md
+++ b/bindings/extractous-python/README.md
@@ -1,6 +1,6 @@
# Extractous Python Bindings
-This project provides Python bindings for the Extractous library, allowing you to use extractous functionality in
+This project provides Python bindings for the Extractous library, allowing you to use extractous functionality in
your Python applications.
## Installation
@@ -25,13 +25,20 @@ result = extractor.extract_file_to_string("README.md")
print(result)
```
-Extracting a file to a buffered stream:
+Extracting a file(URL / bytearray) to a buffered stream:
```python
from extractous import Extractor
extractor = Extractor()
+# for file
reader = extractor.extract_file("tests/quarkus.pdf")
+# for url
+# reader = extractor.extract_url("https://www.google.com")
+# for bytearray
+# with open("tests/quarkus.pdf", "rb") as file:
+# buffer = bytearray(file.read())
+# reader = extractor.extract_bytes(buffer)
result = ""
buffer = reader.read(4096)
@@ -51,4 +58,4 @@ extractor = Extractor().set_ocr_config(TesseractOcrConfig().set_language("deu"))
result = extractor.extract_file_to_string("../../test_files/documents/eng-ocr.pdf")
print(result)
-```
\ No newline at end of file
+```
diff --git a/bindings/extractous-python/examples/extract_to_stream.py b/bindings/extractous-python/examples/extract_to_stream.py
new file mode 100755
index 0000000..8068f14
--- /dev/null
+++ b/bindings/extractous-python/examples/extract_to_stream.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+import os
+import sys
+
+from extractous import Extractor, PdfOcrStrategy, PdfParserConfig
+
+
+def extract_to_stream(file_path: str):
+
+ # Extract the file
+ extractor = Extractor()
+ reader = extractor.extract_file(in_file)
+
+ buffer = bytearray(4096 * 4096)
+ while True:
+ bytes_read = reader.readinto(buffer)
+ # If no more data, exit the loop
+ if bytes_read == 0:
+ break
+ # Decode the valid portion of the buffer and append it to the result
+ chunk = buffer[:bytes_read].decode('utf-8')
+ print(chunk)
+
+
+if __name__ == '__main__':
+ # Pare input args
+ if len(sys.argv) != 2:
+ print(f"Usage: '{sys.argv[0]}' ")
+ sys.exit(1)
+ in_file = sys.argv[1]
+ if not os.path.isfile(in_file):
+ raise FileNotFoundError(f"No such file: '{in_file}'")
+
+ extract_to_stream(in_file)
diff --git a/bindings/extractous-python/src/extractor.rs b/bindings/extractous-python/src/extractor.rs
index 7376cca..ed95e7b 100644
--- a/bindings/extractous-python/src/extractor.rs
+++ b/bindings/extractous-python/src/extractor.rs
@@ -75,6 +75,18 @@ impl StreamReader {
))),
}
}
+
+ /// Reads into the specified buffer
+ pub fn readinto<'py>(&mut self, buf: Bound<'py, PyByteArray>) -> PyResult {
+ let bs = unsafe { buf.as_bytes_mut() };
+
+ let bytes_read = self.reader.read(bs)
+ .map_err(|e| PyErr::new::(
+ format!("{}", e))
+ )?;
+ Ok(bytes_read)
+ }
+
}
/// `Extractor` is the entry for all extract APIs
@@ -147,6 +159,39 @@ impl Extractor {
.map_err(|e| PyErr::new::(format!("{:?}", e)))
}
+ /// Extracts text from a bytearray. Returns a stream of the extracted text
+ /// the stream is decoded using the extractor's `encoding`
+ pub fn extract_bytes(&self, buffer: &Bound<'_, PyByteArray>) -> PyResult {
+ let slice = buffer.to_vec();
+ let reader = self
+ .0
+ .extract_bytes(&slice)
+ .map_err(|e| PyErr::new::(format!("{:?}", e)))?;
+
+ // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
+ Ok(StreamReader {
+ reader,
+ buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
+ py_bytes: None,
+ })
+ }
+
+ /// Extracts text from a url. Returns a string that is of maximum length
+ /// of the extractor's `extract_string_max_length`
+ pub fn extract_url(&self, url: &str) -> PyResult {
+ let reader = self
+ .0
+ .extract_url(&url)
+ .map_err(|e| PyErr::new::(format!("{:?}", e)))?;
+
+ // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes
+ Ok(StreamReader {
+ reader,
+ buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE),
+ py_bytes: None,
+ })
+ }
+
fn __repr__(&self) -> String {
format!("{:?}", self.0)
}
diff --git a/bindings/extractous-python/tests/test_extract_bytes_to_stream.py b/bindings/extractous-python/tests/test_extract_bytes_to_stream.py
new file mode 100644
index 0000000..32be6a7
--- /dev/null
+++ b/bindings/extractous-python/tests/test_extract_bytes_to_stream.py
@@ -0,0 +1,40 @@
+import pytest
+
+from extractous import Extractor
+from utils import cosine_similarity, read_to_string, read_file_to_bytearray
+
+TEST_CASES = [
+ ("2022_Q3_AAPL.pdf", 0.9),
+ ("science-exploration-1p.pptx", 0.9),
+ ("simple.odt", 0.9),
+ ("table-multi-row-column-cells-actual.csv", 0.9),
+ ("vodafone.xlsx", 0.4),
+ ("category-level.docx", 0.9),
+ ("simple.doc", 0.9),
+ ("simple.pptx", 0.9),
+ ("table-multi-row-column-cells.png", -1.0),
+ ("winter-sports.epub", 0.9),
+ ("bug_16.docx", 0.9),
+ #("eng-ocr.pdf", 0.9),
+]
+
+
+@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
+def test_extract_bytes_to_stream(file_name, target_dist):
+ """Test the extraction from bytes of various file types."""
+ original_filepath = f"../../test_files/documents/{file_name}"
+ expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt"
+
+ file_bytes = read_file_to_bytearray(original_filepath)
+
+ extractor = Extractor()
+ reader = extractor.extract_bytes(file_bytes)
+ result = read_to_string(reader)
+
+ # Expected
+ with open(expected_result_filepath, "r", encoding="utf8") as file:
+ expected = file.read()
+
+ assert cosine_similarity(result, expected) > target_dist, \
+ f"Cosine similarity is less than {target_dist} for file: {file_name}"
+
diff --git a/bindings/extractous-python/tests/test_extract_file_to_string.py b/bindings/extractous-python/tests/test_extract_file_to_string.py
index ed3dbe8..95b5bbb 100644
--- a/bindings/extractous-python/tests/test_extract_file_to_string.py
+++ b/bindings/extractous-python/tests/test_extract_file_to_string.py
@@ -15,9 +15,10 @@
("table-multi-row-column-cells.png", -1.0),
("winter-sports.epub", 0.9),
("bug_16.docx", 0.9),
- ("deu-ocr.pdf", 0.9),
+ #("eng-ocr.pdf", 0.9),
]
+
@pytest.mark.parametrize("file_name, target_dist", TEST_CASES)
def test_extract_file_to_string(file_name, target_dist):
"""Test the extraction and comparison of various file types."""
diff --git a/bindings/extractous-python/tests/test_extract_url.py b/bindings/extractous-python/tests/test_extract_url.py
new file mode 100644
index 0000000..b6f4158
--- /dev/null
+++ b/bindings/extractous-python/tests/test_extract_url.py
@@ -0,0 +1,10 @@
+from extractous import Extractor
+from utils import read_to_string
+
+def test_extract_url():
+ extractor = Extractor()
+
+ reader = extractor.extract_url("https://www.google.com")
+ result = read_to_string(reader)
+
+ assert "Google" in result
diff --git a/bindings/extractous-python/tests/test_ocr.py b/bindings/extractous-python/tests/test_ocr.py
index 7f4de09..4baaf76 100644
--- a/bindings/extractous-python/tests/test_ocr.py
+++ b/bindings/extractous-python/tests/test_ocr.py
@@ -1,19 +1,20 @@
from extractous import Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig
from utils import cosine_similarity
+
def test_ara_ocr_png():
ocr_config = TesseractOcrConfig().set_language("ara")
extractor = Extractor().set_ocr_config(ocr_config)
result = extractor.extract_file_to_string("../../test_files/documents/ara-ocr.png")
- with open("../../test_files/expected_result/ara-ocr.png.txt", "r", encoding="utf8") as file:
+ with open("../../test_files/expected_result/ara-ocr.png.txt", "r", encoding="utf8") as file:
expected = file.read()
- assert cosine_similarity(result, expected)
+ assert cosine_similarity(result, expected) > 0.9
-def test_ocr_only_strategy_extract_deu_ocr_pdf_to_string():
- test_file = "../../test_files/documents/eng-ocr.pdf"
+def test_extract_file_to_string_ocr_only_strategy_deu_ocr_pdf():
+ test_file = "../../test_files/documents/deu-ocr.pdf"
expected_result_file = "../../test_files/expected_result/deu-ocr.pdf.txt"
pdf_config = PdfParserConfig().set_ocr_strategy(PdfOcrStrategy.OCR_ONLY)
@@ -26,12 +27,13 @@ def test_ocr_only_strategy_extract_deu_ocr_pdf_to_string():
result = extractor.extract_file_to_string(test_file)
- with open(expected_result_file, "r", encoding="utf8") as file:
+ with open(expected_result_file, "r", encoding="utf8") as file:
expected = file.read()
- assert cosine_similarity(result, expected)
+ assert cosine_similarity(result, expected) > 0.9
+
-def test_no_ocr_strategy_extract_deu_ocr_pdf_to_string():
+def test_test_extract_file_to_string_no_ocr_strategy_deu_ocr_pdf():
test_file = "../../test_files/documents/deu-ocr.pdf"
pdf_config = PdfParserConfig()
@@ -39,8 +41,8 @@ def test_no_ocr_strategy_extract_deu_ocr_pdf_to_string():
ocr_config = TesseractOcrConfig()
ocr_config = ocr_config.set_language("deu")
- extractor = Extractor().set_ocr_config(ocr_config).set_pdf_config(PdfParserConfig().set_ocr_strategy(PdfOcrStrategy.NO_OCR))
+ extractor = Extractor().set_ocr_config(ocr_config).set_pdf_config(pdf_config)
result = extractor.extract_file_to_string(test_file)
- assert result.strip() == ""
\ No newline at end of file
+ assert result.strip() == ""
diff --git a/bindings/extractous-python/tests/test_pdf.py b/bindings/extractous-python/tests/test_pdf.py
index 5e85f3c..a14d9ed 100644
--- a/bindings/extractous-python/tests/test_pdf.py
+++ b/bindings/extractous-python/tests/test_pdf.py
@@ -1,4 +1,5 @@
from extractous import Extractor
+from utils import read_to_string
def expected_result():
@@ -12,16 +13,23 @@ def test_extract_file_to_string():
#print(result)
assert result == expected_result()
-
def test_extract_file():
extractor = Extractor()
reader = extractor.extract_file("tests/quarkus.pdf")
- result = ""
- b = reader.read(4096)
- while len(b) > 0:
- result += b.decode("utf-8")
- b = reader.read(4096)
+ result = read_to_string(reader)
#print(result)
- assert result == expected_result()
\ No newline at end of file
+ assert result == expected_result()
+
+def test_extract_bytes():
+ extractor = Extractor()
+
+ with open("tests/quarkus.pdf", "rb") as file:
+ buffer = bytearray(file.read())
+ reader = extractor.extract_bytes(buffer)
+
+ result = read_to_string(reader)
+
+ #print(result)
+ assert result == expected_result()
diff --git a/bindings/extractous-python/tests/utils.py b/bindings/extractous-python/tests/utils.py
index 30c3944..b153895 100644
--- a/bindings/extractous-python/tests/utils.py
+++ b/bindings/extractous-python/tests/utils.py
@@ -1,6 +1,7 @@
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cosine_sim
+
def cosine_similarity(text1, text2):
"""Calculate the cosine similarity between two texts."""
@@ -10,4 +11,37 @@ def cosine_similarity(text1, text2):
# Calculate cosine similarity between the two vectors
cos_sim = cosine_sim(vectors)
- return cos_sim[0][1]
\ No newline at end of file
+ return cos_sim[0][1]
+
+
+# def read_to_string(reader):
+# """Read from stream to string."""
+# result = ""
+# b = reader.read(4096)
+# while len(b) > 0:
+# result += b.decode("utf-8")
+# b = reader.read(4096)
+# return result
+
+def read_to_string(reader):
+ """Read from stream to string."""
+ utf8_string = []
+ buffer = bytearray(4096)
+
+ while True:
+ bytes_read = reader.readinto(buffer)
+ # If no more data, exit the loop
+ if bytes_read == 0:
+ break
+ # Decode the valid portion of the buffer and append it to the result
+ utf8_string.append(buffer[:bytes_read].decode('utf-8'))
+
+ # Join all parts into a single string
+ return ''.join(utf8_string)
+
+
+def read_file_to_bytearray(file_path: str):
+ """Read file to bytes array."""
+ with open(file_path, 'rb') as file:
+ file_content = bytearray(file.read())
+ return file_content
diff --git a/extractous-core/README.md b/extractous-core/README.md
index 4e04bbb..3e55a42 100644
--- a/extractous-core/README.md
+++ b/extractous-core/README.md
@@ -49,8 +49,9 @@ fn main() {
}
```
-* Extract a content of a file to a `StreamReader` and perform buffered reading
+* Extract a content of a file(URL/ bytes) to a `StreamReader` and perform buffered reading
```rust
+// use std::fs::File; use for bytes
use std::io::{BufReader, Read};
use extractous::Extractor;
@@ -62,6 +63,13 @@ fn main() {
// Extract the provided file content to a string
let extractor = Extractor::new();
let stream = extractor.extract_file(file_path).unwrap();
+ // Extract url
+ // let stream = extractor.extract_url("https://www.google.com/").unwrap();
+ // Extract bytes
+ // let mut file = File::open(file_path)?;
+ // let mut buffer = Vec::new();
+ // file.read_to_end(&mut buffer)?;
+ // let stream= extractor.extract_bytes(&file_bytes);
// Because stream implements std::io::Read trait we can perform buffered reading
// For example we can use it to create a BufReader
@@ -80,7 +88,7 @@ use extractous::Extractor;
fn main() {
let file_path = "../test_files/documents/deu-ocr.pdf";
-
+
let extractor = Extractor::new()
.set_ocr_config(TesseractOcrConfig::new().set_language("deu"))
.set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::OCR_ONLY));
@@ -94,11 +102,11 @@ fn main() {
## Building
### Requirements
-* Extractous uses [Apache Tika](https://tika.apache.org/) for file formats that are not natively supported in Rust.
- However, to achieve one of Extractous goals, which is speed and efficiency, we do not set up any Tika as a servers or
- run any Java code. We instead, compile [Apache Tika](https://tika.apache.org/) as native shared libraries and use
- them on our Rust core as ffi. [GraalVm](https://www.graalvm.org/) is required to build Tika as native libs.
-* The provided build script already takes care of installing the required GraalVM JDK. However, if you want to use a
+* Extractous uses [Apache Tika](https://tika.apache.org/) for file formats that are not natively supported in Rust.
+ However, to achieve one of Extractous goals, which is speed and efficiency, we do not set up any Tika as a servers or
+ run any Java code. We instead, compile [Apache Tika](https://tika.apache.org/) as native shared libraries and use
+ them on our Rust core as ffi. [GraalVm](https://www.graalvm.org/) is required to build Tika as native libs.
+* The provided build script already takes care of installing the required GraalVM JDK. However, if you want to use a
specific local version, you can do so by setting the GRAALVM_HOME environment variable
* We recommend using [sdkman](https://sdkman.io/install) to install GraalVM JDKs
* `sdk install java 22.0.1-graalce`
@@ -112,16 +120,18 @@ OpenJDK 64-Bit Server VM Liberica-NIK-24.0.1-1 (build 22.0.1+10, mixed mode, sha
* On macOS the official GraalVM JDKs fail to work with code that use java awt. On macOS, we recommend using
Bellsoft Liberica NIK
* `sdk install java 24.0.1.r22-nik`
-* Extractous supports OCR through [tesseract](https://github.com/tesseract-ocr/tesseract), make sure tesseract is
+* Extractous supports OCR through [tesseract](https://github.com/tesseract-ocr/tesseract), make sure tesseract is
installed on your system because some of the OCR tests will fail if no tesseract is found.
* `sudo apt install tesseract-ocr`
-* Install any language extensions you want. for example to install German and Arabic:
+* Install any language extensions you want. for example to install German and Arabic:
* `sudo apt install tesseract-ocr-deu tesseract-ocr-ara`
+* On Mac
+* `brew install tesseract tesseract-lang`
### Building Extractous
-* To build Extractous, just run:
+* To build Extractous, just run:
* `cargo build`
### Running Tests
* To run tests, just run:
-* `cargo test`
\ No newline at end of file
+* `cargo test`
diff --git a/extractous-core/examples/extract_to_stream.rs b/extractous-core/examples/extract_to_stream.rs
index 7c99f85..9bbb142 100644
--- a/extractous-core/examples/extract_to_stream.rs
+++ b/extractous-core/examples/extract_to_stream.rs
@@ -1,4 +1,5 @@
use extractous::Extractor;
+// use std::fs::File; use for bytes
use std::io::{BufReader, Read};
fn main() {
@@ -9,6 +10,14 @@ fn main() {
// Extract the provided file content to a string
let extractor = Extractor::new();
let stream = extractor.extract_file(file_path).unwrap();
+ // Extract url
+ // let stream = extractor.extract_url("https://www.google.com/").unwrap();
+ // Extract bytes
+ // let mut file = File::open(file_path)?;
+ // let mut buffer = Vec::new();
+ // file.read_to_end(&mut buffer)?;
+ // let stream= extractor.extract_bytes(&file_bytes).unwrap();
+
// Because stream implements std::io::Read trait we can perform buffered reading
// For example we can use it to create a BufReader
let mut reader = BufReader::new(stream);
diff --git a/extractous-core/src/extractor.rs b/extractous-core/src/extractor.rs
index 113e303..9917afa 100644
--- a/extractous-core/src/extractor.rs
+++ b/extractous-core/src/extractor.rs
@@ -124,6 +124,30 @@ impl Extractor {
)
}
+ /// Extracts text from a byte buffer. Returns a stream of the extracted text
+ /// the stream is decoded using the extractor's `encoding`
+ pub fn extract_bytes(&self, buffer: &[u8]) -> ExtractResult {
+ tika::parse_bytes(
+ buffer,
+ &self.encoding,
+ &self.pdf_config,
+ &self.office_config,
+ &self.ocr_config,
+ )
+ }
+
+ /// Extracts text from a url. Returns a stream of the extracted text
+ /// the stream is decoded using the extractor's `encoding`
+ pub fn extract_url(&self, url: &str) -> ExtractResult {
+ tika::parse_url(
+ url,
+ &self.encoding,
+ &self.pdf_config,
+ &self.office_config,
+ &self.ocr_config,
+ )
+ }
+
/// Extracts text from a file path. Returns a string that is of maximum length
/// of the extractor's `extract_string_max_length`
pub fn extract_file_to_string(&self, file_path: &str) -> ExtractResult {
@@ -141,10 +165,13 @@ impl Extractor {
mod tests {
use crate::Extractor;
use std::fs::File;
- use std::io::prelude::*;
use std::io::BufReader;
+ use std::io::{self, Read};
+
+ use super::StreamReader;
const TEST_FILE: &str = "README.md";
+ const TEST_URL: &str = "https://www.google.com/";
fn expected_content() -> String {
let mut file = File::open(TEST_FILE).unwrap();
@@ -153,6 +180,15 @@ mod tests {
content
}
+ fn read_content_from_stream(stream: StreamReader) -> String {
+ let mut reader = BufReader::new(stream);
+ let mut buffer = Vec::new();
+ reader.read_to_end(&mut buffer).unwrap();
+
+ let content = String::from_utf8(buffer).unwrap();
+ content
+ }
+
#[test]
fn extract_file_test() {
// Prepare expected_content
@@ -161,17 +197,8 @@ mod tests {
// Parse the files using extractous
let extractor = Extractor::new();
let result = extractor.extract_file(TEST_FILE);
- let mut reader = BufReader::new(result.unwrap());
- let mut buffer = Vec::new();
- reader.read_to_end(&mut buffer).unwrap();
-
- let content = String::from_utf8(buffer).unwrap();
+ let content = read_content_from_stream(result.unwrap());
assert_eq!(content.trim(), expected_content.trim());
-
- // let mut reader = BufReader::new(result.unwrap());
- // let mut line = String::new();
- // let _len = reader.read_line(&mut line).unwrap();
- //assert_eq!("# Extractous", line.trim());
}
#[test]
@@ -185,4 +212,33 @@ mod tests {
let content = result.unwrap();
assert_eq!(content.trim(), expected_content.trim());
}
+
+ fn read_file_as_bytes(path: &str) -> io::Result> {
+ let mut file = File::open(path)?;
+ let mut buffer = Vec::new();
+ file.read_to_end(&mut buffer)?;
+ Ok(buffer)
+ }
+
+ #[test]
+ fn extract_bytes_test() {
+ // Prepare expected_content
+ let expected_content = expected_content();
+
+ // Parse the bytes using extractous
+ let file_bytes = read_file_as_bytes(TEST_FILE).unwrap();
+ let extractor = Extractor::new();
+ let result = extractor.extract_bytes(&file_bytes);
+ let content = read_content_from_stream(result.unwrap());
+ assert_eq!(content.trim(), expected_content.trim());
+ }
+
+ #[test]
+ fn extract_url_test() {
+ // Parse url by extractous
+ let extractor = Extractor::new();
+ let result = extractor.extract_url(&TEST_URL);
+ let content = read_content_from_stream(result.unwrap());
+ assert!(content.contains("Google"));
+ }
}
diff --git a/extractous-core/src/tika/jni_utils.rs b/extractous-core/src/tika/jni_utils.rs
index 3eb9de6..a99bae2 100644
--- a/extractous-core/src/tika/jni_utils.rs
+++ b/extractous-core/src/tika/jni_utils.rs
@@ -1,11 +1,23 @@
use std::os::raw::{c_char, c_void};
use jni::errors::jni_error_code_to_result;
-use jni::objects::{JObject, JString, JValue, JValueOwned};
+use jni::objects::{JByteBuffer, JObject, JString, JValue, JValueOwned};
use jni::{sys, JNIEnv, JavaVM};
use crate::errors::{Error, ExtractResult};
+/// Calls a static method and prints any thrown exceptions to stderr
+pub fn jni_new_direct_buffer<'local>(
+ env: &mut JNIEnv<'local>,
+ data: *mut u8,
+ len: usize,
+) -> ExtractResult> {
+ let direct_byte_buffer = unsafe { env.new_direct_byte_buffer(data, len) }
+ .map_err(|_e| Error::JniEnvCall("Failed to create direct byte buffer"))?;
+
+ Ok(direct_byte_buffer)
+}
+
/// Calls a static method and prints any thrown exceptions to stderr
pub fn jni_call_static_method<'local>(
env: &mut JNIEnv<'local>,
@@ -99,20 +111,23 @@ pub fn jni_check_exception(env: &mut JNIEnv) -> ExtractResult {
/// linked in by the build script.
pub fn create_vm_isolate() -> JavaVM {
unsafe {
- // let mut option0 = sys::JavaVMOption {
- // optionString: "-Djava.awt.headless=true".as_ptr() as *mut c_char,
- // extraInfo: std::ptr::null_mut(),
- // };
-
- // Set java.library.path to be able to load libawt.so, which must be in the same dir as libtika_native.so
- let mut options = sys::JavaVMOption {
- optionString: "-Djava.library.path=.".as_ptr() as *mut c_char,
- extraInfo: std::ptr::null_mut(),
- };
+ let vm_options: Vec = vec![
+ // Set java.library.path to be able to load libawt.so, which must be in the same dir as libtika_native.so
+ sys::JavaVMOption {
+ optionString: "-Djava.library.path=.".as_ptr() as *mut c_char,
+ extraInfo: std::ptr::null_mut(),
+ },
+ // enable awt headless mode
+ sys::JavaVMOption {
+ optionString: "Djava.awt.headless=true".as_ptr() as *mut c_char,
+ extraInfo: std::ptr::null_mut(),
+ },
+ ];
+
let mut args = sys::JavaVMInitArgs {
version: sys::JNI_VERSION_1_8,
- nOptions: 1,
- options: &mut options,
+ nOptions: vm_options.len() as sys::jint,
+ options: vm_options.as_ptr() as *mut sys::JavaVMOption,
ignoreUnrecognized: sys::JNI_TRUE,
};
let mut ptr: *mut sys::JavaVM = std::ptr::null_mut();
diff --git a/extractous-core/src/tika/parse.rs b/extractous-core/src/tika/parse.rs
index a019e9b..8766d27 100644
--- a/extractous-core/src/tika/parse.rs
+++ b/extractous-core/src/tika/parse.rs
@@ -1,7 +1,7 @@
use std::sync::OnceLock;
use jni::objects::JValue;
-use jni::JavaVM;
+use jni::{AttachGuard, JavaVM};
use crate::errors::ExtractResult;
use crate::tika::jni_utils::*;
@@ -17,18 +17,23 @@ pub(crate) fn vm() -> &'static JavaVM {
GRAAL_VM.get_or_init(create_vm_isolate)
}
-pub fn parse_file(
- file_path: &str,
+fn get_vm_attach_current_thread<'local>() -> ExtractResult> {
+ // Attaching a thead that is already attached is a no-op. Good to have this in case this method
+ // is called from another thread
+ let env = vm().attach_current_thread()?;
+ Ok(env)
+}
+
+fn parse_to_stream(
+ mut env: AttachGuard,
+ data_source_val: JValue,
char_set: &CharSet,
pdf_conf: &PdfParserConfig,
office_conf: &OfficeParserConfig,
ocr_conf: &TesseractOcrConfig,
+ method_name: &str,
+ signature: &str,
) -> ExtractResult {
- // Attaching a thead that is already attached is a no-op. Good to have this in case this method
- // is called from another thread
- let mut env = vm().attach_current_thread()?;
-
- let file_path_val = jni_new_string_as_jvalue(&mut env, file_path)?;
let charset_name_val = jni_new_string_as_jvalue(&mut env, &char_set.to_string())?;
let j_pdf_conf = JPDFParserConfig::new(&mut env, pdf_conf)?;
let j_office_conf = JOfficeParserConfig::new(&mut env, office_conf)?;
@@ -38,15 +43,10 @@ pub fn parse_file(
let call_result = jni_call_static_method(
&mut env,
"ai/yobix/TikaNativeMain",
- "parseFile",
- "(Ljava/lang/String;\
- Ljava/lang/String;\
- Lorg/apache/tika/parser/pdf/PDFParserConfig;\
- Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
- Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
- )Lai/yobix/ReaderResult;",
+ method_name,
+ signature,
&[
- (&file_path_val).into(),
+ data_source_val,
(&charset_name_val).into(),
(&j_pdf_conf.internal).into(),
(&j_office_conf.internal).into(),
@@ -62,6 +62,33 @@ pub fn parse_file(
Ok(StreamReader { inner: j_reader })
}
+pub fn parse_file(
+ file_path: &str,
+ char_set: &CharSet,
+ pdf_conf: &PdfParserConfig,
+ office_conf: &OfficeParserConfig,
+ ocr_conf: &TesseractOcrConfig,
+) -> ExtractResult {
+ let mut env = get_vm_attach_current_thread()?;
+
+ let file_path_val = jni_new_string_as_jvalue(&mut env, file_path)?;
+ parse_to_stream(
+ env,
+ (&file_path_val).into(),
+ char_set,
+ pdf_conf,
+ office_conf,
+ ocr_conf,
+ "parseFile",
+ "(Ljava/lang/String;\
+ Ljava/lang/String;\
+ Lorg/apache/tika/parser/pdf/PDFParserConfig;\
+ Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
+ Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
+ )Lai/yobix/ReaderResult;",
+ )
+}
+
/// Parses a file to a string using the Apache Tika library.
pub fn parse_file_to_string(
file_path: &str,
@@ -70,9 +97,7 @@ pub fn parse_file_to_string(
office_conf: &OfficeParserConfig,
ocr_conf: &TesseractOcrConfig,
) -> ExtractResult {
- // Attaching a thead that is already attached is a no-op. Good to have this in case this method
- // is called from another thread
- let mut env = vm().attach_current_thread()?;
+ let mut env = get_vm_attach_current_thread()?;
// Create a new Java string from the Rust string
let file_path_val = jni_new_string_as_jvalue(&mut env, file_path)?;
@@ -102,3 +127,62 @@ pub fn parse_file_to_string(
Ok(result.content)
}
+
+pub fn parse_bytes(
+ buffer: &[u8],
+ char_set: &CharSet,
+ pdf_conf: &PdfParserConfig,
+ office_conf: &OfficeParserConfig,
+ ocr_conf: &TesseractOcrConfig,
+) -> ExtractResult {
+ let mut env = get_vm_attach_current_thread()?;
+
+ // Because we know the buffer is used for reading only, cast it to *mut u8 to satisfy the
+ // jni_new_direct_buffer call, which requires a mutable pointer
+ let mut_ptr: *mut u8 = buffer.as_ptr() as *mut u8;
+
+ let byte_buffer = jni_new_direct_buffer(&mut env, mut_ptr, buffer.len())?;
+
+ parse_to_stream(
+ env,
+ (&byte_buffer).into(),
+ char_set,
+ pdf_conf,
+ office_conf,
+ ocr_conf,
+ "parseBytes",
+ "(Ljava/nio/ByteBuffer;\
+ Ljava/lang/String;\
+ Lorg/apache/tika/parser/pdf/PDFParserConfig;\
+ Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
+ Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
+ )Lai/yobix/ReaderResult;",
+ )
+}
+
+pub fn parse_url(
+ url: &str,
+ char_set: &CharSet,
+ pdf_conf: &PdfParserConfig,
+ office_conf: &OfficeParserConfig,
+ ocr_conf: &TesseractOcrConfig,
+) -> ExtractResult {
+ let mut env = get_vm_attach_current_thread()?;
+
+ let url_val = jni_new_string_as_jvalue(&mut env, url)?;
+ parse_to_stream(
+ env,
+ (&url_val).into(),
+ char_set,
+ pdf_conf,
+ office_conf,
+ ocr_conf,
+ "parseUrl",
+ "(Ljava/lang/String;\
+ Ljava/lang/String;\
+ Lorg/apache/tika/parser/pdf/PDFParserConfig;\
+ Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\
+ Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\
+ )Lai/yobix/ReaderResult;",
+ )
+}
diff --git a/extractous-core/tests/extract_to_stream_tests.rs b/extractous-core/tests/extract_to_stream_tests.rs
new file mode 100644
index 0000000..c29d089
--- /dev/null
+++ b/extractous-core/tests/extract_to_stream_tests.rs
@@ -0,0 +1,74 @@
+extern crate test_case;
+extern crate textdistance;
+
+use extractous::{Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig};
+use std::fs;
+use std::io::Read;
+use test_case::test_case;
+use textdistance::nstr::cosine;
+
+#[test_case("2022_Q3_AAPL.pdf", 0.9; "Test PDF file")]
+#[test_case("science-exploration-1p.pptx", 0.9; "Test PPTX file")]
+#[test_case("simple.odt", 0.8; "Test ODT file")]
+#[test_case("table-multi-row-column-cells-actual.csv", 0.8; "Test CSV file")]
+#[test_case("vodafone.xlsx", 0.4; "Test XLSX file")]
+#[test_case("category-level.docx", 0.9; "Test DOCX file")]
+#[test_case("simple.doc", 0.9; "Test DOC file")]
+#[test_case("simple.pptx", 0.9; "Test another PPTX file")]
+#[test_case("table-multi-row-column-cells.png", -1.0; "Test PNG file")]
+#[test_case("winter-sports.epub", 0.9; "Test EPUB file")]
+#[test_case("bug_16.docx", 0.9; "Test bug16 DOCX file")]
+//#[test_case("eng-ocr.pdf", 0.9; "Test eng-ocr PDF file")]
+fn test_extract_bytes_to_stream(file_name: &str, target_dist: f64) {
+ let extractor = Extractor::new();
+
+ let bytes = fs::read(&format!("../test_files/documents/{}", file_name)).unwrap();
+ let mut stream = extractor.extract_bytes(&bytes).unwrap();
+
+ let mut buffer = Vec::new();
+ stream.read_to_end(&mut buffer).unwrap();
+ let extracted = String::from_utf8_lossy(&buffer);
+
+ // read expected string
+ let expected =
+ fs::read_to_string(format!("../test_files/expected_result/{}.txt", file_name)).unwrap();
+
+ let dist = cosine(&expected, &extracted);
+ assert!(
+ dist > target_dist,
+ "Cosine similarity is less than {} for file: {}, dist: {}",
+ target_dist,
+ file_name,
+ dist
+ );
+ println!("{}: {}", file_name, dist);
+}
+
+#[test]
+fn test_extract_bytes_to_stream_ara_ocr_png() {
+ let extractor = Extractor::new()
+ .set_ocr_config(TesseractOcrConfig::new().set_language("ara"))
+ .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::NO_OCR));
+
+ // extract file with extractor
+ let bytes = fs::read(&"../test_files/documents/ara-ocr.png".to_string()).unwrap();
+ let mut stream = extractor.extract_bytes(&bytes).unwrap();
+
+ let mut buffer = Vec::new();
+ stream.read_to_end(&mut buffer).unwrap();
+ let extracted = String::from_utf8_lossy(&buffer);
+
+ println!("{}", extracted);
+
+ // read expected string
+ let expected =
+ fs::read_to_string("../test_files/expected_result/ara-ocr.png.txt".to_string()).unwrap();
+
+ let dist = cosine(&expected, &extracted);
+ assert!(
+ dist > 0.9,
+ "Cosine similarity is less than 0.9 for file: ara-ocr.png, dist: {}",
+ dist
+ );
+ println!("{}: {}", "ara-ocr.png", dist);
+}
diff --git a/extractous-core/tests/extractor_test.rs b/extractous-core/tests/extract_to_string_tests.rs
similarity index 87%
rename from extractous-core/tests/extractor_test.rs
rename to extractous-core/tests/extract_to_string_tests.rs
index 5322c3f..7456442 100644
--- a/extractous-core/tests/extractor_test.rs
+++ b/extractous-core/tests/extract_to_string_tests.rs
@@ -17,7 +17,7 @@ use textdistance::nstr::cosine;
#[test_case("table-multi-row-column-cells.png", -1.0; "Test PNG file")]
#[test_case("winter-sports.epub", 0.9; "Test EPUB file")]
#[test_case("bug_16.docx", 0.9; "Test bug16 DOCX file")]
-#[test_case("eng-ocr.pdf", 0.9; "Test eng-ocr PDF file")]
+//#[test_case("eng-ocr.pdf", 0.9; "Test eng-ocr PDF file")]
fn test_extract_file_to_string(file_name: &str, target_dist: f64) {
let extractor = Extractor::new().set_extract_string_max_length(1000000);
// extract file with extractor
@@ -40,7 +40,7 @@ fn test_extract_file_to_string(file_name: &str, target_dist: f64) {
}
#[test]
-fn test_extract_ara_ocr_png_to_string() {
+fn test_extract_file_to_string_ara_ocr_png() {
let extractor = Extractor::new()
.set_ocr_config(TesseractOcrConfig::new().set_language("ara"))
.set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::NO_OCR));
@@ -61,18 +61,18 @@ fn test_extract_ara_ocr_png_to_string() {
"Cosine similarity is less than 0.9 for file: ara-ocr.png, dist: {}",
dist
);
- println!("{}: {}", "ara-ocr.png", dist);
}
+#[cfg(not(target_os = "macos"))]
#[test]
-fn test_ocr_only_strategy_extract_deu_ocr_pdf_to_string() {
+fn test_extract_file_to_string_ocr_only_strategy_deu_ocr_pdf() {
let extractor = Extractor::new()
.set_ocr_config(TesseractOcrConfig::new().set_language("deu"))
.set_pdf_config(
PdfParserConfig::new()
- .set_ocr_strategy(PdfOcrStrategy::OCR_ONLY)
- .set_extract_inline_images(true)
- .set_extract_unique_inline_images_only(true),
+ .set_ocr_strategy(PdfOcrStrategy::OCR_AND_TEXT_EXTRACTION)
+ .set_extract_inline_images(false)
+ .set_extract_unique_inline_images_only(false),
);
// extract file with extractor
let extracted = extractor
@@ -89,11 +89,11 @@ fn test_ocr_only_strategy_extract_deu_ocr_pdf_to_string() {
"Cosine similarity is less than 0.9 for file: ara-ocr.png, dist: {}",
dist
);
- println!("{}: {}", "ara-ocr.png", dist);
}
+#[cfg(not(target_os = "macos"))]
#[test]
-fn test_no_ocr_strategy_extract_deu_ocr_pdf_to_string() {
+fn test_test_extract_file_to_string_no_ocr_strategy_deu_ocr_pdf() {
let extractor = Extractor::new()
.set_ocr_config(TesseractOcrConfig::new().set_language("deu"))
.set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::NO_OCR));
diff --git a/extractous-core/tika-native/build.gradle b/extractous-core/tika-native/build.gradle
index d153548..793ae26 100644
--- a/extractous-core/tika-native/build.gradle
+++ b/extractous-core/tika-native/build.gradle
@@ -66,6 +66,7 @@ graalvmNative {
buildArgs.addAll(
"-H:+AddAllCharsets", // Very important to get UTF8 working
+ "--enable-https", // Very important https working
"-O3",
"--parallelism=$numThreads",
"-march=compatibility" // VERY IMPORTANT to use compatibility flag. If not the libs will use the cpu arch of the build machine and will notwork on other CPUs if distributed
diff --git a/extractous-core/tika-native/src/main/java/ai/yobix/ByteBufferInputStream.java b/extractous-core/tika-native/src/main/java/ai/yobix/ByteBufferInputStream.java
new file mode 100644
index 0000000..9abf3a2
--- /dev/null
+++ b/extractous-core/tika-native/src/main/java/ai/yobix/ByteBufferInputStream.java
@@ -0,0 +1,90 @@
+package ai.yobix;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+
+public class ByteBufferInputStream extends InputStream {
+
+ private ByteBuffer bb;
+
+ public ByteBufferInputStream(ByteBuffer bb) {
+ this.bb = bb;
+ }
+
+ @Override
+ public int read() throws IOException {
+ if (bb == null) {
+ throw new IOException("read on a closed InputStream");
+ }
+
+ if (bb.remaining() == 0) {
+ return -1;
+ }
+
+ return (bb.get() & 0xFF); // need to be in the range 0 to 255
+ }
+
+ @Override
+ public int read(byte[] b, int off, int len) throws IOException {
+
+ if (bb == null) {
+ throw new IOException("read on a closed InputStream");
+ }
+
+ if (b == null) {
+ throw new NullPointerException();
+ } else if (off < 0 || len < 0 || len > b.length - off) {
+ throw new IndexOutOfBoundsException();
+ } else if (len == 0) {
+ return 0;
+ }
+
+ int length = Math.min(bb.remaining(), len);
+ if (length == 0) {
+ return -1;
+ }
+
+ bb.get(b, off, length);
+ return length;
+ }
+
+ @Override
+ public long skip(long n) throws IOException {
+
+ if (bb == null) {
+ throw new IOException("skip on a closed InputStream");
+ }
+
+ if (n <= 0) {
+ return 0;
+ }
+
+ /*
+ * ByteBuffers have at most an int, so lose the upper bits.
+ * The contract allows this.
+ */
+ int nInt = (int) n;
+ int skip = Math.min(bb.remaining(), nInt);
+
+ bb.position(bb.position() + skip);
+
+ return nInt;
+ }
+
+ @Override
+ public int available() throws IOException {
+
+ if (bb == null) {
+ throw new IOException("available on a closed InputStream");
+ }
+
+ return bb.remaining();
+ }
+
+ @Override
+ public void close() throws IOException {
+ bb = null;
+ }
+
+}
diff --git a/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java b/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java
index ba83662..b524b40 100644
--- a/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java
+++ b/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java
@@ -1,34 +1,22 @@
package ai.yobix;
import org.apache.commons.io.input.ReaderInputStream;
-import org.apache.tika.exception.WriteLimitReachedException;
-import org.apache.tika.parser.ParsingReader;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.WriteOutContentHandler;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Reader;
-import java.net.MalformedURLException;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.nio.charset.Charset;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-
+import org.apache.tika.exception.WriteLimitReachedException;
+import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParsingReader;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.WriteOutContentHandler;
import org.graalvm.nativeimage.IsolateThread;
import org.graalvm.nativeimage.c.function.CEntryPoint;
import org.graalvm.nativeimage.c.type.CCharPointer;
@@ -36,6 +24,19 @@
import org.graalvm.nativeimage.c.type.CTypeConversion;
import org.xml.sax.SAXException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
public class TikaNativeMain {
private static final Tika tika = new Tika();
@@ -196,15 +197,17 @@ public static ReaderResult parseUrl(
* @return ReaderResult
*/
public static ReaderResult parseBytes(
- byte[] data,
+ ByteBuffer data,
String charsetName,
PDFParserConfig pdfConfig,
OfficeParserConfig officeConfig,
TesseractOCRConfig tesseractConfig
) {
+
final Metadata metadata = new Metadata();
- final TikaInputStream stream = TikaInputStream.get(data, metadata);
+ final ByteBufferInputStream inStream = new ByteBufferInputStream(data);
+ final TikaInputStream stream = TikaInputStream.get(inStream, new TemporaryResources(), metadata);
return parse(stream, metadata, charsetName, pdfConfig, officeConfig, tesseractConfig);
}
diff --git a/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json b/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json
index 288d373..496d5d3 100644
--- a/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json
+++ b/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json
@@ -55,7 +55,7 @@
{
"name": "parseBytes",
"parameterTypes": [
- "byte[]",
+ "java.nio.ByteBuffer",
"java.lang.String",
"org.apache.tika.parser.pdf.PDFParserConfig",
"org.apache.tika.parser.microsoft.OfficeParserConfig",