diff --git a/README.md b/README.md index f439854..297db83 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@
_Extractous offers a fast and efficient solution for extracting content and metadata from various documents types such as PDF, Word, HTML, and [many other formats](#supported-file-formats). -Our goal is to deliver a fast and efficient comprehensive solution in Rust with bindings for many programming +Our goal is to deliver a fast and efficient comprehensive solution in Rust with bindings for many programming languages._
@@ -27,7 +27,7 @@ languages._ --- **Demo**: showing that [Extractous 🚀](https://github.com/yobix-ai/extractous) is **25x faster** than the popular -[unstructured-io](https://github.com/Unstructured-IO/unstructured) library ($65m in funding and 8.5k GitHub stars). +[unstructured-io](https://github.com/Unstructured-IO/unstructured) library ($65m in funding and 8.5k GitHub stars). For complete benchmarking details please consult our [benchmarking repository](https://github.com/yobix-ai/extractous-benchmarks) ![unstructured_vs_extractous](https://github.com/yobix-ai/extractous-benchmarks/raw/main/docs/extractous_vs_unstructured.gif) @@ -55,7 +55,7 @@ With Extractous, the need for external services or APIs is eliminated, making da * High-performance unstructured data extraction optimized for speed and low memory usage. * Clear and simple API for extracting text and metadata content. * Automatically identifies document types and extracts content accordingly -* Supports [many file formats](#supported-file-formats) (most formats supported by Apache Tika). +* Supports [many file formats](#supported-file-formats) (most formats supported by Apache Tika). * Extracts text from images and scanned documents with OCR through [tesseract-ocr](https://github.com/tesseract-ocr/tesseract). * Core engine written in Rust with bindings for [Python](https://pypi.org/project/extractous/) and upcoming support for JavaScript/TypeScript. * Detailed documentation and examples to help you get started quickly and efficiently. @@ -77,13 +77,20 @@ extractor.set_extract_string_max_length(1000) result = extractor.extract_file_to_string("README.md") print(result) ``` -* Extracting a file to a buffered stream: +* Extracting a file(URL / bytearray) to a buffered stream: ```python from extractous import Extractor extractor = Extractor() +# for file reader = extractor.extract_file("tests/quarkus.pdf") +# for url +# reader = extractor.extract_url("https://www.google.com") +# for bytearray +# with open("tests/quarkus.pdf", "rb") as file: +# buffer = bytearray(file.read()) +# reader = extractor.extract_bytes(buffer) result = "" buffer = reader.read(4096) @@ -122,9 +129,10 @@ fn main() { } ``` -* Extract a content of a file to a `StreamReader` and perform buffered reading +* Extract a content of a file(URL/ bytes) to a `StreamReader` and perform buffered reading ```rust -use std::io::Read; +use std::io::{BufReader, Read}; +// use std::fs::File; use for bytes use extractous::Extractor; fn main() { @@ -135,17 +143,25 @@ fn main() { // Extract the provided file content to a string let extractor = Extractor::new(); let stream = extractor.extract_file(file_path).unwrap(); + // Extract url + // let stream = extractor.extract_url("https://www.google.com/").unwrap(); + // Extract bytes + // let mut file = File::open(file_path)?; + // let mut buffer = Vec::new(); + // file.read_to_end(&mut buffer)?; + // let stream= extractor.extract_bytes(&file_bytes); // Because stream implements std::io::Read trait we can perform buffered reading // For example we can use it to create a BufReader + let mut reader = BufReader::new(stream); let mut buffer = Vec::new(); - stream.read_to_end(&mut buffer).unwrap(); + reader.read_to_end(&mut buffer).unwrap(); println!("{}", String::from_utf8(buffer).unwrap()) } ``` -* Extract content of PDF with OCR. +* Extract content of PDF with OCR. You need to have Tesseract installed with the language pack. For example on debian `sudo apt install tesseract-ocr tesseract-ocr-deu` @@ -154,7 +170,7 @@ use extractous::Extractor; fn main() { let file_path = "../test_files/documents/deu-ocr.pdf"; - + let extractor = Extractor::new() .set_ocr_config(TesseractOcrConfig::new().set_language("deu")) .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::OCR_ONLY)); @@ -204,4 +220,4 @@ fn main() { Contributions are welcome! Please open an issue or submit a pull request if you have any improvements or new features to propose. ## 🕮 License -This project is licensed under the Apache License 2.0. See the LICENSE file for details. \ No newline at end of file +This project is licensed under the Apache License 2.0. See the LICENSE file for details. diff --git a/bindings/extractous-python/README.md b/bindings/extractous-python/README.md index 9aa83ee..998bbae 100644 --- a/bindings/extractous-python/README.md +++ b/bindings/extractous-python/README.md @@ -1,6 +1,6 @@ # Extractous Python Bindings -This project provides Python bindings for the Extractous library, allowing you to use extractous functionality in +This project provides Python bindings for the Extractous library, allowing you to use extractous functionality in your Python applications. ## Installation @@ -25,13 +25,20 @@ result = extractor.extract_file_to_string("README.md") print(result) ``` -Extracting a file to a buffered stream: +Extracting a file(URL / bytearray) to a buffered stream: ```python from extractous import Extractor extractor = Extractor() +# for file reader = extractor.extract_file("tests/quarkus.pdf") +# for url +# reader = extractor.extract_url("https://www.google.com") +# for bytearray +# with open("tests/quarkus.pdf", "rb") as file: +# buffer = bytearray(file.read()) +# reader = extractor.extract_bytes(buffer) result = "" buffer = reader.read(4096) @@ -51,4 +58,4 @@ extractor = Extractor().set_ocr_config(TesseractOcrConfig().set_language("deu")) result = extractor.extract_file_to_string("../../test_files/documents/eng-ocr.pdf") print(result) -``` \ No newline at end of file +``` diff --git a/bindings/extractous-python/examples/extract_to_stream.py b/bindings/extractous-python/examples/extract_to_stream.py new file mode 100755 index 0000000..8068f14 --- /dev/null +++ b/bindings/extractous-python/examples/extract_to_stream.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +import os +import sys + +from extractous import Extractor, PdfOcrStrategy, PdfParserConfig + + +def extract_to_stream(file_path: str): + + # Extract the file + extractor = Extractor() + reader = extractor.extract_file(in_file) + + buffer = bytearray(4096 * 4096) + while True: + bytes_read = reader.readinto(buffer) + # If no more data, exit the loop + if bytes_read == 0: + break + # Decode the valid portion of the buffer and append it to the result + chunk = buffer[:bytes_read].decode('utf-8') + print(chunk) + + +if __name__ == '__main__': + # Pare input args + if len(sys.argv) != 2: + print(f"Usage: '{sys.argv[0]}' ") + sys.exit(1) + in_file = sys.argv[1] + if not os.path.isfile(in_file): + raise FileNotFoundError(f"No such file: '{in_file}'") + + extract_to_stream(in_file) diff --git a/bindings/extractous-python/src/extractor.rs b/bindings/extractous-python/src/extractor.rs index 7376cca..ed95e7b 100644 --- a/bindings/extractous-python/src/extractor.rs +++ b/bindings/extractous-python/src/extractor.rs @@ -75,6 +75,18 @@ impl StreamReader { ))), } } + + /// Reads into the specified buffer + pub fn readinto<'py>(&mut self, buf: Bound<'py, PyByteArray>) -> PyResult { + let bs = unsafe { buf.as_bytes_mut() }; + + let bytes_read = self.reader.read(bs) + .map_err(|e| PyErr::new::( + format!("{}", e)) + )?; + Ok(bytes_read) + } + } /// `Extractor` is the entry for all extract APIs @@ -147,6 +159,39 @@ impl Extractor { .map_err(|e| PyErr::new::(format!("{:?}", e))) } + /// Extracts text from a bytearray. Returns a stream of the extracted text + /// the stream is decoded using the extractor's `encoding` + pub fn extract_bytes(&self, buffer: &Bound<'_, PyByteArray>) -> PyResult { + let slice = buffer.to_vec(); + let reader = self + .0 + .extract_bytes(&slice) + .map_err(|e| PyErr::new::(format!("{:?}", e)))?; + + // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes + Ok(StreamReader { + reader, + buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE), + py_bytes: None, + }) + } + + /// Extracts text from a url. Returns a string that is of maximum length + /// of the extractor's `extract_string_max_length` + pub fn extract_url(&self, url: &str) -> PyResult { + let reader = self + .0 + .extract_url(&url) + .map_err(|e| PyErr::new::(format!("{:?}", e)))?; + + // Create a new `StreamReader` with initial buffer capacity of ecore::DEFAULT_BUF_SIZE bytes + Ok(StreamReader { + reader, + buffer: Vec::with_capacity(ecore::DEFAULT_BUF_SIZE), + py_bytes: None, + }) + } + fn __repr__(&self) -> String { format!("{:?}", self.0) } diff --git a/bindings/extractous-python/tests/test_extract_bytes_to_stream.py b/bindings/extractous-python/tests/test_extract_bytes_to_stream.py new file mode 100644 index 0000000..32be6a7 --- /dev/null +++ b/bindings/extractous-python/tests/test_extract_bytes_to_stream.py @@ -0,0 +1,40 @@ +import pytest + +from extractous import Extractor +from utils import cosine_similarity, read_to_string, read_file_to_bytearray + +TEST_CASES = [ + ("2022_Q3_AAPL.pdf", 0.9), + ("science-exploration-1p.pptx", 0.9), + ("simple.odt", 0.9), + ("table-multi-row-column-cells-actual.csv", 0.9), + ("vodafone.xlsx", 0.4), + ("category-level.docx", 0.9), + ("simple.doc", 0.9), + ("simple.pptx", 0.9), + ("table-multi-row-column-cells.png", -1.0), + ("winter-sports.epub", 0.9), + ("bug_16.docx", 0.9), + #("eng-ocr.pdf", 0.9), +] + + +@pytest.mark.parametrize("file_name, target_dist", TEST_CASES) +def test_extract_bytes_to_stream(file_name, target_dist): + """Test the extraction from bytes of various file types.""" + original_filepath = f"../../test_files/documents/{file_name}" + expected_result_filepath = f"../../test_files/expected_result/{file_name}.txt" + + file_bytes = read_file_to_bytearray(original_filepath) + + extractor = Extractor() + reader = extractor.extract_bytes(file_bytes) + result = read_to_string(reader) + + # Expected + with open(expected_result_filepath, "r", encoding="utf8") as file: + expected = file.read() + + assert cosine_similarity(result, expected) > target_dist, \ + f"Cosine similarity is less than {target_dist} for file: {file_name}" + diff --git a/bindings/extractous-python/tests/test_extract_file_to_string.py b/bindings/extractous-python/tests/test_extract_file_to_string.py index ed3dbe8..95b5bbb 100644 --- a/bindings/extractous-python/tests/test_extract_file_to_string.py +++ b/bindings/extractous-python/tests/test_extract_file_to_string.py @@ -15,9 +15,10 @@ ("table-multi-row-column-cells.png", -1.0), ("winter-sports.epub", 0.9), ("bug_16.docx", 0.9), - ("deu-ocr.pdf", 0.9), + #("eng-ocr.pdf", 0.9), ] + @pytest.mark.parametrize("file_name, target_dist", TEST_CASES) def test_extract_file_to_string(file_name, target_dist): """Test the extraction and comparison of various file types.""" diff --git a/bindings/extractous-python/tests/test_extract_url.py b/bindings/extractous-python/tests/test_extract_url.py new file mode 100644 index 0000000..b6f4158 --- /dev/null +++ b/bindings/extractous-python/tests/test_extract_url.py @@ -0,0 +1,10 @@ +from extractous import Extractor +from utils import read_to_string + +def test_extract_url(): + extractor = Extractor() + + reader = extractor.extract_url("https://www.google.com") + result = read_to_string(reader) + + assert "Google" in result diff --git a/bindings/extractous-python/tests/test_ocr.py b/bindings/extractous-python/tests/test_ocr.py index 7f4de09..4baaf76 100644 --- a/bindings/extractous-python/tests/test_ocr.py +++ b/bindings/extractous-python/tests/test_ocr.py @@ -1,19 +1,20 @@ from extractous import Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig from utils import cosine_similarity + def test_ara_ocr_png(): ocr_config = TesseractOcrConfig().set_language("ara") extractor = Extractor().set_ocr_config(ocr_config) result = extractor.extract_file_to_string("../../test_files/documents/ara-ocr.png") - with open("../../test_files/expected_result/ara-ocr.png.txt", "r", encoding="utf8") as file: + with open("../../test_files/expected_result/ara-ocr.png.txt", "r", encoding="utf8") as file: expected = file.read() - assert cosine_similarity(result, expected) + assert cosine_similarity(result, expected) > 0.9 -def test_ocr_only_strategy_extract_deu_ocr_pdf_to_string(): - test_file = "../../test_files/documents/eng-ocr.pdf" +def test_extract_file_to_string_ocr_only_strategy_deu_ocr_pdf(): + test_file = "../../test_files/documents/deu-ocr.pdf" expected_result_file = "../../test_files/expected_result/deu-ocr.pdf.txt" pdf_config = PdfParserConfig().set_ocr_strategy(PdfOcrStrategy.OCR_ONLY) @@ -26,12 +27,13 @@ def test_ocr_only_strategy_extract_deu_ocr_pdf_to_string(): result = extractor.extract_file_to_string(test_file) - with open(expected_result_file, "r", encoding="utf8") as file: + with open(expected_result_file, "r", encoding="utf8") as file: expected = file.read() - assert cosine_similarity(result, expected) + assert cosine_similarity(result, expected) > 0.9 + -def test_no_ocr_strategy_extract_deu_ocr_pdf_to_string(): +def test_test_extract_file_to_string_no_ocr_strategy_deu_ocr_pdf(): test_file = "../../test_files/documents/deu-ocr.pdf" pdf_config = PdfParserConfig() @@ -39,8 +41,8 @@ def test_no_ocr_strategy_extract_deu_ocr_pdf_to_string(): ocr_config = TesseractOcrConfig() ocr_config = ocr_config.set_language("deu") - extractor = Extractor().set_ocr_config(ocr_config).set_pdf_config(PdfParserConfig().set_ocr_strategy(PdfOcrStrategy.NO_OCR)) + extractor = Extractor().set_ocr_config(ocr_config).set_pdf_config(pdf_config) result = extractor.extract_file_to_string(test_file) - assert result.strip() == "" \ No newline at end of file + assert result.strip() == "" diff --git a/bindings/extractous-python/tests/test_pdf.py b/bindings/extractous-python/tests/test_pdf.py index 5e85f3c..a14d9ed 100644 --- a/bindings/extractous-python/tests/test_pdf.py +++ b/bindings/extractous-python/tests/test_pdf.py @@ -1,4 +1,5 @@ from extractous import Extractor +from utils import read_to_string def expected_result(): @@ -12,16 +13,23 @@ def test_extract_file_to_string(): #print(result) assert result == expected_result() - def test_extract_file(): extractor = Extractor() reader = extractor.extract_file("tests/quarkus.pdf") - result = "" - b = reader.read(4096) - while len(b) > 0: - result += b.decode("utf-8") - b = reader.read(4096) + result = read_to_string(reader) #print(result) - assert result == expected_result() \ No newline at end of file + assert result == expected_result() + +def test_extract_bytes(): + extractor = Extractor() + + with open("tests/quarkus.pdf", "rb") as file: + buffer = bytearray(file.read()) + reader = extractor.extract_bytes(buffer) + + result = read_to_string(reader) + + #print(result) + assert result == expected_result() diff --git a/bindings/extractous-python/tests/utils.py b/bindings/extractous-python/tests/utils.py index 30c3944..b153895 100644 --- a/bindings/extractous-python/tests/utils.py +++ b/bindings/extractous-python/tests/utils.py @@ -1,6 +1,7 @@ from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity as cosine_sim + def cosine_similarity(text1, text2): """Calculate the cosine similarity between two texts.""" @@ -10,4 +11,37 @@ def cosine_similarity(text1, text2): # Calculate cosine similarity between the two vectors cos_sim = cosine_sim(vectors) - return cos_sim[0][1] \ No newline at end of file + return cos_sim[0][1] + + +# def read_to_string(reader): +# """Read from stream to string.""" +# result = "" +# b = reader.read(4096) +# while len(b) > 0: +# result += b.decode("utf-8") +# b = reader.read(4096) +# return result + +def read_to_string(reader): + """Read from stream to string.""" + utf8_string = [] + buffer = bytearray(4096) + + while True: + bytes_read = reader.readinto(buffer) + # If no more data, exit the loop + if bytes_read == 0: + break + # Decode the valid portion of the buffer and append it to the result + utf8_string.append(buffer[:bytes_read].decode('utf-8')) + + # Join all parts into a single string + return ''.join(utf8_string) + + +def read_file_to_bytearray(file_path: str): + """Read file to bytes array.""" + with open(file_path, 'rb') as file: + file_content = bytearray(file.read()) + return file_content diff --git a/extractous-core/README.md b/extractous-core/README.md index 4e04bbb..3e55a42 100644 --- a/extractous-core/README.md +++ b/extractous-core/README.md @@ -49,8 +49,9 @@ fn main() { } ``` -* Extract a content of a file to a `StreamReader` and perform buffered reading +* Extract a content of a file(URL/ bytes) to a `StreamReader` and perform buffered reading ```rust +// use std::fs::File; use for bytes use std::io::{BufReader, Read}; use extractous::Extractor; @@ -62,6 +63,13 @@ fn main() { // Extract the provided file content to a string let extractor = Extractor::new(); let stream = extractor.extract_file(file_path).unwrap(); + // Extract url + // let stream = extractor.extract_url("https://www.google.com/").unwrap(); + // Extract bytes + // let mut file = File::open(file_path)?; + // let mut buffer = Vec::new(); + // file.read_to_end(&mut buffer)?; + // let stream= extractor.extract_bytes(&file_bytes); // Because stream implements std::io::Read trait we can perform buffered reading // For example we can use it to create a BufReader @@ -80,7 +88,7 @@ use extractous::Extractor; fn main() { let file_path = "../test_files/documents/deu-ocr.pdf"; - + let extractor = Extractor::new() .set_ocr_config(TesseractOcrConfig::new().set_language("deu")) .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::OCR_ONLY)); @@ -94,11 +102,11 @@ fn main() { ## Building ### Requirements -* Extractous uses [Apache Tika](https://tika.apache.org/) for file formats that are not natively supported in Rust. - However, to achieve one of Extractous goals, which is speed and efficiency, we do not set up any Tika as a servers or - run any Java code. We instead, compile [Apache Tika](https://tika.apache.org/) as native shared libraries and use - them on our Rust core as ffi. [GraalVm](https://www.graalvm.org/) is required to build Tika as native libs. -* The provided build script already takes care of installing the required GraalVM JDK. However, if you want to use a +* Extractous uses [Apache Tika](https://tika.apache.org/) for file formats that are not natively supported in Rust. + However, to achieve one of Extractous goals, which is speed and efficiency, we do not set up any Tika as a servers or + run any Java code. We instead, compile [Apache Tika](https://tika.apache.org/) as native shared libraries and use + them on our Rust core as ffi. [GraalVm](https://www.graalvm.org/) is required to build Tika as native libs. +* The provided build script already takes care of installing the required GraalVM JDK. However, if you want to use a specific local version, you can do so by setting the GRAALVM_HOME environment variable * We recommend using [sdkman](https://sdkman.io/install) to install GraalVM JDKs * `sdk install java 22.0.1-graalce` @@ -112,16 +120,18 @@ OpenJDK 64-Bit Server VM Liberica-NIK-24.0.1-1 (build 22.0.1+10, mixed mode, sha * On macOS the official GraalVM JDKs fail to work with code that use java awt. On macOS, we recommend using Bellsoft Liberica NIK * `sdk install java 24.0.1.r22-nik` -* Extractous supports OCR through [tesseract](https://github.com/tesseract-ocr/tesseract), make sure tesseract is +* Extractous supports OCR through [tesseract](https://github.com/tesseract-ocr/tesseract), make sure tesseract is installed on your system because some of the OCR tests will fail if no tesseract is found. * `sudo apt install tesseract-ocr` -* Install any language extensions you want. for example to install German and Arabic: +* Install any language extensions you want. for example to install German and Arabic: * `sudo apt install tesseract-ocr-deu tesseract-ocr-ara` +* On Mac +* `brew install tesseract tesseract-lang` ### Building Extractous -* To build Extractous, just run: +* To build Extractous, just run: * `cargo build` ### Running Tests * To run tests, just run: -* `cargo test` \ No newline at end of file +* `cargo test` diff --git a/extractous-core/examples/extract_to_stream.rs b/extractous-core/examples/extract_to_stream.rs index 7c99f85..9bbb142 100644 --- a/extractous-core/examples/extract_to_stream.rs +++ b/extractous-core/examples/extract_to_stream.rs @@ -1,4 +1,5 @@ use extractous::Extractor; +// use std::fs::File; use for bytes use std::io::{BufReader, Read}; fn main() { @@ -9,6 +10,14 @@ fn main() { // Extract the provided file content to a string let extractor = Extractor::new(); let stream = extractor.extract_file(file_path).unwrap(); + // Extract url + // let stream = extractor.extract_url("https://www.google.com/").unwrap(); + // Extract bytes + // let mut file = File::open(file_path)?; + // let mut buffer = Vec::new(); + // file.read_to_end(&mut buffer)?; + // let stream= extractor.extract_bytes(&file_bytes).unwrap(); + // Because stream implements std::io::Read trait we can perform buffered reading // For example we can use it to create a BufReader let mut reader = BufReader::new(stream); diff --git a/extractous-core/src/extractor.rs b/extractous-core/src/extractor.rs index 113e303..9917afa 100644 --- a/extractous-core/src/extractor.rs +++ b/extractous-core/src/extractor.rs @@ -124,6 +124,30 @@ impl Extractor { ) } + /// Extracts text from a byte buffer. Returns a stream of the extracted text + /// the stream is decoded using the extractor's `encoding` + pub fn extract_bytes(&self, buffer: &[u8]) -> ExtractResult { + tika::parse_bytes( + buffer, + &self.encoding, + &self.pdf_config, + &self.office_config, + &self.ocr_config, + ) + } + + /// Extracts text from a url. Returns a stream of the extracted text + /// the stream is decoded using the extractor's `encoding` + pub fn extract_url(&self, url: &str) -> ExtractResult { + tika::parse_url( + url, + &self.encoding, + &self.pdf_config, + &self.office_config, + &self.ocr_config, + ) + } + /// Extracts text from a file path. Returns a string that is of maximum length /// of the extractor's `extract_string_max_length` pub fn extract_file_to_string(&self, file_path: &str) -> ExtractResult { @@ -141,10 +165,13 @@ impl Extractor { mod tests { use crate::Extractor; use std::fs::File; - use std::io::prelude::*; use std::io::BufReader; + use std::io::{self, Read}; + + use super::StreamReader; const TEST_FILE: &str = "README.md"; + const TEST_URL: &str = "https://www.google.com/"; fn expected_content() -> String { let mut file = File::open(TEST_FILE).unwrap(); @@ -153,6 +180,15 @@ mod tests { content } + fn read_content_from_stream(stream: StreamReader) -> String { + let mut reader = BufReader::new(stream); + let mut buffer = Vec::new(); + reader.read_to_end(&mut buffer).unwrap(); + + let content = String::from_utf8(buffer).unwrap(); + content + } + #[test] fn extract_file_test() { // Prepare expected_content @@ -161,17 +197,8 @@ mod tests { // Parse the files using extractous let extractor = Extractor::new(); let result = extractor.extract_file(TEST_FILE); - let mut reader = BufReader::new(result.unwrap()); - let mut buffer = Vec::new(); - reader.read_to_end(&mut buffer).unwrap(); - - let content = String::from_utf8(buffer).unwrap(); + let content = read_content_from_stream(result.unwrap()); assert_eq!(content.trim(), expected_content.trim()); - - // let mut reader = BufReader::new(result.unwrap()); - // let mut line = String::new(); - // let _len = reader.read_line(&mut line).unwrap(); - //assert_eq!("# Extractous", line.trim()); } #[test] @@ -185,4 +212,33 @@ mod tests { let content = result.unwrap(); assert_eq!(content.trim(), expected_content.trim()); } + + fn read_file_as_bytes(path: &str) -> io::Result> { + let mut file = File::open(path)?; + let mut buffer = Vec::new(); + file.read_to_end(&mut buffer)?; + Ok(buffer) + } + + #[test] + fn extract_bytes_test() { + // Prepare expected_content + let expected_content = expected_content(); + + // Parse the bytes using extractous + let file_bytes = read_file_as_bytes(TEST_FILE).unwrap(); + let extractor = Extractor::new(); + let result = extractor.extract_bytes(&file_bytes); + let content = read_content_from_stream(result.unwrap()); + assert_eq!(content.trim(), expected_content.trim()); + } + + #[test] + fn extract_url_test() { + // Parse url by extractous + let extractor = Extractor::new(); + let result = extractor.extract_url(&TEST_URL); + let content = read_content_from_stream(result.unwrap()); + assert!(content.contains("Google")); + } } diff --git a/extractous-core/src/tika/jni_utils.rs b/extractous-core/src/tika/jni_utils.rs index 3eb9de6..a99bae2 100644 --- a/extractous-core/src/tika/jni_utils.rs +++ b/extractous-core/src/tika/jni_utils.rs @@ -1,11 +1,23 @@ use std::os::raw::{c_char, c_void}; use jni::errors::jni_error_code_to_result; -use jni::objects::{JObject, JString, JValue, JValueOwned}; +use jni::objects::{JByteBuffer, JObject, JString, JValue, JValueOwned}; use jni::{sys, JNIEnv, JavaVM}; use crate::errors::{Error, ExtractResult}; +/// Calls a static method and prints any thrown exceptions to stderr +pub fn jni_new_direct_buffer<'local>( + env: &mut JNIEnv<'local>, + data: *mut u8, + len: usize, +) -> ExtractResult> { + let direct_byte_buffer = unsafe { env.new_direct_byte_buffer(data, len) } + .map_err(|_e| Error::JniEnvCall("Failed to create direct byte buffer"))?; + + Ok(direct_byte_buffer) +} + /// Calls a static method and prints any thrown exceptions to stderr pub fn jni_call_static_method<'local>( env: &mut JNIEnv<'local>, @@ -99,20 +111,23 @@ pub fn jni_check_exception(env: &mut JNIEnv) -> ExtractResult { /// linked in by the build script. pub fn create_vm_isolate() -> JavaVM { unsafe { - // let mut option0 = sys::JavaVMOption { - // optionString: "-Djava.awt.headless=true".as_ptr() as *mut c_char, - // extraInfo: std::ptr::null_mut(), - // }; - - // Set java.library.path to be able to load libawt.so, which must be in the same dir as libtika_native.so - let mut options = sys::JavaVMOption { - optionString: "-Djava.library.path=.".as_ptr() as *mut c_char, - extraInfo: std::ptr::null_mut(), - }; + let vm_options: Vec = vec![ + // Set java.library.path to be able to load libawt.so, which must be in the same dir as libtika_native.so + sys::JavaVMOption { + optionString: "-Djava.library.path=.".as_ptr() as *mut c_char, + extraInfo: std::ptr::null_mut(), + }, + // enable awt headless mode + sys::JavaVMOption { + optionString: "Djava.awt.headless=true".as_ptr() as *mut c_char, + extraInfo: std::ptr::null_mut(), + }, + ]; + let mut args = sys::JavaVMInitArgs { version: sys::JNI_VERSION_1_8, - nOptions: 1, - options: &mut options, + nOptions: vm_options.len() as sys::jint, + options: vm_options.as_ptr() as *mut sys::JavaVMOption, ignoreUnrecognized: sys::JNI_TRUE, }; let mut ptr: *mut sys::JavaVM = std::ptr::null_mut(); diff --git a/extractous-core/src/tika/parse.rs b/extractous-core/src/tika/parse.rs index a019e9b..8766d27 100644 --- a/extractous-core/src/tika/parse.rs +++ b/extractous-core/src/tika/parse.rs @@ -1,7 +1,7 @@ use std::sync::OnceLock; use jni::objects::JValue; -use jni::JavaVM; +use jni::{AttachGuard, JavaVM}; use crate::errors::ExtractResult; use crate::tika::jni_utils::*; @@ -17,18 +17,23 @@ pub(crate) fn vm() -> &'static JavaVM { GRAAL_VM.get_or_init(create_vm_isolate) } -pub fn parse_file( - file_path: &str, +fn get_vm_attach_current_thread<'local>() -> ExtractResult> { + // Attaching a thead that is already attached is a no-op. Good to have this in case this method + // is called from another thread + let env = vm().attach_current_thread()?; + Ok(env) +} + +fn parse_to_stream( + mut env: AttachGuard, + data_source_val: JValue, char_set: &CharSet, pdf_conf: &PdfParserConfig, office_conf: &OfficeParserConfig, ocr_conf: &TesseractOcrConfig, + method_name: &str, + signature: &str, ) -> ExtractResult { - // Attaching a thead that is already attached is a no-op. Good to have this in case this method - // is called from another thread - let mut env = vm().attach_current_thread()?; - - let file_path_val = jni_new_string_as_jvalue(&mut env, file_path)?; let charset_name_val = jni_new_string_as_jvalue(&mut env, &char_set.to_string())?; let j_pdf_conf = JPDFParserConfig::new(&mut env, pdf_conf)?; let j_office_conf = JOfficeParserConfig::new(&mut env, office_conf)?; @@ -38,15 +43,10 @@ pub fn parse_file( let call_result = jni_call_static_method( &mut env, "ai/yobix/TikaNativeMain", - "parseFile", - "(Ljava/lang/String;\ - Ljava/lang/String;\ - Lorg/apache/tika/parser/pdf/PDFParserConfig;\ - Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\ - Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\ - )Lai/yobix/ReaderResult;", + method_name, + signature, &[ - (&file_path_val).into(), + data_source_val, (&charset_name_val).into(), (&j_pdf_conf.internal).into(), (&j_office_conf.internal).into(), @@ -62,6 +62,33 @@ pub fn parse_file( Ok(StreamReader { inner: j_reader }) } +pub fn parse_file( + file_path: &str, + char_set: &CharSet, + pdf_conf: &PdfParserConfig, + office_conf: &OfficeParserConfig, + ocr_conf: &TesseractOcrConfig, +) -> ExtractResult { + let mut env = get_vm_attach_current_thread()?; + + let file_path_val = jni_new_string_as_jvalue(&mut env, file_path)?; + parse_to_stream( + env, + (&file_path_val).into(), + char_set, + pdf_conf, + office_conf, + ocr_conf, + "parseFile", + "(Ljava/lang/String;\ + Ljava/lang/String;\ + Lorg/apache/tika/parser/pdf/PDFParserConfig;\ + Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\ + Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\ + )Lai/yobix/ReaderResult;", + ) +} + /// Parses a file to a string using the Apache Tika library. pub fn parse_file_to_string( file_path: &str, @@ -70,9 +97,7 @@ pub fn parse_file_to_string( office_conf: &OfficeParserConfig, ocr_conf: &TesseractOcrConfig, ) -> ExtractResult { - // Attaching a thead that is already attached is a no-op. Good to have this in case this method - // is called from another thread - let mut env = vm().attach_current_thread()?; + let mut env = get_vm_attach_current_thread()?; // Create a new Java string from the Rust string let file_path_val = jni_new_string_as_jvalue(&mut env, file_path)?; @@ -102,3 +127,62 @@ pub fn parse_file_to_string( Ok(result.content) } + +pub fn parse_bytes( + buffer: &[u8], + char_set: &CharSet, + pdf_conf: &PdfParserConfig, + office_conf: &OfficeParserConfig, + ocr_conf: &TesseractOcrConfig, +) -> ExtractResult { + let mut env = get_vm_attach_current_thread()?; + + // Because we know the buffer is used for reading only, cast it to *mut u8 to satisfy the + // jni_new_direct_buffer call, which requires a mutable pointer + let mut_ptr: *mut u8 = buffer.as_ptr() as *mut u8; + + let byte_buffer = jni_new_direct_buffer(&mut env, mut_ptr, buffer.len())?; + + parse_to_stream( + env, + (&byte_buffer).into(), + char_set, + pdf_conf, + office_conf, + ocr_conf, + "parseBytes", + "(Ljava/nio/ByteBuffer;\ + Ljava/lang/String;\ + Lorg/apache/tika/parser/pdf/PDFParserConfig;\ + Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\ + Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\ + )Lai/yobix/ReaderResult;", + ) +} + +pub fn parse_url( + url: &str, + char_set: &CharSet, + pdf_conf: &PdfParserConfig, + office_conf: &OfficeParserConfig, + ocr_conf: &TesseractOcrConfig, +) -> ExtractResult { + let mut env = get_vm_attach_current_thread()?; + + let url_val = jni_new_string_as_jvalue(&mut env, url)?; + parse_to_stream( + env, + (&url_val).into(), + char_set, + pdf_conf, + office_conf, + ocr_conf, + "parseUrl", + "(Ljava/lang/String;\ + Ljava/lang/String;\ + Lorg/apache/tika/parser/pdf/PDFParserConfig;\ + Lorg/apache/tika/parser/microsoft/OfficeParserConfig;\ + Lorg/apache/tika/parser/ocr/TesseractOCRConfig;\ + )Lai/yobix/ReaderResult;", + ) +} diff --git a/extractous-core/tests/extract_to_stream_tests.rs b/extractous-core/tests/extract_to_stream_tests.rs new file mode 100644 index 0000000..c29d089 --- /dev/null +++ b/extractous-core/tests/extract_to_stream_tests.rs @@ -0,0 +1,74 @@ +extern crate test_case; +extern crate textdistance; + +use extractous::{Extractor, PdfOcrStrategy, PdfParserConfig, TesseractOcrConfig}; +use std::fs; +use std::io::Read; +use test_case::test_case; +use textdistance::nstr::cosine; + +#[test_case("2022_Q3_AAPL.pdf", 0.9; "Test PDF file")] +#[test_case("science-exploration-1p.pptx", 0.9; "Test PPTX file")] +#[test_case("simple.odt", 0.8; "Test ODT file")] +#[test_case("table-multi-row-column-cells-actual.csv", 0.8; "Test CSV file")] +#[test_case("vodafone.xlsx", 0.4; "Test XLSX file")] +#[test_case("category-level.docx", 0.9; "Test DOCX file")] +#[test_case("simple.doc", 0.9; "Test DOC file")] +#[test_case("simple.pptx", 0.9; "Test another PPTX file")] +#[test_case("table-multi-row-column-cells.png", -1.0; "Test PNG file")] +#[test_case("winter-sports.epub", 0.9; "Test EPUB file")] +#[test_case("bug_16.docx", 0.9; "Test bug16 DOCX file")] +//#[test_case("eng-ocr.pdf", 0.9; "Test eng-ocr PDF file")] +fn test_extract_bytes_to_stream(file_name: &str, target_dist: f64) { + let extractor = Extractor::new(); + + let bytes = fs::read(&format!("../test_files/documents/{}", file_name)).unwrap(); + let mut stream = extractor.extract_bytes(&bytes).unwrap(); + + let mut buffer = Vec::new(); + stream.read_to_end(&mut buffer).unwrap(); + let extracted = String::from_utf8_lossy(&buffer); + + // read expected string + let expected = + fs::read_to_string(format!("../test_files/expected_result/{}.txt", file_name)).unwrap(); + + let dist = cosine(&expected, &extracted); + assert!( + dist > target_dist, + "Cosine similarity is less than {} for file: {}, dist: {}", + target_dist, + file_name, + dist + ); + println!("{}: {}", file_name, dist); +} + +#[test] +fn test_extract_bytes_to_stream_ara_ocr_png() { + let extractor = Extractor::new() + .set_ocr_config(TesseractOcrConfig::new().set_language("ara")) + .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::NO_OCR)); + + // extract file with extractor + let bytes = fs::read(&"../test_files/documents/ara-ocr.png".to_string()).unwrap(); + let mut stream = extractor.extract_bytes(&bytes).unwrap(); + + let mut buffer = Vec::new(); + stream.read_to_end(&mut buffer).unwrap(); + let extracted = String::from_utf8_lossy(&buffer); + + println!("{}", extracted); + + // read expected string + let expected = + fs::read_to_string("../test_files/expected_result/ara-ocr.png.txt".to_string()).unwrap(); + + let dist = cosine(&expected, &extracted); + assert!( + dist > 0.9, + "Cosine similarity is less than 0.9 for file: ara-ocr.png, dist: {}", + dist + ); + println!("{}: {}", "ara-ocr.png", dist); +} diff --git a/extractous-core/tests/extractor_test.rs b/extractous-core/tests/extract_to_string_tests.rs similarity index 87% rename from extractous-core/tests/extractor_test.rs rename to extractous-core/tests/extract_to_string_tests.rs index 5322c3f..7456442 100644 --- a/extractous-core/tests/extractor_test.rs +++ b/extractous-core/tests/extract_to_string_tests.rs @@ -17,7 +17,7 @@ use textdistance::nstr::cosine; #[test_case("table-multi-row-column-cells.png", -1.0; "Test PNG file")] #[test_case("winter-sports.epub", 0.9; "Test EPUB file")] #[test_case("bug_16.docx", 0.9; "Test bug16 DOCX file")] -#[test_case("eng-ocr.pdf", 0.9; "Test eng-ocr PDF file")] +//#[test_case("eng-ocr.pdf", 0.9; "Test eng-ocr PDF file")] fn test_extract_file_to_string(file_name: &str, target_dist: f64) { let extractor = Extractor::new().set_extract_string_max_length(1000000); // extract file with extractor @@ -40,7 +40,7 @@ fn test_extract_file_to_string(file_name: &str, target_dist: f64) { } #[test] -fn test_extract_ara_ocr_png_to_string() { +fn test_extract_file_to_string_ara_ocr_png() { let extractor = Extractor::new() .set_ocr_config(TesseractOcrConfig::new().set_language("ara")) .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::NO_OCR)); @@ -61,18 +61,18 @@ fn test_extract_ara_ocr_png_to_string() { "Cosine similarity is less than 0.9 for file: ara-ocr.png, dist: {}", dist ); - println!("{}: {}", "ara-ocr.png", dist); } +#[cfg(not(target_os = "macos"))] #[test] -fn test_ocr_only_strategy_extract_deu_ocr_pdf_to_string() { +fn test_extract_file_to_string_ocr_only_strategy_deu_ocr_pdf() { let extractor = Extractor::new() .set_ocr_config(TesseractOcrConfig::new().set_language("deu")) .set_pdf_config( PdfParserConfig::new() - .set_ocr_strategy(PdfOcrStrategy::OCR_ONLY) - .set_extract_inline_images(true) - .set_extract_unique_inline_images_only(true), + .set_ocr_strategy(PdfOcrStrategy::OCR_AND_TEXT_EXTRACTION) + .set_extract_inline_images(false) + .set_extract_unique_inline_images_only(false), ); // extract file with extractor let extracted = extractor @@ -89,11 +89,11 @@ fn test_ocr_only_strategy_extract_deu_ocr_pdf_to_string() { "Cosine similarity is less than 0.9 for file: ara-ocr.png, dist: {}", dist ); - println!("{}: {}", "ara-ocr.png", dist); } +#[cfg(not(target_os = "macos"))] #[test] -fn test_no_ocr_strategy_extract_deu_ocr_pdf_to_string() { +fn test_test_extract_file_to_string_no_ocr_strategy_deu_ocr_pdf() { let extractor = Extractor::new() .set_ocr_config(TesseractOcrConfig::new().set_language("deu")) .set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::NO_OCR)); diff --git a/extractous-core/tika-native/build.gradle b/extractous-core/tika-native/build.gradle index d153548..793ae26 100644 --- a/extractous-core/tika-native/build.gradle +++ b/extractous-core/tika-native/build.gradle @@ -66,6 +66,7 @@ graalvmNative { buildArgs.addAll( "-H:+AddAllCharsets", // Very important to get UTF8 working + "--enable-https", // Very important https working "-O3", "--parallelism=$numThreads", "-march=compatibility" // VERY IMPORTANT to use compatibility flag. If not the libs will use the cpu arch of the build machine and will notwork on other CPUs if distributed diff --git a/extractous-core/tika-native/src/main/java/ai/yobix/ByteBufferInputStream.java b/extractous-core/tika-native/src/main/java/ai/yobix/ByteBufferInputStream.java new file mode 100644 index 0000000..9abf3a2 --- /dev/null +++ b/extractous-core/tika-native/src/main/java/ai/yobix/ByteBufferInputStream.java @@ -0,0 +1,90 @@ +package ai.yobix; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; + +public class ByteBufferInputStream extends InputStream { + + private ByteBuffer bb; + + public ByteBufferInputStream(ByteBuffer bb) { + this.bb = bb; + } + + @Override + public int read() throws IOException { + if (bb == null) { + throw new IOException("read on a closed InputStream"); + } + + if (bb.remaining() == 0) { + return -1; + } + + return (bb.get() & 0xFF); // need to be in the range 0 to 255 + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + + if (bb == null) { + throw new IOException("read on a closed InputStream"); + } + + if (b == null) { + throw new NullPointerException(); + } else if (off < 0 || len < 0 || len > b.length - off) { + throw new IndexOutOfBoundsException(); + } else if (len == 0) { + return 0; + } + + int length = Math.min(bb.remaining(), len); + if (length == 0) { + return -1; + } + + bb.get(b, off, length); + return length; + } + + @Override + public long skip(long n) throws IOException { + + if (bb == null) { + throw new IOException("skip on a closed InputStream"); + } + + if (n <= 0) { + return 0; + } + + /* + * ByteBuffers have at most an int, so lose the upper bits. + * The contract allows this. + */ + int nInt = (int) n; + int skip = Math.min(bb.remaining(), nInt); + + bb.position(bb.position() + skip); + + return nInt; + } + + @Override + public int available() throws IOException { + + if (bb == null) { + throw new IOException("available on a closed InputStream"); + } + + return bb.remaining(); + } + + @Override + public void close() throws IOException { + bb = null; + } + +} diff --git a/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java b/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java index ba83662..b524b40 100644 --- a/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java +++ b/extractous-core/tika-native/src/main/java/ai/yobix/TikaNativeMain.java @@ -1,34 +1,22 @@ package ai.yobix; import org.apache.commons.io.input.ReaderInputStream; -import org.apache.tika.exception.WriteLimitReachedException; -import org.apache.tika.parser.ParsingReader; -import org.apache.tika.sax.BodyContentHandler; -import org.apache.tika.sax.WriteOutContentHandler; import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; - -import java.io.IOException; -import java.io.InputStream; -import java.io.Reader; -import java.net.MalformedURLException; -import java.net.URI; -import java.net.URISyntaxException; -import java.net.URL; -import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; -import java.nio.file.Path; -import java.nio.file.Paths; - +import org.apache.tika.exception.WriteLimitReachedException; +import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParsingReader; import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.apache.tika.parser.ocr.TesseractOCRConfig; import org.apache.tika.parser.pdf.PDFParserConfig; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.WriteOutContentHandler; import org.graalvm.nativeimage.IsolateThread; import org.graalvm.nativeimage.c.function.CEntryPoint; import org.graalvm.nativeimage.c.type.CCharPointer; @@ -36,6 +24,19 @@ import org.graalvm.nativeimage.c.type.CTypeConversion; import org.xml.sax.SAXException; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.nio.file.Paths; + public class TikaNativeMain { private static final Tika tika = new Tika(); @@ -196,15 +197,17 @@ public static ReaderResult parseUrl( * @return ReaderResult */ public static ReaderResult parseBytes( - byte[] data, + ByteBuffer data, String charsetName, PDFParserConfig pdfConfig, OfficeParserConfig officeConfig, TesseractOCRConfig tesseractConfig ) { + final Metadata metadata = new Metadata(); - final TikaInputStream stream = TikaInputStream.get(data, metadata); + final ByteBufferInputStream inStream = new ByteBufferInputStream(data); + final TikaInputStream stream = TikaInputStream.get(inStream, new TemporaryResources(), metadata); return parse(stream, metadata, charsetName, pdfConfig, officeConfig, tesseractConfig); } diff --git a/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json b/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json index 288d373..496d5d3 100644 --- a/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json +++ b/extractous-core/tika-native/src/main/resources/META-INF/native-image/jni-config.json @@ -55,7 +55,7 @@ { "name": "parseBytes", "parameterTypes": [ - "byte[]", + "java.nio.ByteBuffer", "java.lang.String", "org.apache.tika.parser.pdf.PDFParserConfig", "org.apache.tika.parser.microsoft.OfficeParserConfig",