diff --git a/connectors/idp-extraction/element-templates/hybrid/hybrid-idp-extraction-outbound-connector-hybrid.json b/connectors/idp-extraction/element-templates/hybrid/hybrid-idp-extraction-outbound-connector-hybrid.json index 59add31cad..ad44180683 100644 --- a/connectors/idp-extraction/element-templates/hybrid/hybrid-idp-extraction-outbound-connector-hybrid.json +++ b/connectors/idp-extraction/element-templates/hybrid/hybrid-idp-extraction-outbound-connector-hybrid.json @@ -57,17 +57,17 @@ }, "type" : "Hidden" }, { - "id" : "input.documentUrl", - "label" : "Document URL", - "description" : "Specify the URL where the document is hosted", + "id" : "input.document", + "label" : "Document", + "description" : "Specify the document", "optional" : false, - "value" : "= input.documentUrl", + "value" : "= input.document", "constraints" : { "notEmpty" : true }, "group" : "input", "binding" : { - "name" : "input.documentUrl", + "name" : "input.document", "type" : "zeebe:input" }, "type" : "Hidden" diff --git a/connectors/idp-extraction/element-templates/idp-extraction-outbound-connector.json b/connectors/idp-extraction/element-templates/idp-extraction-outbound-connector.json index 5e10d1a3ed..b3773bcb18 100644 --- a/connectors/idp-extraction/element-templates/idp-extraction-outbound-connector.json +++ b/connectors/idp-extraction/element-templates/idp-extraction-outbound-connector.json @@ -52,17 +52,17 @@ }, "type" : "Hidden" }, { - "id" : "input.documentUrl", - "label" : "Document URL", - "description" : "Specify the URL where the document is hosted", + "id" : "input.document", + "label" : "Document", + "description" : "Specify the document", "optional" : false, - "value" : "= input.documentUrl", + "value" : "= input.document", "constraints" : { "notEmpty" : true }, "group" : "input", "binding" : { - "name" : "input.documentUrl", + "name" : "input.document", "type" : "zeebe:input" }, "type" : "Hidden" diff --git a/connectors/idp-extraction/src/main/java/io/camunda/connector/idp/extraction/ExtractionConnectorFunction.java b/connectors/idp-extraction/src/main/java/io/camunda/connector/idp/extraction/ExtractionConnectorFunction.java index 47b0579cdb..8ae477f15c 100644 --- a/connectors/idp-extraction/src/main/java/io/camunda/connector/idp/extraction/ExtractionConnectorFunction.java +++ b/connectors/idp-extraction/src/main/java/io/camunda/connector/idp/extraction/ExtractionConnectorFunction.java @@ -18,10 +18,7 @@ import io.camunda.connector.idp.extraction.supplier.BedrockRuntimeClientSupplier; import io.camunda.connector.idp.extraction.supplier.S3ClientSupplier; import io.camunda.connector.idp.extraction.supplier.TextractClientSupplier; -import java.net.URI; -import java.net.URL; import org.apache.pdfbox.Loader; -import org.apache.pdfbox.io.IOUtils; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.slf4j.Logger; @@ -97,16 +94,14 @@ public Object execute(OutboundConnectorContext context) { private String extractTextUsingAwsTextract(ExtractionRequest extractionRequest) throws Exception { return pollingTextractCaller.call( - extractionRequest.input().documentUrl(), + extractionRequest.input().document(), extractionRequest.input().s3BucketName(), textractClientSupplier.getTextractClient(extractionRequest), s3ClientSupplier.getAsyncS3Client(extractionRequest)); } private String extractTextUsingApachePdf(ExtractionRequest extractionRequest) throws Exception { - String documentUrl = extractionRequest.input().documentUrl(); - URL url = URI.create(documentUrl).toURL(); - PDDocument document = Loader.loadPDF(IOUtils.toByteArray(url.openStream())); + PDDocument document = Loader.loadPDF(extractionRequest.input().document().asByteArray()); PDFTextStripper pdfStripper = new PDFTextStripper(); return pdfStripper.getText(document); } diff --git a/connectors/idp-extraction/src/main/java/io/camunda/connector/idp/extraction/caller/PollingTextractCaller.java b/connectors/idp-extraction/src/main/java/io/camunda/connector/idp/extraction/caller/PollingTextractCaller.java index 4d20735f0e..7e7b5e6dc0 100644 --- a/connectors/idp-extraction/src/main/java/io/camunda/connector/idp/extraction/caller/PollingTextractCaller.java +++ b/connectors/idp-extraction/src/main/java/io/camunda/connector/idp/extraction/caller/PollingTextractCaller.java @@ -11,6 +11,7 @@ import io.camunda.connector.api.error.ConnectorException; import io.camunda.connector.idp.extraction.model.TextractTask; import io.camunda.connector.idp.extraction.utils.AwsS3Util; +import io.camunda.document.Document; import java.util.ArrayList; import java.util.List; import java.util.concurrent.Executors; @@ -31,13 +32,13 @@ public class PollingTextractCaller { private static final Logger LOGGER = LoggerFactory.getLogger(PollingTextractCaller.class); public String call( - String documentUrl, + Document document, String bucketName, TextractClient textractClient, S3AsyncClient s3AsyncClient) throws Exception { - S3Object s3Object = AwsS3Util.buildS3ObjectFromUrl(documentUrl, bucketName, s3AsyncClient); + S3Object s3Object = AwsS3Util.buildS3ObjectFromDocument(document, bucketName, s3AsyncClient); LOGGER.debug("Starting polling task for document analysis with document: {}", s3Object.name()); diff --git a/connectors/idp-extraction/src/main/java/io/camunda/connector/idp/extraction/model/ExtractionRequestData.java b/connectors/idp-extraction/src/main/java/io/camunda/connector/idp/extraction/model/ExtractionRequestData.java index fc207bafb7..09422a523b 100644 --- a/connectors/idp-extraction/src/main/java/io/camunda/connector/idp/extraction/model/ExtractionRequestData.java +++ b/connectors/idp-extraction/src/main/java/io/camunda/connector/idp/extraction/model/ExtractionRequestData.java @@ -10,6 +10,7 @@ import io.camunda.connector.generator.java.annotation.TemplateProperty; import io.camunda.connector.generator.java.annotation.TemplateProperty.PropertyBinding; import io.camunda.connector.generator.java.annotation.TemplateProperty.PropertyConstraints; +import io.camunda.document.Document; import jakarta.validation.constraints.NotNull; import java.util.List; @@ -27,17 +28,17 @@ public record ExtractionRequestData( @NotNull TextExtractionEngineType extractionEngineType, @TemplateProperty( - id = "documentUrl", - label = "Document URL", + id = "document", + label = "Document", group = "input", type = TemplateProperty.PropertyType.Hidden, - description = "Specify the URL where the document is hosted", - defaultValue = "= input.documentUrl", - binding = @PropertyBinding(name = "documentUrl"), + description = "Specify the document", + defaultValue = "= input.document", + binding = @PropertyBinding(name = "document"), feel = Property.FeelMode.disabled, constraints = @PropertyConstraints(notEmpty = true)) @NotNull - String documentUrl, + Document document, @TemplateProperty( id = "s3BucketName", label = "AWS S3 Bucket name", diff --git a/connectors/idp-extraction/src/main/java/io/camunda/connector/idp/extraction/utils/AwsS3Util.java b/connectors/idp-extraction/src/main/java/io/camunda/connector/idp/extraction/utils/AwsS3Util.java index 084adb4490..9e3c5dc560 100644 --- a/connectors/idp-extraction/src/main/java/io/camunda/connector/idp/extraction/utils/AwsS3Util.java +++ b/connectors/idp-extraction/src/main/java/io/camunda/connector/idp/extraction/utils/AwsS3Util.java @@ -6,11 +6,9 @@ */ package io.camunda.connector.idp.extraction.utils; +import io.camunda.document.Document; import java.io.IOException; import java.io.InputStream; -import java.net.URI; -import java.net.URL; -import java.net.URLConnection; import java.util.UUID; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutorService; @@ -28,22 +26,16 @@ public class AwsS3Util { private static final Logger LOGGER = LoggerFactory.getLogger(AwsS3Util.class); - private static String uploadNewFileFromUrl( - final String documentUrl, final String bucketName, final S3AsyncClient s3AsyncClient) + private static String uploadNewFileFromDocument( + final Document document, final String bucketName, final S3AsyncClient s3AsyncClient) throws IOException { String documentKey = UUID.randomUUID().toString(); LOGGER.debug("Starting document upload to AWS S3 with key {}", documentKey); - URL url = URI.create(documentUrl).toURL(); - URLConnection urlConnection = url.openConnection(); - long contentLength = urlConnection.getContentLength(); + long contentLength = document.asByteArray().length; - if (contentLength == -1) { - throw new IOException("Unable to determine file size for URL: " + documentUrl); - } - - try (InputStream inputStream = urlConnection.getInputStream()) { + try (InputStream inputStream = document.asInputStream()) { PutObjectRequest putObjectRequest = PutObjectRequest.builder().bucket(bucketName).key(documentKey).build(); @@ -82,12 +74,12 @@ public static void deleteS3ObjectFromBucketAsync( response.thenApply(r -> null); } - public static S3Object buildS3ObjectFromUrl( - final String documentUrl, final String bucketName, final S3AsyncClient s3AsyncClient) + public static S3Object buildS3ObjectFromDocument( + final Document document, final String bucketName, final S3AsyncClient s3AsyncClient) throws IOException { return S3Object.builder() .bucket(bucketName) - .name(uploadNewFileFromUrl(documentUrl, bucketName, s3AsyncClient)) + .name(uploadNewFileFromDocument(document, bucketName, s3AsyncClient)) .build(); } diff --git a/connectors/idp-extraction/src/test/java/io/camunda/connector/idp/extraction/caller/BedrockCallerTest.java b/connectors/idp-extraction/src/test/java/io/camunda/connector/idp/extraction/caller/BedrockCallerTest.java index 24a0677e3b..8f17a0095c 100644 --- a/connectors/idp-extraction/src/test/java/io/camunda/connector/idp/extraction/caller/BedrockCallerTest.java +++ b/connectors/idp-extraction/src/test/java/io/camunda/connector/idp/extraction/caller/BedrockCallerTest.java @@ -49,6 +49,6 @@ void executeSuccessfulExtraction() { String bedrockResponse = bedrockCaller.call(extractionRequest, "", bedrockRuntimeClient); - assertEquals(bedrockResponse, expectedResponse); + assertEquals(expectedResponse, bedrockResponse); } } diff --git a/connectors/idp-extraction/src/test/java/io/camunda/connector/idp/extraction/caller/PollingTextractCallerTest.java b/connectors/idp-extraction/src/test/java/io/camunda/connector/idp/extraction/caller/PollingTextractCallerTest.java index 548b09c809..271a9bb01f 100644 --- a/connectors/idp-extraction/src/test/java/io/camunda/connector/idp/extraction/caller/PollingTextractCallerTest.java +++ b/connectors/idp-extraction/src/test/java/io/camunda/connector/idp/extraction/caller/PollingTextractCallerTest.java @@ -15,6 +15,7 @@ import io.camunda.connector.api.error.ConnectorException; import io.camunda.connector.idp.extraction.utils.AwsS3Util; +import io.camunda.document.Document; import java.util.List; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -34,6 +35,7 @@ class PollingTextractCallerTest { S3AsyncClient s3AsyncClient = Mockito.mock(S3AsyncClient.class); S3Object s3Object = Mockito.mock(S3Object.class); MockedStatic awsS3UtilMockedStatic; + Document mockedDocument = Mockito.mock(Document.class); @BeforeEach void beforeEach() { @@ -43,7 +45,7 @@ void beforeEach() { awsS3UtilMockedStatic .when( () -> - AwsS3Util.buildS3ObjectFromUrl(any(), any(String.class), any(S3AsyncClient.class))) + AwsS3Util.buildS3ObjectFromDocument(any(), any(String.class), any(S3AsyncClient.class))) .thenReturn(s3Object); awsS3UtilMockedStatic .when( @@ -85,7 +87,7 @@ void callTextractDocumentAnalysisWithSuccess() throws Exception { String expectedExtractedText = "AAA\nBBB"; String extractedText = new PollingTextractCaller() - .call("test Url", "test-aws-s3-bucket-name", textractClient, s3AsyncClient); + .call(mockedDocument, "test-aws-s3-bucket-name", textractClient, s3AsyncClient); assertThat(extractedText).isEqualTo(expectedExtractedText); } @@ -114,7 +116,7 @@ void callTextractDocumentAnalysisWithEmptyResult() throws Exception { String expectedExtractedText = ""; String extractedText = new PollingTextractCaller() - .call("test Url", "test-aws-s3-bucket-name", textractClient, s3AsyncClient); + .call(mockedDocument, "test-aws-s3-bucket-name", textractClient, s3AsyncClient); assertThat(extractedText).isEqualTo(expectedExtractedText); } @@ -147,7 +149,7 @@ void callTextractDocumentAnalysisWithFailure() { ConnectorException.class, () -> pollingTextractCaller.call( - "test Url", "test-aws-s3-bucket-name", textractClient, s3AsyncClient)); + mockedDocument, "test-aws-s3-bucket-name", textractClient, s3AsyncClient)); assertEquals("Test exception message", exception.getMessage()); } diff --git a/connectors/idp-extraction/src/test/java/io/camunda/connector/idp/extraction/util/ExtractionTestUtils.java b/connectors/idp-extraction/src/test/java/io/camunda/connector/idp/extraction/util/ExtractionTestUtils.java index 942edfbc90..d1d57dca64 100644 --- a/connectors/idp-extraction/src/test/java/io/camunda/connector/idp/extraction/util/ExtractionTestUtils.java +++ b/connectors/idp-extraction/src/test/java/io/camunda/connector/idp/extraction/util/ExtractionTestUtils.java @@ -6,10 +6,19 @@ */ package io.camunda.connector.idp.extraction.util; +import static org.apache.hc.core5.http.ContentType.APPLICATION_PDF; + import io.camunda.connector.idp.extraction.model.ConverseData; import io.camunda.connector.idp.extraction.model.ExtractionRequestData; import io.camunda.connector.idp.extraction.model.TaxonomyItem; import io.camunda.connector.idp.extraction.model.TextExtractionEngineType; +import io.camunda.document.Document; +import io.camunda.document.factory.DocumentFactory; +import io.camunda.document.factory.DocumentFactoryImpl; +import io.camunda.document.store.DocumentCreationRequest; +import io.camunda.document.store.InMemoryDocumentStore; +import java.io.FileInputStream; +import java.io.FileNotFoundException; import java.util.List; public class ExtractionTestUtils { @@ -21,7 +30,12 @@ public class ExtractionTestUtils { { "input": { "extractionEngineType": "AWS_TEXTRACT", - "documentUrl": "https://some-url-containing-your-document/documemt.pdf", + "document": { + "camunda.document.type": "camunda", + "storeId": "test", + "documentId": "test", + "metadata": {} + }, "s3BucketName": "test-aws-s3-bucket-name", "converseData": { "modelId": "anthropic.claude-3-5-sonnet-20240620-v1:0" @@ -53,10 +67,25 @@ public class ExtractionTestUtils { public static final ExtractionRequestData TEXTRACT_EXTRACTION_REQUEST_DATA = new ExtractionRequestData( TextExtractionEngineType.AWS_TEXTRACT, - "https://some-url-containing-your-document/documemt.pdf", + loadTestFile(), "test-aws-s3-bucket-name", List.of( new TaxonomyItem("sum", "the total amount that was paid for this invoice"), new TaxonomyItem("supplier", "who provided the goods or services")), new ConverseData("anthropic.claude-3-5-sonnet-20240620-v1:0", 512, 0.5f, 0.9f)); + + private static Document loadTestFile() { + DocumentFactory documentFactory = new DocumentFactoryImpl(InMemoryDocumentStore.INSTANCE); + try { + FileInputStream fileInputStream = + new FileInputStream("src/test/resources/sample-invoice.pdf"); + return documentFactory.create( + DocumentCreationRequest.from(fileInputStream) + .contentType(APPLICATION_PDF.getMimeType()) + .fileName("sample-invoice") + .build()); + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } + } } diff --git a/connectors/idp-extraction/src/test/resources/sample-invoice.pdf b/connectors/idp-extraction/src/test/resources/sample-invoice.pdf new file mode 100644 index 0000000000..b9ae2d7ff7 Binary files /dev/null and b/connectors/idp-extraction/src/test/resources/sample-invoice.pdf differ