From eaf99be9a0b3fc3a15667e2d0d76d690ae852691 Mon Sep 17 00:00:00 2001 From: carolyncole <1599081+carolyncole@users.noreply.github.com> Date: Wed, 9 Aug 2023 08:51:08 -0400 Subject: [PATCH] Allowing Multipart upload for files larger than 5GB (#1401) Files smaller than 5GB are still uploaded in one peice --- app/jobs/attach_file_to_work_job.rb | 2 +- app/models/readme.rb | 3 ++- app/services/pul_dspace_aws_connector.rb | 3 ++- app/services/s3_query_service.rb | 34 +++++++++++++++++++++--- spec/models/readme_spec.rb | 6 ++--- spec/services/s3_query_service_spec.rb | 34 +++++++++++++++++++++--- 6 files changed, 69 insertions(+), 13 deletions(-) diff --git a/app/jobs/attach_file_to_work_job.rb b/app/jobs/attach_file_to_work_job.rb index acef86603..42b264d03 100644 --- a/app/jobs/attach_file_to_work_job.rb +++ b/app/jobs/attach_file_to_work_job.rb @@ -11,7 +11,7 @@ def perform(file_path:, file_name:, size:, background_upload_snapshot_id:) @background_upload_snapshot_id = background_upload_snapshot_id File.open(file_path) do |file| - unless work.s3_query_service.upload_file(io: file.to_io, filename: file_name) + unless work.s3_query_service.upload_file(io: file.to_io, filename: file_name, size: @size) raise "An error uploading #{file_name} was encountered for work #{work}" end end diff --git a/app/models/readme.rb b/app/models/readme.rb index bb0cc1e6e..e1e753290 100644 --- a/app/models/readme.rb +++ b/app/models/readme.rb @@ -15,7 +15,8 @@ def attach(readme_file_param) extension = File.extname(readme_file_param.original_filename) readme_name = "README#{extension}" - key = work.s3_query_service.upload_file(io: readme_file_param.to_io, filename: readme_name) + size = readme_file_param.size + key = work.s3_query_service.upload_file(io: readme_file_param.to_io, filename: readme_name, size: size) if key log_change(key) nil diff --git a/app/services/pul_dspace_aws_connector.rb b/app/services/pul_dspace_aws_connector.rb index a487e34f9..733ba271f 100644 --- a/app/services/pul_dspace_aws_connector.rb +++ b/app/services/pul_dspace_aws_connector.rb @@ -15,7 +15,8 @@ def upload_to_s3(dspace_files) basename = File.basename(dspace_file.filename_display) match_dspace_file.filename = dspace_file.filename_display io = File.open(filename) - key = work.s3_query_service.upload_file(io: io, filename: basename, md5_digest: dspace_file.checksum) + size = File.size(filename) + key = work.s3_query_service.upload_file(io: io, filename: basename, md5_digest: dspace_file.checksum, size: size) if key { key: key, file: match_dspace_file, error: nil } else diff --git a/app/services/s3_query_service.rb b/app/services/s3_query_service.rb index 47310928a..28838bdc9 100644 --- a/app/services/s3_query_service.rb +++ b/app/services/s3_query_service.rb @@ -264,11 +264,15 @@ def create_directory Rails.logger.error("An error was encountered when requesting to create the AWS S3 directory Object in the bucket #{bucket_name} with the key #{prefix}: #{aws_service_error}") end - def upload_file(io:, filename:, md5_digest: nil) + def upload_file(io:, filename:, size:, md5_digest: nil) # upload file from io in a single request, may not exceed 5GB - md5_digest ||= md5(io: io) key = "#{prefix}#{filename}" - @last_response = client.put_object(bucket: bucket_name, key: key, body: io, content_md5: md5_digest) + if size > part_size + upload_multipart_file(target_bucket: bucket_name, target_key: key, size: size, io: io) + else + md5_digest ||= md5(io: io) + @last_response = client.put_object(bucket: bucket_name, key: key, body: io, content_md5: md5_digest) + end key rescue Aws::S3::Errors::SignatureDoesNotMatch => e Honeybadger.notify("Error Uploading file #{filename} for object: #{s3_address} Signature did not match! error: #{e}") @@ -318,5 +322,27 @@ def parse_continuation(resp_hash, bucket_name: self.bucket_name, prefix: self.pr Rails.logger.error("An error was encountered when requesting to list the AWS S3 Objects in the bucket #{bucket_name} with the key #{prefix}: #{aws_service_error}") [] end -end + + def upload_multipart_file(target_bucket:, target_key:, size:, io: ) + multi = client.create_multipart_upload(bucket: target_bucket, key: target_key) + part_num = 0 + start_byte = 0 + parts = [] + while start_byte < size + part_num += 1 + Tempfile.open('mutlipart-upload') do |file| + IO.copy_stream(io, file, part_size) + file.rewind + checksum = md5(io: file) + resp = client.upload_part(body: file, bucket: target_bucket, key: multi.key, part_number: part_num, upload_id: multi.upload_id, content_md5: checksum) + parts << { etag: resp.etag, part_number: part_num } + end + start_byte += part_size + end + @last_response = client.complete_multipart_upload(bucket: target_bucket, key: target_key, upload_id: multi.upload_id, multipart_upload: { parts: parts }) + rescue Aws::Errors::ServiceError => aws_service_error + Rails.logger.error("An error was encountered when requesting to multipart upload to AWS S3 Object to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}") + end + + end # rubocop:enable Metrics/ClassLength diff --git a/spec/models/readme_spec.rb b/spec/models/readme_spec.rb index 0e1e36bda..8e31ccc78 100644 --- a/spec/models/readme_spec.rb +++ b/spec/models/readme_spec.rb @@ -35,7 +35,7 @@ it "attaches the file and renames to to README" do expect { expect(readme.attach(uploaded_file)).to be_nil }.to change { UploadSnapshot.count }.by 1 - expect(fake_s3_service).to have_received(:upload_file).with(io: uploaded_file.to_io, filename: "README.csv") + expect(fake_s3_service).to have_received(:upload_file).with(io: uploaded_file.to_io, filename: "README.csv", size: 287) end context "when no uploaded file is sent" do @@ -63,7 +63,7 @@ it "returns an error message" do expect(readme.attach(uploaded_file)).to eq("An error uploading your README was encountered. Please try again.") - expect(fake_s3_service).to have_received(:upload_file).with(io: uploaded_file.to_io, filename: "README.csv") + expect(fake_s3_service).to have_received(:upload_file).with(io: uploaded_file.to_io, filename: "README.csv", size: 287) end end @@ -72,7 +72,7 @@ it "returns no error message" do expect { expect(readme.attach(uploaded_file)).to be_nil }.to change { UploadSnapshot.count }.by 1 - expect(fake_s3_service).to have_received(:upload_file).with(io: uploaded_file.to_io, filename: "README.csv") + expect(fake_s3_service).to have_received(:upload_file).with(io: uploaded_file.to_io, filename: "README.csv", size: 287) expect(fake_s3_service).to have_received(:delete_s3_object).with(s3_files.last.key) end end diff --git a/spec/services/s3_query_service_spec.rb b/spec/services/s3_query_service_spec.rb index cc4907828..38f188e11 100644 --- a/spec/services/s3_query_service_spec.rb +++ b/spec/services/s3_query_service_spec.rb @@ -570,17 +570,45 @@ end it "uploads the readme" do - expect(s3_query_service.upload_file(io: file, filename: filename)).to eq("10.34770/pe9w-x904/#{work.id}/README.txt") + expect(s3_query_service.upload_file(io: file, filename: filename, size: 2852)).to eq("10.34770/pe9w-x904/#{work.id}/README.txt") assert_requested(:put, "https://example-bucket.s3.amazonaws.com/#{s3_query_service.prefix}#{filename}", headers: { "Content-Length" => 2852 }) end + context "when the file is large" do + let(:fake_aws_client) { double(Aws::S3::Client) } + let(:fake_multi) { instance_double(Aws::S3::Types::CreateMultipartUploadOutput, key: "abc", upload_id: "upload id", bucket: "bucket") } + let(:fake_upload) { instance_double(Aws::S3::Types::UploadPartOutput, etag: "etag123abc") } + let(:fake_completion) { instance_double(Seahorse::Client::Response, "successful?": true) } + let(:key) { "10.34770/pe9w-x904/#{work.id}/README.txt" } + + before do + s3_query_service.stub(:client).and_return(fake_aws_client) + allow(s3_query_service.client).to receive(:create_multipart_upload).and_return(fake_multi) + allow(s3_query_service.client).to receive(:upload_part).and_return(fake_upload) + allow(s3_query_service.client).to receive(:complete_multipart_upload).and_return(fake_completion) + end + + it "uploads the large file" do + expect(s3_query_service.upload_file(io: file, filename: filename, size: 6_000_000_000)).to eq(key) + expect(s3_query_service.client).to have_received(:create_multipart_upload) + .with({ bucket: "example-bucket", key: key }) + expect(subject.client).to have_received(:upload_part) + .with(hash_including(bucket: "example-bucket", key: "abc", part_number: 1, upload_id: "upload id")) + expect(subject.client).to have_received(:upload_part) + .with(hash_including(bucket: "example-bucket", key: "abc", part_number: 2, upload_id: "upload id")) + expect(subject.client).to have_received(:complete_multipart_upload) + .with({ bucket: "example-bucket", key: key, multipart_upload: { parts: [{ etag: "etag123abc", part_number: 1 }, + { etag: "etag123abc", part_number: 2 }] }, upload_id: "upload id" }) + end + end + context "when checksum does not match" do before do stub_request(:put, "https://example-bucket.s3.amazonaws.com/#{s3_query_service.prefix}#{filename}").to_raise(Aws::S3::Errors::SignatureDoesNotMatch.new(nil, nil)) end it "detects the upload error" do - expect(s3_query_service.upload_file(io: file, filename: filename)).to be_falsey + expect(s3_query_service.upload_file(io: file, filename: filename, size: 2852)).to be_falsey assert_requested(:put, "https://example-bucket.s3.amazonaws.com/#{s3_query_service.prefix}#{filename}", headers: { "Content-Length" => 2852 }) end end @@ -604,7 +632,7 @@ it "logs the error" do s3_query_service = described_class.new(work) - result = s3_query_service.upload_file(io: file, filename: filename) + result = s3_query_service.upload_file(io: file, filename: filename, size: 2852) expect(result).to be false # rubocop:disable Layout/LineLength expect(Rails.logger).to have_received(:error).with("An error was encountered when requesting to create the AWS S3 Object in the bucket example-bucket with the key #{prefix}README.txt: test AWS service error")