Skip to content

Commit

Permalink
Allowing Multipart upload for files larger than 5GB (#1401)
Browse files Browse the repository at this point in the history
Files smaller than 5GB are still uploaded in one peice
  • Loading branch information
carolyncole authored Aug 9, 2023
1 parent 4b95db8 commit eaf99be
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 13 deletions.
2 changes: 1 addition & 1 deletion app/jobs/attach_file_to_work_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def perform(file_path:, file_name:, size:, background_upload_snapshot_id:)
@background_upload_snapshot_id = background_upload_snapshot_id

File.open(file_path) do |file|
unless work.s3_query_service.upload_file(io: file.to_io, filename: file_name)
unless work.s3_query_service.upload_file(io: file.to_io, filename: file_name, size: @size)
raise "An error uploading #{file_name} was encountered for work #{work}"
end
end
Expand Down
3 changes: 2 additions & 1 deletion app/models/readme.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ def attach(readme_file_param)

extension = File.extname(readme_file_param.original_filename)
readme_name = "README#{extension}"
key = work.s3_query_service.upload_file(io: readme_file_param.to_io, filename: readme_name)
size = readme_file_param.size
key = work.s3_query_service.upload_file(io: readme_file_param.to_io, filename: readme_name, size: size)
if key
log_change(key)
nil
Expand Down
3 changes: 2 additions & 1 deletion app/services/pul_dspace_aws_connector.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ def upload_to_s3(dspace_files)
basename = File.basename(dspace_file.filename_display)
match_dspace_file.filename = dspace_file.filename_display
io = File.open(filename)
key = work.s3_query_service.upload_file(io: io, filename: basename, md5_digest: dspace_file.checksum)
size = File.size(filename)
key = work.s3_query_service.upload_file(io: io, filename: basename, md5_digest: dspace_file.checksum, size: size)
if key
{ key: key, file: match_dspace_file, error: nil }
else
Expand Down
34 changes: 30 additions & 4 deletions app/services/s3_query_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -264,11 +264,15 @@ def create_directory
Rails.logger.error("An error was encountered when requesting to create the AWS S3 directory Object in the bucket #{bucket_name} with the key #{prefix}: #{aws_service_error}")
end

def upload_file(io:, filename:, md5_digest: nil)
def upload_file(io:, filename:, size:, md5_digest: nil)
# upload file from io in a single request, may not exceed 5GB
md5_digest ||= md5(io: io)
key = "#{prefix}#{filename}"
@last_response = client.put_object(bucket: bucket_name, key: key, body: io, content_md5: md5_digest)
if size > part_size
upload_multipart_file(target_bucket: bucket_name, target_key: key, size: size, io: io)
else
md5_digest ||= md5(io: io)
@last_response = client.put_object(bucket: bucket_name, key: key, body: io, content_md5: md5_digest)
end
key
rescue Aws::S3::Errors::SignatureDoesNotMatch => e
Honeybadger.notify("Error Uploading file #{filename} for object: #{s3_address} Signature did not match! error: #{e}")
Expand Down Expand Up @@ -318,5 +322,27 @@ def parse_continuation(resp_hash, bucket_name: self.bucket_name, prefix: self.pr
Rails.logger.error("An error was encountered when requesting to list the AWS S3 Objects in the bucket #{bucket_name} with the key #{prefix}: #{aws_service_error}")
[]
end
end

def upload_multipart_file(target_bucket:, target_key:, size:, io: )
multi = client.create_multipart_upload(bucket: target_bucket, key: target_key)
part_num = 0
start_byte = 0
parts = []
while start_byte < size
part_num += 1
Tempfile.open('mutlipart-upload') do |file|
IO.copy_stream(io, file, part_size)
file.rewind
checksum = md5(io: file)
resp = client.upload_part(body: file, bucket: target_bucket, key: multi.key, part_number: part_num, upload_id: multi.upload_id, content_md5: checksum)
parts << { etag: resp.etag, part_number: part_num }
end
start_byte += part_size
end
@last_response = client.complete_multipart_upload(bucket: target_bucket, key: target_key, upload_id: multi.upload_id, multipart_upload: { parts: parts })
rescue Aws::Errors::ServiceError => aws_service_error
Rails.logger.error("An error was encountered when requesting to multipart upload to AWS S3 Object to #{target_key} in the bucket #{target_bucket}: #{aws_service_error}")
end

end
# rubocop:enable Metrics/ClassLength
6 changes: 3 additions & 3 deletions spec/models/readme_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

it "attaches the file and renames to to README" do
expect { expect(readme.attach(uploaded_file)).to be_nil }.to change { UploadSnapshot.count }.by 1
expect(fake_s3_service).to have_received(:upload_file).with(io: uploaded_file.to_io, filename: "README.csv")
expect(fake_s3_service).to have_received(:upload_file).with(io: uploaded_file.to_io, filename: "README.csv", size: 287)
end

context "when no uploaded file is sent" do
Expand Down Expand Up @@ -63,7 +63,7 @@

it "returns an error message" do
expect(readme.attach(uploaded_file)).to eq("An error uploading your README was encountered. Please try again.")
expect(fake_s3_service).to have_received(:upload_file).with(io: uploaded_file.to_io, filename: "README.csv")
expect(fake_s3_service).to have_received(:upload_file).with(io: uploaded_file.to_io, filename: "README.csv", size: 287)
end
end

Expand All @@ -72,7 +72,7 @@

it "returns no error message" do
expect { expect(readme.attach(uploaded_file)).to be_nil }.to change { UploadSnapshot.count }.by 1
expect(fake_s3_service).to have_received(:upload_file).with(io: uploaded_file.to_io, filename: "README.csv")
expect(fake_s3_service).to have_received(:upload_file).with(io: uploaded_file.to_io, filename: "README.csv", size: 287)
expect(fake_s3_service).to have_received(:delete_s3_object).with(s3_files.last.key)
end
end
Expand Down
34 changes: 31 additions & 3 deletions spec/services/s3_query_service_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -570,17 +570,45 @@
end

it "uploads the readme" do
expect(s3_query_service.upload_file(io: file, filename: filename)).to eq("10.34770/pe9w-x904/#{work.id}/README.txt")
expect(s3_query_service.upload_file(io: file, filename: filename, size: 2852)).to eq("10.34770/pe9w-x904/#{work.id}/README.txt")
assert_requested(:put, "https://example-bucket.s3.amazonaws.com/#{s3_query_service.prefix}#{filename}", headers: { "Content-Length" => 2852 })
end

context "when the file is large" do
let(:fake_aws_client) { double(Aws::S3::Client) }
let(:fake_multi) { instance_double(Aws::S3::Types::CreateMultipartUploadOutput, key: "abc", upload_id: "upload id", bucket: "bucket") }
let(:fake_upload) { instance_double(Aws::S3::Types::UploadPartOutput, etag: "etag123abc") }
let(:fake_completion) { instance_double(Seahorse::Client::Response, "successful?": true) }
let(:key) { "10.34770/pe9w-x904/#{work.id}/README.txt" }

before do
s3_query_service.stub(:client).and_return(fake_aws_client)
allow(s3_query_service.client).to receive(:create_multipart_upload).and_return(fake_multi)
allow(s3_query_service.client).to receive(:upload_part).and_return(fake_upload)
allow(s3_query_service.client).to receive(:complete_multipart_upload).and_return(fake_completion)
end

it "uploads the large file" do
expect(s3_query_service.upload_file(io: file, filename: filename, size: 6_000_000_000)).to eq(key)
expect(s3_query_service.client).to have_received(:create_multipart_upload)
.with({ bucket: "example-bucket", key: key })
expect(subject.client).to have_received(:upload_part)
.with(hash_including(bucket: "example-bucket", key: "abc", part_number: 1, upload_id: "upload id"))
expect(subject.client).to have_received(:upload_part)
.with(hash_including(bucket: "example-bucket", key: "abc", part_number: 2, upload_id: "upload id"))
expect(subject.client).to have_received(:complete_multipart_upload)
.with({ bucket: "example-bucket", key: key, multipart_upload: { parts: [{ etag: "etag123abc", part_number: 1 },
{ etag: "etag123abc", part_number: 2 }] }, upload_id: "upload id" })
end
end

context "when checksum does not match" do
before do
stub_request(:put, "https://example-bucket.s3.amazonaws.com/#{s3_query_service.prefix}#{filename}").to_raise(Aws::S3::Errors::SignatureDoesNotMatch.new(nil, nil))
end

it "detects the upload error" do
expect(s3_query_service.upload_file(io: file, filename: filename)).to be_falsey
expect(s3_query_service.upload_file(io: file, filename: filename, size: 2852)).to be_falsey
assert_requested(:put, "https://example-bucket.s3.amazonaws.com/#{s3_query_service.prefix}#{filename}", headers: { "Content-Length" => 2852 })
end
end
Expand All @@ -604,7 +632,7 @@

it "logs the error" do
s3_query_service = described_class.new(work)
result = s3_query_service.upload_file(io: file, filename: filename)
result = s3_query_service.upload_file(io: file, filename: filename, size: 2852)
expect(result).to be false
# rubocop:disable Layout/LineLength
expect(Rails.logger).to have_received(:error).with("An error was encountered when requesting to create the AWS S3 Object in the bucket example-bucket with the key #{prefix}README.txt: test AWS service error")
Expand Down

0 comments on commit eaf99be

Please sign in to comment.