Skip to content

Commit

Permalink
Adding a method to discover empty files to the s3query service (#1589)
Browse files Browse the repository at this point in the history
* Adding a method to discover empty files to the s3query service
  • Loading branch information
carolyncole authored Oct 27, 2023
1 parent 1e7a90d commit 7d13ef2
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 10 deletions.
4 changes: 4 additions & 0 deletions app/models/s3_file.rb
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ def directory?
size == 0
end

def empty?
size == 0
end

def globus_url
encoded_filename = filename.split("/").map { |name| ERB::Util.url_encode(name) }.join("/")
File.join(Rails.configuration.globus["post_curation_base_url"], encoded_filename)
Expand Down
28 changes: 19 additions & 9 deletions app/services/s3_query_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -167,15 +167,14 @@ def find_s3_file(filename:)
# @return [Array<S3File>]
def client_s3_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix, ignore_directories: true)
@client_s3_files = nil if reload # force a reload
@client_s3_files ||= begin
start = Time.zone.now
resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix: })
resp_hash = resp.to_h
objects = parse_objects(resp_hash, ignore_directories:)
objects += parse_continuation(resp_hash, bucket_name:, prefix:, ignore_directories:)
elapsed = Time.zone.now - start
Rails.logger.info("Loading S3 objects. Bucket: #{bucket_name}. Prefix: #{prefix}. Elapsed: #{elapsed} seconds")
objects
@client_s3_files ||= get_s3_objects(bucket_name:, prefix:, ignore_directories:)
end

def client_s3_empty_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix)
@client_s3_empty_files = nil if reload # force a reload
@client_s3_empty_files = begin
files_and_directories = get_s3_objects(bucket_name:, prefix:, ignore_directories: false)
files_and_directories.select { |object| !object.filename.ends_with?("/") && object.empty? }
end
end

Expand Down Expand Up @@ -315,6 +314,17 @@ def md5(io:)

private

def get_s3_objects(bucket_name:, prefix:, ignore_directories:)
start = Time.zone.now
resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix: })
resp_hash = resp.to_h
objects = parse_objects(resp_hash, ignore_directories:)
objects += parse_continuation(resp_hash, bucket_name:, prefix:, ignore_directories:)
elapsed = Time.zone.now - start
Rails.logger.info("Loading S3 objects. Bucket: #{bucket_name}. Prefix: #{prefix}. Elapsed: #{elapsed} seconds")
objects
end

def parse_objects(resp, ignore_directories: true)
objects = []
resp_hash = resp.to_h
Expand Down
34 changes: 33 additions & 1 deletion spec/services/s3_query_service_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
},
{
etag: "\"7bd3d4339c034ebc663b99065771111\"",
key: "A directory",
key: "a_directory/",
last_modified: s3_last_modified2,
size: 0,
storage_class: "STANDARD"
Expand Down Expand Up @@ -709,6 +709,38 @@
end
end

describe "#client_s3_empty_files" do
let(:fake_aws_client) { double(Aws::S3::Client) }
let(:s3_size2) { 0 }

before do
subject.stub(:client).and_return(fake_aws_client)
fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output)
fake_aws_client.stub(:list_objects_v2).and_return(fake_s3_resp)
s3_hash_truncated = s3_hash.clone
s3_hash_truncated[:is_truncated] = true
fake_s3_resp.stub(:to_h).and_return(s3_hash_truncated, s3_hash)
end

it "it retrieves the files for the work" do
files = subject.client_s3_empty_files
expect(files.count).to eq 2
expect(files.first.filename).to eq(s3_key2)
expect(files[1].filename).to match(s3_key2)
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "example-bucket", max_keys: 1000, prefix: "10.34770/pe9w-x904/#{work.id}/")
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "example-bucket", continuation_token: nil, max_keys: 1000, prefix: "10.34770/pe9w-x904/#{work.id}/")
end

it "it retrieves the files for a bucket and prefix" do
files = subject.client_s3_empty_files(reload: true, bucket_name: "other-bucket", prefix: "new-prefix")
expect(files.count).to eq 2
expect(files.first.filename).to eq(s3_key2)
expect(files[1].filename).to eq(s3_key2)
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", max_keys: 1000, prefix: "new-prefix")
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", continuation_token: nil, max_keys: 1000, prefix: "new-prefix")
end
end

describe "#copy_directory" do
let(:fake_aws_client) { double(Aws::S3::Client) }
let(:fake_completion) { instance_double(Seahorse::Client::Response, "successful?": true) }
Expand Down

0 comments on commit 7d13ef2

Please sign in to comment.