diff --git a/app/models/s3_file.rb b/app/models/s3_file.rb index dcdad20f6..869107b46 100644 --- a/app/models/s3_file.rb +++ b/app/models/s3_file.rb @@ -43,6 +43,10 @@ def directory? size == 0 end + def empty? + size == 0 + end + def globus_url encoded_filename = filename.split("/").map { |name| ERB::Util.url_encode(name) }.join("/") File.join(Rails.configuration.globus["post_curation_base_url"], encoded_filename) diff --git a/app/services/s3_query_service.rb b/app/services/s3_query_service.rb index 635c59ac4..1dc48a96b 100644 --- a/app/services/s3_query_service.rb +++ b/app/services/s3_query_service.rb @@ -167,15 +167,14 @@ def find_s3_file(filename:) # @return [Array] def client_s3_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix, ignore_directories: true) @client_s3_files = nil if reload # force a reload - @client_s3_files ||= begin - start = Time.zone.now - resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix: }) - resp_hash = resp.to_h - objects = parse_objects(resp_hash, ignore_directories:) - objects += parse_continuation(resp_hash, bucket_name:, prefix:, ignore_directories:) - elapsed = Time.zone.now - start - Rails.logger.info("Loading S3 objects. Bucket: #{bucket_name}. Prefix: #{prefix}. Elapsed: #{elapsed} seconds") - objects + @client_s3_files ||= get_s3_objects(bucket_name:, prefix:, ignore_directories:) + end + + def client_s3_empty_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix) + @client_s3_empty_files = nil if reload # force a reload + @client_s3_empty_files = begin + files_and_directories = get_s3_objects(bucket_name:, prefix:, ignore_directories: false) + files_and_directories.select { |object| !object.filename.ends_with?("/") && object.empty? } end end @@ -315,6 +314,17 @@ def md5(io:) private + def get_s3_objects(bucket_name:, prefix:, ignore_directories:) + start = Time.zone.now + resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix: }) + resp_hash = resp.to_h + objects = parse_objects(resp_hash, ignore_directories:) + objects += parse_continuation(resp_hash, bucket_name:, prefix:, ignore_directories:) + elapsed = Time.zone.now - start + Rails.logger.info("Loading S3 objects. Bucket: #{bucket_name}. Prefix: #{prefix}. Elapsed: #{elapsed} seconds") + objects + end + def parse_objects(resp, ignore_directories: true) objects = [] resp_hash = resp.to_h diff --git a/spec/services/s3_query_service_spec.rb b/spec/services/s3_query_service_spec.rb index 924dd08fa..b8b0b6ca9 100644 --- a/spec/services/s3_query_service_spec.rb +++ b/spec/services/s3_query_service_spec.rb @@ -33,7 +33,7 @@ }, { etag: "\"7bd3d4339c034ebc663b99065771111\"", - key: "A directory", + key: "a_directory/", last_modified: s3_last_modified2, size: 0, storage_class: "STANDARD" @@ -709,6 +709,38 @@ end end + describe "#client_s3_empty_files" do + let(:fake_aws_client) { double(Aws::S3::Client) } + let(:s3_size2) { 0 } + + before do + subject.stub(:client).and_return(fake_aws_client) + fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output) + fake_aws_client.stub(:list_objects_v2).and_return(fake_s3_resp) + s3_hash_truncated = s3_hash.clone + s3_hash_truncated[:is_truncated] = true + fake_s3_resp.stub(:to_h).and_return(s3_hash_truncated, s3_hash) + end + + it "it retrieves the files for the work" do + files = subject.client_s3_empty_files + expect(files.count).to eq 2 + expect(files.first.filename).to eq(s3_key2) + expect(files[1].filename).to match(s3_key2) + expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "example-bucket", max_keys: 1000, prefix: "10.34770/pe9w-x904/#{work.id}/") + expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "example-bucket", continuation_token: nil, max_keys: 1000, prefix: "10.34770/pe9w-x904/#{work.id}/") + end + + it "it retrieves the files for a bucket and prefix" do + files = subject.client_s3_empty_files(reload: true, bucket_name: "other-bucket", prefix: "new-prefix") + expect(files.count).to eq 2 + expect(files.first.filename).to eq(s3_key2) + expect(files[1].filename).to eq(s3_key2) + expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", max_keys: 1000, prefix: "new-prefix") + expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", continuation_token: nil, max_keys: 1000, prefix: "new-prefix") + end + end + describe "#copy_directory" do let(:fake_aws_client) { double(Aws::S3::Client) } let(:fake_completion) { instance_double(Seahorse::Client::Response, "successful?": true) }