Skip to content

Commit

Permalink
Rename files with S3 illegal characters
Browse files Browse the repository at this point in the history
- Rename files with S3 illegal characters to something that is S3 legal by
replacing all illegal characters with a _ (underscore)
- Ensure there are no duplicate file names after the renaming by appending
a (1), (2) at the end of the filename if the file has been renamed
- Keep a record of all of the file names as they originally existed and what
they were renamed to
- The record goes into a file called files_renamed.txt, which contains a
list of all files that have been renamed and what they were renamed to,
along with a date.
- This files_renamed.txt file gets added to the dataset as a payload file
- Update the migration upload snapshot so it doesn't get confused about
  the files now having different names

Co-authored-by: Carolyn Cole <[email protected]>
  • Loading branch information
2 people authored and jrgriffiniii committed Sep 25, 2023
1 parent 3dcad9e commit e731591
Show file tree
Hide file tree
Showing 8 changed files with 206 additions and 10 deletions.
34 changes: 31 additions & 3 deletions app/jobs/dspace_bitstream_copy_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,24 @@
class DspaceBitstreamCopyJob < ApplicationJob
queue_as :default

# For each file from DSpace, queue up a migration.
# If the file contains characters that are not S3 safe, re-name the file.
# Note that the dspace_file.filename_display will be the file's key in S3.
# Files that are re-named must be re-named sequentially, and we must provide
# a list of all of the re-naming that occurred, and include that file in
# what is uploaded to S3.
def perform(dspace_files_json:, work_id:, migration_snapshot_id:)
dspace_files = JSON.parse(dspace_files_json).map { |json_file| S3File.from_json(json_file) }
@work = Work.find(work_id)

frms = FileRenameMappingService.new(upload_snapshot: MigrationUploadSnapshot.find(migration_snapshot_id))
dspace_files.each do |dspace_file|
if FileRenameService.new(filename: dspace_file.filename_display).needs_rename?
# Rename files so they are S3 safe
dspace_file.filename_display = frms.renamed_files[dspace_file.filename_display]
end
migrate_file(dspace_file, migration_snapshot_id)
end
upload_rename_mapping(frms)
end

private
Expand Down Expand Up @@ -55,9 +66,26 @@ def update_migration_status(migration_snapshot_id)
end
end

def file_complete?(migratoion_snapshot, dspace_file)
def file_complete?(migration_snapshot, dspace_file)
s3_file = dspace_file.clone
s3_file.filename = s3_file.filename_display
migratoion_snapshot.complete?(s3_file)
migration_snapshot.complete?(s3_file)
end

# If any files were renamed, upload a text file containing
# the original file names and what they were renamed to.
def upload_rename_mapping(frms)
return unless frms.rename_needed?

filename = "renamed_files.txt"
io = StringIO.new
io.write frms.renaming_document
io.rewind
size = io.size
checksum = Digest::MD5.new
checksum.update(io.read)
base64 = checksum.base64digest
io.rewind
@work.s3_query_service.upload_file(io: io, filename: filename, size: size, md5_digest: base64)
end
end
7 changes: 7 additions & 0 deletions app/models/migration_upload_snapshot.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@ def mark_error(s3_file, error_message)
end
end

# Rename a file
def rename(old_filename, new_filename)
index = find_file(old_filename)
files[index]["original_filename"] = old_filename
files[index]["filename"] = new_filename
end

def mark_complete(s3_file)
index = find_file(s3_file.filename)
if index.present?
Expand Down
2 changes: 1 addition & 1 deletion app/models/work_activity_notification.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class WorkActivityNotification < ApplicationRecord
if send_message?
mailer = NotificationMailer.with(user: user, work_activity: work_activity)
message = mailer.build_message
message.deliver_later(wait: 10.seconds)
message.deliver_later(wait: 10.seconds) unless Rails.env.development?
end
end

Expand Down
73 changes: 73 additions & 0 deletions app/services/file_rename_mapping_service.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# frozen_string_literal: true

# We sometimes have data with filenames that contain characters that AWS S3 cannot handle. In those cases we want to:
# 1. Rename the files to something that is AWS legal. Replace all illegal characters with a _ (underscore)
# 2. Ensure there are no duplicate file names after the renaming by appending a (1), (2) at the end of the filename
# if the file has been renamed
# 3. Keep a record of all of the file names as they originally existed and what they were renamed to
# 4. The record goes into a file called files_renamed.txt, which contains a list of all files that have been renamed
# and what they were renamed to, along with a timestamp
# 5. This files_renamed.txt file gets added to the dataset as a payload file, akin to a README.txt or license.txt
class FileRenameMappingService
attr_reader :upload_snapshot, :files, :renamed_files, :original_filenames

def initialize(upload_snapshot:)
@upload_snapshot = upload_snapshot
@original_filenames = @upload_snapshot.files.map { |a| a["filename"] }
@files = parse_files_to_rename
@renamed_files = rename_files
end

def parse_files_to_rename
files = []
@original_filenames.each do |original_filename|
files << FileRenameService.new(filename: original_filename)
end
files
end

# Make a hash containing all files that need renaming.
# The key of the hash is the original filename.
# The value of the hash is the re-named file with an index number appended.
def rename_files
@upload_snapshot.with_lock do
@upload_snapshot.reload
rename_index = 1
renamed_files = {}
@files.each do |file|
next unless file.needs_rename?
new_filename = file.new_filename(rename_index)
renamed_files[file.original_filename] = new_filename
# Also update the filename in the MigrationSnapshot
@upload_snapshot.rename(file.original_filename, new_filename)
rename_index += 1
end
@upload_snapshot.save
renamed_files
end
end

# A rename is needed if any of the original filenames need renaming
def rename_needed?
@files.each do |file|
return true if file.needs_rename?
end
false
end

# Format: "Sep 19 2023"
def rename_date
Time.zone.now.strftime("%d %b %Y")
end

def renaming_document
message = "Some files have been renamed to comply with AWS S3 storage requirements\n"
message += "Rename date: #{rename_date}\n"
message += "Original Filename\t Renamed File\n"
@files.each do |file|
next unless file.needs_rename?
message += "#{file.original_filename}\t#{@renamed_files[file.original_filename]}\n"
end
message
end
end
15 changes: 11 additions & 4 deletions app/services/file_rename_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ class FileRenameService
# https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html
# This service will only attempt to fix the most likely problems. For example, we will not try to
# handle "ASCII character ranges 00–1F hex (0–31 decimal) and 7F (127 decimal)"
# Note that we do not rename for a single space, but we do for two spaces together, because according to S3 docs:
# "Significant sequences of spaces might be lost in some uses (especially multiple spaces)"
ILLEGAL_CHARACTERS = [
"&", "$", "@", "=", ";", "/", ":", "+", " ", ",", "?", "\\", "{", "}", "^", "%", "`", "[", "]", "'", '"', ">", "<", "~", "#", "|"
"&", "$", "@", "=", ";", ":", "+", " ", ",", "?", "\\", "{", "}", "^", "%", "`", "[", "]", "'", '"', ">", "<", "~", "#", "|"
].freeze

attr_reader :original_filename
Expand All @@ -34,12 +36,17 @@ def check_if_file_needs_rename
false
end

# Replace every instance of an illegal character with an underscore
def new_filename
# Replace every instance of an illegal character with an underscore.
# Append an index number in parentheses just before the file extension,
# so we avoid ever accidentally naming two files identically and causing
# one to over-write the other.
def new_filename(index)
nf = @original_filename.dup
ILLEGAL_CHARACTERS.each do |char|
nf.gsub!(char, "_")
end
nf
split = nf.split(".")
split[-2] = "#{split[-2]}(#{index})"
split.join(".")
end
end
37 changes: 37 additions & 0 deletions spec/factories/migration_upload_snapshot.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# frozen_string_literal: true

FactoryBot.define do
factory :migration_upload_snapshot do
url { "https://localhost.localdomain/file.txt" }
version { 1 }
work { FactoryBot.create(:approved_work) }
files { [] }

factory :migration_upload_snapshot_with_illegal_characters do
files do
[
{
"filename" => "10.34770/tbd/4/laser width.xlsx",
"checksum" => "dGFh+f5CnwifPlEhkT1Amg==",
"migrate_status" => "started"
},
{
"filename" => "10.34770/tbd/4/all OH LIF decays.xlsx",
"checksum" => "oCovyV5XT+jNMsDbUpP/xA==",
"migrate_status" => "started"
},
{
"filename" => "10.34770/tbd/4/Dry He 2mm 10kV le=0.8mJ RH 50%.csv",
"checksum" => "4sUs+2GkGPPFHgjyY3NsPw==",
"migrate_status" => "started"
},
{
"filename" => "10.34770/tbd/4/Dry He 2mm 20kV le=0.8mJ RH 50%.csv",
"checksum" => "nY0PImdocFIffUu0oAIpoA==",
"migrate_status" => "started"
}
]
end
end
end
end
44 changes: 44 additions & 0 deletions spec/services/file_rename_mapping_service_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# frozen_string_literal: true
require "rails_helper"

RSpec.describe FileRenameMappingService do
let(:upload_snapshot) { FactoryBot.create(:migration_upload_snapshot_with_illegal_characters) }
let(:subject) { described_class.new(upload_snapshot: upload_snapshot) }
let(:file_needing_rename) { "10.34770/tbd/4/Dry He 2mm 20kV le=0.8mJ RH 50%.csv" }
let(:file_not_needing_rename) { "10.34770/tbd/4/laser width.xlsx" }

it "has an upload snapshot" do
expect(subject.upload_snapshot).to eq upload_snapshot
end

it "has an array of FileRenameService objects" do
expect(subject.files.count).to eq 4
expect(subject.files.first).to be_instance_of FileRenameService
end

it "has a hash of renamed files" do
expect(subject.renamed_files).to be_instance_of Hash
end

it "adds sequential numbers when it renames files" do
expect(subject.renamed_files[file_needing_rename]).to eq "10.34770/tbd/4/Dry He 2mm 20kV le_0.8mJ RH 50_(2).csv"
end

it "has a list of original filenames" do
original_filenames = [
"10.34770/tbd/4/laser width.xlsx",
"10.34770/tbd/4/all OH LIF decays.xlsx",
"10.34770/tbd/4/Dry He 2mm 10kV le=0.8mJ RH 50%.csv",
"10.34770/tbd/4/Dry He 2mm 20kV le=0.8mJ RH 50%.csv"
]
expect(subject.original_filenames).to eq original_filenames
end

it "knows whether it needs to rename any files" do
expect(subject.rename_needed?).to eq true
end

it "produces a mapping of all the file renaming" do
expect(subject.renaming_document).to match(/Some files have been renamed/)
end
end
4 changes: 2 additions & 2 deletions spec/services/file_rename_service_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
end

context "a file with S3 illegal characters" do
let(:filename) { "Dry He 2mm 10kV le=0.8mJ RH 50%.csv" }
let(:filename) { "10.80021/4dy5-dh78/473/Dry He 2mm 10kV le=0.8mJ RH 50%.csv" }
let(:subject) { described_class.new(filename: filename) }

it "knows the original filename" do
Expand All @@ -21,7 +21,7 @@
end

it "knows the new filename" do
expect(subject.new_filename).to eq "Dry_He_2mm_10kV_le_0.8mJ_RH_50_.csv"
expect(subject.new_filename(2)).to eq "10.80021/4dy5-dh78/473/Dry He 2mm 10kV le_0.8mJ RH 50_(2).csv"
end
end

Expand Down

0 comments on commit e731591

Please sign in to comment.