Skip to content

Commit

Permalink
Utilizing wget so that the download can restart when the dspace conne…
Browse files Browse the repository at this point in the history
…ction fails

Also change to run through the entire download upload delete cycle for one file at a time.
This shold allow for better restart capabilities and also keep disk usage down.
  • Loading branch information
carolyncole authored and jrgriffiniii committed Aug 7, 2023
1 parent de61681 commit 78f44be
Show file tree
Hide file tree
Showing 7 changed files with 88 additions and 73 deletions.
57 changes: 27 additions & 30 deletions app/jobs/dspace_bitstream_copy_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,48 +4,45 @@ class DspaceBitstreamCopyJob < ApplicationJob

def perform(dspace_files_json:, work_id:, migration_snapshot_id:)
dspace_files = JSON.parse(dspace_files_json).map { |json_file| S3File.from_json(json_file) }
work = Work.find(work_id)
@work = Work.find(work_id)

dspace_files = download_dspace_bitstreams(dspace_files, work, migration_snapshot_id)
upload_dspace_files(dspace_files, work, migration_snapshot_id)

dspace_files.each { |file| File.delete(file.filename) }
dspace_files.each do |dspace_file|
migrate_file(dspace_file, migration_snapshot_id)
end
end

private

def download_dspace_bitstreams(dspace_files, work, migration_snapshot_id)
dspace_connector = PULDspaceConnector.new(work)
def migrate_file(dspace_file, migration_snapshot_id)
# Allow a restart if there is an error with one file
snapshot = MigrationUploadSnapshot.find(migration_snapshot_id)
return if snapshot.complete?(dspace_file)

downloaded_files = dspace_connector.download_bitstreams(dspace_files)
if downloaded_files.any?(Hash)
error_files = downloaded_files.select { |file| file.is_a? Hash }
update_migration_status(migration_snapshot_id) do |migration_snapshot|
error_files.each do |error_file|
migration_snapshot.mark_error(error_file[:file], error_file[:error])
end
downloaded_file = download_dspace_file(dspace_file, migration_snapshot_id)
return if downloaded_file.nil?
aws_connector = PULDspaceAwsConnector.new(@work, @work.doi)
result = aws_connector.upload_to_s3([dspace_file]).first
update_migration_status(migration_snapshot_id) do |migration_snapshot|
if result[:error].present?
migration_snapshot.mark_error(result[:file], result[:error])
else
migration_snapshot.mark_complete(result[:file])
end
dspace_files = downloaded_files.reject { |file| file.is_a? Hash }
end
dspace_files
File.delete(downloaded_file.filename) if File.exist?(downloaded_file.filename)
end

def upload_dspace_files(dspace_files, work, migration_snapshot_id)
aws_connector = PULDspaceAwsConnector.new(work, work.doi)

results = aws_connector.upload_to_s3(dspace_files)
error_results = results.select { |result| result[:error].present? }
good_results = results.select { |result| result[:key].present? }
update_migration_status(migration_snapshot_id) do |migration_snapshot|
good_results.each do |result|
migration_snapshot.mark_complete(result[:file])
end
error_results.each do |result|
migration_snapshot.mark_error(result[:file], result[:error])
def download_dspace_file(dspace_file, migration_snapshot_id)
dspace_connector = PULDspaceConnector.new(@work)
downloaded_file = dspace_connector.download_bitstreams([dspace_file]).first
if downloaded_file.is_a?(Hash)
update_migration_status(migration_snapshot_id) do |migration_snapshot|
migration_snapshot.mark_error(downloaded_file[:file], downloaded_file[:error])
end
nil
else
downloaded_file
end
@migration_snapshot&.save
error_results.map { |result| result[:error] }
end

def update_migration_status(migration_snapshot_id)
Expand Down
6 changes: 6 additions & 0 deletions app/models/migration_upload_snapshot.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ def mark_complete(s3_file)
end
end

def complete?(s3_file)
index = find_file(s3_file.filename)
return false if index.nil?
files[index]["migrate_status"] == "complete" && files[index]["checksum"] == s3_file.checksum
end

def migration_complete?
files.select { |file| file.keys.include?("migrate_status") }.map { |file| file["migrate_status"] }.uniq == ["complete"]
end
Expand Down
22 changes: 8 additions & 14 deletions app/services/pul_dspace_connector.rb
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# frozen_string_literal: true
class PULDspaceConnector
attr_reader :work, :ark
attr_reader :work, :ark, :download_base

def initialize(work)
@work = work
@ark = work.ark&.gsub("ark:/", "")
@download_base = "#{Rails.configuration.dspace.base_url.gsub('rest/', '')}bitstream/#{ark}"
end

def id
Expand Down Expand Up @@ -41,7 +42,7 @@ def list_bitsteams
end

S3File.new(filename_display: bitstream["name"], checksum: base64digest(bitstream["checkSum"]["value"]), last_modified: DateTime.now,
size: -1, work: work, url: bitstream["retrieveLink"], filename: filename)
size: -1, work: work, url: "#{download_base}/#{bitstream['sequenceId']}", filename: filename)
end
end

Expand Down Expand Up @@ -81,24 +82,17 @@ def get_data(url_path)
JSON.parse(response.body)
end

def download_bitstream(retrieval_path, filename)
url = "#{Rails.configuration.dspace.base_url}#{retrieval_path}"
def download_bitstream(retrieval_url, filename)
path = File.join(Rails.configuration.dspace.download_file_path, "dspace_download", work.id.to_s)
FileUtils.mkdir_p path
download_file(url, filename)
download_file(retrieval_url, filename)
filename
end

def download_file(url, filename)
http = request_http(url)
uri = URI(url)
req = Net::HTTP::Get.new uri.path
http.request req do |response|
io = File.open(filename, "w")
response.read_body do |chunk|
io.write chunk.force_encoding("UTF-8")
end
io.close
stdout_and_stderr_str, status = Open3.capture2e("wget -c '#{url}' -O #{filename}")
unless status.success?
Honeybadger.notify("Error dowloading file #{url} for work id #{work.id} to #{filename}! Error: #{stdout_and_stderr_str}")
end
end

Expand Down
Binary file modified dump.rdb
Binary file not shown.
27 changes: 17 additions & 10 deletions spec/services/pul_dspace_connector_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,22 +36,26 @@
let(:handle_body) { File.read(Rails.root.join("spec", "fixtures", "files", "dspace_handle.json")) }
let(:bitsreams_body) { File.read(Rails.root.join("spec", "fixtures", "files", "dspace_bitstreams_response.json")) }
let(:metadata_body) { File.read(Rails.root.join("spec", "fixtures", "files", "dspace_metadata_response.json")) }
let(:bitsream1_body) { File.read(Rails.root.join("spec", "fixtures", "files", "bitstreams", "SCoData_combined_v1_2020-07_README.txt")) }
let(:bitsream2_body) { File.read(Rails.root.join("spec", "fixtures", "files", "bitstreams", "SCoData_combined_v1_2020-07_datapackage.json")) }
let(:bitsream3_body) { File.read(Rails.root.join("spec", "fixtures", "files", "bitstreams", "license.txt")) }
let(:process_status) { instance_double Process::Status, "success?": true }
before do
stub_request(:get, "https://dataspace.example.com/rest/handle/88435/dsp01zc77st047")
.to_return(status: 200, body: handle_body, headers: {})
stub_request(:get, "https://dataspace.example.com/rest/items/104718/bitstreams")
.to_return(status: 200, body: bitsreams_body, headers: {})
stub_request(:get, "https://dataspace.example.com/rest/items/104718/metadata")
.to_return(status: 200, body: metadata_body, headers: {})
stub_request(:get, "https://dataspace.example.com/rest//bitstreams/145784/retrieve")
.to_return(status: 200, body: bitsream1_body, headers: {})
stub_request(:get, "https://dataspace.example.com/rest//bitstreams/145785/retrieve")
.to_return(status: 200, body: bitsream2_body, headers: {})
stub_request(:get, "https://dataspace.example.com/rest//bitstreams/145762/retrieve")
.to_return(status: 200, body: bitsream3_body, headers: {})
allow(Open3).to receive(:capture2e).and_return(["", process_status])
FileUtils.mkdir_p("/tmp/dspace_download/#{work.id}")
FileUtils.cp(Rails.root.join("spec", "fixtures", "files", "bitstreams", "SCoData_combined_v1_2020-07_README.txt"),
"/tmp/dspace_download/#{work.id}/SCoData_combined_v1_2020-07_README.txt")
FileUtils.cp(Rails.root.join("spec", "fixtures", "files", "bitstreams", "SCoData_combined_v1_2020-07_datapackage.json"),
"/tmp/dspace_download/#{work.id}/SCoData_combined_v1_2020-07_datapackage.json")
FileUtils.cp(Rails.root.join("spec", "fixtures", "files", "bitstreams", "license.txt"),
"/tmp/dspace_download/#{work.id}/license.txt")
end

after do
FileUtils.rm_r("/tmp/dspace_download/#{work.id}")
end

it "downloads the bitstreams" do
Expand All @@ -62,7 +66,10 @@

context "a bitstream missmatch" do
# realy should be the readme, but we are intetionally returning the wrong data
let(:bitsream1_body) { "not the readme!!" }
before do
FileUtils.cp(Rails.root.join("spec", "fixtures", "files", "bitstreams", "license.txt"),
"/tmp/dspace_download/#{work.id}/SCoData_combined_v1_2020-07_README.txt")
end

it "downloads the bitstreams" do
allow(Honeybadger).to receive(:notify)
Expand Down
27 changes: 17 additions & 10 deletions spec/services/pul_dspace_migrate_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@
let(:handle_body) { File.read(Rails.root.join("spec", "fixtures", "files", "dspace_handle.json")) }
let(:bitsreams_body) { File.read(Rails.root.join("spec", "fixtures", "files", "dspace_bitstreams_response.json")) }
let(:metadata_body) { File.read(Rails.root.join("spec", "fixtures", "files", "dspace_metadata_response.json")) }
let(:bitsream1_body) { File.read(Rails.root.join("spec", "fixtures", "files", "bitstreams", "SCoData_combined_v1_2020-07_README.txt")) }
let(:bitsream2_body) { File.read(Rails.root.join("spec", "fixtures", "files", "bitstreams", "SCoData_combined_v1_2020-07_datapackage.json")) }
let(:bitsream3_body) { File.read(Rails.root.join("spec", "fixtures", "files", "bitstreams", "license.txt")) }
let(:process_status) { instance_double Process::Status, "success?": true }
before do
stub_request(:get, "https://dataspace.example.com/rest/handle/88435/dsp01zc77st047")
.to_return(status: 200, body: handle_body, headers: {})
Expand All @@ -34,12 +32,18 @@
.to_return(status: 200, body: bitsreams_body, headers: {})
stub_request(:get, "https://dataspace.example.com/rest/items/104718/metadata")
.to_return(status: 200, body: metadata_body, headers: {})
stub_request(:get, "https://dataspace.example.com/rest//bitstreams/145784/retrieve")
.to_return(status: 200, body: bitsream1_body, headers: {})
stub_request(:get, "https://dataspace.example.com/rest//bitstreams/145785/retrieve")
.to_return(status: 200, body: bitsream2_body, headers: {})
stub_request(:get, "https://dataspace.example.com/rest//bitstreams/145762/retrieve")
.to_return(status: 200, body: bitsream3_body, headers: {})
allow(Open3).to receive(:capture2e).and_return(["", process_status])
FileUtils.mkdir_p("/tmp/dspace_download/#{work.id}")
FileUtils.cp(Rails.root.join("spec", "fixtures", "files", "bitstreams", "SCoData_combined_v1_2020-07_README.txt"),
"/tmp/dspace_download/#{work.id}/SCoData_combined_v1_2020-07_README.txt")
FileUtils.cp(Rails.root.join("spec", "fixtures", "files", "bitstreams", "SCoData_combined_v1_2020-07_datapackage.json"),
"/tmp/dspace_download/#{work.id}/SCoData_combined_v1_2020-07_datapackage.json")
FileUtils.cp(Rails.root.join("spec", "fixtures", "files", "bitstreams", "license.txt"),
"/tmp/dspace_download/#{work.id}/license.txt")
end

after do
FileUtils.rm_r("/tmp/dspace_download/#{work.id}")
end

describe "#migrate" do
Expand Down Expand Up @@ -210,7 +214,10 @@

context "a dspace bitstream missmatch" do
# realy should be the readme, but we are intetionally returning the wrong data
let(:bitsream1_body) { "not the readme!!" }
before do
FileUtils.cp(Rails.root.join("spec", "fixtures", "files", "bitstreams", "license.txt"),
"/tmp/dspace_download/#{work.id}/SCoData_combined_v1_2020-07_README.txt")
end

it "downloads the bitstreams" do
allow(Honeybadger).to receive(:notify)
Expand Down
22 changes: 13 additions & 9 deletions spec/system/migrate_submission_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -225,9 +225,7 @@
let(:handle_body) { File.read(Rails.root.join("spec", "fixtures", "files", "dspace_handle.json")) }
let(:bitsreams_body) { File.read(Rails.root.join("spec", "fixtures", "files", "dspace_bitstreams_response.json")) }
let(:metadata_body) { File.read(Rails.root.join("spec", "fixtures", "files", "dspace_metadata_response.json")) }
let(:bitsream1_body) { File.read(Rails.root.join("spec", "fixtures", "files", "bitstreams", "SCoData_combined_v1_2020-07_README.txt")) }
let(:bitsream2_body) { File.read(Rails.root.join("spec", "fixtures", "files", "bitstreams", "SCoData_combined_v1_2020-07_datapackage.json")) }
let(:bitsream3_body) { File.read(Rails.root.join("spec", "fixtures", "files", "bitstreams", "license.txt")) }
let(:process_status) { instance_double Process::Status, "success?": true }

before do
stub_request(:get, "https://dataspace.example.com/rest/handle/88435/dsp01zc77st047")
Expand All @@ -236,12 +234,14 @@
.to_return(status: 200, body: bitsreams_body, headers: {})
stub_request(:get, "https://dataspace.example.com/rest/items/104718/metadata")
.to_return(status: 200, body: metadata_body, headers: {})
stub_request(:get, "https://dataspace.example.com/rest//bitstreams/145784/retrieve")
.to_return(status: 200, body: bitsream1_body, headers: {})
stub_request(:get, "https://dataspace.example.com/rest//bitstreams/145785/retrieve")
.to_return(status: 200, body: bitsream2_body, headers: {})
stub_request(:get, "https://dataspace.example.com/rest//bitstreams/145762/retrieve")
.to_return(status: 200, body: bitsream3_body, headers: {})
allow(Open3).to receive(:capture2e).and_return(["", process_status])
FileUtils.mkdir_p("/tmp/dspace_download/#{work.id}")
FileUtils.cp(Rails.root.join("spec", "fixtures", "files", "bitstreams", "SCoData_combined_v1_2020-07_README.txt"),
"/tmp/dspace_download/#{work.id}/SCoData_combined_v1_2020-07_README.txt")
FileUtils.cp(Rails.root.join("spec", "fixtures", "files", "bitstreams", "SCoData_combined_v1_2020-07_datapackage.json"),
"/tmp/dspace_download/#{work.id}/SCoData_combined_v1_2020-07_datapackage.json")
FileUtils.cp(Rails.root.join("spec", "fixtures", "files", "bitstreams", "license.txt"),
"/tmp/dspace_download/#{work.id}/license.txt")

work.resource.migrated = true
work.draft!(user)
Expand All @@ -250,6 +250,10 @@
allow(fake_s3_service).to receive(:copy_file).and_return(fake_completion)
end

after do
FileUtils.rm_r("/tmp/dspace_download/#{work.id}")
end

it "allows the user to click migrate and the migration gets run" do
sign_in user
visit(work_path(work))
Expand Down

0 comments on commit 78f44be

Please sign in to comment.