Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

i16 Process subsets #862

Merged
merged 10 commits into from
Apr 24, 2024
9 changes: 7 additions & 2 deletions app/services/ams/asset_destroyer.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
require 'ruby-progressbar'

module AMS
class AssetDestroyer
attr_accessor :asset_ids, :user_email, :logger
Expand All @@ -10,8 +12,10 @@ def initialize(asset_ids: [], user_email: nil)

def destroy(asset_ids)
logger.info "Initiating destruction sequence for #{asset_ids.count} Assets..."
progressbar = ProgressBar.create(total: asset_ids.size, format: "Destroying Assets: %a %e %c/%C %P%")
Array(asset_ids).each do |asset_id|
destroy_asset_by_id asset_id
progressbar.increment
end
end

Expand Down Expand Up @@ -65,8 +69,9 @@ def destroy_in_postgres(asset_id)
.with_step_args('work_resource.delete' => { user: user },
'work_resource.delete_all_file_sets' => { user: user })
.call(asset_resource).value!
rescue Valkyrie::Persistence::ObjectNotFoundError
puts "No AssetResource found with ID #{asset_id}"
logger.debug "AssetResource '#{asset_id}' (and children) destroyed."
rescue => e
error_rescue(e, 'AssetResource', asset_id)
end

def actor
Expand Down
100 changes: 85 additions & 15 deletions app/services/ams/missing_instantiations_locator.rb
Original file line number Diff line number Diff line change
@@ -1,22 +1,21 @@
# frozen_string_literal: true
require 'ruby-progressbar'
require 'parallel'

module AMS
# @see https://github.com/scientist-softserv/ams/issues/16
class MissingInstantiationsLocator
class MissingInstantiationsLocator # rubocop:disable Metrics/ClassLength
WORKING_DIR = Rails.root.join('tmp', 'imports')

attr_reader :search_dirs, :current_dir, :truncated_dir_name, :results_path, :results, :progressbar, :logger
attr_reader :current_dir, :truncated_dir_name, :results_path, :results, :progressbar, :logger

# @param [Array<String>] search_dirs
def initialize(search_dirs)
@search_dirs = search_dirs.map { |dir| WORKING_DIR.join(dir) }
@logger = ActiveSupport::Logger.new(
WORKING_DIR.join('i16-missing-instantiations-locator.log')
)
def initialize
@logger = Logger.new(WORKING_DIR.join('i16-missing-instantiations-locator.log'))
end

def map_all_instantiation_identifiers
# @param [Array<String>] search_dirs
def map_all_instantiation_identifiers(search_dirnames)
search_dirs = search_dirnames.map { |dir| WORKING_DIR.join(dir) }
search_dirs.each do |current_dir|
@current_dir = current_dir
@truncated_dir_name = File.basename(current_dir)
Expand Down Expand Up @@ -57,27 +56,98 @@ def merge_all_instantiation_maps
end
end

def create_subsets_from_merged_map
# @param [Integer] num_processes
def create_subsets_from_merged_map(num_processes: 4)
results = JSON.parse(File.read(WORKING_DIR.join('i16-combined-results.json')))
uniq_assset_paths = results.values.flatten.uniq
subsets = uniq_assset_paths.each_slice(10_000).to_a
uniq_asset_paths = results.values.flatten.uniq
subsets = uniq_asset_paths.each_slice(10_000).to_a

subsets.each_with_index do |set, i|
Parallel.each_with_index(subsets, in_processes: num_processes) do |set, i|
set_path = WORKING_DIR.join("i16-subset-#{i}")
FileUtils.mkdir_p(set_path)
pb_format = "Copying XML files to #{File.basename(set_path)}: %c/%C %P%"
pb_format = "Copying XML files to #{File.basename(set_path)}: %a %e %c/%C %P%"
progressbar = ProgressBar.create(total: set.size, format: pb_format)

set.each do |asset_path|
importer_dir, asset_id = asset_path.split('/')
xml_filename = "#{asset_id.sub('cpb-aacip-', '')}.xml"

FileUtils.cp(WORKING_DIR.join(importer_dir, xml_filename), WORKING_DIR.join(set_path, xml_filename))
if File.exist?(WORKING_DIR.join(set_path, xml_filename))
logger.debug "#{xml_filename} already exists in #{File.basename(set_path)}"
else
begin
FileUtils.cp(WORKING_DIR.join(importer_dir, xml_filename), WORKING_DIR.join(set_path, xml_filename))
rescue => e
logger.error "#{e.class} - (#{File.basename(set_path)}/#{xml_filename}) - #{e.message}"
end
end
progressbar.increment
end
end
end

def audit_duplicate_xml_files
results = JSON.parse(File.read(WORKING_DIR.join('i16-combined-results.json')))
asset_paths = results.values.flatten.uniq
filename_map = {}

asset_paths.each do |path|
path, asset_id = path.split('/')
filename = "#{asset_id.sub('cpb-aacip-', '')}.xml"

filename_map[filename] ||= {}
filename_map[filename][:paths] ||= []
filename_map[filename][:paths] << path
end

duplicate_files = filename_map.select { |_filename, attrs| attrs[:paths].size > 1 }

duplicate_files.each do |filename, attrs|
file_contents = attrs[:paths].map { |path| File.read(WORKING_DIR.join(path, filename)) }
duplicate_files[filename][:content_differs] = file_contents.uniq.size > 1
end

File.open(WORKING_DIR.join('i16-duplicate-xml-files-audit.json'), 'w') do |file|
file.puts JSON.pretty_generate(duplicate_files)
end
end

def destroy_assets(subset_path)
xml_files = Dir.glob(subset_path.join('*.xml'))
asset_ids = xml_files.map { |f| "cpb-aacip-#{File.basename(f).sub('.xml', '')}" }

begin
logger.info "Destroying #{asset_ids.size} Assets via the AssetDestroyer. See asset_destroyer.log"
ad = AMS::AssetDestroyer.new(asset_ids: asset_ids, user_email: '[email protected]')
ad.destroy(ad.asset_ids)
rescue => e
logger.error "Error destroying Assets. See asset_destroyer.log (#{e.class} - #{e.message})"
end
end

def create_subset_importers
subset_paths = Dir.glob(Rails.root.join('tmp', 'imports', 'i16-subset*'))
base_imp = Bulkrax::Importer.find_by(name: 'AMS1Importer_0-10000')
desired_parser_field_attrs = %w[
record_element
import_type
visibility
rights_statement
override_rights_statement
file_style
]

subset_paths.each do |path|
imp = base_imp.dup

imp.name = File.basename(path)
imp.parser_fields = base_imp.parser_fields.slice(*desired_parser_field_attrs)
imp.parser_fields['import_file_path'] = path

imp.save!
end
end

private

def map_asset_id_to_inst_ids(xml_file)
Expand Down
3 changes: 2 additions & 1 deletion db/schema.test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.

ActiveRecord::Schema.define(version: 2023_08_30_155065) do
ActiveRecord::Schema.define(version: 2024_03_07_053156) do

# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
Expand Down Expand Up @@ -414,6 +414,7 @@
t.datetime "updated_at", null: false
t.string "internal_resource"
t.integer "lock_version"
t.index "(((metadata -> 'bulkrax_identifier'::text) ->> 0))", name: "index_on_bulkrax_identifier", where: "((metadata -> 'bulkrax_identifier'::text) IS NOT NULL)"
t.index ["internal_resource"], name: "index_orm_resources_on_internal_resource"
t.index ["metadata"], name: "index_orm_resources_on_metadata", using: :gin
t.index ["metadata"], name: "index_orm_resources_on_metadata_jsonb_path_ops", opclass: :jsonb_path_ops, using: :gin
Expand Down
Loading