diff --git a/app/services/ams/asset_destroyer.rb b/app/services/ams/asset_destroyer.rb index 5ca14ea7..3d1d6cf6 100644 --- a/app/services/ams/asset_destroyer.rb +++ b/app/services/ams/asset_destroyer.rb @@ -1,3 +1,5 @@ +require 'ruby-progressbar' + module AMS class AssetDestroyer attr_accessor :asset_ids, :user_email, :logger @@ -10,8 +12,10 @@ def initialize(asset_ids: [], user_email: nil) def destroy(asset_ids) logger.info "Initiating destruction sequence for #{asset_ids.count} Assets..." + progressbar = ProgressBar.create(total: asset_ids.size, format: "Destroying Assets: %a %e %c/%C %P%") Array(asset_ids).each do |asset_id| destroy_asset_by_id asset_id + progressbar.increment end end @@ -65,8 +69,9 @@ def destroy_in_postgres(asset_id) .with_step_args('work_resource.delete' => { user: user }, 'work_resource.delete_all_file_sets' => { user: user }) .call(asset_resource).value! - rescue Valkyrie::Persistence::ObjectNotFoundError - puts "No AssetResource found with ID #{asset_id}" + logger.debug "AssetResource '#{asset_id}' (and children) destroyed." + rescue => e + error_rescue(e, 'AssetResource', asset_id) end def actor diff --git a/app/services/ams/missing_instantiations_locator.rb b/app/services/ams/missing_instantiations_locator.rb index b3f78799..0fbd779e 100644 --- a/app/services/ams/missing_instantiations_locator.rb +++ b/app/services/ams/missing_instantiations_locator.rb @@ -1,22 +1,21 @@ # frozen_string_literal: true require 'ruby-progressbar' +require 'parallel' module AMS # @see https://github.com/scientist-softserv/ams/issues/16 - class MissingInstantiationsLocator + class MissingInstantiationsLocator # rubocop:disable Metrics/ClassLength WORKING_DIR = Rails.root.join('tmp', 'imports') - attr_reader :search_dirs, :current_dir, :truncated_dir_name, :results_path, :results, :progressbar, :logger + attr_reader :current_dir, :truncated_dir_name, :results_path, :results, :progressbar, :logger - # @param [Array] search_dirs - def initialize(search_dirs) - @search_dirs = search_dirs.map { |dir| WORKING_DIR.join(dir) } - @logger = ActiveSupport::Logger.new( - WORKING_DIR.join('i16-missing-instantiations-locator.log') - ) + def initialize + @logger = Logger.new(WORKING_DIR.join('i16-missing-instantiations-locator.log')) end - def map_all_instantiation_identifiers + # @param [Array] search_dirs + def map_all_instantiation_identifiers(search_dirnames) + search_dirs = search_dirnames.map { |dir| WORKING_DIR.join(dir) } search_dirs.each do |current_dir| @current_dir = current_dir @truncated_dir_name = File.basename(current_dir) @@ -57,27 +56,98 @@ def merge_all_instantiation_maps end end - def create_subsets_from_merged_map + # @param [Integer] num_processes + def create_subsets_from_merged_map(num_processes: 4) results = JSON.parse(File.read(WORKING_DIR.join('i16-combined-results.json'))) - uniq_assset_paths = results.values.flatten.uniq - subsets = uniq_assset_paths.each_slice(10_000).to_a + uniq_asset_paths = results.values.flatten.uniq + subsets = uniq_asset_paths.each_slice(10_000).to_a - subsets.each_with_index do |set, i| + Parallel.each_with_index(subsets, in_processes: num_processes) do |set, i| set_path = WORKING_DIR.join("i16-subset-#{i}") FileUtils.mkdir_p(set_path) - pb_format = "Copying XML files to #{File.basename(set_path)}: %c/%C %P%" + pb_format = "Copying XML files to #{File.basename(set_path)}: %a %e %c/%C %P%" progressbar = ProgressBar.create(total: set.size, format: pb_format) set.each do |asset_path| importer_dir, asset_id = asset_path.split('/') xml_filename = "#{asset_id.sub('cpb-aacip-', '')}.xml" - FileUtils.cp(WORKING_DIR.join(importer_dir, xml_filename), WORKING_DIR.join(set_path, xml_filename)) + if File.exist?(WORKING_DIR.join(set_path, xml_filename)) + logger.debug "#{xml_filename} already exists in #{File.basename(set_path)}" + else + begin + FileUtils.cp(WORKING_DIR.join(importer_dir, xml_filename), WORKING_DIR.join(set_path, xml_filename)) + rescue => e + logger.error "#{e.class} - (#{File.basename(set_path)}/#{xml_filename}) - #{e.message}" + end + end progressbar.increment end end end + def audit_duplicate_xml_files + results = JSON.parse(File.read(WORKING_DIR.join('i16-combined-results.json'))) + asset_paths = results.values.flatten.uniq + filename_map = {} + + asset_paths.each do |path| + path, asset_id = path.split('/') + filename = "#{asset_id.sub('cpb-aacip-', '')}.xml" + + filename_map[filename] ||= {} + filename_map[filename][:paths] ||= [] + filename_map[filename][:paths] << path + end + + duplicate_files = filename_map.select { |_filename, attrs| attrs[:paths].size > 1 } + + duplicate_files.each do |filename, attrs| + file_contents = attrs[:paths].map { |path| File.read(WORKING_DIR.join(path, filename)) } + duplicate_files[filename][:content_differs] = file_contents.uniq.size > 1 + end + + File.open(WORKING_DIR.join('i16-duplicate-xml-files-audit.json'), 'w') do |file| + file.puts JSON.pretty_generate(duplicate_files) + end + end + + def destroy_assets(subset_path) + xml_files = Dir.glob(subset_path.join('*.xml')) + asset_ids = xml_files.map { |f| "cpb-aacip-#{File.basename(f).sub('.xml', '')}" } + + begin + logger.info "Destroying #{asset_ids.size} Assets via the AssetDestroyer. See asset_destroyer.log" + ad = AMS::AssetDestroyer.new(asset_ids: asset_ids, user_email: 'wgbh_admin@wgbh-mla.org') + ad.destroy(ad.asset_ids) + rescue => e + logger.error "Error destroying Assets. See asset_destroyer.log (#{e.class} - #{e.message})" + end + end + + def create_subset_importers + subset_paths = Dir.glob(Rails.root.join('tmp', 'imports', 'i16-subset*')) + base_imp = Bulkrax::Importer.find_by(name: 'AMS1Importer_0-10000') + desired_parser_field_attrs = %w[ + record_element + import_type + visibility + rights_statement + override_rights_statement + file_style + ] + + subset_paths.each do |path| + imp = base_imp.dup + + imp.name = File.basename(path) + imp.parser_fields = base_imp.parser_fields.slice(*desired_parser_field_attrs) + imp.parser_fields['import_file_path'] = path + + imp.save! + end + end + private def map_asset_id_to_inst_ids(xml_file) diff --git a/db/schema.test.rb b/db/schema.test.rb index 10e689f8..f5ba4d0c 100644 --- a/db/schema.test.rb +++ b/db/schema.test.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema.define(version: 2023_08_30_155065) do +ActiveRecord::Schema.define(version: 2024_03_07_053156) do # These are extensions that must be enabled in order to support this database enable_extension "plpgsql" @@ -414,6 +414,7 @@ t.datetime "updated_at", null: false t.string "internal_resource" t.integer "lock_version" + t.index "(((metadata -> 'bulkrax_identifier'::text) ->> 0))", name: "index_on_bulkrax_identifier", where: "((metadata -> 'bulkrax_identifier'::text) IS NOT NULL)" t.index ["internal_resource"], name: "index_orm_resources_on_internal_resource" t.index ["metadata"], name: "index_orm_resources_on_metadata", using: :gin t.index ["metadata"], name: "index_orm_resources_on_metadata_jsonb_path_ops", opclass: :jsonb_path_ops, using: :gin