From ad42f331815d181d2d6c92e733614a0381d032be Mon Sep 17 00:00:00 2001 From: Benjamin Kiah Stroud <32469930+bkiahstroud@users.noreply.github.com> Date: Mon, 15 Apr 2024 18:10:44 -0700 Subject: [PATCH 01/10] various minor improvements --- .../ams/missing_instantiations_locator.rb | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/app/services/ams/missing_instantiations_locator.rb b/app/services/ams/missing_instantiations_locator.rb index b3f78799..162ef326 100644 --- a/app/services/ams/missing_instantiations_locator.rb +++ b/app/services/ams/missing_instantiations_locator.rb @@ -1,22 +1,21 @@ # frozen_string_literal: true require 'ruby-progressbar' +require 'parallel' module AMS # @see https://github.com/scientist-softserv/ams/issues/16 - class MissingInstantiationsLocator + class MissingInstantiationsLocator # rubocop:disable Metrics/ClassLength WORKING_DIR = Rails.root.join('tmp', 'imports') - attr_reader :search_dirs, :current_dir, :truncated_dir_name, :results_path, :results, :progressbar, :logger + attr_reader :current_dir, :truncated_dir_name, :results_path, :results, :progressbar, :logger - # @param [Array] search_dirs - def initialize(search_dirs) - @search_dirs = search_dirs.map { |dir| WORKING_DIR.join(dir) } - @logger = ActiveSupport::Logger.new( - WORKING_DIR.join('i16-missing-instantiations-locator.log') - ) + def initialize + @logger = Logger.new(WORKING_DIR.join('i16-missing-instantiations-locator.log')) end - def map_all_instantiation_identifiers + # @param [Array] search_dirs + def map_all_instantiation_identifiers(search_dirnames) + search_dirs = search_dirnames.map { |dir| WORKING_DIR.join(dir) } search_dirs.each do |current_dir| @current_dir = current_dir @truncated_dir_name = File.basename(current_dir) @@ -57,12 +56,13 @@ def merge_all_instantiation_maps end end - def create_subsets_from_merged_map + # @param [Integer] num_processes + def create_subsets_from_merged_map(num_processes: 4) results = JSON.parse(File.read(WORKING_DIR.join('i16-combined-results.json'))) uniq_assset_paths = results.values.flatten.uniq subsets = uniq_assset_paths.each_slice(10_000).to_a - subsets.each_with_index do |set, i| + Parallel.each_with_index(subsets, in_processes: num_processes) do |set, i| set_path = WORKING_DIR.join("i16-subset-#{i}") FileUtils.mkdir_p(set_path) pb_format = "Copying XML files to #{File.basename(set_path)}: %c/%C %P%" @@ -72,7 +72,9 @@ def create_subsets_from_merged_map importer_dir, asset_id = asset_path.split('/') xml_filename = "#{asset_id.sub('cpb-aacip-', '')}.xml" - FileUtils.cp(WORKING_DIR.join(importer_dir, xml_filename), WORKING_DIR.join(set_path, xml_filename)) + unless File.exist?(WORKING_DIR.join(set_path, xml_filename)) + FileUtils.cp(WORKING_DIR.join(importer_dir, xml_filename), WORKING_DIR.join(set_path, xml_filename)) + end progressbar.increment end end From 9ae4e472e7a4284875abe37b1904d6952f3ac8ea Mon Sep 17 00:00:00 2001 From: Benjamin Kiah Stroud <32469930+bkiahstroud@users.noreply.github.com> Date: Tue, 16 Apr 2024 17:41:02 -0700 Subject: [PATCH 02/10] fix typo --- app/services/ams/missing_instantiations_locator.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/services/ams/missing_instantiations_locator.rb b/app/services/ams/missing_instantiations_locator.rb index 162ef326..e800a7b4 100644 --- a/app/services/ams/missing_instantiations_locator.rb +++ b/app/services/ams/missing_instantiations_locator.rb @@ -59,8 +59,8 @@ def merge_all_instantiation_maps # @param [Integer] num_processes def create_subsets_from_merged_map(num_processes: 4) results = JSON.parse(File.read(WORKING_DIR.join('i16-combined-results.json'))) - uniq_assset_paths = results.values.flatten.uniq - subsets = uniq_assset_paths.each_slice(10_000).to_a + uniq_asset_paths = results.values.flatten.uniq + subsets = uniq_asset_paths.each_slice(10_000).to_a Parallel.each_with_index(subsets, in_processes: num_processes) do |set, i| set_path = WORKING_DIR.join("i16-subset-#{i}") From 91e3fb8e428222f2f56951eefe37e3d8b29e6302 Mon Sep 17 00:00:00 2001 From: Benjamin Kiah Stroud <32469930+bkiahstroud@users.noreply.github.com> Date: Wed, 17 Apr 2024 10:22:32 -0700 Subject: [PATCH 03/10] log if file already exists in target dir --- app/services/ams/missing_instantiations_locator.rb | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/app/services/ams/missing_instantiations_locator.rb b/app/services/ams/missing_instantiations_locator.rb index e800a7b4..ceb8c938 100644 --- a/app/services/ams/missing_instantiations_locator.rb +++ b/app/services/ams/missing_instantiations_locator.rb @@ -65,14 +65,16 @@ def create_subsets_from_merged_map(num_processes: 4) Parallel.each_with_index(subsets, in_processes: num_processes) do |set, i| set_path = WORKING_DIR.join("i16-subset-#{i}") FileUtils.mkdir_p(set_path) - pb_format = "Copying XML files to #{File.basename(set_path)}: %c/%C %P%" + pb_format = "Copying XML files to #{File.basename(set_path)}: %a %e %c/%C %P%" progressbar = ProgressBar.create(total: set.size, format: pb_format) set.each do |asset_path| importer_dir, asset_id = asset_path.split('/') xml_filename = "#{asset_id.sub('cpb-aacip-', '')}.xml" - unless File.exist?(WORKING_DIR.join(set_path, xml_filename)) + if File.exist?(WORKING_DIR.join(set_path, xml_filename)) + logger.debug "#{xml_filename} already exists in #{File.basename(set_path)}" + else FileUtils.cp(WORKING_DIR.join(importer_dir, xml_filename), WORKING_DIR.join(set_path, xml_filename)) end progressbar.increment From 0ff6956105b8e592feea17a96def7b193c3e060c Mon Sep 17 00:00:00 2001 From: Benjamin Kiah Stroud <32469930+bkiahstroud@users.noreply.github.com> Date: Wed, 17 Apr 2024 10:36:55 -0700 Subject: [PATCH 04/10] add method to audit duplicate XML files Specifically, we're auditing whether their content differs from each other or not. If not, we aren't concerned with handling the duplicates further, but if the content does differ, we need to figure out how --- .../ams/missing_instantiations_locator.rb | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/app/services/ams/missing_instantiations_locator.rb b/app/services/ams/missing_instantiations_locator.rb index ceb8c938..4032ee81 100644 --- a/app/services/ams/missing_instantiations_locator.rb +++ b/app/services/ams/missing_instantiations_locator.rb @@ -82,6 +82,32 @@ def create_subsets_from_merged_map(num_processes: 4) end end + def audit_duplicate_xml_files + results = JSON.parse(File.read(WORKING_DIR.join('i16-combined-results.json'))) + asset_paths = results.values.flatten.uniq + filename_map = {} + + asset_paths.each do |path| + path, asset_id = path.split('/') + filename = "#{asset_id.sub('cpb-aacip-', '')}.xml" + + filename_map[filename] ||= {} + filename_map[filename][:paths] ||= [] + filename_map[filename][:paths] << path + end + + duplicate_files = filename_map.select { |_filename, attrs| attrs[:paths].size > 1 } + + duplicate_files.each do |filename, attrs| + file_contents = attrs[:paths].map { |path| File.read(WORKING_DIR.join(path, filename)) } + duplicate_files[filename][:content_differs] = file_contents.uniq.size > 1 + end + + File.open(WORKING_DIR.join('i16-duplicate-xml-files-audit.json'), 'w') do |file| + file.puts JSON.pretty_generate(duplicate_files) + end + end + private def map_asset_id_to_inst_ids(xml_file) From 51957bb4104dc4dcfa3c28d379604aed9943c22f Mon Sep 17 00:00:00 2001 From: Benjamin Kiah Stroud <32469930+bkiahstroud@users.noreply.github.com> Date: Fri, 19 Apr 2024 18:25:05 -0700 Subject: [PATCH 05/10] if file doesn't exist, capture and log error --- app/services/ams/missing_instantiations_locator.rb | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/app/services/ams/missing_instantiations_locator.rb b/app/services/ams/missing_instantiations_locator.rb index 4032ee81..81895651 100644 --- a/app/services/ams/missing_instantiations_locator.rb +++ b/app/services/ams/missing_instantiations_locator.rb @@ -75,7 +75,11 @@ def create_subsets_from_merged_map(num_processes: 4) if File.exist?(WORKING_DIR.join(set_path, xml_filename)) logger.debug "#{xml_filename} already exists in #{File.basename(set_path)}" else - FileUtils.cp(WORKING_DIR.join(importer_dir, xml_filename), WORKING_DIR.join(set_path, xml_filename)) + begin + FileUtils.cp(WORKING_DIR.join(importer_dir, xml_filename), WORKING_DIR.join(set_path, xml_filename)) + rescue => e + logger.error "#{e.class} - (#{File.basename(set_path)}/#{xml_filename}) - #{e.message}" + end end progressbar.increment end From d22935c6ceee437fcbcb1813803bcf676bb504d4 Mon Sep 17 00:00:00 2001 From: Benjamin Kiah Stroud <32469930+bkiahstroud@users.noreply.github.com> Date: Tue, 23 Apr 2024 11:50:49 -0700 Subject: [PATCH 06/10] add methods to delete/recreate subsets --- .../ams/missing_instantiations_locator.rb | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/app/services/ams/missing_instantiations_locator.rb b/app/services/ams/missing_instantiations_locator.rb index 81895651..a19c39db 100644 --- a/app/services/ams/missing_instantiations_locator.rb +++ b/app/services/ams/missing_instantiations_locator.rb @@ -112,6 +112,41 @@ def audit_duplicate_xml_files end end + def destroy_assets(subset_path) + xml_files = Dir.glob(subset_path.join('*.xml')) + asset_ids = xml_files.map { |f| "cpb-aacip-#{File.basename(f).sub('.xml', '')}" } + + begin + AMS::AssetDestroyer.new(asset_ids: asset_ids, user_email: 'wgbh_admin@wgbh-mla.org').destroy + rescue => e + logger.error "Error destroying Assets. See asset_destroyer.log (#{e.class} - #{e.message})" + end + end + + def create_subset_importers + subset_paths = Dir.glob(Rails.root.join('tmp', 'imports', 'i16-subset*')) + desired_parser_field_attrs = %w[ + record_element + import_type + visibility + rights_statement + override_rights_statement + file_style + import_file_path + ] + + subset_paths.each do |path| + base_imp = Bulkrax::Importer.find_by(name: 'AMS1Importer_0-10000') + imp = base_imp.dup + + imp.name = File.basename(path) + imp.parser_fields = base_imp.parser_fields.slice(*desired_parser_field_attrs) + imp.parser_fields['import_file_path'] = path.to_s + + imp.save! + end + end + private def map_asset_id_to_inst_ids(xml_file) From 26bacb7a9236130a0a9b7bd4b16c047e1bb76e74 Mon Sep 17 00:00:00 2001 From: Benjamin Kiah Stroud <32469930+bkiahstroud@users.noreply.github.com> Date: Tue, 23 Apr 2024 12:24:43 -0700 Subject: [PATCH 07/10] run migrations on test db schema --- db/schema.test.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/db/schema.test.rb b/db/schema.test.rb index 10e689f8..f5ba4d0c 100644 --- a/db/schema.test.rb +++ b/db/schema.test.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema.define(version: 2023_08_30_155065) do +ActiveRecord::Schema.define(version: 2024_03_07_053156) do # These are extensions that must be enabled in order to support this database enable_extension "plpgsql" @@ -414,6 +414,7 @@ t.datetime "updated_at", null: false t.string "internal_resource" t.integer "lock_version" + t.index "(((metadata -> 'bulkrax_identifier'::text) ->> 0))", name: "index_on_bulkrax_identifier", where: "((metadata -> 'bulkrax_identifier'::text) IS NOT NULL)" t.index ["internal_resource"], name: "index_orm_resources_on_internal_resource" t.index ["metadata"], name: "index_orm_resources_on_metadata", using: :gin t.index ["metadata"], name: "index_orm_resources_on_metadata_jsonb_path_ops", opclass: :jsonb_path_ops, using: :gin From 531f4513667f174dd0f10c7d12fe4c5add379902 Mon Sep 17 00:00:00 2001 From: Benjamin Kiah Stroud <32469930+bkiahstroud@users.noreply.github.com> Date: Tue, 23 Apr 2024 16:38:36 -0700 Subject: [PATCH 08/10] minor clean up --- app/services/ams/missing_instantiations_locator.rb | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/app/services/ams/missing_instantiations_locator.rb b/app/services/ams/missing_instantiations_locator.rb index a19c39db..e9aade95 100644 --- a/app/services/ams/missing_instantiations_locator.rb +++ b/app/services/ams/missing_instantiations_locator.rb @@ -125,6 +125,7 @@ def destroy_assets(subset_path) def create_subset_importers subset_paths = Dir.glob(Rails.root.join('tmp', 'imports', 'i16-subset*')) + base_imp = Bulkrax::Importer.find_by(name: 'AMS1Importer_0-10000') desired_parser_field_attrs = %w[ record_element import_type @@ -132,16 +133,14 @@ def create_subset_importers rights_statement override_rights_statement file_style - import_file_path ] subset_paths.each do |path| - base_imp = Bulkrax::Importer.find_by(name: 'AMS1Importer_0-10000') imp = base_imp.dup imp.name = File.basename(path) imp.parser_fields = base_imp.parser_fields.slice(*desired_parser_field_attrs) - imp.parser_fields['import_file_path'] = path.to_s + imp.parser_fields['import_file_path'] = path imp.save! end From 8e82c18dbd6f09593124dedd512af962889df9a8 Mon Sep 17 00:00:00 2001 From: Benjamin Kiah Stroud <32469930+bkiahstroud@users.noreply.github.com> Date: Wed, 24 Apr 2024 13:29:09 -0700 Subject: [PATCH 09/10] fix bug due to missing arg --- app/services/ams/missing_instantiations_locator.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/app/services/ams/missing_instantiations_locator.rb b/app/services/ams/missing_instantiations_locator.rb index e9aade95..0fbd779e 100644 --- a/app/services/ams/missing_instantiations_locator.rb +++ b/app/services/ams/missing_instantiations_locator.rb @@ -117,7 +117,9 @@ def destroy_assets(subset_path) asset_ids = xml_files.map { |f| "cpb-aacip-#{File.basename(f).sub('.xml', '')}" } begin - AMS::AssetDestroyer.new(asset_ids: asset_ids, user_email: 'wgbh_admin@wgbh-mla.org').destroy + logger.info "Destroying #{asset_ids.size} Assets via the AssetDestroyer. See asset_destroyer.log" + ad = AMS::AssetDestroyer.new(asset_ids: asset_ids, user_email: 'wgbh_admin@wgbh-mla.org') + ad.destroy(ad.asset_ids) rescue => e logger.error "Error destroying Assets. See asset_destroyer.log (#{e.class} - #{e.message})" end From 285285d57c8bb689f24defdcab3b5500cc3831a8 Mon Sep 17 00:00:00 2001 From: Benjamin Kiah Stroud <32469930+bkiahstroud@users.noreply.github.com> Date: Wed, 24 Apr 2024 13:29:44 -0700 Subject: [PATCH 10/10] follow established patterns --- app/services/ams/asset_destroyer.rb | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/app/services/ams/asset_destroyer.rb b/app/services/ams/asset_destroyer.rb index 5ca14ea7..3d1d6cf6 100644 --- a/app/services/ams/asset_destroyer.rb +++ b/app/services/ams/asset_destroyer.rb @@ -1,3 +1,5 @@ +require 'ruby-progressbar' + module AMS class AssetDestroyer attr_accessor :asset_ids, :user_email, :logger @@ -10,8 +12,10 @@ def initialize(asset_ids: [], user_email: nil) def destroy(asset_ids) logger.info "Initiating destruction sequence for #{asset_ids.count} Assets..." + progressbar = ProgressBar.create(total: asset_ids.size, format: "Destroying Assets: %a %e %c/%C %P%") Array(asset_ids).each do |asset_id| destroy_asset_by_id asset_id + progressbar.increment end end @@ -65,8 +69,9 @@ def destroy_in_postgres(asset_id) .with_step_args('work_resource.delete' => { user: user }, 'work_resource.delete_all_file_sets' => { user: user }) .call(asset_resource).value! - rescue Valkyrie::Persistence::ObjectNotFoundError - puts "No AssetResource found with ID #{asset_id}" + logger.debug "AssetResource '#{asset_id}' (and children) destroyed." + rescue => e + error_rescue(e, 'AssetResource', asset_id) end def actor