adjust job-id params and namespace for sending files to s3

sul-dlss · Sep 27, 2024 · 9ebbfc1 · 9ebbfc1
1 parent d3260ae
commit 9ebbfc1
Show file tree

Hide file tree

Showing 6 changed files with 38 additions and 23 deletions.
diff --git a/lib/dor/text_extraction/speech_to_text.rb b/lib/dor/text_extraction/speech_to_text.rb
@@ -4,13 +4,12 @@ module Dor
   module TextExtraction
     # Determine if speech to text is required and possible for a given object
     class SpeechToText
-      attr_reader :cocina_object, :workflow_context, :bare_druid, :logger
+      attr_reader :cocina_object, :workflow_context, :bare_druid
 
-      def initialize(cocina_object:, workflow_context: {}, logger: nil)
+      def initialize(cocina_object:, workflow_context: {})
         @cocina_object = cocina_object
         @workflow_context = workflow_context
         @bare_druid = cocina_object.externalIdentifier.delete_prefix('druid:')
-        @logger = logger || Logger.new($stdout)
       end
 
       def possible?
@@ -40,6 +39,11 @@ def filenames_to_stt
         stt_files.map(&:filename)
       end
 
+      # return the job_id for the stt job, defined as the druid-version of the object
+      def job_id
+        "#{bare_druid}-v#{cocina_object.version}"
+      end
+
       private
 
       # iterate through cocina structural contains and return all File objects for files that need to be stt'd

diff --git a/lib/robots/dor_repo/speech_to_text/fetch_files.rb b/lib/robots/dor_repo/speech_to_text/fetch_files.rb
@@ -12,14 +12,20 @@ def initialize
         # available from LyberCore::Robot: druid, bare_druid, workflow_service, object_client, cocina_object, logger
         def perform_work
           sttable_filenames.each do |filename|
-            raise "Unable to fetch #{filename} for #{druid}" unless file_fetcher.write_file_with_retries(filename:, location: aws_provider.bucket.object(File.join(bare_druid, filename)), max_tries: 3)
+            raise "Unable to fetch #{filename} for #{druid}" unless file_fetcher.write_file_with_retries(filename:, location: aws_provider.bucket.object(File.join(job_id, filename)), max_tries: 3)
           end
         end
 
         private
 
         def sttable_filenames
-          Dor::TextExtraction::SpeechToText.new(cocina_object:, workflow_context: workflow.context).filenames_to_stt
+          Dor::TextExtraction::SpeechToText.new(cocina_object:).filenames_to_stt
+        end
+
+        # this will be the base of the S3 key for the files sent (to namespace them in the bucket)
+        # it is the same as the job_id when we send the SQS message
+        def job_id
+          @job_id ||= Dor::TextExtraction::SpeechToText.new(cocina_object:).job_id
         end
 
         def file_fetcher

diff --git a/lib/robots/dor_repo/speech_to_text/stt_create.rb b/lib/robots/dor_repo/speech_to_text/stt_create.rb
@@ -26,9 +26,7 @@ def perform_work
 
         def send_sqs_message
           message_body = {
-            id: job_id,
-            druid:,
-            media:
+            id: job_id
           }.merge(whisper_options).to_json
 
           # Send the message to the SQS queue
@@ -37,15 +35,11 @@ def send_sqs_message
                                           message_body:
                                         })
 
-          logger.info("Sent SQS message for druid #{druid} to queue #{aws_provider.sqs_todo_queue_url}")
+          logger.info("Sent SQS message for druid #{druid} to queue #{aws_provider.sqs_todo_queue_url} with job_id #{job_id}")
         end
 
         def job_id
-          @job_id ||= SecureRandom.uuid
-        end
-
-        def media
-          Dor::TextExtraction::SpeechToText.new(cocina_object:, workflow_context: workflow.context).filenames_to_stt
+          @job_id ||= Dor::TextExtraction::SpeechToText.new(cocina_object:).job_id
         end
 
         # pulled from config, could later be overriden by settings in the workflow context

diff --git a/spec/lib/dor/text_extraction/speech_to_text_spec.rb b/spec/lib/dor/text_extraction/speech_to_text_spec.rb
@@ -20,6 +20,7 @@
   let(:text_file) { build_file(true, true, 'file1.txt') }
   let(:text_file2) { build_file(true, true, 'file2.txt') }
   let(:druid) { 'druid:bc123df4567' }
+  let(:bare_druid) { 'bc123df4567' }
 
   def build_file(sdr_preserve, shelve, filename)
     extension = File.extname(filename)
@@ -107,7 +108,16 @@ def build_file(sdr_preserve, shelve, filename)
     let(:cocina_object) { instance_double(Cocina::Models::DRO, externalIdentifier: druid, structural:, type: object_type) }
 
     it 'returns a list of all filenames' do
+      debugger
       expect(stt.send(:stt_files)).to eq([m4a_file, mp4_file])
     end
   end
+
+  describe '#job_id' do
+    let(:cocina_object) { instance_double(Cocina::Models::DRO, version: 3, externalIdentifier: druid, dro?: true, type: object_type) }
+
+    it 'returns the job_id for the STT job' do
+      expect(stt.job_id).to eq("#{bare_druid}-v#{version}")
+    end
+  end
 end
diff --git a/spec/robots/dor_repo/speech_to_text/fetch_files_spec.rb b/spec/robots/dor_repo/speech_to_text/fetch_files_spec.rb
@@ -9,7 +9,7 @@
   let(:bare_druid) { 'bb222cc3333' }
   let(:robot) { described_class.new }
   let(:file_fetcher) { instance_double(Dor::TextExtraction::FileFetcher, write_file_with_retries: written) }
-  let(:stt) { instance_double(Dor::TextExtraction::SpeechToText, filenames_to_stt: ['file1.mov', 'file2.mp3']) }
+  let(:stt) { instance_double(Dor::TextExtraction::SpeechToText, job_id:, filenames_to_stt: ['file1.mov', 'file2.mp3']) }
   let(:cocina_model) { build(:dro, id: druid).new(structural: {}, type: object_type, access: { view: 'world' }) }
   let(:object_type) { 'https://cocina.sul.stanford.edu/models/media' }
   let(:dsa_object_client) do
@@ -22,17 +22,18 @@
     instance_double(Dor::Workflow::Response::Process, lane_id: 'lane1', context: { 'runSpeechToText' => true })
   end
   let(:aws_client) { instance_double(Aws::S3::Client) }
-  let(:mov_location) { instance_double(Aws::S3::Object, bucket_name: Settings.aws.speech_to_text.base_s3_bucket, key: "#{bare_druid}/file1.mov", client: aws_client) }
-  let(:mp3_location) { instance_double(Aws::S3::Object, bucket_name: Settings.aws.speech_to_text.base_s3_bucket, key: "#{bare_druid}/file2.mp3", client: aws_client) }
+  let(:mov_location) { instance_double(Aws::S3::Object, bucket_name: Settings.aws.speech_to_text.base_s3_bucket, key: "#{job_id}/file1.mov", client: aws_client) }
+  let(:mp3_location) { instance_double(Aws::S3::Object, bucket_name: Settings.aws.speech_to_text.base_s3_bucket, key: "#{job_id}/file2.mp3", client: aws_client) }
+  let(:job_id) { "#{bare_druid}-v1" }
 
   before do
     allow(Dor::Services::Client).to receive(:object).and_return(dsa_object_client)
     allow(LyberCore::WorkflowClientFactory).to receive(:build).and_return(workflow_client)
     allow(Dor::TextExtraction::FileFetcher).to receive(:new).and_return(file_fetcher)
     allow(Dor::TextExtraction::SpeechToText).to receive(:new).and_return(stt)
     allow(Aws::S3::Client).to receive(:new).and_return(aws_client)
-    allow(Aws::S3::Object).to receive(:new).with(bucket_name: Settings.aws.speech_to_text.base_s3_bucket, key: "#{bare_druid}/file1.mov", client: aws_client).and_return(mov_location)
-    allow(Aws::S3::Object).to receive(:new).with(bucket_name: Settings.aws.speech_to_text.base_s3_bucket, key: "#{bare_druid}/file2.mp3", client: aws_client).and_return(mp3_location)
+    allow(Aws::S3::Object).to receive(:new).with(bucket_name: Settings.aws.speech_to_text.base_s3_bucket, key: "#{job_id}/file1.mov", client: aws_client).and_return(mov_location)
+    allow(Aws::S3::Object).to receive(:new).with(bucket_name: Settings.aws.speech_to_text.base_s3_bucket, key: "#{job_id}/file2.mp3", client: aws_client).and_return(mp3_location)
   end
 
   context 'when fetching files is successful' do

diff --git a/spec/robots/dor_repo/speech_to_text/stt_create_spec.rb b/spec/robots/dor_repo/speech_to_text/stt_create_spec.rb
@@ -6,9 +6,10 @@
   subject(:perform) { test_perform(robot, druid) }
 
   let(:druid) { 'druid:bb222cc3333' }
+  let(:bare_druid) { 'bb222cc3333' }
   let(:robot) { described_class.new }
   let(:aws_client) { instance_double(Aws::SQS::Client) }
-  let(:stt) { instance_double(Dor::TextExtraction::SpeechToText, filenames_to_stt: ['file1.mov', 'file2.mp3']) }
+  let(:stt) { instance_double(Dor::TextExtraction::SpeechToText, job_id:, filenames_to_stt: ['file1.mov', 'file2.mp3']) }
   let(:cocina_model) { build(:dro, id: druid).new(structural: {}, type: object_type, access: { view: 'world' }) }
   let(:object_type) { 'https://cocina.sul.stanford.edu/models/media' }
   let(:dsa_object_client) do
@@ -20,18 +21,17 @@
   let(:workflow_process) do
     instance_double(Dor::Workflow::Response::Process, lane_id: 'lane1', context: { 'runSpeechToText' => true })
   end
-  let(:job_id) { '1234-5678-0000' }
+  let(:job_id) { "#{bare_druid}-v1" }
 
   before do
     allow(Aws::SQS::Client).to receive(:new).and_return(aws_client)
     allow(Dor::Services::Client).to receive(:object).and_return(dsa_object_client)
     allow(Dor::TextExtraction::SpeechToText).to receive(:new).and_return(stt)
     allow(LyberCore::WorkflowClientFactory).to receive(:build).and_return(workflow_client)
-    allow(SecureRandom).to receive(:uuid).and_return(job_id)
   end
 
   context 'when the message is sent successfully' do
-    let(:message_body) { { id: job_id, druid:, media: ['file1.mov', 'file2.mp3'], options: { model: 'large', max_line_count: 80, beam_size: 10 } }.to_json }
+    let(:message_body) { { id: job_id, options: { model: 'large', max_line_count: 80, beam_size: 10 } }.to_json }
 
     before do
       allow(aws_client).to receive(:send_message).with({ queue_url: Settings.aws.speech_to_text.sqs_todo_queue_url, message_body: }).and_return(true)