microbiomedata · mbthornton-lbl · Jan 12, 2024 · Jul 18, 2023 · Jul 18, 2023 · Oct 27, 2023
diff --git a/Dockerfile b/Dockerfile
@@ -1,10 +1,15 @@
 FROM python:3.9
 
-ADD requirements.txt /tmp/requirements.txt
+RUN \
+    pip install poetry && \
+    poetry config virtualenvs.create false
 
-RUN pip install -r /tmp/requirements.txt
+ADD pyproject.toml poetry.lock README.md /src/
+WORKDIR /src
+RUN \
+    poetry install --only=main --no-root
 
+RUN pip install semver
 
 ADD . /src
 
-WORKDIR /src
diff --git a/configs/workflows.yaml b/configs/workflows.yaml
@@ -44,17 +44,14 @@ Workflows:
     Outputs:
       - output: filtered_final
         name: Reads QC result fastq (clean data)
-        suffix: "_filtered.fastq.gz"
         data_object_type: Filtered Sequencing Reads
         description: "Reads QC for {id}"
       - output: filtered_stats_final
         name: Reads QC summary statistics
-        suffix: "_filterStats.txt"
         data_object_type: QC Statistics
         description: "Reads QC summary for {id}"
       - output: rqc_info
         name: File containing read filtering information
-        suffix: "_readsQC.info"
         data_object_type: Read Filtering Info File
         description: "Read filtering info for {id}"
 
@@ -85,17 +82,14 @@ Workflows:
     Outputs:
       - output: filtered_final
         name: Reads QC result fastq (clean data)
-        suffix: "_filtered.fastq.gz"
         data_object_type: Filtered Sequencing Reads
         description: "Reads QC for {id}"
       - output: filtered_stats_final
         name: Reads QC summary statistics
-        suffix: "_filterStats.txt"
         data_object_type: QC Statistics
         description: "Reads QC summary for {id}"
       - output: rqc_info
         name: File containing read filtering information
-        suffix: "_readsQC.info"
         data_object_type: Read Filtering Info File
         description: "Read filtering info for {id}"
 
@@ -145,32 +139,26 @@ Workflows:
     Outputs:
       - output: contig
         name: Final assembly contigs fasta
-        suffix: "_contigs.fna"
         data_object_type: Assembly Contigs
         description: "Assembly contigs for {id}"
       - output: scaffold
         name: Final assembly scaffolds fasta
-        suffix: "_scaffolds.fna"
         data_object_type: Assembly Scaffolds
         description: "Assembly scaffolds for {id}"
       - output: covstats
         name: Assembled contigs coverage information
-        suffix: "_covstats.txt"
         data_object_type: Assembly Coverage Stats
         description: "Coverage Stats for {id}"
       - output: agp
         name: An AGP format file that describes the assembly
-        suffix: "_assembly.agp"
         data_object_type: Assembly AGP
         description: "AGP for {id}"
       - output: bam
         name: Sorted bam file of reads mapping back to the final assembly
-        suffix: "_pairedMapped_sorted.bam"
         data_object_type: Assembly Coverage BAM
         description: "Sorted Bam for {id}"
       - output: asminfo
         name: File containing assembly info
-        suffix: "_metaAsm.info"
         data_object_type: Assembly Info File
         description: "Assembly info for {id}"
 
@@ -196,22 +184,18 @@ Workflows:
         data_object_type: Annotation Amino Acid FASTA
         description: FASTA Amino Acid File for {id}
         name: FASTA amino acid file for annotated proteins
-        suffix: _proteins.faa
       - output: structural_gff
         data_object_type: Structural Annotation GFF
         description: Structural Annotation for {id}
         name: GFF3 format file with structural annotations
-        suffix: _structural_annotation.gff
       - output: functional_gff
         data_object_type: Functional Annotation GFF
         description: Functional Annotation for {id}
         name: GFF3 format file with functional annotations
-        suffix: _functional_annotation.gff
       - output: ko_tsv
         data_object_type: Annotation KEGG Orthology
         description: KEGG Orthology for {id}
         name: Tab delimited file for KO annotation
-        suffix: _ko.tsv
       - output: ec_tsv
         data_object_type: Annotation Enzyme Commission
         description: EC Annotations for {id}
@@ -226,92 +210,74 @@ Workflows:
         data_object_type: Clusters of Orthologous Groups (COG) Annotation GFF
         description: COGs for {id}
         name: GFF3 format file with COGs
-        suffix: _cog.gff
       - output: pfam_gff
         data_object_type: Pfam Annotation GFF
         description: Pfam Annotation for {id}
         name: GFF3 format file with Pfam
-        suffix: _pfam.gff
       - output: tigrfam_gff
         data_object_type: TIGRFam Annotation GFF
         description: TIGRFam for {id}
         name: GFF3 format file with TIGRfam
-        suffix: _tigrfam.gff
       - output: smart_gff
         data_object_type: SMART Annotation GFF
         description: SMART Annotations for {id}
         name: GFF3 format file with SMART
-        suffix: _smart.gff
       - output: supfam_gff
         data_object_type: SUPERFam Annotation GFF
         description: SUPERFam Annotations for {id}
         name: GFF3 format file with SUPERFam
-        suffix: _supfam.gff
       - output: cath_funfam_gff
         data_object_type: CATH FunFams (Functional Families) Annotation GFF
         description: CATH FunFams for {id}
         name: GFF3 format file with CATH FunFams
-        suffix: _cath_funfam.gff
       - output: crt_gff
         data_object_type: CRT Annotation GFF
         description: CRT Annotations for {id}
         name: GFF3 format file with CRT
-        suffix: _crt.gff
       - output: genemark_gff
         data_object_type: Genemark Annotation GFF
         description: Genemark Annotations for {id}
         name: GFF3 format file with Genemark
-        suffix: _genemark.gff
       - output: prodigal_gff
         data_object_type: Prodigal Annotation GFF
         description: Prodigal Annotations {id}
         name: GFF3 format file with Prodigal
-        suffix: _prodigal.gff
       - output: trna_gff
         data_object_type: TRNA Annotation GFF
         description: TRNA Annotations {id}
         name: GFF3 format file with TRNA
-        suffix: _trna.gff
       - output: final_rfam_gff
         data_object_type: RFAM Annotation GFF
         description: RFAM Annotations for {id}
         name: GFF3 format file with RFAM
-        suffix: _rfam.gff
       - output: ko_ec_gff
         data_object_type: KO_EC Annotation GFF
         description: KO_EC Annotations for {id}
         name: GFF3 format file with KO_EC
-        suffix: _ko_ec.gff
       - output: product_names_tsv
         data_object_type: Product Names
         description: Product names for {id}
         name: Product names file
-        suffix: _product_names.tsv
       - output: gene_phylogeny_tsv
         data_object_type: Gene Phylogeny tsv
         description: Gene Phylogeny for {id}
         name: Gene Phylogeny file
-        suffix: _gene_phylogeny.tsv
       - output: crt_crisprs
         data_object_type: Crispr Terms
         description: Crispr Terms for {id}
         name: Crispr Terms
-        suffix: _crt.crisprs
       - output: stats_tsv
         data_object_type: Annotation Statistics
         description: Annotation Stats for {id}
         name: Annotation statistics report
-        suffix: _stats.tsv
-      # - output: contig_mapping
-      #   data_object_type: Contig Mapping File
-      #   description: Contig mappings file for {id}
-      #   name: Contig mappings between contigs and scaffolds
-      #   suffix: _contig_names_mapping.tsv
+      - output: contig_mapping
+        data_object_type: Contig Mapping File
+        description: Conging mappings file for {id}
+        name: Contig mappings between contigs and scaffolds
       - output: imgap_version
         data_object_type: Annotation Info File
         description: Annotation info for {id}
         name: File containing annotation info
-        suffix: _imgap.info
 
   - Name: MAGs
     Type: nmdc:MagsAnalysisActivity
@@ -350,17 +316,14 @@ Workflows:
       data_object_type: CheckM Statistics
       description: CheckM for {id}
       name: CheckM statistics report
-      suffix: _checkm_qa.out
     - output: final_hqmq_bins_zip
       data_object_type: Metagenome Bins
       description: Metagenome Bins for {id}
       name: Metagenome bin tarfiles archive
-      suffix: _hqmq_bin.zip
     - output: final_gtdbtk_bac_summary
       data_object_type: GTDBTK Bacterial Summary
       description: Bacterial Summary for {id}
       name: GTDBTK bacterial summary
-      suffix: _gtdbtk.bac122.summary.tsv
     - output: final_gtdbtk_ar_summary
       data_object_type: GTDBTK Archaeal Summary
       description: Archaeal Summary for {id}
@@ -370,7 +333,6 @@ Workflows:
       data_object_type: Metagenome Bins Info File
       description: Metagenome Bins Info File for {id}
       name: Metagenome Bins Info File
-      suffix: _bin.info
 
   - Name: Readbased Analysis
     Type: nmdc:ReadBasedTaxonomyAnalysisActivity
@@ -394,47 +356,38 @@ Workflows:
       data_object_type: GOTTCHA2 Classification Report
       description: GOTTCHA2 Classification for {id}
       name: GOTTCHA2 classification report file
-      suffix: _gottcha2_report.tsv
     - output: final_gottcha2_full_tsv
       data_object_type: GOTTCHA2 Report Full
       description: GOTTCHA2 Full Report for {id}
       name: GOTTCHA2 report file
-      suffix: _gottcha2_full_tsv
     - output: final_gottcha2_krona_html
       data_object_type: GOTTCHA2 Krona Plot
       description: GOTTCHA2 Krona for {id}
       name: GOTTCHA2 krona plot HTML file
-      suffix: _gottcha2_krona.html
     - output: final_centrifuge_classification_tsv
       data_object_type: Centrifuge Taxonomic Classification
       description: Centrifuge Classification for {id}
       name: Centrifuge output read classification file
-      suffix: _centrifuge_classification.tsv
     - output: final_centrifuge_report_tsv
       data_object_type: Centrifuge output report file
       description: Centrifuge Report for {id}
       name: Centrifuge Classification Report
-      suffix: _centrifuge_report.tsv
     - output: final_centrifuge_krona_html
       data_object_type: Centrifuge Krona Plot
       description: Centrifuge Krona for {id}
       name: Centrifug krona plot HTML file
-      suffix: _centrifuge_krona.html
     - output: final_kraken2_classification_tsv
       data_object_type: Kraken2 Taxonomic Classification
       description: Kraken2 Classification for {id}
       name: Kraken2 output read classification file
-      suffix: _kraken2_classification.tsv
     - output: final_kraken2_report_tsv
       data_object_type: Kraken2 Classification Report
       description: Kraken2 Report for {id}
       name: Kraken2 output report file
-      suffix: _kraken2_report.tsv
     - output: final_kraken2_krona_html
       data_object_type: Kraken2 Krona Plot
       description: Kraken2 Krona for {id}
       name: Kraken2 Krona plot HTML file
-      suffix: _kraken2_krona.html
     - output: info_file
       data_object_type: Read Based Analysis Info File
       description: Read based analysis info for {id}

diff --git a/nmdc_automation/workflow_automation/activities.py b/nmdc_automation/workflow_automation/activities.py
@@ -24,31 +24,43 @@ def _load_data_objects(db, workflows: List[Workflow]):
 
 
 def _check(match_types, data_object_ids, data_objs):
+    if not data_object_ids:
+        return False
     if not match_types or len(match_types) == 0:
         return True
     match_set = set(match_types)
     do_types = set()
     for doid in data_object_ids:
-        do_types.add(data_objs[doid].data_object_type)
+        if doid in data_objs:
+            do_types.add(data_objs[doid].data_object_type)
     return match_set.issubset(do_types)
 
 
 def _filter_skip(wf, rec, data_objs):
-    match_in = _check(wf.filter_input_objects, rec["has_input"], data_objs)
-    match_out = _check(wf.filter_output_objects, rec["has_output"], data_objs)
+    match_in = _check(wf.filter_input_objects,
+                      rec.get("has_input"),
+                      data_objs)
+    match_out = _check(wf.filter_output_objects,
+                       rec.get("has_output"),
+                       data_objs)
     return not (match_in and match_out)
 
 
-def _read_acitivites(db, workflows: List[Workflow], data_objects: dict, filter: dict):
+def _read_acitivites(db, workflows: List[Workflow],
+                     data_objects: dict, filter: dict):
     """
     Read in all the activities for the defined workflows.
     """
     activities = []
     for wf in workflows:
+        logging.debug(f"Checking {wf.name}:{wf.version}")
         q = filter
         q["git_url"] = wf.git_repo
         q["version"] = wf.version
         for rec in db[wf.collection].find(q):
+            if wf.collection == "omics_processing_set" and \
+               rec["id"].startswith("gold"):
+                continue
             if _filter_skip(wf, rec, data_objects):
                 continue
             act = Activity(rec, wf)
@@ -86,10 +98,8 @@ def _resolve_relationships(activities, data_obj_act):
             # Let's make sure these came from the same source
             # This is just a safeguard
             if act.was_informed_by != parent_act.was_informed_by:
-                logging.warning(
-                    "Mismatched informed by found for"
-                    f"{do_id} in {act.id} ({act.name})"
-                )
+                logging.warning("Mismatched informed by for "
+                                f"{do_id} in {act.id}")
                 continue
             # We only want to use it as a parent if it is the right
             # parent workflow. Some inputs may come from ancestors
@@ -98,13 +108,11 @@ def _resolve_relationships(activities, data_obj_act):
                 # This is the one
                 act.parent = parent_act
                 parent_act.children.append(act)
-                logging.debug(f"Found parent: {parent_act.id} {parent_act.name}")
+                logging.debug(f"Found parent: {parent_act.id}"
+                              f" {parent_act.name}")
                 break
         if len(act.workflow.parents) > 0 and not act.parent:
-            logging.warning(
-                "Didn't find a parent for "
-                f"{act.id} ({act.name}) - {act.workflow.name}"
-            )
+            logging.warning(f"Didn't find a parent for {act.id}")
     # Now all the activities have their parent
     return activities