Merge pull request #223 from microbiomedata/222-write-metatranscripto…

…me-workflows-yaml Initial metatranscriptome workflow
microbiomedata · Aug 8, 2024 · 6b15b5f · 6b15b5f
2 parents 7bce3d5 + 9a76be9
commit 6b15b5f
Show file tree

Hide file tree

Showing 2 changed files with 388 additions and 8 deletions.
diff --git a/configs/workflows-mt.yaml b/configs/workflows-mt.yaml
@@ -0,0 +1,380 @@
+Workflows:
+  - Name: Sequencing Noninterleaved
+    Collection: omics_processing_set
+    Enabled: True
+    Analyte Category: Metatranscriptome
+    Filter Output Objects:
+    - Metagenome Raw Read 1
+    - Metagenome Raw Read 2
+
+  - Name: Sequencing Interleaved
+    Collection: omics_processing_set
+    Enabled: True
+    Analyte Category: Metatranscriptome
+    Filter Output Objects:
+    - Metagenome Raw Reads
+
+  - Name: Metatranscriptome Reads QC
+    Type: nmdc:ReadQcAnalysisActivity
+    Enabled: True
+    Analyte Category: Metatranscriptome
+    Git_repo: https://github.com/microbiomedata/metaT_ReadsQC
+    Version: v0.0.3
+    WDL: rqcfilter.wdl
+    Collection: read_qc_analysis_activity_set
+    Filter Input Objects:
+    - Metagenome Raw Reads
+    Predecessors:
+    - Sequencing
+    - Sequencing Interleaved
+    Input_prefix: nmdc_rqcfilter
+    Inputs:
+      input_files: do:Metagenome Raw Reads
+      proj: "{activity_id}"
+    Activity:
+      name: "Read QC Activity for {id}"
+      input_read_bases: "{outputs.stats.input_read_bases}"
+      input_read_count: "{outputs.stats.input_read_count}"
+      output_read_bases: "{outputs.stats.output_read_bases}"
+      output_read_count: "{outputs.stats.output_read_count}"
+      type: nmdc:ReadQcAnalysisActivity
+    Outputs:
+      - output: filtered_final
+        name: Reads QC result fastq (clean data)
+        data_object_type: Filtered Sequencing Reads
+        description: "Reads QC for {id}"
+      - output: filtered_stats_final
+        name: Reads QC summary statistics
+        data_object_type: QC Statistics
+        description: "Reads QC summary for {id}"
+      - output: rqc_info
+        name: File containing read filtering information
+        data_object_type: Read Filtering Info File
+        description: "Read filtering info for {id}"
+      - output: filtered_ribo_final
+        name: Fastq file containing filtered ribosomal sequences
+        data_object_type: rRNA Filtered Sequencing Reads
+        description: "rRNA fastq for {id}"
+
+  - Name: Metatranscriptome Reads QC Interleave
+    Type: nmdc:ReadQcAnalysisActivity
+    Enabled: True
+    Analyte Category: Metatranscriptome
+    Git_repo: https://github.com/microbiomedata/metaT_ReadsQC
+    Version: v0.0.3
+    Collection: read_qc_analysis_activity_set
+    WDL: interleave_rqcfilter.wdl
+    Input_prefix: nmdc_rqcfilter
+    Inputs:
+      proj: "{activity_id}"
+      input_fastq1: do:Metagenome Raw Read 1
+      input_fastq2: do:Metagenome Raw Read 2
+    Filter Input Objects:
+    - Metagenome Raw Read 1
+    - Metagenome Raw Read 2
+    Predecessors:
+    - Sequencing Noninterleaved
+    Activity:
+      name: "Read QC Activity for {id}"
+      input_read_bases: "{outputs.stats.input_read_bases}"
+      input_read_count: "{outputs.stats.input_read_count}"
+      output_read_bases: "{outputs.stats.output_read_bases}"
+      output_read_count: "{outputs.stats.output_read_count}"
+      type: nmdc:ReadQcAnalysisActivity
+    Outputs:
+      - output: filtered_final
+        name: Reads QC result fastq (clean data)
+        data_object_type: Filtered Sequencing Reads
+        description: "Reads QC for {id}"
+      - output: filtered_stats_final
+        name: Reads QC summary statistics
+        data_object_type: QC Statistics
+        description: "Reads QC summary for {id}"
+      - output: rqc_info
+        name: File containing read filtering information
+        data_object_type: Read Filtering Info File
+        description: "Read filtering info for {id}"
+      - output: rrna_fastq_final #placeholder until https://github.com/microbiomedata/metaT_ReadsQC/issues/5 is resolved
+        name: Fastq file containing filtered ribosomal sequences             
+        data_object_type: rRNA Filtered Sequencing Reads
+        description: "rRNA fastq for {id}"
+
+  - Name: Metatranscriptome Assembly
+    Type: nmdc:MetatranscriptomeAssembly
+    Enabled: True
+    Analyte Category: Metatranscriptome
+    Git_repo: https://github.com/microbiomedata/metaT_Assembly
+    Version: v0.0.1
+    WDL: metaT_assembly.wdl
+    Collection: metagenome_assembly_set
+    Predecessors:
+    - Metatranscriptome Reads QC
+    - Metatranscriptome Reads QC Interleave
+    Input_prefix: jgi_metaASM
+    Inputs:
+      input_files: do:Filtered Sequencing Reads
+      proj: "{activity_id}"
+    Activity:
+      name: "Metatranscriptome Assembly Activity for {id}"
+      type: nmdc:MetatranscriptomeAssembly
+      asm_score: "{outputs.stats.asm_score}"
+      contig_bp: "{outputs.stats.contig_bp}"
+      contigs: "{outputs.stats.contigs}"
+      ctg_l50: "{outputs.stats.ctg_l50}"
+      ctg_l90: "{outputs.stats.ctg_l90}"
+      ctg_logsum: "{outputs.stats.ctg_logsum}"
+      ctg_max: "{outputs.stats.ctg_max}"
+      ctg_n50: "{outputs.stats.ctg_n50}"
+      ctg_n90: "{outputs.stats.ctg_n90}"
+      ctg_powsum: "{outputs.stats.ctg_powsum}"
+      gap_pct: "{outputs.stats.gap_pct}"
+      gc_avg: "{outputs.stats.gc_avg}"
+      gc_std: "{outputs.stats.gc_std}"
+    Outputs:
+      - output: final_contigs
+        name: Final assembly contigs fasta
+        data_object_type: Assembly Contigs
+        description: "Assembly contigs for {id}"
+      - output: final_cov
+        name: Assembled contigs coverage information
+        data_object_type: Assembly Coverage Stats
+        description: "Coverage Stats for {id}"
+      - output: final_bam
+        name: Sorted bam file of reads mapping back to the final assembly
+        data_object_type: Assembly Coverage BAM
+        description: "Sorted Bam for {id}"
+      - output: info_file
+        name: File containing assembly info
+        data_object_type: Assembly Info File
+        description: "Assembly info for {id}"
+      - output: final_bamidx
+        name: Indexed bam file
+        data_object_type: BAI File
+        description: "Alignment index file for {id}" 
+
+  - Name: Metatranscriptome Annotation
+    Type: nmdc:MetatranscriptomeAnnotationActivity
+    Enabled: True
+    Analyte Category: Metatranscriptome
+    Git_repo: https://github.com/microbiomedata/mg_annotation
+    Version: v1.1.0
+    WDL: annotation_full.wdl
+    Collection: metatranscriptome_annotation_set
+    Predecessors:
+    - Metatranscriptome Assembly
+    Input_prefix: annotation
+    Inputs:
+      input_file: do:Assembly Contigs
+      imgap_project_id: "scaffold"
+      proj: "{activity_id}"
+    Activity:
+      name: "Metatranscriptome Annotation Analysis Activity for {id}"
+      type: nmdc:MetatranscriptomeAnnotationActivity
+    Outputs:
+      - output: proteins_faa
+        data_object_type: Annotation Amino Acid FASTA
+        description: FASTA Amino Acid File for {id}
+        name: FASTA amino acid file for annotated proteins
+      - output: structural_gff
+        data_object_type: Structural Annotation GFF
+        description: Structural Annotation for {id}
+        name: GFF3 format file with structural annotations
+      - output: functional_gff
+        data_object_type: Functional Annotation GFF
+        description: Functional Annotation for {id}
+        name: GFF3 format file with functional annotations
+      - output: ko_tsv
+        data_object_type: Annotation KEGG Orthology
+        description: KEGG Orthology for {id}
+        name: Tab delimited file for KO annotation
+      - output: ec_tsv
+        data_object_type: Annotation Enzyme Commission
+        description: EC Annotations for {id}
+        name: Tab delimited file for EC annotation
+        suffix: _ec.tsv
+      - output: lineage_tsv
+        data_object_type: Scaffold Lineage tsv
+        description: Scaffold Lineage tsv for {id}
+        name: Phylogeny at the scaffold level
+        suffix: _scaffold_lineage.tsv
+      - output: cog_gff
+        data_object_type: Clusters of Orthologous Groups (COG) Annotation GFF
+        description: COGs for {id}
+        name: GFF3 format file with COGs
+      - output: pfam_gff
+        data_object_type: Pfam Annotation GFF
+        description: Pfam Annotation for {id}
+        name: GFF3 format file with Pfam
+      - output: tigrfam_gff
+        data_object_type: TIGRFam Annotation GFF
+        description: TIGRFam for {id}
+        name: GFF3 format file with TIGRfam
+      - output: smart_gff
+        data_object_type: SMART Annotation GFF
+        description: SMART Annotations for {id}
+        name: GFF3 format file with SMART
+      - output: supfam_gff
+        data_object_type: SUPERFam Annotation GFF
+        description: SUPERFam Annotations for {id}
+        name: GFF3 format file with SUPERFam
+      - output: cath_funfam_gff
+        data_object_type: CATH FunFams (Functional Families) Annotation GFF
+        description: CATH FunFams for {id}
+        name: GFF3 format file with CATH FunFams
+      - output: crt_gff
+        data_object_type: CRT Annotation GFF
+        description: CRT Annotations for {id}
+        name: GFF3 format file with CRT
+      - output: genemark_gff
+        data_object_type: Genemark Annotation GFF
+        description: Genemark Annotations for {id}
+        name: GFF3 format file with Genemark
+      - output: prodigal_gff
+        data_object_type: Prodigal Annotation GFF
+        description: Prodigal Annotations {id}
+        name: GFF3 format file with Prodigal
+      - output: trna_gff
+        data_object_type: TRNA Annotation GFF
+        description: TRNA Annotations {id}
+        name: GFF3 format file with TRNA
+      - output: final_rfam_gff
+        data_object_type: RFAM Annotation GFF
+        description: RFAM Annotations for {id}
+        name: GFF3 format file with RFAM
+      - output: ko_ec_gff
+        data_object_type: KO_EC Annotation GFF
+        description: KO_EC Annotations for {id}
+        name: GFF3 format file with KO_EC
+      - output: product_names_tsv
+        data_object_type: Product Names
+        description: Product names for {id}
+        name: Product names file
+      - output: gene_phylogeny_tsv
+        data_object_type: Gene Phylogeny tsv
+        description: Gene Phylogeny for {id}
+        name: Gene Phylogeny file
+      - output: crt_crisprs
+        data_object_type: Crispr Terms
+        description: Crispr Terms for {id}
+        name: Crispr Terms
+      - output: stats_tsv
+        data_object_type: Annotation Statistics
+        description: Annotation Stats for {id}
+        name: Annotation statistics report
+      - output: map_file
+        data_object_type: Contig Mapping File
+        description: Contig mappings file for {id}
+        name: Contig mappings between contigs and scaffolds
+      - output: imgap_version
+        data_object_type: Annotation Info File
+        description: Annotation info for {id}
+        name: File containing annotation info
+      - output: renamed_fasta
+        data_object_type: Assembly Contigs
+        description: Assembly contigs (remapped) for {id}
+        name: File containing contigs with annotation headers
+
+
+  - Name: Expression Analysis Antisense
+    Type: nmdc:MetatranscriptomeExpressionAnalysis
+    Enabled: True
+    Analyte Category: Metatranscriptome
+    Git_repo: https://github.com/microbiomedata/metaT_ReadCounts
+    Version: v0.0.1
+    WDL: readcount.wdl
+    Collection: metatranscriptome_expression_analysis_set
+    Predecessors:
+    - Metatranscriptome Annotation
+    Input_prefix: nmdc_expression
+    Inputs:
+      gff_file: do:Functional Annotation GFF
+      map: do:Contig Mapping File
+      bam: do:Assembly Coverage BAM
+      rna_type: "aRNA"
+      proj: "{activity_id}"
+    Activity:
+      name: "Metatranscriptome Expression Analysis for {id}"
+      type: nmdc:MetatranscriptomeExpressionAnalysis
+    Outputs:
+    - output: count_table
+      data_object_type: Metatranscriptome Expression 
+      description: Expression counts for {id}
+      name: Metatranscriptome expression table
+    - output: count_ig
+      data_object_type: Metatranscriptome Expression Intergenic
+      description: Metatranscriptome expression intergenic regions for {id}
+      name: Metatranscriptome expression intergenic table
+      optional: true
+    - output: info_file
+      data_object_type: Metatranscriptome Expression Info File
+      description: Expression info for {id}
+      name: Metatranscriptome Expression Info File
+
+  - Name: Expression Analysis Sense
+    Type: nmdc:MetatranscriptomeExpressionAnalysis
+    Enabled: True
+    Analyte Category: Metatranscriptome
+    Git_repo: https://github.com/microbiomedata/metaT_ReadCounts
+    Version: v0.0.1
+    WDL: readcount.wdl
+    Collection: metatranscriptome_expression_analysis_set
+    Predecessors:
+    - Metatranscriptome Annotation
+    Input_prefix: nmdc_expression
+    Inputs:
+      gff_file: do:Functional Annotation GFF
+      map: do:Contig Mapping File
+      bam: do:Assembly Coverage BAM
+      proj: "{activity_id}"
+    Activity:
+      name: "Metatranscriptome Expression Analysis for {id}"
+      type: nmdc:MetatranscriptomeExpressionAnalysis
+    Outputs:
+    - output: count_table
+      data_object_type: Metatranscriptome Expression
+      description: Expression counts for {id}
+      name: Metatranscriptome expression table
+    - output: count_ig
+      data_object_type: Metatranscriptome Expression Intergenic
+      description: Metatranscriptome expression intergenic regions for {id}
+      name: Metatranscriptome expression intergenic table
+      optional: true
+    - output: info_file
+      data_object_type: Metatranscriptome Expression Info File
+      description: Expression info for {id}
+      name: Metatranscriptome Expression Info File
+
+  - Name: Expression Analysis Nonstranded
+    Type: nmdc:MetatranscriptomeExpressionAnalysis
+    Enabled: True
+    Analyte Category: Metatranscriptome
+    Git_repo: https://github.com/microbiomedata/metaT_ReadCounts
+    Version: v0.0.1
+    WDL: readcount.wdl
+    Collection: metatranscriptome_expression_analysis_set
+    Predecessors:
+    - Metatranscriptome Annotation
+    Input_prefix: nmdc_expression
+    Inputs:
+      gff_file: do:Functional Annotation GFF
+      map: do:Contig Mapping File
+      bam: do:Assembly Coverage BAM
+      rna_type: "non_stranded_RNA"
+      proj: "{activity_id}"
+    Activity:
+      name: "Metatranscriptome Expression Analysis for {id}"
+      type: nmdc:MetatranscriptomeExpressionAnalysis
+    Outputs:
+    - output: count_table
+      data_object_type: Metatranscriptome Expression
+      description: Expression counts for {id}
+      name: Metatranscriptome expression table
+    - output: count_ig
+      data_object_type: Metatranscriptome Expression Intergenic
+      description: Metatranscriptome expression intergenic regions for {id}
+      name: Metatranscriptome expression intergenic table
+      optional: true
+    - output: info_file
+      data_object_type: Metatranscriptome Expression Info File
+      description: Expression info for {id}
+      name: Metatranscriptome Expression Info File