Skip to content

Commit

Permalink
Merge pull request #223 from microbiomedata/222-write-metatranscripto…
Browse files Browse the repository at this point in the history
…me-workflows-yaml

Initial metatranscriptome workflow
  • Loading branch information
aclum authored Aug 8, 2024
2 parents 7bce3d5 + 9a76be9 commit 6b15b5f
Show file tree
Hide file tree
Showing 2 changed files with 388 additions and 8 deletions.
380 changes: 380 additions & 0 deletions configs/workflows-mt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,380 @@
Workflows:
- Name: Sequencing Noninterleaved
Collection: omics_processing_set
Enabled: True
Analyte Category: Metatranscriptome
Filter Output Objects:
- Metagenome Raw Read 1
- Metagenome Raw Read 2

- Name: Sequencing Interleaved
Collection: omics_processing_set
Enabled: True
Analyte Category: Metatranscriptome
Filter Output Objects:
- Metagenome Raw Reads

- Name: Metatranscriptome Reads QC
Type: nmdc:ReadQcAnalysisActivity
Enabled: True
Analyte Category: Metatranscriptome
Git_repo: https://github.com/microbiomedata/metaT_ReadsQC
Version: v0.0.3
WDL: rqcfilter.wdl
Collection: read_qc_analysis_activity_set
Filter Input Objects:
- Metagenome Raw Reads
Predecessors:
- Sequencing
- Sequencing Interleaved
Input_prefix: nmdc_rqcfilter
Inputs:
input_files: do:Metagenome Raw Reads
proj: "{activity_id}"
Activity:
name: "Read QC Activity for {id}"
input_read_bases: "{outputs.stats.input_read_bases}"
input_read_count: "{outputs.stats.input_read_count}"
output_read_bases: "{outputs.stats.output_read_bases}"
output_read_count: "{outputs.stats.output_read_count}"
type: nmdc:ReadQcAnalysisActivity
Outputs:
- output: filtered_final
name: Reads QC result fastq (clean data)
data_object_type: Filtered Sequencing Reads
description: "Reads QC for {id}"
- output: filtered_stats_final
name: Reads QC summary statistics
data_object_type: QC Statistics
description: "Reads QC summary for {id}"
- output: rqc_info
name: File containing read filtering information
data_object_type: Read Filtering Info File
description: "Read filtering info for {id}"
- output: filtered_ribo_final
name: Fastq file containing filtered ribosomal sequences
data_object_type: rRNA Filtered Sequencing Reads
description: "rRNA fastq for {id}"

- Name: Metatranscriptome Reads QC Interleave
Type: nmdc:ReadQcAnalysisActivity
Enabled: True
Analyte Category: Metatranscriptome
Git_repo: https://github.com/microbiomedata/metaT_ReadsQC
Version: v0.0.3
Collection: read_qc_analysis_activity_set
WDL: interleave_rqcfilter.wdl
Input_prefix: nmdc_rqcfilter
Inputs:
proj: "{activity_id}"
input_fastq1: do:Metagenome Raw Read 1
input_fastq2: do:Metagenome Raw Read 2
Filter Input Objects:
- Metagenome Raw Read 1
- Metagenome Raw Read 2
Predecessors:
- Sequencing Noninterleaved
Activity:
name: "Read QC Activity for {id}"
input_read_bases: "{outputs.stats.input_read_bases}"
input_read_count: "{outputs.stats.input_read_count}"
output_read_bases: "{outputs.stats.output_read_bases}"
output_read_count: "{outputs.stats.output_read_count}"
type: nmdc:ReadQcAnalysisActivity
Outputs:
- output: filtered_final
name: Reads QC result fastq (clean data)
data_object_type: Filtered Sequencing Reads
description: "Reads QC for {id}"
- output: filtered_stats_final
name: Reads QC summary statistics
data_object_type: QC Statistics
description: "Reads QC summary for {id}"
- output: rqc_info
name: File containing read filtering information
data_object_type: Read Filtering Info File
description: "Read filtering info for {id}"
- output: rrna_fastq_final #placeholder until https://github.com/microbiomedata/metaT_ReadsQC/issues/5 is resolved
name: Fastq file containing filtered ribosomal sequences
data_object_type: rRNA Filtered Sequencing Reads
description: "rRNA fastq for {id}"

- Name: Metatranscriptome Assembly
Type: nmdc:MetatranscriptomeAssembly
Enabled: True
Analyte Category: Metatranscriptome
Git_repo: https://github.com/microbiomedata/metaT_Assembly
Version: v0.0.1
WDL: metaT_assembly.wdl
Collection: metagenome_assembly_set
Predecessors:
- Metatranscriptome Reads QC
- Metatranscriptome Reads QC Interleave
Input_prefix: jgi_metaASM
Inputs:
input_files: do:Filtered Sequencing Reads
proj: "{activity_id}"
Activity:
name: "Metatranscriptome Assembly Activity for {id}"
type: nmdc:MetatranscriptomeAssembly
asm_score: "{outputs.stats.asm_score}"
contig_bp: "{outputs.stats.contig_bp}"
contigs: "{outputs.stats.contigs}"
ctg_l50: "{outputs.stats.ctg_l50}"
ctg_l90: "{outputs.stats.ctg_l90}"
ctg_logsum: "{outputs.stats.ctg_logsum}"
ctg_max: "{outputs.stats.ctg_max}"
ctg_n50: "{outputs.stats.ctg_n50}"
ctg_n90: "{outputs.stats.ctg_n90}"
ctg_powsum: "{outputs.stats.ctg_powsum}"
gap_pct: "{outputs.stats.gap_pct}"
gc_avg: "{outputs.stats.gc_avg}"
gc_std: "{outputs.stats.gc_std}"
Outputs:
- output: final_contigs
name: Final assembly contigs fasta
data_object_type: Assembly Contigs
description: "Assembly contigs for {id}"
- output: final_cov
name: Assembled contigs coverage information
data_object_type: Assembly Coverage Stats
description: "Coverage Stats for {id}"
- output: final_bam
name: Sorted bam file of reads mapping back to the final assembly
data_object_type: Assembly Coverage BAM
description: "Sorted Bam for {id}"
- output: info_file
name: File containing assembly info
data_object_type: Assembly Info File
description: "Assembly info for {id}"
- output: final_bamidx
name: Indexed bam file
data_object_type: BAI File
description: "Alignment index file for {id}"

- Name: Metatranscriptome Annotation
Type: nmdc:MetatranscriptomeAnnotationActivity
Enabled: True
Analyte Category: Metatranscriptome
Git_repo: https://github.com/microbiomedata/mg_annotation
Version: v1.1.0
WDL: annotation_full.wdl
Collection: metatranscriptome_annotation_set
Predecessors:
- Metatranscriptome Assembly
Input_prefix: annotation
Inputs:
input_file: do:Assembly Contigs
imgap_project_id: "scaffold"
proj: "{activity_id}"
Activity:
name: "Metatranscriptome Annotation Analysis Activity for {id}"
type: nmdc:MetatranscriptomeAnnotationActivity
Outputs:
- output: proteins_faa
data_object_type: Annotation Amino Acid FASTA
description: FASTA Amino Acid File for {id}
name: FASTA amino acid file for annotated proteins
- output: structural_gff
data_object_type: Structural Annotation GFF
description: Structural Annotation for {id}
name: GFF3 format file with structural annotations
- output: functional_gff
data_object_type: Functional Annotation GFF
description: Functional Annotation for {id}
name: GFF3 format file with functional annotations
- output: ko_tsv
data_object_type: Annotation KEGG Orthology
description: KEGG Orthology for {id}
name: Tab delimited file for KO annotation
- output: ec_tsv
data_object_type: Annotation Enzyme Commission
description: EC Annotations for {id}
name: Tab delimited file for EC annotation
suffix: _ec.tsv
- output: lineage_tsv
data_object_type: Scaffold Lineage tsv
description: Scaffold Lineage tsv for {id}
name: Phylogeny at the scaffold level
suffix: _scaffold_lineage.tsv
- output: cog_gff
data_object_type: Clusters of Orthologous Groups (COG) Annotation GFF
description: COGs for {id}
name: GFF3 format file with COGs
- output: pfam_gff
data_object_type: Pfam Annotation GFF
description: Pfam Annotation for {id}
name: GFF3 format file with Pfam
- output: tigrfam_gff
data_object_type: TIGRFam Annotation GFF
description: TIGRFam for {id}
name: GFF3 format file with TIGRfam
- output: smart_gff
data_object_type: SMART Annotation GFF
description: SMART Annotations for {id}
name: GFF3 format file with SMART
- output: supfam_gff
data_object_type: SUPERFam Annotation GFF
description: SUPERFam Annotations for {id}
name: GFF3 format file with SUPERFam
- output: cath_funfam_gff
data_object_type: CATH FunFams (Functional Families) Annotation GFF
description: CATH FunFams for {id}
name: GFF3 format file with CATH FunFams
- output: crt_gff
data_object_type: CRT Annotation GFF
description: CRT Annotations for {id}
name: GFF3 format file with CRT
- output: genemark_gff
data_object_type: Genemark Annotation GFF
description: Genemark Annotations for {id}
name: GFF3 format file with Genemark
- output: prodigal_gff
data_object_type: Prodigal Annotation GFF
description: Prodigal Annotations {id}
name: GFF3 format file with Prodigal
- output: trna_gff
data_object_type: TRNA Annotation GFF
description: TRNA Annotations {id}
name: GFF3 format file with TRNA
- output: final_rfam_gff
data_object_type: RFAM Annotation GFF
description: RFAM Annotations for {id}
name: GFF3 format file with RFAM
- output: ko_ec_gff
data_object_type: KO_EC Annotation GFF
description: KO_EC Annotations for {id}
name: GFF3 format file with KO_EC
- output: product_names_tsv
data_object_type: Product Names
description: Product names for {id}
name: Product names file
- output: gene_phylogeny_tsv
data_object_type: Gene Phylogeny tsv
description: Gene Phylogeny for {id}
name: Gene Phylogeny file
- output: crt_crisprs
data_object_type: Crispr Terms
description: Crispr Terms for {id}
name: Crispr Terms
- output: stats_tsv
data_object_type: Annotation Statistics
description: Annotation Stats for {id}
name: Annotation statistics report
- output: map_file
data_object_type: Contig Mapping File
description: Contig mappings file for {id}
name: Contig mappings between contigs and scaffolds
- output: imgap_version
data_object_type: Annotation Info File
description: Annotation info for {id}
name: File containing annotation info
- output: renamed_fasta
data_object_type: Assembly Contigs
description: Assembly contigs (remapped) for {id}
name: File containing contigs with annotation headers


- Name: Expression Analysis Antisense
Type: nmdc:MetatranscriptomeExpressionAnalysis
Enabled: True
Analyte Category: Metatranscriptome
Git_repo: https://github.com/microbiomedata/metaT_ReadCounts
Version: v0.0.1
WDL: readcount.wdl
Collection: metatranscriptome_expression_analysis_set
Predecessors:
- Metatranscriptome Annotation
Input_prefix: nmdc_expression
Inputs:
gff_file: do:Functional Annotation GFF
map: do:Contig Mapping File
bam: do:Assembly Coverage BAM
rna_type: "aRNA"
proj: "{activity_id}"
Activity:
name: "Metatranscriptome Expression Analysis for {id}"
type: nmdc:MetatranscriptomeExpressionAnalysis
Outputs:
- output: count_table
data_object_type: Metatranscriptome Expression
description: Expression counts for {id}
name: Metatranscriptome expression table
- output: count_ig
data_object_type: Metatranscriptome Expression Intergenic
description: Metatranscriptome expression intergenic regions for {id}
name: Metatranscriptome expression intergenic table
optional: true
- output: info_file
data_object_type: Metatranscriptome Expression Info File
description: Expression info for {id}
name: Metatranscriptome Expression Info File

- Name: Expression Analysis Sense
Type: nmdc:MetatranscriptomeExpressionAnalysis
Enabled: True
Analyte Category: Metatranscriptome
Git_repo: https://github.com/microbiomedata/metaT_ReadCounts
Version: v0.0.1
WDL: readcount.wdl
Collection: metatranscriptome_expression_analysis_set
Predecessors:
- Metatranscriptome Annotation
Input_prefix: nmdc_expression
Inputs:
gff_file: do:Functional Annotation GFF
map: do:Contig Mapping File
bam: do:Assembly Coverage BAM
proj: "{activity_id}"
Activity:
name: "Metatranscriptome Expression Analysis for {id}"
type: nmdc:MetatranscriptomeExpressionAnalysis
Outputs:
- output: count_table
data_object_type: Metatranscriptome Expression
description: Expression counts for {id}
name: Metatranscriptome expression table
- output: count_ig
data_object_type: Metatranscriptome Expression Intergenic
description: Metatranscriptome expression intergenic regions for {id}
name: Metatranscriptome expression intergenic table
optional: true
- output: info_file
data_object_type: Metatranscriptome Expression Info File
description: Expression info for {id}
name: Metatranscriptome Expression Info File

- Name: Expression Analysis Nonstranded
Type: nmdc:MetatranscriptomeExpressionAnalysis
Enabled: True
Analyte Category: Metatranscriptome
Git_repo: https://github.com/microbiomedata/metaT_ReadCounts
Version: v0.0.1
WDL: readcount.wdl
Collection: metatranscriptome_expression_analysis_set
Predecessors:
- Metatranscriptome Annotation
Input_prefix: nmdc_expression
Inputs:
gff_file: do:Functional Annotation GFF
map: do:Contig Mapping File
bam: do:Assembly Coverage BAM
rna_type: "non_stranded_RNA"
proj: "{activity_id}"
Activity:
name: "Metatranscriptome Expression Analysis for {id}"
type: nmdc:MetatranscriptomeExpressionAnalysis
Outputs:
- output: count_table
data_object_type: Metatranscriptome Expression
description: Expression counts for {id}
name: Metatranscriptome expression table
- output: count_ig
data_object_type: Metatranscriptome Expression Intergenic
description: Metatranscriptome expression intergenic regions for {id}
name: Metatranscriptome expression intergenic table
optional: true
- output: info_file
data_object_type: Metatranscriptome Expression Info File
description: Expression info for {id}
name: Metatranscriptome Expression Info File
Loading

0 comments on commit 6b15b5f

Please sign in to comment.