Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates to scheduler #22

Merged
merged 6 commits into from
Jan 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
FROM python:3.9

ADD requirements.txt /tmp/requirements.txt
RUN \
pip install poetry && \
poetry config virtualenvs.create false

RUN pip install -r /tmp/requirements.txt
ADD pyproject.toml poetry.lock README.md /src/
WORKDIR /src
RUN \
poetry install --only=main --no-root

RUN pip install semver

ADD . /src

WORKDIR /src
55 changes: 4 additions & 51 deletions configs/workflows.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,14 @@ Workflows:
Outputs:
- output: filtered_final
name: Reads QC result fastq (clean data)
suffix: "_filtered.fastq.gz"
data_object_type: Filtered Sequencing Reads
description: "Reads QC for {id}"
- output: filtered_stats_final
name: Reads QC summary statistics
suffix: "_filterStats.txt"
data_object_type: QC Statistics
description: "Reads QC summary for {id}"
- output: rqc_info
name: File containing read filtering information
suffix: "_readsQC.info"
data_object_type: Read Filtering Info File
description: "Read filtering info for {id}"

Expand Down Expand Up @@ -85,17 +82,14 @@ Workflows:
Outputs:
- output: filtered_final
name: Reads QC result fastq (clean data)
suffix: "_filtered.fastq.gz"
data_object_type: Filtered Sequencing Reads
description: "Reads QC for {id}"
- output: filtered_stats_final
name: Reads QC summary statistics
suffix: "_filterStats.txt"
data_object_type: QC Statistics
description: "Reads QC summary for {id}"
- output: rqc_info
name: File containing read filtering information
suffix: "_readsQC.info"
data_object_type: Read Filtering Info File
description: "Read filtering info for {id}"

Expand Down Expand Up @@ -145,32 +139,26 @@ Workflows:
Outputs:
- output: contig
name: Final assembly contigs fasta
suffix: "_contigs.fna"
data_object_type: Assembly Contigs
description: "Assembly contigs for {id}"
- output: scaffold
name: Final assembly scaffolds fasta
suffix: "_scaffolds.fna"
data_object_type: Assembly Scaffolds
description: "Assembly scaffolds for {id}"
- output: covstats
name: Assembled contigs coverage information
suffix: "_covstats.txt"
data_object_type: Assembly Coverage Stats
description: "Coverage Stats for {id}"
- output: agp
name: An AGP format file that describes the assembly
suffix: "_assembly.agp"
data_object_type: Assembly AGP
description: "AGP for {id}"
- output: bam
name: Sorted bam file of reads mapping back to the final assembly
suffix: "_pairedMapped_sorted.bam"
data_object_type: Assembly Coverage BAM
description: "Sorted Bam for {id}"
- output: asminfo
name: File containing assembly info
suffix: "_metaAsm.info"
data_object_type: Assembly Info File
description: "Assembly info for {id}"

Expand All @@ -196,22 +184,18 @@ Workflows:
data_object_type: Annotation Amino Acid FASTA
description: FASTA Amino Acid File for {id}
name: FASTA amino acid file for annotated proteins
suffix: _proteins.faa
- output: structural_gff
data_object_type: Structural Annotation GFF
description: Structural Annotation for {id}
name: GFF3 format file with structural annotations
suffix: _structural_annotation.gff
- output: functional_gff
data_object_type: Functional Annotation GFF
description: Functional Annotation for {id}
name: GFF3 format file with functional annotations
suffix: _functional_annotation.gff
- output: ko_tsv
data_object_type: Annotation KEGG Orthology
description: KEGG Orthology for {id}
name: Tab delimited file for KO annotation
suffix: _ko.tsv
- output: ec_tsv
data_object_type: Annotation Enzyme Commission
description: EC Annotations for {id}
Expand All @@ -226,92 +210,74 @@ Workflows:
data_object_type: Clusters of Orthologous Groups (COG) Annotation GFF
description: COGs for {id}
name: GFF3 format file with COGs
suffix: _cog.gff
- output: pfam_gff
data_object_type: Pfam Annotation GFF
description: Pfam Annotation for {id}
name: GFF3 format file with Pfam
suffix: _pfam.gff
- output: tigrfam_gff
data_object_type: TIGRFam Annotation GFF
description: TIGRFam for {id}
name: GFF3 format file with TIGRfam
suffix: _tigrfam.gff
- output: smart_gff
data_object_type: SMART Annotation GFF
description: SMART Annotations for {id}
name: GFF3 format file with SMART
suffix: _smart.gff
- output: supfam_gff
data_object_type: SUPERFam Annotation GFF
description: SUPERFam Annotations for {id}
name: GFF3 format file with SUPERFam
suffix: _supfam.gff
- output: cath_funfam_gff
data_object_type: CATH FunFams (Functional Families) Annotation GFF
description: CATH FunFams for {id}
name: GFF3 format file with CATH FunFams
suffix: _cath_funfam.gff
- output: crt_gff
data_object_type: CRT Annotation GFF
description: CRT Annotations for {id}
name: GFF3 format file with CRT
suffix: _crt.gff
- output: genemark_gff
data_object_type: Genemark Annotation GFF
description: Genemark Annotations for {id}
name: GFF3 format file with Genemark
suffix: _genemark.gff
- output: prodigal_gff
data_object_type: Prodigal Annotation GFF
description: Prodigal Annotations {id}
name: GFF3 format file with Prodigal
suffix: _prodigal.gff
- output: trna_gff
data_object_type: TRNA Annotation GFF
description: TRNA Annotations {id}
name: GFF3 format file with TRNA
suffix: _trna.gff
- output: final_rfam_gff
data_object_type: RFAM Annotation GFF
description: RFAM Annotations for {id}
name: GFF3 format file with RFAM
suffix: _rfam.gff
- output: ko_ec_gff
data_object_type: KO_EC Annotation GFF
description: KO_EC Annotations for {id}
name: GFF3 format file with KO_EC
suffix: _ko_ec.gff
- output: product_names_tsv
data_object_type: Product Names
description: Product names for {id}
name: Product names file
suffix: _product_names.tsv
- output: gene_phylogeny_tsv
data_object_type: Gene Phylogeny tsv
description: Gene Phylogeny for {id}
name: Gene Phylogeny file
suffix: _gene_phylogeny.tsv
- output: crt_crisprs
data_object_type: Crispr Terms
description: Crispr Terms for {id}
name: Crispr Terms
suffix: _crt.crisprs
- output: stats_tsv
data_object_type: Annotation Statistics
description: Annotation Stats for {id}
name: Annotation statistics report
suffix: _stats.tsv
# - output: contig_mapping
# data_object_type: Contig Mapping File
# description: Contig mappings file for {id}
# name: Contig mappings between contigs and scaffolds
# suffix: _contig_names_mapping.tsv
- output: contig_mapping
data_object_type: Contig Mapping File
description: Conging mappings file for {id}
name: Contig mappings between contigs and scaffolds
- output: imgap_version
data_object_type: Annotation Info File
description: Annotation info for {id}
name: File containing annotation info
suffix: _imgap.info

- Name: MAGs
Type: nmdc:MagsAnalysisActivity
Expand Down Expand Up @@ -350,17 +316,14 @@ Workflows:
data_object_type: CheckM Statistics
description: CheckM for {id}
name: CheckM statistics report
suffix: _checkm_qa.out
- output: final_hqmq_bins_zip
data_object_type: Metagenome Bins
description: Metagenome Bins for {id}
name: Metagenome bin tarfiles archive
suffix: _hqmq_bin.zip
- output: final_gtdbtk_bac_summary
data_object_type: GTDBTK Bacterial Summary
description: Bacterial Summary for {id}
name: GTDBTK bacterial summary
suffix: _gtdbtk.bac122.summary.tsv
- output: final_gtdbtk_ar_summary
data_object_type: GTDBTK Archaeal Summary
description: Archaeal Summary for {id}
Expand All @@ -370,7 +333,6 @@ Workflows:
data_object_type: Metagenome Bins Info File
description: Metagenome Bins Info File for {id}
name: Metagenome Bins Info File
suffix: _bin.info

- Name: Readbased Analysis
Type: nmdc:ReadBasedTaxonomyAnalysisActivity
Expand All @@ -394,47 +356,38 @@ Workflows:
data_object_type: GOTTCHA2 Classification Report
description: GOTTCHA2 Classification for {id}
name: GOTTCHA2 classification report file
suffix: _gottcha2_report.tsv
- output: final_gottcha2_full_tsv
data_object_type: GOTTCHA2 Report Full
description: GOTTCHA2 Full Report for {id}
name: GOTTCHA2 report file
suffix: _gottcha2_full_tsv
- output: final_gottcha2_krona_html
data_object_type: GOTTCHA2 Krona Plot
description: GOTTCHA2 Krona for {id}
name: GOTTCHA2 krona plot HTML file
suffix: _gottcha2_krona.html
- output: final_centrifuge_classification_tsv
data_object_type: Centrifuge Taxonomic Classification
description: Centrifuge Classification for {id}
name: Centrifuge output read classification file
suffix: _centrifuge_classification.tsv
- output: final_centrifuge_report_tsv
data_object_type: Centrifuge output report file
description: Centrifuge Report for {id}
name: Centrifuge Classification Report
suffix: _centrifuge_report.tsv
- output: final_centrifuge_krona_html
data_object_type: Centrifuge Krona Plot
description: Centrifuge Krona for {id}
name: Centrifug krona plot HTML file
suffix: _centrifuge_krona.html
- output: final_kraken2_classification_tsv
data_object_type: Kraken2 Taxonomic Classification
description: Kraken2 Classification for {id}
name: Kraken2 output read classification file
suffix: _kraken2_classification.tsv
- output: final_kraken2_report_tsv
data_object_type: Kraken2 Classification Report
description: Kraken2 Report for {id}
name: Kraken2 output report file
suffix: _kraken2_report.tsv
- output: final_kraken2_krona_html
data_object_type: Kraken2 Krona Plot
description: Kraken2 Krona for {id}
name: Kraken2 Krona plot HTML file
suffix: _kraken2_krona.html
- output: info_file
data_object_type: Read Based Analysis Info File
description: Read based analysis info for {id}
Expand Down
34 changes: 21 additions & 13 deletions nmdc_automation/workflow_automation/activities.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,31 +24,43 @@ def _load_data_objects(db, workflows: List[Workflow]):


def _check(match_types, data_object_ids, data_objs):
if not data_object_ids:
return False
if not match_types or len(match_types) == 0:
return True
match_set = set(match_types)
do_types = set()
for doid in data_object_ids:
do_types.add(data_objs[doid].data_object_type)
if doid in data_objs:
do_types.add(data_objs[doid].data_object_type)
return match_set.issubset(do_types)


def _filter_skip(wf, rec, data_objs):
match_in = _check(wf.filter_input_objects, rec["has_input"], data_objs)
match_out = _check(wf.filter_output_objects, rec["has_output"], data_objs)
match_in = _check(wf.filter_input_objects,
rec.get("has_input"),
data_objs)
match_out = _check(wf.filter_output_objects,
rec.get("has_output"),
data_objs)
return not (match_in and match_out)


def _read_acitivites(db, workflows: List[Workflow], data_objects: dict, filter: dict):
def _read_acitivites(db, workflows: List[Workflow],
data_objects: dict, filter: dict):
"""
Read in all the activities for the defined workflows.
"""
activities = []
for wf in workflows:
logging.debug(f"Checking {wf.name}:{wf.version}")
q = filter
q["git_url"] = wf.git_repo
q["version"] = wf.version
for rec in db[wf.collection].find(q):
if wf.collection == "omics_processing_set" and \
rec["id"].startswith("gold"):
continue
if _filter_skip(wf, rec, data_objects):
continue
act = Activity(rec, wf)
Expand Down Expand Up @@ -86,10 +98,8 @@ def _resolve_relationships(activities, data_obj_act):
# Let's make sure these came from the same source
# This is just a safeguard
if act.was_informed_by != parent_act.was_informed_by:
logging.warning(
"Mismatched informed by found for"
f"{do_id} in {act.id} ({act.name})"
)
logging.warning("Mismatched informed by for "
f"{do_id} in {act.id}")
continue
# We only want to use it as a parent if it is the right
# parent workflow. Some inputs may come from ancestors
Expand All @@ -98,13 +108,11 @@ def _resolve_relationships(activities, data_obj_act):
# This is the one
act.parent = parent_act
parent_act.children.append(act)
logging.debug(f"Found parent: {parent_act.id} {parent_act.name}")
logging.debug(f"Found parent: {parent_act.id}"
f" {parent_act.name}")
break
if len(act.workflow.parents) > 0 and not act.parent:
logging.warning(
"Didn't find a parent for "
f"{act.id} ({act.name}) - {act.workflow.name}"
)
logging.warning(f"Didn't find a parent for {act.id}")
# Now all the activities have their parent
return activities

Expand Down
Loading