From b94fb271cca202f5c12fb775fa5c655e03027951 Mon Sep 17 00:00:00 2001 From: Chienchi Lo Date: Wed, 24 Jul 2024 16:09:08 -0600 Subject: [PATCH 01/10] add check_id_map task --- mbin_nmdc.wdl | 46 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/mbin_nmdc.wdl b/mbin_nmdc.wdl index cbf09ac..a87e7eb 100755 --- a/mbin_nmdc.wdl +++ b/mbin_nmdc.wdl @@ -48,10 +48,17 @@ workflow nmdc_mags { lineage_file=lineage_file } + call check_id_map { + input: + container=container, + contig_file=stage.contig, + proteins_file=stage.proteins + } + call mbin_nmdc { input: name=proj, - fna = stage.contig, + fna = check_id_map.contig, aln = stage.sam, gff = stage.gff, lineage=stage.lineage_tsv, @@ -308,6 +315,43 @@ task stage { } } +task check_id_map{ + input{ + String container + File contig_file + File proteins_file + String contig_file_name=basename(contig_file) + } + command<<< + python <"): + file_id = line[1:].rstrip().split()[0] + contigIDs.append("file_id") + with open(~{proteins_file}) as p_file: + for line in p_file: + if line.startswith(">"): + file_id = line[1:].rstrip().split()[0] + contig_id = "_".join(file_id.split("_")[0:-2]) + if contig_id not in contigIDs: + print(f"{contig_id} is not in ~{contig_file_name}.", file=sys.stderr) + sys.exit(1) + CODE + >>> + + output{ + File contig = contig_file + } + runtime { + memory: "1 GiB" + cpu: 2 + maxRetries: 1 + docker: container + } +} task package{ input{ From 5eac3b86f80af3b2da735fee2a3f0d3db0065628 Mon Sep 17 00:00:00 2001 From: Chienchi Lo Date: Wed, 24 Jul 2024 16:17:58 -0600 Subject: [PATCH 02/10] fix open file variable --- mbin_nmdc.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mbin_nmdc.wdl b/mbin_nmdc.wdl index a87e7eb..9808993 100755 --- a/mbin_nmdc.wdl +++ b/mbin_nmdc.wdl @@ -326,12 +326,12 @@ task check_id_map{ python <"): file_id = line[1:].rstrip().split()[0] contigIDs.append("file_id") - with open(~{proteins_file}) as p_file: + with open("~{proteins_file}","r") as p_file: for line in p_file: if line.startswith(">"): file_id = line[1:].rstrip().split()[0] From 6fc6bbe0d84c4f66132f6e908f1dfe0c2945875b Mon Sep 17 00:00:00 2001 From: Chienchi Lo Date: Wed, 24 Jul 2024 16:25:00 -0600 Subject: [PATCH 03/10] set -euo for tasks --- mbin_nmdc.wdl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mbin_nmdc.wdl b/mbin_nmdc.wdl index 9808993..8683300 100755 --- a/mbin_nmdc.wdl +++ b/mbin_nmdc.wdl @@ -257,7 +257,7 @@ task stage { } command<<< - set -e + set -euo pipefail function stage() { in=$1 @@ -323,6 +323,8 @@ task check_id_map{ String contig_file_name=basename(contig_file) } command<<< + set -euo pipefail + python < Date: Wed, 24 Jul 2024 16:30:41 -0600 Subject: [PATCH 04/10] update cpu to 1 for task check_id_map --- mbin_nmdc.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mbin_nmdc.wdl b/mbin_nmdc.wdl index 8683300..73405bb 100755 --- a/mbin_nmdc.wdl +++ b/mbin_nmdc.wdl @@ -349,7 +349,7 @@ task check_id_map{ } runtime { memory: "1 GiB" - cpu: 2 + cpu: 1 docker: container } } From 4a9cda725ab55e090b3619cb9ea2b2c9c39aff7f Mon Sep 17 00:00:00 2001 From: Chienchi Lo Date: Wed, 24 Jul 2024 16:35:49 -0600 Subject: [PATCH 05/10] add comment of task check_id_map --- mbin_nmdc.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mbin_nmdc.wdl b/mbin_nmdc.wdl index 73405bb..717bbdb 100755 --- a/mbin_nmdc.wdl +++ b/mbin_nmdc.wdl @@ -336,8 +336,8 @@ task check_id_map{ with open("~{proteins_file}","r") as p_file: for line in p_file: if line.startswith(">"): - file_id = line[1:].rstrip().split()[0] - contig_id = "_".join(file_id.split("_")[0:-2]) + file_id = line[1:].rstrip().split()[0] # nmdc:wfmgan-12-gbysvd76.1_0000001_1_225 + contig_id = "_".join(file_id.split("_")[0:-2]) # nmdc:wfmgan-12-gbysvd76.1_0000001 if contig_id not in contigIDs: print(f"{contig_id} is not in ~{contig_file_name}.", file=sys.stderr) sys.exit(1) From 5ba7e4154684167eb6ccad637d05f8af17f0eba3 Mon Sep 17 00:00:00 2001 From: Chienchi Lo Date: Wed, 24 Jul 2024 16:40:36 -0600 Subject: [PATCH 06/10] fix contig_id variable append to array --- mbin_nmdc.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mbin_nmdc.wdl b/mbin_nmdc.wdl index 717bbdb..e762fc6 100755 --- a/mbin_nmdc.wdl +++ b/mbin_nmdc.wdl @@ -331,8 +331,8 @@ task check_id_map{ with open("~{contig_file}","r") as c_file: for line in c_file: if line.startswith(">"): - file_id = line[1:].rstrip().split()[0] - contigIDs.append("file_id") + file_id = line[1:].rstrip().split()[0] # nmdc:wfmgan-12-gbysvd76.1_0000001 + contigIDs.append(file_id) with open("~{proteins_file}","r") as p_file: for line in p_file: if line.startswith(">"): From 4ee118213beee2ec602c0d48862b6ca81596579d Mon Sep 17 00:00:00 2001 From: Chienchi Lo Date: Wed, 24 Jul 2024 16:43:04 -0600 Subject: [PATCH 07/10] rename variable from file_id to seq_id --- mbin_nmdc.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mbin_nmdc.wdl b/mbin_nmdc.wdl index e762fc6..9f8fbd1 100755 --- a/mbin_nmdc.wdl +++ b/mbin_nmdc.wdl @@ -331,13 +331,13 @@ task check_id_map{ with open("~{contig_file}","r") as c_file: for line in c_file: if line.startswith(">"): - file_id = line[1:].rstrip().split()[0] # nmdc:wfmgan-12-gbysvd76.1_0000001 - contigIDs.append(file_id) + seq_id = line[1:].rstrip().split()[0] # nmdc:wfmgan-12-gbysvd76.1_0000001 + contigIDs.append(seq_id) with open("~{proteins_file}","r") as p_file: for line in p_file: if line.startswith(">"): - file_id = line[1:].rstrip().split()[0] # nmdc:wfmgan-12-gbysvd76.1_0000001_1_225 - contig_id = "_".join(file_id.split("_")[0:-2]) # nmdc:wfmgan-12-gbysvd76.1_0000001 + seq_id = line[1:].rstrip().split()[0] # nmdc:wfmgan-12-gbysvd76.1_0000001_1_225 + contig_id = "_".join(seq_id.split("_")[0:-2]) # nmdc:wfmgan-12-gbysvd76.1_0000001 if contig_id not in contigIDs: print(f"{contig_id} is not in ~{contig_file_name}.", file=sys.stderr) sys.exit(1) From fef426d4011aa9b0d6592fda4614c0733f872a06 Mon Sep 17 00:00:00 2001 From: Chienchi Lo Date: Wed, 24 Jul 2024 22:48:41 -0600 Subject: [PATCH 08/10] check 10000 seqs --- mbin_nmdc.wdl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mbin_nmdc.wdl b/mbin_nmdc.wdl index 9f8fbd1..49b4911 100755 --- a/mbin_nmdc.wdl +++ b/mbin_nmdc.wdl @@ -328,6 +328,7 @@ task check_id_map{ python <"): @@ -338,9 +339,12 @@ task check_id_map{ if line.startswith(">"): seq_id = line[1:].rstrip().split()[0] # nmdc:wfmgan-12-gbysvd76.1_0000001_1_225 contig_id = "_".join(seq_id.split("_")[0:-2]) # nmdc:wfmgan-12-gbysvd76.1_0000001 + checknum += 1 if contig_id not in contigIDs: print(f"{contig_id} is not in ~{contig_file_name}.", file=sys.stderr) sys.exit(1) + if checknum > 10000: + break CODE >>> From 6986a8ada5718528f6857018e751a4c6d304f526 Mon Sep 17 00:00:00 2001 From: Chienchi Lo Date: Tue, 30 Jul 2024 08:09:05 -0600 Subject: [PATCH 09/10] use dictionary instead array to speed up id check --- mbin_nmdc.wdl | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mbin_nmdc.wdl b/mbin_nmdc.wdl index 8cb1b15..e29d939 100755 --- a/mbin_nmdc.wdl +++ b/mbin_nmdc.wdl @@ -333,8 +333,7 @@ task check_id_map{ python <"): @@ -349,8 +348,6 @@ task check_id_map{ if contig_id not in contigIDs: print(f"{contig_id} is not in ~{contig_file_name}.", file=sys.stderr) sys.exit(1) - if checknum > 10000: - break CODE >>> From 9f48f9b5f583a996ed28e7f7511164cb0ff7fe4b Mon Sep 17 00:00:00 2001 From: Chienchi Lo Date: Tue, 30 Jul 2024 08:10:18 -0600 Subject: [PATCH 10/10] use dictionary instead array to speed up id check --- mbin_nmdc.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mbin_nmdc.wdl b/mbin_nmdc.wdl index e29d939..7db48c3 100755 --- a/mbin_nmdc.wdl +++ b/mbin_nmdc.wdl @@ -338,7 +338,7 @@ task check_id_map{ for line in c_file: if line.startswith(">"): seq_id = line[1:].rstrip().split()[0] # nmdc:wfmgan-12-gbysvd76.1_0000001 - contigIDs.append(seq_id) + contigIDs[seq_id]=1 with open("~{proteins_file}","r") as p_file: for line in p_file: if line.startswith(">"):