forked from nextgenusfs/genome_scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_ncbi_genomes.py
executable file
·206 lines (200 loc) · 11.4 KB
/
get_ncbi_genomes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#!/usr/bin/env python
#This script downloads all NCBI Fungal genomes with protein annotation
#written by Jon Palmer palmer.jona at gmail dot com
import sys, Bio, os, argparse, os.path, gzip, glob, re
import urllib2, shutil
from Bio import Entrez
from xml.dom import minidom
from Bio import SeqIO
class bcolors:
OKGREEN = '\033[92m'
BLUE = '\033[36m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
parser=argparse.ArgumentParser(
description='''Script searches NCBI BioProject/Assembly Database and automatically downloads genomes in GBK format''',
epilog="""Written by Jon Palmer (2015) [email protected]""")
parser.add_argument('search_term', help='Organism search term')
parser.add_argument('email', help='must tell ncbi who you are')
parser.add_argument('-n', '--num_records', default='1000', help='max number of records to download')
parser.add_argument('--only_annotated', action='store_true', help='only get annotated genomes')
args=parser.parse_args()
max_num = args.num_records
#construct search term based on input
organism = args.search_term
term = '((%s[Organism])' % (organism)
Entrez.email = args.email # Always tell NCBI who you are
handle = Entrez.esearch(db="genome", term=term, retmax=max_num)
search_result = Entrez.read(handle)
handle.close()
count = int(search_result["Count"])
print "------------------------------------------------"
print("Found %i species from NCBI Genome Database." % (count))
print "------------------------------------------------\n"
question = raw_input(bcolors.BLUE + "Do you want to continue?: [y/n] " + bcolors.ENDC)
if question == "n":
print "Script Exited by User"
os._exit(1)
else:
print "\n------------------------------------------------"
print "Using ELink to cross-reference Genome IDs with Assembly IDs."
print "------------------------------------------------"
#look for gi_list and remove if it exists
if os.path.isfile('gi_list.tmp'):
os.remove('gi_list.tmp')
gi_list = ",".join(search_result["IdList"])
#get the link data from assembly and print list to temp file
link_result = Entrez.read(Entrez.elink(dbfrom="genome", db="assembly", id=gi_list))
fo = open('gi_list.tmp', 'a')
for link in link_result[0]["LinkSetDb"][0]["Link"]:
fo.write("%s\n" % (link["Id"]))
fo.close()
#count the number of records from the link
link_count = sum(1 for line in open('gi_list.tmp'))
print ("Found %i records from Link to Assembly DB?\n" % (link_count))
question2 = raw_input(bcolors.BLUE + "Do you want to continue?: [y/n] " + bcolors.ENDC)
if question2 == "n":
print "Script Exited by User"
os._exit(1)
else:
print "\n------------------------------------------------"
#load in temp file to variable and get esummary for each record
with open('gi_list.tmp') as fi:
link_list = fi.read().splitlines()
fi.close()
gi_link = ",".join(link_list)
#look for results.xml and remove if it exists
if os.path.isfile('results.xml'):
os.remove('results.xml')
summary_handle = Entrez.esummary(db="assembly", id=gi_link, report="xml")
out_handle = open('results.xml', "w")
out_handle.write(summary_handle.read())
summary_handle.close()
out_handle.close()
print "Fetched Esummary for all genomes, saved to temp file"
print "------------------------------------------------"
#now need to parse those records to get Assembly accession numbers and download genome
data = minidom.parse("results.xml")
node = data.documentElement
ids = data.getElementsByTagName("DocumentSummary")
for id in ids:
accession = id.getElementsByTagName("AssemblyAccession")[0].childNodes[0].data
name = id.getElementsByTagName("AssemblyName")[0].childNodes[0].data
species = id.getElementsByTagName("Organism")[0].childNodes[0].data
gb_id = id.getElementsByTagName("GbUid")[0].childNodes[0].data
folder = accession + "_" + name.replace(" ", "_")
address = folder + "_genomic.gbff.gz"
out = re.sub(" ", "_", species)
out = re.sub("\.", "", out)
out = re.sub("\/", "-", out)
out = re.sub("\#", "-", out)
out = re.sub("\'", "", out)
out_name = out + "_gi" + gb_id + ".gbk"
if os.path.isfile(out_name):
print "Genome for " + species + " already converted to GBK"
continue
else:
if os.path.isfile(address):
print "Genome for " + species + " already downloaded, converting to GBK"
print bcolors.WARNING + "Now Loading: " + bcolors.ENDC + address
gbf_file = gzip.open(address, 'rb')
num_records = list(SeqIO.parse(gbf_file, "genbank"))
total_records = len(num_records)
have_seq = 0
no_seq = 0
cds_count = 0
for seq_record in SeqIO.parse(gbf_file, "genbank"):
sequence = seq_record.seq
if isinstance(sequence, Bio.Seq.UnknownSeq):
no_seq += 1
else:
have_seq += 1
#check annotation
for f in seq_record.features:
if f.type == "CDS":
cds_count = cds_count + 1
if total_records == have_seq and cds_count >= 100 and args.only_annotated:
print bcolors.OKGREEN + "GBK file validated: found " + bcolors.ENDC + str(total_records) + " records with " + str(cds_count) + " gene models. Writing to file " + bcolors.BLUE + out_name + bcolors.ENDC
gbf_file = gzip.open(address, 'rb')
gb_out = open(out_name, "w")
gb_file = SeqIO.parse(gbf_file, "genbank")
out_file = SeqIO.write(gb_file, gb_out, "genbank")
gbf_file.close()
gb_out.close()
os.remove(address)
elif total_records == have_seq and cds_count <= 99 and args.only_annotated:
print bcolors.FAIL + "GBK file missing annotation: " + bcolors.ENDC + "there were " + str(cds_count) + " gene models found in the file, skipping file"
os.remove(address)
elif total_records == have_seq:
print bcolors.OKGREEN + "GBK file validated: " + bcolors.ENDC + str(total_records) + " records found. Writing to file " + bcolors.BLUE + out_name + bcolors.ENDC
gbf_file = gzip.open(address, 'rb')
gb_out = open(out_name, "w")
gb_file = SeqIO.parse(gbf_file, "genbank")
out_file = SeqIO.write(gb_file, gb_out, "genbank")
gbf_file.close()
gb_out.close()
os.remove(address)
elif total_records != have_seq:
print bcolors.FAIL + "GBK file missing DNA sequence: " + bcolors.ENDC + "there were " + str(total_records) + " records but " + str(have_seq) + " have DNA sequence, skipping file"
os.remove(address)
continue
else:
try:
ftpfile = urllib2.urlopen("ftp://ftp.ncbi.nlm.nih.gov/genomes/all/" + folder + "/" + address)
localfile = open(address, "wb")
shutil.copyfileobj(ftpfile, localfile)
localfile.close()
print bcolors.OKGREEN + "Found: " + bcolors.ENDC + species + ". " + bcolors.OKGREEN + "Downloaded file: " + bcolors.ENDC + address
print bcolors.WARNING + "Now Loading: " + bcolors.ENDC + address
gbf_file = gzip.open(address, 'rb')
num_records = list(SeqIO.parse(gbf_file, "genbank"))
total_records = len(num_records)
have_seq = 0
no_seq = 0
cds_count = 0
gbf_file = gzip.open(address, 'rb')
for seq_record in SeqIO.parse(gbf_file, "genbank"):
sequence = seq_record.seq
if isinstance(sequence, Bio.Seq.UnknownSeq):
no_seq += 1
else:
have_seq += 1
#check annotation
for f in seq_record.features:
if f.type == "CDS":
cds_count = cds_count + 1
if total_records == have_seq and cds_count >= 100 and args.only_annotated:
print bcolors.OKGREEN + "GBK file validated: " + bcolors.ENDC + str(total_records) + " records found, with " + str(cds_count) + " gene models. Writing to file " + bcolors.BLUE + out_name + bcolors.ENDC
gbf_file = gzip.open(address, 'rb')
gb_out = open(out_name, "w")
gb_file = SeqIO.parse(gbf_file, "genbank")
out_file = SeqIO.write(gb_file, gb_out, "genbank")
gbf_file.close()
gb_out.close()
os.remove(address)
elif total_records == have_seq and cds_count <= 99 and args.only_annotated:
print bcolors.FAIL + "GBK file missing annotation: " + bcolors.ENDC + "there were " + str(cds_count) + " gene models found in the file, skipping file"
os.remove(address)
elif total_records == have_seq:
print bcolors.OKGREEN + "GBK file validated: " + bcolors.ENDC + str(total_records) + " records found. Writing to file " + bcolors.BLUE + out_name + bcolors.ENDC
gbf_file = gzip.open(address, 'rb')
gb_out = open(out_name, "w")
gb_file = SeqIO.parse(gbf_file, "genbank")
out_file = SeqIO.write(gb_file, gb_out, "genbank")
gbf_file.close()
gb_out.close()
os.remove(address)
elif total_records != have_seq:
print bcolors.FAIL + "GBK file missing DNA sequence: " + bcolors.ENDC + "there were " + str(total_records) + " records but " + str(have_seq) + " have DNA sequence, skipping file"
os.remove(address)
except:
print bcolors.FAIL + "Error: " + bcolors.ENDC + "file not found for " + species + " " + address + " it is likely an old assembly"
#clean up your mess
os.remove('results.xml')
os.remove('gi_list.tmp')
total = len([name for name in os.listdir('.') if os.path.isfile(name)])
print "------------------------------------------------"
print ("Searched for: %s" % (organism))
print "------------------------------------------------"
print ("Resulted in downloading and unpacking %i files\n" % (total))