diff --git a/README.md b/README.md index 5701f109..9e8450ef 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,15 @@ ![GitHub Workflow Status (with event)](https://img.shields.io/github/actions/workflow/status/HelikarLab/COMO/container_build.yml?style=for-the-badge&logo=docker&logoColor=white&label=Docker%20Build) ![GitHub Workflow Status (with event)](https://img.shields.io/github/actions/workflow/status/HelikarLab/COMO/type_checking.yml?style=for-the-badge&logo=docker&logoColor=white&label=Type%20Checking) -## Docker Quick Start +# Setting up COMO + +Go to COMO's documentation page for full installation and operation instructions or use one of the Quick Start options + +## [COMO Documentation Page](https://helikarlab.github.io/COMO) + +## Quick Start Options + +### Docker Quick Start This installation method **does** require docker @@ -19,7 +27,7 @@ This installation method **does** require docker > the [Gurobi solver](https://www.gurobi.com/). If you would like either (or both) of these features, > please [visit our documentation](https://helikarlab.github.io/COMO) for more details -## Conda Quick Start +### Conda Quick Start This installation method does **not** require docker diff --git a/environment.yaml b/environment.yaml index 42e616d3..705eb7d1 100644 --- a/environment.yaml +++ b/environment.yaml @@ -58,6 +58,7 @@ dependencies: - conda-forge::wget~=1.20.3 - conda-forge::xlrd~=2.0.1 - pip: + - async_bioservices - cobra==0.26.* - Escher==1.7.* - framed==0.5.* diff --git a/main/src/GSEpipelineFast.py b/main/src/GSEpipelineFast.py index 127f75f3..d6fa5c82 100644 --- a/main/src/GSEpipelineFast.py +++ b/main/src/GSEpipelineFast.py @@ -11,23 +11,16 @@ from GSEpipeline import load_gse_soft from instruments import AffyIO from rpy2.robjects import pandas2ri -from async_bioservices import async_bioservices -from async_bioservices.input_database import InputDatabase -from async_bioservices.output_database import OutputDatabase pandas2ri.activate() # Input: Extract Gene Info from GEO DataSets # gse = load_gse_soft(gsename) -from async_bioservices import async_bioservices -from async_bioservices.input_database import InputDatabase -from async_bioservices.output_database import OutputDatabase +from async_bioservices import db2db, InputDatabase, OutputDatabase -# Extract Platform Information - -def download_gsm_id_maps(datadir, gse, gpls: list[str] = None, vendor="affy"): +async def download_gsm_id_maps(datadir, gse, gpls: list[str] = None, vendor="affy"): """ download ID to ENTREZ_GENE_ID maps, create a csv file for each platform, and return dictionary :param gpls: @@ -40,25 +33,18 @@ def download_gsm_id_maps(datadir, gse, gpls: list[str] = None, vendor="affy"): # From: https://florimond.dev/en/posts/2018/08/python-mutable-defaults-are-the-source-of-all-evil/ if gpls is None: gpls: list[str] = ["GPL96", "GPL97", "GPL8300"] - + for gpl in gpls: table = gse.gpls[gpl].table.copy() if vendor.lower() == "affy": temp = table[["ID", "ENTREZ_GENE_ID"]] - + elif vendor.lower() == "agilent": input_values = table.loc[ table["CONTROL_TYPE"] == "FALSE", "SPOT_ID" ].tolist() - - """ - input_values, - input_db="Agilent ID", - output_db: list[str] = None, - delay=30, - """ - - temp = async_bioservices.fetch_gene_info( + + temp = await db2db( input_values=input_values, input_db=InputDatabase.AGILENT_ID, output_db=[ @@ -66,7 +52,7 @@ def download_gsm_id_maps(datadir, gse, gpls: list[str] = None, vendor="affy"): OutputDatabase.ENSEMBL_GENE_ID.value ] ) - + temp.drop(columns=["Ensembl Gene ID"], inplace=True) temp.reset_index(inplace=True) temp.rename( @@ -76,11 +62,11 @@ def download_gsm_id_maps(datadir, gse, gpls: list[str] = None, vendor="affy"): }, inplace=True ) temp.replace(to_replace="-", value=np.nan, inplace=True) - + else: print("Unsupported Platform: {}".format(gpl)) continue - + # Save to file filefullpath = os.path.join(datadir, "{}entrez.csv".format(gpl.lower())) temp.to_csv(filefullpath, index=False) @@ -107,7 +93,7 @@ def __init__(self, gsename, querytable, rootdir="../"): vendors = pairs["Instrument"].tolist() self.platforms = dict(zip(gpls, vendors)) self.download_samples() - + def organize_gse_raw_data(self): """ Organize raw data at local folder @@ -121,7 +107,7 @@ def organize_gse_raw_data(self): print("Path created: {}".format(platformdir)) else: print("Path already exist: {}".format(platformdir)) - + # Move Corresponding Cel files to Folders onlyfiles = [f for f in os.listdir(self.gene_dir) if f.endswith(".gz")] cnt = 0 @@ -138,7 +124,7 @@ def organize_gse_raw_data(self): print("Move {} to {}".format(src_path, dst_path)) cnt += 1 print("{} raw data files moved.".format(cnt)) - + def get_gsm_tables(self): """ get gsm maps in table @@ -158,9 +144,9 @@ def get_gsm_tables(self): df = temp[["ID", "ENTREZ_GENE_ID"]] df.set_index("ID", inplace=True) gsm_tables[gpl] = df - + return gsm_tables - + def get_gsm_platform(self): """ Classify the Samples by Platform @@ -172,7 +158,7 @@ def get_gsm_platform(self): gsm_platform = dict(zip(keys, values)) return gsm_platform - + def gsms_included_by(self, df): for gsm in self.gsm_platform.keys(): included = False @@ -183,15 +169,15 @@ def gsms_included_by(self, df): break if not included: return False - + return True - - def get_entrez_table_pipeline(self, fromcsv=True): + + async def get_entrez_table_pipeline(self, fromcsv=True): """ create ENTREZ ID based table from gse :return: pandas dataframe for table of GSE """ - + filefullpath = os.path.join( self.gene_dir, "{}_sc500_full_table.csv".format(self.gsename) ) @@ -209,14 +195,14 @@ def get_entrez_table_pipeline(self, fromcsv=True): print("Need Append GSMs") except: print("Unable to read {}") - + print("Create new table: {}".format(filefullpath)) gsm_maps = self.get_gsm_tables() - + if not any(gsm_maps): print("Not available, return empty dataframe") return pd.DataFrame([]) - + # Ready Affy files from folders gsm_tables_sc500 = {} for key, vendor in self.platforms.items(): @@ -229,15 +215,15 @@ def get_entrez_table_pipeline(self, fromcsv=True): outputdf = outputdf.readaffydir(platformdir) outputdf = ro.conversion.rpy2py(outputdf) elif vendor.lower() == "agilent": - + outputdf = instruments.readagilent(platformdir, list(self.gsm_platform.keys())) - - gsm_maps[key] = async_bioservices.fetch_gene_info( + + gsm_maps[key] = await db2db( input_values=list(map(str, list(outputdf["ProbeName"]))), input_db=InputDatabase.AGILENT_ID, output_db=[OutputDatabase.GENE_ID] ) - + gsm_maps[key].rename(columns={"Gene ID": "ENTREZ_GENE_ID"}, inplace=True) else: print("Unsupported Platform {} and Vendor {}".format(key, vendor)) @@ -245,19 +231,19 @@ def get_entrez_table_pipeline(self, fromcsv=True): else: print("Path not exist: {}".format(platformdir)) continue - + drop_idx = np.where(gsm_maps[key]["ENTREZ_GENE_ID"] == "-")[0].tolist() outputdf.drop(outputdf.index[drop_idx], axis=0, inplace=True) gsm_maps[key].drop(gsm_maps[key].index[drop_idx], axis=0, inplace=True) outputdf["ENTREZ_GENE_ID"] = gsm_maps[key]["ENTREZ_GENE_ID"].to_list() gsm_tables_sc500[key] = outputdf - + # Drop rows without ENTREZ GENE ID, set index to ENTREZ for key in self.platforms.keys(): gsm_tables_sc500[key].dropna(subset=["ENTREZ_GENE_ID"], inplace=True) gsm_tables_sc500[key].set_index("ENTREZ_GENE_ID", inplace=True) print("gsm table drop: ", gsm_tables_sc500[key]) - + # Merge tables of platforms df_outer_sc500 = None for key in self.platforms.keys(): @@ -271,7 +257,7 @@ def get_entrez_table_pipeline(self, fromcsv=True): on="ENTREZ_GENE_ID", how="outer", ) - + df_outer_sc500.dropna(how="all", inplace=True) print("Full: {}".format(df_outer_sc500.shape)) df_outer_sc500.rename(str.lower, axis="columns", inplace=True) @@ -287,10 +273,10 @@ def get_entrez_table_pipeline(self, fromcsv=True): vals.append(newcol) keys.append(col) gsms_loaded.append(gsm) - + df_outer_sc500.rename(columns=dict(zip(keys, vals)), inplace=True) gsms_loaded = list(set(gsms_loaded).union(set(self.gsm_platform.keys()))) - + # Remove duplicated items, keep largest VALUE for each GSM if "df_clean_sc500" not in locals(): df_clean_sc500 = pd.DataFrame([], index=df_outer_sc500.index) @@ -315,7 +301,7 @@ def get_entrez_table_pipeline(self, fromcsv=True): df_clean_sc500 = df_clean_sc500[ ~df_clean_sc500.index.duplicated(keep="last") ] - + for key in gsms_loaded: key_low = key.lower() col1, col2, col3 = ( @@ -326,26 +312,26 @@ def get_entrez_table_pipeline(self, fromcsv=True): try: temp = df_outer_sc500.loc[:, [col1, col2, col3]] - + except: if key in list(self.gsm_platform.keys()): print("{} not in df_outer_sc500".format(key)) - - continue + continue + temp.sort_values(by=["ENTREZ_GENE_ID", col1], inplace=True) temp = temp[~temp.index.duplicated(keep="last")] df_clean_sc500[col1] = temp[col1] df_clean_sc500[col2] = temp[col2] df_clean_sc500[col3] = temp[col3] - + # save to csv file try: df_clean_sc500.set_index("ENTREZ_GENE_ID", inplace=True) - + except: pass - + df_clean_sc500.dropna(axis="columns", how="all", inplace=True) df_clean_sc500.dropna(how="all", inplace=True) @@ -353,13 +339,13 @@ def get_entrez_table_pipeline(self, fromcsv=True): df_clean_sc500.drop(columns=["placeholder"], inplace=True) except: pass - + df_clean_sc500.sort_index(inplace=True) df_clean_sc500.to_csv(filefullpath) print("Full table saved to:\n{}".format(filefullpath)) return df_clean_sc500 - + def download_raw(self, overwrite=False): # check if path created if (not os.path.isdir(self.gene_dir)) or overwrite: @@ -380,10 +366,10 @@ def download_raw(self, overwrite=False): os.remove(filefullpath) print("Remove Raw File: {}".format(filefullpath)) self.organize_gse_raw_data() - + else: pass - + def download_samples(self, overwrite=False): os.makedirs(self.gene_dir, exist_ok=True) for gsm, gpl in self.gsm_platform.items(): @@ -399,17 +385,17 @@ def download_samples(self, overwrite=False): if (not os.path.isfile(filefullpath)) or overwrite: urllib.request.urlretrieve(sample_url, filefullpath) print("Retrieve Sample: {}".format(filefullpath)) - + else: print("Sample exist: {}".format(filefullpath)) continue - + tfile = tarfile.open(filefullpath) tfile.extractall(path=platformdir) # os.remove(filefullpath) # keep to avoid re-download print("Extract to: {}".format(platformdir)) print("Retrieve Samples Completed.") - + def calculate_z_score(self, df, to_csv=False): cols = list(df) result = pd.DataFrame([], index=df.index) @@ -422,5 +408,5 @@ def calculate_z_score(self, df, to_csv=False): self.gene_dir, "{}_data_z.csv".format(self.gsename) ) result.to_csv(filefullpath) - + return result diff --git a/main/src/async_bioservices/__init__.py b/main/src/async_bioservices/__init__.py deleted file mode 100644 index 130b0266..00000000 --- a/main/src/async_bioservices/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from . import async_bioservices -from . import input_database -from . import output_database -from . import taxon_ids diff --git a/main/src/async_bioservices/async_bioservices.py b/main/src/async_bioservices/async_bioservices.py deleted file mode 100644 index cc326c87..00000000 --- a/main/src/async_bioservices/async_bioservices.py +++ /dev/null @@ -1,146 +0,0 @@ -import asyncio -import pandas as pd -from bioservices import BioDBNet - -try: - from input_database import InputDatabase - from output_database import OutputDatabase - from taxon_ids import TaxonIDs -except ImportError: - from .input_database import InputDatabase - from .output_database import OutputDatabase - from .taxon_ids import TaxonIDs - - -async def _async_fetch_info( - biodbnet: BioDBNet, - event_loop: asyncio.AbstractEventLoop, - semaphore: asyncio.Semaphore, - input_values: list[str], - input_db: str, - output_db: list[str], - taxon_id: int, - delay: int = 10 -) -> pd.DataFrame: - await semaphore.acquire() - conversion: pd.DataFrame = await asyncio.to_thread( - biodbnet.db2db, - input_db, - output_db, - input_values, - taxon_id - ) - - semaphore.release() - - # If the above db2db conversion didn't work, try again until it does - if not isinstance(conversion, pd.DataFrame): - # Errors will occur on a timeouut. If this happens, split our working dataset in two and try again - first_set: list[str] = input_values[:len(input_values) // 2] - second_set: list[str] = input_values[len(input_values) // 2:] - - await asyncio.sleep(delay) - first_conversion: pd.DataFrame = await _async_fetch_info( - biodbnet, event_loop, semaphore, - first_set, input_db, output_db, - taxon_id, delay - ) - second_conversion: pd.DataFrame = await _async_fetch_info( - biodbnet, event_loop, semaphore, - second_set, input_db, output_db, - taxon_id, delay, - ) - return pd.concat([first_conversion, second_conversion]) - - return conversion - - -async def _fetch_gene_info_manager(tasks: list[asyncio.Task[pd.DataFrame]], batch_length: int) -> list[pd.DataFrame]: - results: list[pd.DataFrame] = [] - - print("Collecting genes... ", end="") - - task: asyncio.Future[pd.DataFrame] - for i, task in enumerate(asyncio.as_completed(tasks)): - results.append(await task) - print(f"\rCollecting genes... {i * batch_length} of ~{len(tasks) * batch_length} finished", end="") - - return results - - -def fetch_gene_info( - input_values: list[str], - input_db: InputDatabase, - output_db: OutputDatabase | list[OutputDatabase] = None, - taxon_id: TaxonIDs | int = TaxonIDs.HOMO_SAPIENS, - delay: int = 5 -) -> pd.DataFrame: - """ - This function returns a dataframe with important gene information for future operations in MADRID. - Fetch gene information from BioDBNet - :param input_values: A list of genes in "input_db" format - :param input_db: The input database to use (default: "Ensembl Gene ID") - :param output_db: The output format to use (default: ["Gene Symbol", "Gene ID", "Chromosomal Location"]) - :param delay: The delay in seconds to wait before trying again if bioDBnet is busy (default: 15) - :param taxon_id: The taxon ID to use (default: 9606) - :return: A dataframe with specified columns as "output_db" (Default is HUGO symbol, Entrez ID, and chromosome start and end positions) - """ - - input_db_value = input_db.value - batch_length: int = 500 - output_db_values: list[str] - if output_db is None: - output_db_values = [ - OutputDatabase.GENE_SYMBOL.value, - OutputDatabase.GENE_ID.value, - OutputDatabase.CHROMOSOMAL_LOCATION.value - ] - elif isinstance(output_db, OutputDatabase): - output_db_values = [output_db.value] - else: - output_db_values = [str(i.value) for i in output_db] - - if type(taxon_id) == TaxonIDs: - taxon_id_value = taxon_id.value - else: - taxon_id_value = taxon_id - - biodbnet = BioDBNet() - biodbnet.services.TIMEOUT = 60 - dataframe_maps: pd.DataFrame = pd.DataFrame([], columns=output_db_values) - dataframe_maps.index.name = input_db.value - - # Create a list of tasks to be awaited - event_loop = asyncio.new_event_loop() - asyncio.set_event_loop(event_loop) - async_tasks = [] - semaphore: asyncio.Semaphore = asyncio.Semaphore(15) - for i in range(0, len(input_values), batch_length): - # Define an upper range of values to take from input_values - upper_range = min(i + batch_length, len(input_values)) - task = event_loop.create_task( - _async_fetch_info( - biodbnet=biodbnet, - semaphore=semaphore, - input_values=input_values[i:upper_range], - input_db=input_db_value, - output_db=output_db_values, - taxon_id=taxon_id_value, - delay=delay, - event_loop=event_loop - ) - ) - - async_tasks.append(task) - - database_convert = event_loop.run_until_complete( - _fetch_gene_info_manager(tasks=async_tasks, batch_length=batch_length)) - event_loop.close() # Close the event loop to free resources - - # Loop over database_convert to concat them into dataframe_maps - print("") - for i, df in enumerate(database_convert): - print(f"Concatenating dataframes... {i + 1} of {len(database_convert)}" + " " * 50, end="\r") - dataframe_maps = pd.concat([dataframe_maps, df], sort=False) - print("") - return dataframe_maps diff --git a/main/src/async_bioservices/input_database.py b/main/src/async_bioservices/input_database.py deleted file mode 100644 index cc20d9de..00000000 --- a/main/src/async_bioservices/input_database.py +++ /dev/null @@ -1,71 +0,0 @@ -from enum import Enum - - -class InputDatabase(Enum): - """ - These are valid input database types for the BioDBNet API. - """ - AFFY_GENECHIP_ARRAY = "Affy GeneChip Array" - AFFY_ID = "Affy ID" - AFFY_TRANSCRIPT_CLUSTER_ID = "Affy Transcript Cluster ID" - AGILENT_ID = "Agilent ID" - BIOCARTA_PATHWAY_NAME = "Biocarta Pathway Name" - CODELINK_ID = "CodeLink ID" - DBSNP_ID = "dbSNP ID" - DRUGBANK_DRUG_ID = "DrugBank Drug ID" - DRUGBANK_DRUG_NAME = "DrugBank Drug Name" - EC_NUMBER = "EC Number" - ENSEMBL_GENE_ID = "Ensembl Gene ID" - ENSEMBL_PROTEIN_ID = "Ensembl Protein ID" - ENSEMBL_TRANSCRIPT_ID = "Ensembl Transcript ID" - EST_ACCESSION = "EST Accession" - FLYBASE_GENE_ID = "FlyBase Gene ID" - GENBANK_NUCLEOTIDE_ACCESSION = "GenBank Nucleotide Accession" - GENBANK_PROTEIN_ACCESSION = "GenBank Protein Accession" - GENE_ID = "Gene ID" - GENE_SYMBOL = "Gene Symbol" - GENE_SYMBOL_AND_SYNONYMS = "Gene Symbol and Synonyms" - GENE_SYMBOL_ORDERED_LOCUS = "Gene Symbol Ordered Locus" - GENE_SYMBOL_ORF = "Gene Symbol ORF" - GI_NUMBER = "GI Number" - GO_ID = "GO ID" - GSEA_STANDARD_NAME = "GSEA Standard Name" - H_INV_LOCUS_ID = "H-Inv Locus ID" - H_INV_PROTEIN_ID = "H-Inv Protein ID" - H_INV_TRANSCRIPT_ID = "H-Inv Transcript ID" - HGNC_ID = "HGNC ID" - HMDB_METABOLITE = "HMDB Metabolite" - HOMOLOGENE_ID = "HomoloGene ID" - ILLUMINA_ID = "Illumina ID" - INTERPRO_ID = "InterPro ID" - IPI_ID = "IPI ID" - KEGG_COMPOUND_ID = "KEGG Compound ID" - KEGG_COMPOUND_NAME = "KEGG Compound Name" - KEGG_DISEASE_ID = "KEGG Disease ID" - KEGG_DRUG_ID = "KEGG Drug ID" - KEGG_DRUG_NAME = "KEGG Drug Name" - KEGG_GENE_ID = "KEGG Gene ID" - KEGG_PATHWAY_ID = "KEGG Pathway ID" - MAIZEGDB_ID = "MaizeGDB ID" - MGI_ID = "MGI ID" - MIM_ID = "MIM ID" - MIRBASE_ID = "miRBase ID" - MIRBASE_MATURE_MIRNA_ACC = "miRBase Mature miRNA Acc" - NCIPID_PATHWAY_NAME = "NCIPID Pathway Name" - ORGANISM_SCIENTIFIC_NAME = "Organism Scientific Name" - PDB_ID = "PDB ID" - PFAM_ID = "Pfam ID" - PHARMGKB_ID = "PharmGKB ID" - PUBCHEM_ID = "PubChem ID" - REACTOME_PATHWAY_NAME = "Reactome Pathway Name" - REFSEQ_GENOMIC_ACCESSION = "RefSeq Genomic Accession" - REFSEQ_MRNA_ACCESSION = "RefSeq mRNA Accession" - REFSEQ_PROTEIN_ACCESSION = "RefSeq Protein Accession" - SGD_ID = "SGD ID" - TAIR_ID = "TAIR ID" - TAXON_ID = "Taxon ID" - UNIGENE_ID = "UniGene ID" - UNIPROT_ACCESSION = "UniProt Accession" - UNIPROT_ENTRY_NAME = "UniProt Entry Name" - UNIPROT_PROTEIN_NAME = "UniProt Protein Name" - UNISTS_ID = "UniSTS ID" diff --git a/main/src/async_bioservices/output_database.py b/main/src/async_bioservices/output_database.py deleted file mode 100644 index e9091f34..00000000 --- a/main/src/async_bioservices/output_database.py +++ /dev/null @@ -1,158 +0,0 @@ -from enum import Enum - - -class OutputDatabase(Enum): - """ - These are valid output database types for the BioDBNet API. - """ - AFFY_ID = "Affy ID" - AGILENT_ID = "Agilent ID" - ALLERGOME_CODE = "Allergome Code" - APIDB_CRYPTODB_ID = "ApiDB_CryptoDB ID" - BIOCARTA_PATHWAY_NAME = "Biocarta Pathway Name" - BIOCYC_ID = "BioCyc ID" - CCDS_ID = "CCDS ID" - CHROMOSOMAL_LOCATION = "Chromosomal Location" - CLEANEX_ID = "CleanEx ID" - CODELINK_ID = "CodeLink ID" - COSMIC_ID = "COSMIC ID" - CPDB_PROTEIN_INTERACTOR = "CPDB Protein Interactor" - CTD_DISEASE_INFO = "CTD Disease Info" - CTD_DISEASE_NAME = "CTD Disease Name" - CYGD_ID = "CYGD ID" - DBSNP_ID = "dbSNP ID" - DICTYBASE_ID = "dictyBase ID" - DIP_ID = "DIP ID" - DISPROT_ID = "DisProt ID" - DRUGBANK_DRUG_ID = "DrugBank Drug ID" - DRUGBANK_DRUG_INFO = "DrugBank Drug Info" - DRUGBANK_DRUG_NAME = "DrugBank Drug Name" - EC_NUMBER = "EC Number" - ECHOBASE_ID = "EchoBASE ID" - ECOGENE_ID = "EcoGene ID" - ENSEMBL_BIOTYPE = "Ensembl Biotype" - ENSEMBL_GENE_ID = "Ensembl Gene ID" - ENSEMBL_GENE_INFO = "Ensembl Gene Info" - ENSEMBL_PROTEIN_ID = "Ensembl Protein ID" - ENSEMBL_TRANSCRIPT_ID = "Ensembl Transcript ID" - FLYBASE_GENE_ID = "FlyBase Gene ID" - FLYBASE_PROTEIN_ID = "FlyBase Protein ID" - FLYBASE_TRANSCRIPT_ID = "FlyBase Transcript ID" - GAD_DISEASE_INFO = "GAD Disease Info" - GAD_DISEASE_NAME = "GAD Disease Name" - GENBANK_NUCLEOTIDE_ACCESSION = "GenBank Nucleotide Accession" - GENBANK_NUCLEOTIDE_GI = "GenBank Nucleotide GI" - GENBANK_PROTEIN_ACCESSION = "GenBank Protein Accession" - GENBANK_PROTEIN_GI = "GenBank Protein GI" - GENE_ID = "Gene ID" - GENE_INFO = "Gene Info" - GENE_SYMBOL = "Gene Symbol" - GENE_SYMBOL_AND_SYNONYMS = "Gene Symbol and Synonyms" - GENE_SYMBOL_ORDERED_LOCUS = "Gene Symbol Ordered Locus" - GENE_SYMBOL_ORF = "Gene Symbol ORF" - GENE_SYNONYMS = "Gene Synonyms" - GENEFARM_ID = "GeneFarm ID" - GO_BIOLOGICAL_PROCESS = "GO - Biological Process" - GO_CELLULAR_COMPONENT = "GO - Cellular Component" - GO_MOLECULAR_FUNCTION = "GO - Molecular Function" - GO_ID = "GO ID" - GSEA_STANDARD_NAME = "GSEA Standard Name" - H_INV_LOCUS_ID = "H-Inv Locus ID" - HAMAP_ID = "HAMAP ID" - HGNC_ID = "HGNC ID" - HMDB_METABOLITE = "HMDB Metabolite" - HOMOLOG_ALL_ENS_GENE_ID = "Homolog - All Ens Gene ID" - HOMOLOG_ALL_ENS_PROTEIN_ID = "Homolog - All Ens Protein ID" - HOMOLOG_ALL_GENE_ID = "Homolog - All Gene ID" - HOMOLOG_HUMAN_ENS_GENE_ID = "Homolog - Human Ens Gene ID" - HOMOLOG_HUMAN_ENS_PROTEIN_ID = "Homolog - Human Ens Protein ID" - HOMOLOG_HUMAN_GENE_ID = "Homolog - Human Gene ID" - HOMOLOG_MOUSE_ENS_GENE_ID = "Homolog - Mouse Ens Gene ID" - HOMOLOG_MOUSE_ENS_PROTEIN_ID = "Homolog - Mouse Ens Protein ID" - HOMOLOG_MOUSE_GENE_ID = "Homolog - Mouse Gene ID" - HOMOLOG_RAT_ENS_GENE_ID = "Homolog - Rat Ens Gene ID" - HOMOLOG_RAT_ENS_PROTEIN_ID = "Homolog - Rat Ens Protein ID" - HOMOLOG_RAT_GENE_ID = "Homolog - Rat Gene ID" - HOMOLOGENE_ID = "HomoloGene ID" - HPA_ID = "HPA ID" - HPRD_ID = "HPRD ID" - HPRD_PROTEIN_COMPLEX = "HPRD Protein Complex" - HPRD_PROTEIN_INTERACTOR = "HPRD Protein Interactor" - ILLUMINA_ID = "Illumina ID" - IMGT_GENE_DB_ID = "IMGT/GENE-DB ID" - INTERPRO_ID = "InterPro ID" - IPI_ID = "IPI ID" - KEGG_DISEASE_ID = "KEGG Disease ID" - KEGG_GENE_ID = "KEGG Gene ID" - KEGG_ORTHOLOGY_ID = "KEGG Orthology ID" - KEGG_PATHWAY_ID = "KEGG Pathway ID" - KEGG_PATHWAY_INFO = "KEGG Pathway Info" - KEGG_PATHWAY_TITLE = "KEGG Pathway Title" - LEGIOLIST_ID = "LegioList ID" - LEPROMA_ID = "Leproma ID" - LOCUS_TAG = "Locus Tag" - MAIZEGDB_ID = "MaizeGDB ID" - MEROPS_ID = "MEROPS ID" - MGC_ZGC_XGC_ID = "MGC(ZGC/XGC) ID" - MGC_ZGC_XGC_IMAGE_ID = "MGC(ZGC/XGC) Image ID" - MGC_ZGC_XGC_INFO = "MGC(ZGC/XGC) Info" - MGI_ID = "MGI ID" - MIM_ID = "MIM ID" - MIM_INFO = "MIM Info" - MIRBASE_ID = "miRBase ID" - NCIPID_PATHWAY_NAME = "NCIPID Pathway Name" - NCIPID_PROTEIN_COMPLEX = "NCIPID Protein Complex" - NCIPID_PROTEIN_INTERACTOR = "NCIPID Protein Interactor" - NCIPID_PTM = "NCIPID PTM" - ORPHANET_ID = "Orphanet ID" - PANTHER_ID = "PANTHER ID" - PARALOG_ENS_GENE_ID = "Paralog - Ens Gene ID" - PBR_ID = "PBR ID" - PDB_ID = "PDB ID" - PEROXIBASE_ID = "PeroxiBase ID" - PFAM_ID = "Pfam ID" - PHARMGKB_DRUG_INFO = "PharmGKB Drug Info" - PHARMGKB_GENE_ID = "PharmGKB Gene ID" - PIR_ID = "PIR ID" - PIRSF_ID = "PIRSF ID" - PPTASEDB_ID = "PptaseDB ID" - PRINTS_ID = "PRINTS ID" - PRODOM_ID = "ProDom ID" - PROSITE_ID = "PROSITE ID" - PSEUDOCAP_ID = "PseudoCAP ID" - PUBMED_ID = "PubMed ID" - REACTOME_ID = "Reactome ID" - REACTOME_PATHWAY_NAME = "Reactome Pathway Name" - REBASE_ID = "REBASE ID" - REFSEQ_GENOMIC_ACCESSION = "RefSeq Genomic Accession" - REFSEQ_GENOMIC_GI = "RefSeq Genomic GI" - REFSEQ_MRNA_ACCESSION = "RefSeq mRNA Accession" - REFSEQ_NCRNA_ACCESSION = "RefSeq ncRNA Accession" - REFSEQ_NUCLEOTIDE_GI = "RefSeq Nucleotide GI" - REFSEQ_PROTEIN_ACCESSION = "RefSeq Protein Accession" - REFSEQ_PROTEIN_GI = "RefSeq Protein GI" - RFAM_ID = "Rfam ID" - RGD_ID = "RGD ID" - SGD_ID = "SGD ID" - SMART_ID = "SMART ID" - STRING_PROTEIN_INTERACTOR = "STRING Protein Interactor" - TAIR_ID = "TAIR ID" - TAXON_ID = "Taxon ID" - TCDB_ID = "TCDB ID" - TIGRFAMS_ID = "TIGRFAMs ID" - TUBERCULIST_ID = "TubercuList ID" - UCSC_ID = "UCSC ID" - UNIGENE_ID = "UniGene ID" - UNIPROT_ACCESSION = "UniProt Accession" - UNIPROT_ENTRY_NAME = "UniProt Entry Name" - UNIPROT_INFO = "UniProt Info" - UNIPROT_PROTEIN_NAME = "UniProt Protein Name" - UNISTS_ID = "UniSTS ID" - VECTORBASE_GENE_ID = "VectorBase Gene ID" - VEGA_GENE_ID = "VEGA Gene ID" - VEGA_PROTEIN_ID = "VEGA Protein ID" - VEGA_TRANSCRIPT_ID = "VEGA Transcript ID" - WORMBASE_GENE_ID = "WormBase Gene ID" - WORMPEP_PROTEIN_ID = "WormPep Protein ID" - XENBASE_GENE_ID = "XenBase Gene ID" - ZFIN_ID = "ZFIN ID" diff --git a/main/src/async_bioservices/taxon_ids.py b/main/src/async_bioservices/taxon_ids.py deleted file mode 100644 index f05fbbf0..00000000 --- a/main/src/async_bioservices/taxon_ids.py +++ /dev/null @@ -1,9 +0,0 @@ -from enum import Enum - - -class TaxonIDs(Enum): - """ - The taxon IDs are the same as the taxon IDs used in BioDBNet. - """ - HOMO_SAPIENS = 9606 - MUS_MUSCULUS = 10090 diff --git a/main/src/disease_analysis.py b/main/src/disease_analysis.py index 6b199dc3..7d422a81 100644 --- a/main/src/disease_analysis.py +++ b/main/src/disease_analysis.py @@ -1,4 +1,5 @@ #!/usr/bin/python3 +import asyncio import argparse import sys import json @@ -9,9 +10,7 @@ from rpy2.robjects import pandas2ri import rpy2.robjects as ro -from async_bioservices import async_bioservices -from async_bioservices.input_database import InputDatabase -from async_bioservices.output_database import OutputDatabase +from async_bioservices import db2db, InputDatabase, OutputDatabase from rpy2.robjects.packages import importr from pathlib import Path import os @@ -129,7 +128,7 @@ def pharse_configs(config_filepath, sheet): return df, df_target -def get_microarray_diff_gene_exp(config_filepath, disease_name, target_path, taxon_id): +async def get_microarray_diff_gene_exp(config_filepath, disease_name, target_path, taxon_id): """ Get differential gene expression for microarray data @@ -177,7 +176,7 @@ def get_microarray_diff_gene_exp(config_filepath, disease_name, target_path, tax elif inst_name == "agilent": input_db: InputDatabase = InputDatabase.AGILENT_ID - bdnet = async_bioservices.fetch_gene_info( + bdnet = await db2db( input_values=list(map(str, diff_exp_df["Affy ID"].tolist())), input_db=input_db, output_db=[OutputDatabase.ENSEMBL_GENE_ID, OutputDatabase.GENE_ID, OutputDatabase.GENE_SYMBOL], @@ -192,7 +191,7 @@ def get_microarray_diff_gene_exp(config_filepath, disease_name, target_path, tax return diff_exp_df, gse_id -def get_rnaseq_diff_gene_exp(config_filepath, disease_name, context_name, taxon_id): +async def get_rnaseq_diff_gene_exp(config_filepath, disease_name, context_name, taxon_id): """ Get differential gene expression for RNA-seq data @@ -221,7 +220,7 @@ def get_rnaseq_diff_gene_exp(config_filepath, disease_name, context_name, taxon_ diff_exp_df = ro.conversion.rpy2py(diff_exp_df) gse_id = "rnaseq" - bdnet = async_bioservices.fetch_gene_info( + bdnet = await db2db( input_values=list(map(str, diff_exp_df["Ensembl"].tolist())), input_db=InputDatabase.ENSEMBL_GENE_ID, output_db=[OutputDatabase.GENE_ID, OutputDatabase.AFFY_ID, OutputDatabase.GENE_SYMBOL], @@ -314,7 +313,7 @@ def write_outputs(diff_exp_df, gse_id, context_name, disease_name, data_source, os.remove(target_path) -def main(argv): +async def main(argv): targetfile = "targets.txt" count_matrix = None @@ -401,9 +400,10 @@ def main(argv): for disease_name in sheet_names: target_path = os.path.join(configs.rootdir, "data", targetfile) if data_source == "MICROARRAY": - diff_exp_df, gse_id = get_microarray_diff_gene_exp(config_filepath, disease_name, target_path, taxon_id) + diff_exp_df, gse_id = await get_microarray_diff_gene_exp(config_filepath, disease_name, target_path, + taxon_id) elif data_source == "RNASEQ": - diff_exp_df, gse_id = get_rnaseq_diff_gene_exp(config_filepath, disease_name, context_name, taxon_id) + diff_exp_df, gse_id = await get_rnaseq_diff_gene_exp(config_filepath, disease_name, context_name, taxon_id) else: print("data_source should be either 'microarray' or 'rnaseq'") print("Refer to example config file for either type for formatting") @@ -413,4 +413,4 @@ def main(argv): if __name__ == "__main__": - main(sys.argv[1:]) + asyncio.run(main(sys.argv[1:])) diff --git a/main/src/knock_out_simulation.py b/main/src/knock_out_simulation.py index 5ee02be7..e68ced74 100644 --- a/main/src/knock_out_simulation.py +++ b/main/src/knock_out_simulation.py @@ -3,19 +3,18 @@ import re import sys import cobra +import asyncio import argparse import numpy as np import pandas as pd +from typing import Union from pathlib import Path import multiprocessing.pool import multiprocessing as mp -from multiprocessing.sharedctypes import Synchronized from project import configs from utilities import suppress_stdout -from async_bioservices import async_bioservices -from async_bioservices.input_database import InputDatabase -from async_bioservices.output_database import OutputDatabase +from async_bioservices import db2db, InputDatabase, OutputDatabase def _perform_knockout( @@ -54,9 +53,9 @@ def initialize_pool(synchronizer): def knock_out_simulation( model: cobra.Model, - inhibitors_filepath: str | Path, + inhibitors_filepath: Union[str, Path], drug_db: pd.DataFrame, - reference_flux_filepath: str | Path | None, + reference_flux_filepath: Union[str, Path, None], test_all: bool, pars_flag: bool ): @@ -299,7 +298,7 @@ def load_Inhi_Fratio(filepath): return temp2 -def repurposing_hub_preproc(drug_file): +async def repurposing_hub_preproc(drug_file): drug_db = pd.read_csv(drug_file, sep="\t") drug_db_new = pd.DataFrame() for index, row in drug_db.iterrows(): @@ -320,7 +319,7 @@ def repurposing_hub_preproc(drug_file): ) drug_db_new.reset_index(inplace=True) - entrez_ids = async_bioservices.fetch_gene_info( + entrez_ids = await db2db( input_values=drug_db_new["Target"].tolist(), input_db=InputDatabase.GENE_SYMBOL, output_db=OutputDatabase.GENE_ID @@ -333,10 +332,10 @@ def repurposing_hub_preproc(drug_file): return drug_db_new -def drug_repurposing(drug_db, d_score): +async def drug_repurposing(drug_db, d_score): d_score["Gene"] = d_score["Gene"].astype(str) - d_score_gene_sym = async_bioservices.fetch_gene_info( + d_score_gene_sym = await db2db( input_values=d_score["Gene"].tolist(), input_db=InputDatabase.GENE_ID, output_db=[OutputDatabase.GENE_SYMBOL] @@ -361,7 +360,7 @@ def drug_repurposing(drug_db, d_score): return d_score_trim -def main(argv): +async def main(argv): parser = argparse.ArgumentParser( prog="knock_out_simulation.py", description="This script is responsible for mapping drug targets in metabolic models, performing knock out simulations, and comparing simulation results with disease genes. It also identified drug targets and repurposable drugs.", @@ -488,7 +487,7 @@ def main(argv): reformatted_drug_file = os.path.join(configs.datadir, "Repurposing_Hub_Preproc.tsv") if not os.path.isfile(reformatted_drug_file): print("Preprocessing raw Repurposing Hub DB file...") - drug_db = repurposing_hub_preproc(raw_drug_filepath) + drug_db = await repurposing_hub_preproc(raw_drug_filepath) drug_db.to_csv(reformatted_drug_file, index=False, sep="\t") print(f"Preprocessed Repurposing Hub tsv file written to:\n{reformatted_drug_file}") else: @@ -540,7 +539,7 @@ def main(argv): pertubation_effect_score.reset_index(drop=False, inplace=True) # last step: output drugs based on d score - drug_score = drug_repurposing(drug_db, pertubation_effect_score) + drug_score = await drug_repurposing(drug_db, pertubation_effect_score) drug_score_file = os.path.join(output_dir, f"{context}_drug_score.csv") drug_score.to_csv(drug_score_file, index=False) print("Gene D score mapped to repurposing drugs saved to\n{}".format(drug_score_file)) @@ -549,4 +548,4 @@ def main(argv): if __name__ == "__main__": - main(sys.argv[1:]) + asyncio.run(main(sys.argv[1:])) diff --git a/main/src/merge_xomics.py b/main/src/merge_xomics.py index 5f356620..1d70d81c 100644 --- a/main/src/merge_xomics.py +++ b/main/src/merge_xomics.py @@ -4,6 +4,7 @@ import re import sys import json +import asyncio import argparse import pandas as pd from pathlib import Path @@ -15,7 +16,7 @@ import proteomics_gen from project import configs from utilities import split_gene_expression_data -from async_bioservices import async_bioservices +from async_bioservices import db2db, InputDatabase, OutputDatabase # enable r to py conversion # pandas2ri.activate() @@ -56,7 +57,7 @@ class _HighExpressionHeaderNames: SCRNASEQ = f"{_MergedHeaderNames.SCRNASEQ}_high" -def get_transcriptmoic_details(merged_df: pd.DataFrame) -> pd.DataFrame: +async def get_transcriptmoic_details(merged_df: pd.DataFrame) -> pd.DataFrame: """ This function will get the following details of transcriptomic data: - Gene Symbol @@ -105,13 +106,13 @@ def get_transcriptmoic_details(merged_df: pd.DataFrame) -> pd.DataFrame: else: transcriptomic_df: pd.DataFrame = merged_df.copy() - gene_details: pd.DataFrame = async_bioservices.fetch_gene_info( + gene_details: pd.DataFrame = await db2db( input_values=transcriptomic_df.index.astype(str).values.tolist(), - input_db=async_bioservices.InputDatabase.GENE_ID, + input_db=InputDatabase.GENE_ID, output_db=[ - async_bioservices.OutputDatabase.GENE_SYMBOL, - async_bioservices.OutputDatabase.ENSEMBL_GENE_INFO, - async_bioservices.OutputDatabase.GENE_INFO, + OutputDatabase.GENE_SYMBOL, + OutputDatabase.ENSEMBL_GENE_INFO, + OutputDatabase.GENE_INFO, ] ) gene_details["entrez_gene_id"] = gene_details.index @@ -158,7 +159,7 @@ def get_transcriptmoic_details(merged_df: pd.DataFrame) -> pd.DataFrame: return gene_details -def merge_xomics( +async def merge_xomics( context_name: str, expression_requirement, microarray_file=None, @@ -320,7 +321,7 @@ def merge_xomics( split_entrez.to_csv(filepath, index_label="ENTREZ_GENE_ID") files_dict[context_name] = filepath - transcriptomic_details = get_transcriptmoic_details(merge_data) + transcriptomic_details = await get_transcriptmoic_details(merge_data) transcriptomic_details_directory_path = os.path.dirname(filepath) transcriptomic_details_filepath = os.path.join(transcriptomic_details_directory_path, f"TranscriptomicDetails_{context_name}.csv") @@ -331,7 +332,7 @@ def merge_xomics( return files_dict -def handle_context_batch( +async def handle_context_batch( microarray_file, trnaseq_file, mrnaseq_file, @@ -426,7 +427,7 @@ def handle_context_batch( "Will be force changed to 1 to prevent output from having 0 active genes. ") exp_req = 1 - files_dict = merge_xomics( + files_dict = await merge_xomics( context_name, expression_requirement=exp_req, microarray_file=microarray_file, @@ -446,7 +447,7 @@ def handle_context_batch( return -def main(argv): +async def main(argv): """ Merge expression tables of multiple sources, microarray, RNA-seq, and/or proteomics into one list User can specify the number of sources with an active gene in order for it to be considered active in the model. @@ -689,7 +690,7 @@ def main(argv): ) sys.exit(1) - handle_context_batch( + await handle_context_batch( microarray_file, trnaseq_file, mrnaseq_file, @@ -712,7 +713,7 @@ def main(argv): if __name__ == "__main__": - main(sys.argv[1:]) + asyncio.run(main(sys.argv[1:])) # df = pd.read_csv("/Users/joshl/PycharmProjects/MADRID/main/data/results/naiveB/merged_naiveB.csv") # x = get_transcriptmoic_details(df) # print("DONE") diff --git a/main/src/rnaseq_preprocess.py b/main/src/rnaseq_preprocess.py index 37f572f7..846c5c26 100644 --- a/main/src/rnaseq_preprocess.py +++ b/main/src/rnaseq_preprocess.py @@ -2,25 +2,23 @@ import pandas as pd -from project import configs import re import os import sys -import argparse -from rpy2.robjects.packages import importr import glob +import asyncio +import argparse import numpy as np from pathlib import Path -from async_bioservices import async_bioservices -from async_bioservices.output_database import OutputDatabase -from async_bioservices.input_database import InputDatabase -from async_bioservices.taxon_ids import TaxonIDs import rpy2_api import utilities +from project import configs +from async_bioservices import db2db, InputDatabase, OutputDatabase, TaxonID r_file_path: Path = Path(configs.rootdir, "src", "rscripts", "generate_counts_matrix.R") + def create_counts_matrix(context_name): """ Create a counts matrix by reading gene counts tables in COMO_input///geneCounts/ @@ -30,7 +28,7 @@ def create_counts_matrix(context_name): print(f"Looking for STAR gene count tables in '{input_dir}'") matrix_output_dir = os.path.join(configs.rootdir, 'data', 'data_matrices', context_name) print(f"Creating Counts Matrix for '{context_name}'") - + # call generate_counts_matrix.R to create count matrix from COMO_input folder rpy2_hook = rpy2_api.Rpy2(r_file_path=r_file_path, data_dir=input_dir, out_dir=matrix_output_dir) rpy2_hook.call_function("generate_counts_matrix_main") @@ -44,26 +42,26 @@ def create_config_df(context_name): """ gene_counts_glob = os.path.join(configs.rootdir, "data", "COMO_input", context_name, "geneCounts", "*", "*.tab") gene_counts_files = glob.glob(gene_counts_glob, recursive=True) - + out_df = pd.DataFrame(columns=['SampleName', 'FragmentLength', 'Layout', 'Strand', 'Group']) - + for gcfilename in gene_counts_files: try: # Match S___R___r___ # \d{1,3} matches 1-3 digits # (r\d{1,3})? matches an option "r" followed by three digits label = re.findall(r"S\d{1,3}R\d{1,3}(?:r\d{1,3})?", gcfilename)[0] - + except IndexError: print(f"\nfilename of {gcfilename} is not valid. Should be 'contextName_SXRYrZ.tab', where X is the " "study/batch number, Y is the replicate number, and Z is the run number. If not a multi-run sample, " "exclude 'rZ' from the filename.") sys.exit() - + study_number = re.findall(r"S\d{1,3}", label)[0] rep_number = re.findall(r"R\d{1,3}", label)[0] run = re.findall(r"r\d{1,3}", label) - + multi_flag = 0 if len(run) > 0: if run[0] != "r1": @@ -73,30 +71,30 @@ def create_config_df(context_name): runs = [run for run in gene_counts_files if re.search(label_glob, run)] multi_flag = 1 frag_files = [] - + for r in runs: r_label = re.findall(r"r\d{1,3}", r)[0] R_label = re.findall(r"R\d{1,3}", r)[0] frag_filename = "".join([context_name, "_", study_number, R_label, r_label, "_fragment_size.txt"]) frag_files.append(os.path.join(configs.rootdir, "data", "COMO_input", context_name, "fragmentSizes", study_number, frag_filename)) - + layout_file = context_name + "_" + label + "_layout.txt" strand_file = context_name + "_" + label + "_strandedness.txt" frag_file = context_name + "_" + label + "_fragment_size.txt" prep_file = context_name + "_" + label + "_prep_method.txt" - + context_path = os.path.join(configs.rootdir, "data", "COMO_input", context_name) layout_path = os.path.join(context_path, "layouts", "*", layout_file) strand_path = os.path.join(context_path, "strandedness", "*", strand_file) frag_path = os.path.join(context_path, "fragmentSizes", "*", frag_file) prep_path = os.path.join(context_path, "prepMethods", "*", prep_file) - + layout_glob = glob.glob(layout_path, recursive=False) strand_glob = glob.glob(strand_path, recursive=False) frag_glob = glob.glob(frag_path, recursive=False) prep_glob = glob.glob(prep_path, recursive=False) - + # Get layout if len(layout_glob) < 1: print(f"\nNo layout file found for {label}, writing as 'UNKNOWN', this should be defined by user if using " @@ -109,7 +107,7 @@ def create_config_df(context_name): else: with open(layout_glob[0]) as file: layout = file.read().strip() - + # Get strandedness if len(strand_glob) < 1: print(f"\nNo strandedness file found for {label}, writing as 'UNKNOWN' This will not interfere with the " @@ -123,7 +121,7 @@ def create_config_df(context_name): else: with open(strand_glob[0]) as file: strand = file.read().strip() - + # Get preparation method if len(prep_glob) < 1: print(f"No prep file found for {label}, assuming 'total' as in Total RNA library preparation") @@ -138,7 +136,7 @@ def create_config_df(context_name): if prep not in ["total", "mrna"]: print("prep_method.txt must have either 'mrna' or 'total', or be absent to assume 'total'.") sys.exit() - + # Get fragment length if len(frag_glob) < 1: print(f"\nNo fragment file found for {label}, writing as 'UNKNOWN' This must be defined by the user in " @@ -150,7 +148,7 @@ def create_config_df(context_name): "replicate in COMO_input") sys.exit() else: - + if layout == "single-end": mean_fragment_size = 0 else: @@ -158,7 +156,7 @@ def create_config_df(context_name): frag_df = pd.read_table(frag_glob[0], low_memory=False) frag_df['meanxcount'] = frag_df['frag_mean'] * frag_df['frag_count'] mean_fragment_size = sum(frag_df['meanxcount'] / sum(frag_df['frag_count'])) - + else: mean_fragment_sizes = np.array([]) library_sizes = np.array([]) @@ -168,22 +166,22 @@ def create_config_df(context_name): mean_fragment_size = sum(frag_df['meanxcount'] / sum(frag_df['frag_count'])) mean_fragment_sizes = np.append(mean_fragment_sizes, mean_fragment_size) library_sizes = np.append(library_sizes, sum(frag_df['frag_count'])) - + mean_fragment_size = sum(mean_fragment_sizes * library_sizes) / sum(library_sizes) - + # label = "_".join([context_name, re.findall(r"S[1-9][0-9]?[0-9]?R[1-9][0-9]?[0-9]?", label)[0]]) # remove run number if there label = f"{context_name}_{study_number}{rep_number}" - + new_row = pd.DataFrame({'SampleName': [label], 'FragmentLength': [mean_fragment_size], 'Layout': [layout], 'Strand': [strand], 'Group': [study_number], 'LibraryPrep': [prep]}) - + out_df = pd.concat([out_df, new_row], sort=True) out_df.sort_values('SampleName', inplace=True) - + return out_df @@ -193,7 +191,7 @@ def split_config_df(df): """ df_t = df[df['LibraryPrep'] == "total"] df_m = df[df['LibraryPrep'] == "mrna"] - + return df_t, df_m @@ -204,16 +202,16 @@ def split_counts_matrices(count_matrix_all, df_total, df_mrna): matrix_all = pd.read_csv(count_matrix_all) matrix_total = matrix_all[["genes"] + [n for n in matrix_all.columns if n in df_total["SampleName"].tolist()]] matrix_mrna = matrix_all[["genes"] + [n for n in matrix_all.columns if n in df_mrna["SampleName"].tolist()]] - + return matrix_total, matrix_mrna -def create_gene_info_file(matrix_file_list: list[str], form: InputDatabase, taxon_id): +async def create_gene_info_file(matrix_file_list: list[str], form: InputDatabase, taxon_id): """ Create gene info file for specified context by reading first column in its count matrix file at /work/data/results//gene_info_.csv """ - + print(f"Fetching gene info") gene_info_file = os.path.join(configs.datadir, "gene_info.csv") if os.path.exists(gene_info_file): @@ -221,11 +219,11 @@ def create_gene_info_file(matrix_file_list: list[str], form: InputDatabase, taxo genes = current_df["ensembl_gene_id"].tolist() else: genes = [] - + for mfile in matrix_file_list: add_genes = pd.read_csv(mfile)["genes"].tolist() genes = list(set(genes + add_genes)) - + # Create our output database format # Do not include values equal to "form" # Note: This is a refactor; I'm not sure why we are removing values not equal to "form" @@ -238,14 +236,14 @@ def create_gene_info_file(matrix_file_list: list[str], form: InputDatabase, taxo ] if i.value != form.value ] - - gene_info = async_bioservices.fetch_gene_info( + + gene_info = await db2db( input_values=genes, input_db=form, output_db=output_db, taxon_id=taxon_id ) - + gene_info['start_position'] = gene_info['Chromosomal Location'].str.extract("chr_start: (\d+)") gene_info['end_position'] = gene_info['Chromosomal Location'].str.extract("chr_end: (\d+)") gene_info.index.rename("ensembl_gene_id", inplace=True) @@ -255,18 +253,18 @@ def create_gene_info_file(matrix_file_list: list[str], form: InputDatabase, taxo print(f"Gene Info file written at '{gene_info_file}'") -def handle_context_batch(context_names, mode, form: InputDatabase, taxon_id, provided_matrix_file): +async def handle_context_batch(context_names, mode, form: InputDatabase, taxon_id, provided_matrix_file): """ Handle iteration through each context type and create files according to flag used (config, matrix, info) """ trnaseq_config_filename = os.path.join(configs.rootdir, "data", "config_sheets", "trnaseq_data_inputs_auto.xlsx") mrnaseq_config_filename = os.path.join(configs.rootdir, "data", "config_sheets", "mrnaseq_data_inputs_auto.xlsx") - + tflag = False # turn on when any total set is found to prevent writer from being init multiple times or empty mflag = False # turn on when any mrna set is found to prevent writer from being init multiple times or empty - + print(f"Found {len(context_names)} contexts to process: {', '.join(context_names)}") - + tmatrix_files = [] mmatrix_files = [] for context_name in context_names: @@ -277,51 +275,51 @@ def handle_context_batch(context_names, mode, form: InputDatabase, taxon_id, pro os.makedirs(gene_output_dir, exist_ok=True) os.makedirs(matrix_output_dir, exist_ok=True) print('Gene info output directory is "{}"'.format(gene_output_dir)) - + matrix_path_all = os.path.join(matrix_output_dir, ("gene_counts_matrix_full_" + context_name + ".csv")) matrix_path_total = os.path.join(matrix_output_dir, ("gene_counts_matrix_total_" + context_name + ".csv")) matrix_path_mrna = os.path.join(matrix_output_dir, ("gene_counts_matrix_mrna_" + context_name + ".csv")) matrix_path_prov = os.path.join(matrix_output_dir, provided_matrix_file) - + if mode == "make": create_counts_matrix(context_name) # TODO: warn user or remove samples that are all 0 to prevent density plot error in zFPKM df = create_config_df(context_name) df_t, df_m = split_config_df(df) - + if not df_t.empty: if not tflag: tflag = True twriter = pd.ExcelWriter(trnaseq_config_filename) - + tmatrix_files.append(matrix_path_total) df_t.to_excel(twriter, sheet_name=context_name, header=True, index=False) - + if not df_m.empty: if not mflag: mflag = True mwriter = pd.ExcelWriter(mrnaseq_config_filename) - + mmatrix_files.append(matrix_path_mrna) df_m.to_excel(mwriter, sheet_name=context_name, header=True, index=False) - + tmatrix, mmatrix = split_counts_matrices(matrix_path_all, df_t, df_m) if len(tmatrix.columns) > 1: tmatrix.to_csv(matrix_path_total, header=True, index=False) if len(mmatrix.columns) > 1: mmatrix.to_csv(matrix_path_mrna, header=True, index=False) - + if mode == "make": if tflag: twriter.close() if mflag: mwriter.close() - - create_gene_info_file(tmatrix_files + mmatrix_files, form, taxon_id) - + + await create_gene_info_file(tmatrix_files + mmatrix_files, form, taxon_id) + else: matrix_files: list[str] = utilities.stringlist_to_list(matrix_path_prov) - create_gene_info_file(matrix_files, form, taxon_id) + await create_gene_info_file(matrix_files, form, taxon_id) def parse_args(argv): @@ -348,7 +346,7 @@ def parse_args(argv): For additional help, please post questions/issues in the MADRID GitHub repo at https://github.com/HelikarLab/MADRID or email babessell@gmail.com""", ) - + parser.add_argument("-n", "--context-names", type=str, nargs="+", @@ -360,7 +358,7 @@ def parse_args(argv): the count matrix as an imported .csv file. If making multiple models in a batch, then use the format: "context1 context2 context3". """ ) - + parser.add_argument("-f", "--gene-format", type=str, required=False, @@ -368,16 +366,16 @@ def parse_args(argv): dest="gene_format", help="Format of Genes, accepts 'Ensembl', 'Entrez', or'HGNC symbol'" ) - + parser.add_argument("-i", "--taxon-id", required=False, default=9606, dest="taxon_id", help="BioDbNet taxon ID number, also accepts 'human', or 'mouse'" ) - + group = parser.add_mutually_exclusive_group(required=True) - + group.add_argument("-p", "--provide-matrix", action="store_true", required=False, @@ -387,7 +385,7 @@ def parse_args(argv): "where colnames are sample names (in contextName_SXRY format) and rownames are genes in " "in format specified by --gene-format" ) # would be nice if this was a directory full matrices in case you want to do in batches - + group.add_argument('-y', "--create-matrix", action="store_true", required=False, @@ -397,7 +395,7 @@ def parse_args(argv): "Requires a correctly structured COMO_input folder in /work/data/. Can make one using: " "https://github.com/HelikarLab/FastqToGeneCounts" ) - + parser.add_argument("-m", "--matrix", required="--provide-matrix" in argv, # require if using --provide-matrix flag, dest="provided_matrix_fname", @@ -405,37 +403,37 @@ def parse_args(argv): help="Name of provided counts matrix in " "/work/data/data_matrices//.csv" ) - + args = parser.parse_args(argv) args.context_names = utilities.stringlist_to_list(args.context_names) - + return args -def main(argv): +async def main(argv): args = parse_args(argv) - + if args.gene_format.upper() in ["ENSEMBL", "ENSEMBLE", "ENSG", "ENSMUSG", "ENSEMBL ID", "ENSEMBL GENE ID"]: gene_format_database: InputDatabase = InputDatabase.ENSEMBL_GENE_ID - + elif args.gene_format.upper() in ["HGNC SYMBOL", "HUGO", "HUGO SYMBOL", "SYMBOL", "HGNC", "GENE SYMBOL"]: gene_format_database: InputDatabase = InputDatabase.GENE_SYMBOL - + elif args.gene_format.upper() in ["ENTREZ", "ENTRES", "ENTREZ ID", "ENTREZ NUMBER" "GENE ID"]: gene_format_database: InputDatabase = InputDatabase.GENE_ID - + else: # provided invalid gene format print("Gene format (--gene_format) is invalid") print("Accepts 'Ensembl', 'Entrez', and 'HGNC symbol'") print(f"You provided: {args.gene_format}") sys.exit() - + # handle species alternative ids if type(args.taxon_id) == str: if args.taxon_id.upper() == "HUMAN" or args.taxon_id.upper() == "HOMO SAPIENS": - taxon_id = TaxonIDs.HOMO_SAPIENS + taxon_id = TaxonID.HOMO_SAPIENS elif args.taxon_id.upper() == "MOUSE" or args.taxon_id.upper() == "MUS MUSCULUS": - taxon_id = TaxonIDs.MUS_MUSCULUS + taxon_id = TaxonID.MUS_MUSCULUS else: print("--taxon-id must be either an integer, or accepted string ('mouse', 'human')") sys.exit(1) @@ -444,7 +442,7 @@ def main(argv): else: print("--taxon-id must be either an integer, or accepted string ('mouse', 'human')") sys.exit(1) - + # use mutually exclusive flag to set mode which tells which files to generate if args.provide_matrix: mode = "provide" @@ -453,9 +451,9 @@ def main(argv): else: print("--provide-matrix or --create-matrix must be set") sys.exit(1) - - handle_context_batch(args.context_names, mode, gene_format_database, taxon_id, args.provided_matrix_fname) + + await handle_context_batch(args.context_names, mode, gene_format_database, taxon_id, args.provided_matrix_fname) if __name__ == "__main__": - main(sys.argv[1:]) + asyncio.run(main(sys.argv[1:])) diff --git a/main/src/tests/environment.yaml b/main/src/tests/environment.yaml index d9e32597..5fb75d7d 100644 --- a/main/src/tests/environment.yaml +++ b/main/src/tests/environment.yaml @@ -6,6 +6,7 @@ channels: - defaults dependencies: - aioftp==0.21.3 + - conda-forge::bioservices~=1.11.2 - flake8==5.0.4 - pandas==1.4.4 - python==3.10.10 @@ -18,5 +19,5 @@ dependencies: - tqdm==4.64.1 - pip==22.1.2 - pip: - - bioservices==1.10.1 - - GEOparse==2.0.3 + - async_bioservices + - GEOparse==2.0.3 diff --git a/main/src/tests/test_async_bioservices.py b/main/src/tests/test_async_bioservices.py deleted file mode 100644 index 79a6b56d..00000000 --- a/main/src/tests/test_async_bioservices.py +++ /dev/null @@ -1,73 +0,0 @@ -import os -import sys -import pandas as pd - -# Allow us to import anything we need from the /main/src directory -# From: https://stackoverflow.com/a/30536516/13885200 -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from async_bioservices.input_database import InputDatabase -from async_bioservices.output_database import OutputDatabase -from async_bioservices.taxon_ids import TaxonIDs -from async_bioservices.async_bioservices import fetch_gene_info - - -class TestDatabases: - """ - We've chosen a few random input databases to test. - These values should never change. - """ - def test_input_database(self): - assert InputDatabase.GENE_ID.value == "Gene ID" - assert InputDatabase.GENE_SYMBOL.value == "Gene Symbol" - assert InputDatabase.TAXON_ID.value == "Taxon ID" - assert InputDatabase.UNIGENE_ID.value == "UniGene ID" - assert InputDatabase.MIM_ID.value == "MIM ID" - assert InputDatabase.HGNC_ID.value == "HGNC ID" - assert InputDatabase.HOMOLOGENE_ID.value == "HomoloGene ID" - - def test_output_database(self): - assert OutputDatabase.CPDB_PROTEIN_INTERACTOR.value == "CPDB Protein Interactor" - assert OutputDatabase.EC_NUMBER.value == "EC Number" - assert OutputDatabase.GAD_DISEASE_INFO.value == "GAD Disease Info" - assert OutputDatabase.GO_ID.value == "GO ID" - assert OutputDatabase.HOMOLOG_HUMAN_ENS_GENE_ID.value == "Homolog - Human Ens Gene ID" - assert OutputDatabase.KEGG_PATHWAY_ID.value == "KEGG Pathway ID" - assert OutputDatabase.UNIPROT_ACCESSION.value == "UniProt Accession" - - def test_taxon_ids(self): - assert TaxonIDs.HOMO_SAPIENS.value == 9606 - assert TaxonIDs.MUS_MUSCULUS.value == 10090 - - -def test_conversion(): - """ - This test determines if the data collected/converted from async_bioservices.database_convert.fetch_gene_info is correct. - It does so by comparing the results of fetch_gene_info to a pre-collected dataframe, defined as `static_dataframe` - """ - # This is a static, pre-defined dataframe to test against - # "data" is of type Ensembl Gene IDS - # "index" is of type Gene Symbols - static_dataframe: pd.DataFrame = pd.DataFrame( - data={"Ensembl Gene ID": ["ENSG00000141510", "ENSG00000146648"]}, - index=["TP53", "EGFR"] - ) - static_dataframe.index.name = "Gene Symbol" - - input_values: list[str] = static_dataframe.index.to_list() - input_db: InputDatabase = InputDatabase.GENE_SYMBOL - output_db: list[OutputDatabase] = [OutputDatabase.ENSEMBL_GENE_ID] - - # This is the function we're testing - test_dataframe: pd.DataFrame = fetch_gene_info( - input_values=input_values, - input_db=input_db, - output_db=output_db, - taxon_id=TaxonIDs.HOMO_SAPIENS - ) - - assert static_dataframe.equals(test_dataframe) - - -if __name__ == '__main__': - test_conversion()