Skip to content

Commit

Permalink
Use async_bioservices (#157)
Browse files Browse the repository at this point in the history
* Refactor GitHub workflows for better efficiency

The commit refactors GitHub workflows `unit_tests.yml`, `type_checking.yml` and `container_build.yml`. The changes simplify the process of checking for python file changes, by leveraging GitHub's paths filters for pull_request events. This results in cleaner and more maintainable workflow files. In addition, the 'workflow_dispatch' event was removed from the `container_build.yml` workflow. This is due to it being unnecessary, as the build is triggered by push events.

* Remove testing limit on python files, caused collision with test pass requirement

* Update README.md

* COMO will now use the 'async_bioservices' package instead of maintaining its own

* Optimize code by migrating from async_bioservices to db2db for async operations

Replaced all instances of async_bioservices, and its associated methods, with db2db in all source files in the main directory to optimize code for better asynchronous database operations. This update improves performance and reduces complexity by utilizing a single method (db2db) for all asynchronous operations, making it easier to manage and maintain. The async/await syntax has been employed to handle asynchronous calls, enhancing code readability and maintainability.

* Update tests environment.yaml
  • Loading branch information
JoshLoecker authored Oct 17, 2023
1 parent 4f823a3 commit d2eaf58
Show file tree
Hide file tree
Showing 14 changed files with 172 additions and 639 deletions.
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,15 @@
![GitHub Workflow Status (with event)](https://img.shields.io/github/actions/workflow/status/HelikarLab/COMO/container_build.yml?style=for-the-badge&logo=docker&logoColor=white&label=Docker%20Build)
![GitHub Workflow Status (with event)](https://img.shields.io/github/actions/workflow/status/HelikarLab/COMO/type_checking.yml?style=for-the-badge&logo=docker&logoColor=white&label=Type%20Checking)

## Docker Quick Start
# Setting up COMO

Go to COMO's documentation page for full installation and operation instructions or use one of the Quick Start options

## [COMO Documentation Page](https://helikarlab.github.io/COMO)

## Quick Start Options

### Docker Quick Start

This installation method **does** require docker

Expand All @@ -19,7 +27,7 @@ This installation method **does** require docker
> the [Gurobi solver](https://www.gurobi.com/). If you would like either (or both) of these features,
> please [visit our documentation](https://helikarlab.github.io/COMO) for more details
## Conda Quick Start
### Conda Quick Start

This installation method does **not** require docker

Expand Down
1 change: 1 addition & 0 deletions environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ dependencies:
- conda-forge::wget~=1.20.3
- conda-forge::xlrd~=2.0.1
- pip:
- async_bioservices
- cobra==0.26.*
- Escher==1.7.*
- framed==0.5.*
Expand Down
108 changes: 47 additions & 61 deletions main/src/GSEpipelineFast.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,16 @@
from GSEpipeline import load_gse_soft
from instruments import AffyIO
from rpy2.robjects import pandas2ri
from async_bioservices import async_bioservices
from async_bioservices.input_database import InputDatabase
from async_bioservices.output_database import OutputDatabase

pandas2ri.activate()
# Input: Extract Gene Info from GEO DataSets

# gse = load_gse_soft(gsename)

from async_bioservices import async_bioservices
from async_bioservices.input_database import InputDatabase
from async_bioservices.output_database import OutputDatabase
from async_bioservices import db2db, InputDatabase, OutputDatabase

# Extract Platform Information


def download_gsm_id_maps(datadir, gse, gpls: list[str] = None, vendor="affy"):
async def download_gsm_id_maps(datadir, gse, gpls: list[str] = None, vendor="affy"):
"""
download ID to ENTREZ_GENE_ID maps, create a csv file for each platform, and return dictionary
:param gpls:
Expand All @@ -40,33 +33,26 @@ def download_gsm_id_maps(datadir, gse, gpls: list[str] = None, vendor="affy"):
# From: https://florimond.dev/en/posts/2018/08/python-mutable-defaults-are-the-source-of-all-evil/
if gpls is None:
gpls: list[str] = ["GPL96", "GPL97", "GPL8300"]

for gpl in gpls:
table = gse.gpls[gpl].table.copy()
if vendor.lower() == "affy":
temp = table[["ID", "ENTREZ_GENE_ID"]]

elif vendor.lower() == "agilent":
input_values = table.loc[
table["CONTROL_TYPE"] == "FALSE", "SPOT_ID"
].tolist()

"""
input_values,
input_db="Agilent ID",
output_db: list[str] = None,
delay=30,
"""

temp = async_bioservices.fetch_gene_info(

temp = await db2db(
input_values=input_values,
input_db=InputDatabase.AGILENT_ID,
output_db=[
OutputDatabase.GENE_ID.value,
OutputDatabase.ENSEMBL_GENE_ID.value
]
)

temp.drop(columns=["Ensembl Gene ID"], inplace=True)
temp.reset_index(inplace=True)
temp.rename(
Expand All @@ -76,11 +62,11 @@ def download_gsm_id_maps(datadir, gse, gpls: list[str] = None, vendor="affy"):
}, inplace=True
)
temp.replace(to_replace="-", value=np.nan, inplace=True)

else:
print("Unsupported Platform: {}".format(gpl))
continue

# Save to file
filefullpath = os.path.join(datadir, "{}entrez.csv".format(gpl.lower()))
temp.to_csv(filefullpath, index=False)
Expand All @@ -107,7 +93,7 @@ def __init__(self, gsename, querytable, rootdir="../"):
vendors = pairs["Instrument"].tolist()
self.platforms = dict(zip(gpls, vendors))
self.download_samples()

def organize_gse_raw_data(self):
"""
Organize raw data at local folder
Expand All @@ -121,7 +107,7 @@ def organize_gse_raw_data(self):
print("Path created: {}".format(platformdir))
else:
print("Path already exist: {}".format(platformdir))

# Move Corresponding Cel files to Folders
onlyfiles = [f for f in os.listdir(self.gene_dir) if f.endswith(".gz")]
cnt = 0
Expand All @@ -138,7 +124,7 @@ def organize_gse_raw_data(self):
print("Move {} to {}".format(src_path, dst_path))
cnt += 1
print("{} raw data files moved.".format(cnt))

def get_gsm_tables(self):
"""
get gsm maps in table
Expand All @@ -158,9 +144,9 @@ def get_gsm_tables(self):
df = temp[["ID", "ENTREZ_GENE_ID"]]
df.set_index("ID", inplace=True)
gsm_tables[gpl] = df

return gsm_tables

def get_gsm_platform(self):
"""
Classify the Samples by Platform
Expand All @@ -172,7 +158,7 @@ def get_gsm_platform(self):
gsm_platform = dict(zip(keys, values))

return gsm_platform

def gsms_included_by(self, df):
for gsm in self.gsm_platform.keys():
included = False
Expand All @@ -183,15 +169,15 @@ def gsms_included_by(self, df):
break
if not included:
return False

return True

def get_entrez_table_pipeline(self, fromcsv=True):
async def get_entrez_table_pipeline(self, fromcsv=True):
"""
create ENTREZ ID based table from gse
:return: pandas dataframe for table of GSE
"""

filefullpath = os.path.join(
self.gene_dir, "{}_sc500_full_table.csv".format(self.gsename)
)
Expand All @@ -209,14 +195,14 @@ def get_entrez_table_pipeline(self, fromcsv=True):
print("Need Append GSMs")
except:
print("Unable to read {}")

print("Create new table: {}".format(filefullpath))
gsm_maps = self.get_gsm_tables()

if not any(gsm_maps):
print("Not available, return empty dataframe")
return pd.DataFrame([])

# Ready Affy files from folders
gsm_tables_sc500 = {}
for key, vendor in self.platforms.items():
Expand All @@ -229,35 +215,35 @@ def get_entrez_table_pipeline(self, fromcsv=True):
outputdf = outputdf.readaffydir(platformdir)
outputdf = ro.conversion.rpy2py(outputdf)
elif vendor.lower() == "agilent":

outputdf = instruments.readagilent(platformdir, list(self.gsm_platform.keys()))

gsm_maps[key] = async_bioservices.fetch_gene_info(
gsm_maps[key] = await db2db(
input_values=list(map(str, list(outputdf["ProbeName"]))),
input_db=InputDatabase.AGILENT_ID,
output_db=[OutputDatabase.GENE_ID]
)

gsm_maps[key].rename(columns={"Gene ID": "ENTREZ_GENE_ID"}, inplace=True)
else:
print("Unsupported Platform {} and Vendor {}".format(key, vendor))
continue
else:
print("Path not exist: {}".format(platformdir))
continue

drop_idx = np.where(gsm_maps[key]["ENTREZ_GENE_ID"] == "-")[0].tolist()
outputdf.drop(outputdf.index[drop_idx], axis=0, inplace=True)
gsm_maps[key].drop(gsm_maps[key].index[drop_idx], axis=0, inplace=True)
outputdf["ENTREZ_GENE_ID"] = gsm_maps[key]["ENTREZ_GENE_ID"].to_list()
gsm_tables_sc500[key] = outputdf

# Drop rows without ENTREZ GENE ID, set index to ENTREZ
for key in self.platforms.keys():
gsm_tables_sc500[key].dropna(subset=["ENTREZ_GENE_ID"], inplace=True)
gsm_tables_sc500[key].set_index("ENTREZ_GENE_ID", inplace=True)
print("gsm table drop: ", gsm_tables_sc500[key])

# Merge tables of platforms
df_outer_sc500 = None
for key in self.platforms.keys():
Expand All @@ -271,7 +257,7 @@ def get_entrez_table_pipeline(self, fromcsv=True):
on="ENTREZ_GENE_ID",
how="outer",
)

df_outer_sc500.dropna(how="all", inplace=True)
print("Full: {}".format(df_outer_sc500.shape))
df_outer_sc500.rename(str.lower, axis="columns", inplace=True)
Expand All @@ -287,10 +273,10 @@ def get_entrez_table_pipeline(self, fromcsv=True):
vals.append(newcol)
keys.append(col)
gsms_loaded.append(gsm)

df_outer_sc500.rename(columns=dict(zip(keys, vals)), inplace=True)
gsms_loaded = list(set(gsms_loaded).union(set(self.gsm_platform.keys())))

# Remove duplicated items, keep largest VALUE for each GSM
if "df_clean_sc500" not in locals():
df_clean_sc500 = pd.DataFrame([], index=df_outer_sc500.index)
Expand All @@ -315,7 +301,7 @@ def get_entrez_table_pipeline(self, fromcsv=True):
df_clean_sc500 = df_clean_sc500[
~df_clean_sc500.index.duplicated(keep="last")
]

for key in gsms_loaded:
key_low = key.lower()
col1, col2, col3 = (
Expand All @@ -326,40 +312,40 @@ def get_entrez_table_pipeline(self, fromcsv=True):

try:
temp = df_outer_sc500.loc[:, [col1, col2, col3]]

except:
if key in list(self.gsm_platform.keys()):
print("{} not in df_outer_sc500".format(key))

continue

continue

temp.sort_values(by=["ENTREZ_GENE_ID", col1], inplace=True)
temp = temp[~temp.index.duplicated(keep="last")]
df_clean_sc500[col1] = temp[col1]
df_clean_sc500[col2] = temp[col2]
df_clean_sc500[col3] = temp[col3]

# save to csv file
try:
df_clean_sc500.set_index("ENTREZ_GENE_ID", inplace=True)

except:
pass

df_clean_sc500.dropna(axis="columns", how="all", inplace=True)
df_clean_sc500.dropna(how="all", inplace=True)

try:
df_clean_sc500.drop(columns=["placeholder"], inplace=True)
except:
pass

df_clean_sc500.sort_index(inplace=True)
df_clean_sc500.to_csv(filefullpath)
print("Full table saved to:\n{}".format(filefullpath))

return df_clean_sc500

def download_raw(self, overwrite=False):
# check if path created
if (not os.path.isdir(self.gene_dir)) or overwrite:
Expand All @@ -380,10 +366,10 @@ def download_raw(self, overwrite=False):
os.remove(filefullpath)
print("Remove Raw File: {}".format(filefullpath))
self.organize_gse_raw_data()

else:
pass

def download_samples(self, overwrite=False):
os.makedirs(self.gene_dir, exist_ok=True)
for gsm, gpl in self.gsm_platform.items():
Expand All @@ -399,17 +385,17 @@ def download_samples(self, overwrite=False):
if (not os.path.isfile(filefullpath)) or overwrite:
urllib.request.urlretrieve(sample_url, filefullpath)
print("Retrieve Sample: {}".format(filefullpath))

else:
print("Sample exist: {}".format(filefullpath))
continue

tfile = tarfile.open(filefullpath)
tfile.extractall(path=platformdir)
# os.remove(filefullpath) # keep to avoid re-download
print("Extract to: {}".format(platformdir))
print("Retrieve Samples Completed.")

def calculate_z_score(self, df, to_csv=False):
cols = list(df)
result = pd.DataFrame([], index=df.index)
Expand All @@ -422,5 +408,5 @@ def calculate_z_score(self, df, to_csv=False):
self.gene_dir, "{}_data_z.csv".format(self.gsename)
)
result.to_csv(filefullpath)

return result
4 changes: 0 additions & 4 deletions main/src/async_bioservices/__init__.py

This file was deleted.

Loading

0 comments on commit d2eaf58

Please sign in to comment.