Use async_bioservices (#157)

* Refactor GitHub workflows for better efficiency The commit refactors GitHub workflows `unit_tests.yml`, `type_checking.yml` and `container_build.yml`. The changes simplify the process of checking for python file changes, by leveraging GitHub's paths filters for pull_request events. This results in cleaner and more maintainable workflow files. In addition, the 'workflow_dispatch' event was removed from the `container_build.yml` workflow. This is due to it being unnecessary, as the build is triggered by push events. * Remove testing limit on python files, caused collision with test pass requirement * Update README.md * COMO will now use the 'async_bioservices' package instead of maintaining its own * Optimize code by migrating from async_bioservices to db2db for async operations Replaced all instances of async_bioservices, and its associated methods, with db2db in all source files in the main directory to optimize code for better asynchronous database operations. This update improves performance and reduces complexity by utilizing a single method (db2db) for all asynchronous operations, making it easier to manage and maintain. The async/await syntax has been employed to handle asynchronous calls, enhancing code readability and maintainability. * Update tests environment.yaml
HelikarLab · Oct 17, 2023 · d2eaf58 · d2eaf58
1 parent 4f823a3
commit d2eaf58
Show file tree

Hide file tree

Showing 14 changed files with 172 additions and 639 deletions.
diff --git a/README.md b/README.md
@@ -5,7 +5,15 @@
 ![GitHub Workflow Status (with event)](https://img.shields.io/github/actions/workflow/status/HelikarLab/COMO/container_build.yml?style=for-the-badge&logo=docker&logoColor=white&label=Docker%20Build)
 ![GitHub Workflow Status (with event)](https://img.shields.io/github/actions/workflow/status/HelikarLab/COMO/type_checking.yml?style=for-the-badge&logo=docker&logoColor=white&label=Type%20Checking)
 
-## Docker Quick Start
+# Setting up COMO
+
+Go to COMO's documentation page for full installation and operation instructions or use one of the Quick Start options
+
+## [COMO Documentation Page](https://helikarlab.github.io/COMO)
+
+## Quick Start Options
+
+### Docker Quick Start
 
 This installation method **does** require docker
 
@@ -19,7 +27,7 @@ This installation method **does** require docker
 > the [Gurobi solver](https://www.gurobi.com/). If you would like either (or both) of these features,
 > please [visit our documentation](https://helikarlab.github.io/COMO) for more details
 
-## Conda Quick Start
+### Conda Quick Start
 
 This installation method does **not** require docker
 

diff --git a/environment.yaml b/environment.yaml
@@ -58,6 +58,7 @@ dependencies:
   - conda-forge::wget~=1.20.3
   - conda-forge::xlrd~=2.0.1
   - pip:
+      - async_bioservices
       - cobra==0.26.*
       - Escher==1.7.*
       - framed==0.5.*

diff --git a/main/src/GSEpipelineFast.py b/main/src/GSEpipelineFast.py
@@ -11,23 +11,16 @@
 from GSEpipeline import load_gse_soft
 from instruments import AffyIO
 from rpy2.robjects import pandas2ri
-from async_bioservices import async_bioservices
-from async_bioservices.input_database import InputDatabase
-from async_bioservices.output_database import OutputDatabase
 
 pandas2ri.activate()
 # Input: Extract Gene Info from GEO DataSets
 
 # gse = load_gse_soft(gsename)
 
-from async_bioservices import async_bioservices
-from async_bioservices.input_database import InputDatabase
-from async_bioservices.output_database import OutputDatabase
+from async_bioservices import db2db, InputDatabase, OutputDatabase
 
-# Extract Platform Information
 
-
-def download_gsm_id_maps(datadir, gse, gpls: list[str] = None, vendor="affy"):
+async def download_gsm_id_maps(datadir, gse, gpls: list[str] = None, vendor="affy"):
     """
     download ID to ENTREZ_GENE_ID maps, create a csv file for each platform, and return dictionary
     :param gpls:
@@ -40,33 +33,26 @@ def download_gsm_id_maps(datadir, gse, gpls: list[str] = None, vendor="affy"):
     # From: https://florimond.dev/en/posts/2018/08/python-mutable-defaults-are-the-source-of-all-evil/
     if gpls is None:
         gpls: list[str] = ["GPL96", "GPL97", "GPL8300"]
-
+    
     for gpl in gpls:
         table = gse.gpls[gpl].table.copy()
         if vendor.lower() == "affy":
             temp = table[["ID", "ENTREZ_GENE_ID"]]
-            
+
         elif vendor.lower() == "agilent":
             input_values = table.loc[
                 table["CONTROL_TYPE"] == "FALSE", "SPOT_ID"
             ].tolist()
-
-            """
-                input_values,
-    input_db="Agilent ID",
-    output_db: list[str] = None,
-    delay=30,
-            """
-
-            temp = async_bioservices.fetch_gene_info(
+
+            temp = await db2db(
                 input_values=input_values,
                 input_db=InputDatabase.AGILENT_ID,
                 output_db=[
                     OutputDatabase.GENE_ID.value,
                     OutputDatabase.ENSEMBL_GENE_ID.value
                 ]
             )
-
+            
             temp.drop(columns=["Ensembl Gene ID"], inplace=True)
             temp.reset_index(inplace=True)
             temp.rename(
@@ -76,11 +62,11 @@ def download_gsm_id_maps(datadir, gse, gpls: list[str] = None, vendor="affy"):
                 }, inplace=True
             )
             temp.replace(to_replace="-", value=np.nan, inplace=True)
-            
+
         else:
             print("Unsupported Platform: {}".format(gpl))
             continue
-            
+
         # Save to file
         filefullpath = os.path.join(datadir, "{}entrez.csv".format(gpl.lower()))
         temp.to_csv(filefullpath, index=False)
@@ -107,7 +93,7 @@ def __init__(self, gsename, querytable, rootdir="../"):
         vendors = pairs["Instrument"].tolist()
         self.platforms = dict(zip(gpls, vendors))
         self.download_samples()
-
+    
     def organize_gse_raw_data(self):
         """
         Organize raw data at local folder
@@ -121,7 +107,7 @@ def organize_gse_raw_data(self):
                 print("Path created: {}".format(platformdir))
             else:
                 print("Path already exist: {}".format(platformdir))
-
+        
         # Move Corresponding Cel files to Folders
         onlyfiles = [f for f in os.listdir(self.gene_dir) if f.endswith(".gz")]
         cnt = 0
@@ -138,7 +124,7 @@ def organize_gse_raw_data(self):
                 print("Move {} to {}".format(src_path, dst_path))
                 cnt += 1
         print("{} raw data files moved.".format(cnt))
-
+    
     def get_gsm_tables(self):
         """
         get gsm maps in table
@@ -158,9 +144,9 @@ def get_gsm_tables(self):
             df = temp[["ID", "ENTREZ_GENE_ID"]]
             df.set_index("ID", inplace=True)
             gsm_tables[gpl] = df
-            
+
         return gsm_tables
-
+    
     def get_gsm_platform(self):
         """
         Classify the Samples by Platform
@@ -172,7 +158,7 @@ def get_gsm_platform(self):
         gsm_platform = dict(zip(keys, values))
 
         return gsm_platform
-
+    
     def gsms_included_by(self, df):
         for gsm in self.gsm_platform.keys():
             included = False
@@ -183,15 +169,15 @@ def gsms_included_by(self, df):
                     break
             if not included:
                 return False
-                
+
         return True
-
-    def get_entrez_table_pipeline(self, fromcsv=True):
+    
+    async def get_entrez_table_pipeline(self, fromcsv=True):
         """
         create ENTREZ ID based table from gse
         :return: pandas dataframe for table of GSE
         """
-
+        
         filefullpath = os.path.join(
             self.gene_dir, "{}_sc500_full_table.csv".format(self.gsename)
         )
@@ -209,14 +195,14 @@ def get_entrez_table_pipeline(self, fromcsv=True):
                     print("Need Append GSMs")
             except:
                 print("Unable to read {}")
-
+        
         print("Create new table: {}".format(filefullpath))
         gsm_maps = self.get_gsm_tables()
-
+        
         if not any(gsm_maps):
             print("Not available, return empty dataframe")
             return pd.DataFrame([])
-
+        
         # Ready Affy files from folders
         gsm_tables_sc500 = {}
         for key, vendor in self.platforms.items():
@@ -229,35 +215,35 @@ def get_entrez_table_pipeline(self, fromcsv=True):
                     outputdf = outputdf.readaffydir(platformdir)
                     outputdf = ro.conversion.rpy2py(outputdf)
                 elif vendor.lower() == "agilent":
-
+                    
                     outputdf = instruments.readagilent(platformdir, list(self.gsm_platform.keys()))
-
-                    gsm_maps[key] = async_bioservices.fetch_gene_info(
+                    
+                    gsm_maps[key] = await db2db(
                         input_values=list(map(str, list(outputdf["ProbeName"]))),
                         input_db=InputDatabase.AGILENT_ID,
                         output_db=[OutputDatabase.GENE_ID]
                     )
-
+                    
                     gsm_maps[key].rename(columns={"Gene ID": "ENTREZ_GENE_ID"}, inplace=True)
                 else:
                     print("Unsupported Platform {} and Vendor {}".format(key, vendor))
                     continue
             else:
                 print("Path not exist: {}".format(platformdir))
                 continue
-
+            
             drop_idx = np.where(gsm_maps[key]["ENTREZ_GENE_ID"] == "-")[0].tolist()
             outputdf.drop(outputdf.index[drop_idx], axis=0, inplace=True)
             gsm_maps[key].drop(gsm_maps[key].index[drop_idx], axis=0, inplace=True)
             outputdf["ENTREZ_GENE_ID"] = gsm_maps[key]["ENTREZ_GENE_ID"].to_list()
             gsm_tables_sc500[key] = outputdf
-
+        
         # Drop rows without ENTREZ GENE ID, set index to ENTREZ
         for key in self.platforms.keys():
             gsm_tables_sc500[key].dropna(subset=["ENTREZ_GENE_ID"], inplace=True)
             gsm_tables_sc500[key].set_index("ENTREZ_GENE_ID", inplace=True)
             print("gsm table drop: ", gsm_tables_sc500[key])
-
+        
         # Merge tables of platforms
         df_outer_sc500 = None
         for key in self.platforms.keys():
@@ -271,7 +257,7 @@ def get_entrez_table_pipeline(self, fromcsv=True):
                     on="ENTREZ_GENE_ID",
                     how="outer",
                 )
-
+        
         df_outer_sc500.dropna(how="all", inplace=True)
         print("Full: {}".format(df_outer_sc500.shape))
         df_outer_sc500.rename(str.lower, axis="columns", inplace=True)
@@ -287,10 +273,10 @@ def get_entrez_table_pipeline(self, fromcsv=True):
                 vals.append(newcol)
                 keys.append(col)
                 gsms_loaded.append(gsm)
-                
+
         df_outer_sc500.rename(columns=dict(zip(keys, vals)), inplace=True)
         gsms_loaded = list(set(gsms_loaded).union(set(self.gsm_platform.keys())))
-
+        
         # Remove duplicated items, keep largest VALUE for each GSM
         if "df_clean_sc500" not in locals():
             df_clean_sc500 = pd.DataFrame([], index=df_outer_sc500.index)
@@ -315,7 +301,7 @@ def get_entrez_table_pipeline(self, fromcsv=True):
             df_clean_sc500 = df_clean_sc500[
                 ~df_clean_sc500.index.duplicated(keep="last")
             ]
-
+        
         for key in gsms_loaded:
             key_low = key.lower()
             col1, col2, col3 = (
@@ -326,40 +312,40 @@ def get_entrez_table_pipeline(self, fromcsv=True):
 
             try:
                 temp = df_outer_sc500.loc[:, [col1, col2, col3]]
-                
+
             except:
                 if key in list(self.gsm_platform.keys()):
                     print("{} not in df_outer_sc500".format(key))
-
-                continue
 
+                continue
+
             temp.sort_values(by=["ENTREZ_GENE_ID", col1], inplace=True)
             temp = temp[~temp.index.duplicated(keep="last")]
             df_clean_sc500[col1] = temp[col1]
             df_clean_sc500[col2] = temp[col2]
             df_clean_sc500[col3] = temp[col3]
-
+        
         # save to csv file
         try:
             df_clean_sc500.set_index("ENTREZ_GENE_ID", inplace=True)
-            
+
         except:
             pass
-            
+
         df_clean_sc500.dropna(axis="columns", how="all", inplace=True)
         df_clean_sc500.dropna(how="all", inplace=True)
 
         try:
             df_clean_sc500.drop(columns=["placeholder"], inplace=True)
         except:
             pass
-            
+
         df_clean_sc500.sort_index(inplace=True)
         df_clean_sc500.to_csv(filefullpath)
         print("Full table saved to:\n{}".format(filefullpath))
 
         return df_clean_sc500
-
+    
     def download_raw(self, overwrite=False):
         # check if path created
         if (not os.path.isdir(self.gene_dir)) or overwrite:
@@ -380,10 +366,10 @@ def download_raw(self, overwrite=False):
             os.remove(filefullpath)
             print("Remove Raw File: {}".format(filefullpath))
             self.organize_gse_raw_data()
-            
+
         else:
             pass
-
+    
     def download_samples(self, overwrite=False):
         os.makedirs(self.gene_dir, exist_ok=True)
         for gsm, gpl in self.gsm_platform.items():
@@ -399,17 +385,17 @@ def download_samples(self, overwrite=False):
             if (not os.path.isfile(filefullpath)) or overwrite:
                 urllib.request.urlretrieve(sample_url, filefullpath)
                 print("Retrieve Sample: {}".format(filefullpath))
-                
+
             else:
                 print("Sample exist: {}".format(filefullpath))
                 continue
-                
+
             tfile = tarfile.open(filefullpath)
             tfile.extractall(path=platformdir)
             # os.remove(filefullpath) # keep to avoid re-download
             print("Extract to: {}".format(platformdir))
         print("Retrieve Samples Completed.")
-
+    
     def calculate_z_score(self, df, to_csv=False):
         cols = list(df)
         result = pd.DataFrame([], index=df.index)
@@ -422,5 +408,5 @@ def calculate_z_score(self, df, to_csv=False):
                 self.gene_dir, "{}_data_z.csv".format(self.gsename)
             )
             result.to_csv(filefullpath)
-            
+
         return result
diff --git a/main/src/async_bioservices/__init__.py b/main/src/async_bioservices/__init__.py