monarch-initiative · leokim-l · Sep 3, 2024 · Aug 15, 2024 · Aug 15, 2024 · Aug 19, 2024
diff --git a/src/malco/analysis/check_lens.py b/src/malco/analysis/check_lens.py
@@ -0,0 +1,67 @@
+import pandas as pd 
+from typing import List
+
+import pandas as pd
+import yaml
+#from malco.post_process.post_process_results_format import read_raw_result_yaml
+from pathlib import Path
+import sys
+
+def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
+    """
+    Read the raw result file.
+
+    Args:
+        raw_result_path(Path): Path to the raw result file.
+
+    Returns:
+        dict: Contents of the raw result file.
+    """
+    with open(raw_result_path, 'r') as raw_result:
+        return list(yaml.safe_load_all(raw_result.read().replace(u'\x04','')))  # Load and convert to list
+
+unique_ppkts = {}
+#model=str(sys.argv[1])
+models = ["gpt-3.5-turbo", "gpt-4-turbo", "gpt-4", "gpt-4o"]
+for model in models:
+    print("==="*10, "\nEvaluating now: ", model, "\n"+"==="*10)
+
+    yamlfile = f"out_openAI_models/raw_results/multimodel/{model}/results.yaml"
+    all_results=read_raw_result_yaml(yamlfile)
+
+    counter = 0
+    labelvec = []
+
+    # Cannot have further files in raw_result_path!
+    for this_result in all_results:
+        extracted_object = this_result.get("extracted_object")
+        if extracted_object:
+            label = extracted_object.get('label')
+            labelvec.append(label)
+            terms = extracted_object.get('terms')
+            if terms:
+                counter += 1
+
+    full_df_file = f"out_openAI_models/multimodel/{model}/results.tsv"
+    df = pd.read_csv(full_df_file, sep='\t')
+    num_ppkts = df['label'].nunique()
+    unique_ppkts[model] = df['label'].unique()
+    # The first should be equivalent to grepping "raw_" in some results.yaml
+    print("The number of prompts that have something in results.yaml are: ", len(labelvec))
+    print("The number of prompts that have a non-empty differential (i.e. term is not None) is:", counter)
+    print("The number of unique prompts/ppkts with a non-empty differential in results.tsv are:", num_ppkts, "\n")
+
+# This we know a posteriori, gpt-4o and gpt-4-turbo both have 5213 phenopackets
+# Thus, let's print out what is missing in the others
+for i in unique_ppkts["gpt-4-turbo"]:
+    if i in unique_ppkts["gpt-4"]:
+        continue
+    else:
+        print(f"Missing ppkt in gpt-4 is:\t", i)
+print("\n")
+
+for i in unique_ppkts["gpt-4-turbo"]:
+    if i in unique_ppkts["gpt-3.5-turbo"]:
+        continue
+    else:
+        print(f"Missing ppkt in gpt-3.5-turbo is:\t", i)
diff --git a/src/malco/analysis/check_shelved_cache.py b/src/malco/analysis/check_shelved_cache.py
@@ -0,0 +1,20 @@
+# check shelved cache. Can maxsize be changed at a later point in time?
+
+from cachetools import LRUCache
+from cachetools.keys import hashkey
+from shelved_cache import PersistentCache
+
+file_name = "test_increasing_cache"
+
+pc = PersistentCache(LRUCache, file_name, maxsize=4096)        
+
+pc["a"] = 42
+
+pc.close()
+breakpoint()
+
+pc2 = PersistentCache(LRUCache, file_name, maxsize=16384)        
+
+breakpoint()
+pc2.close()
+
diff --git a/src/malco/analysis/eval_diagnose_category.py b/src/malco/analysis/eval_diagnose_category.py
@@ -1,13 +1,20 @@
 import pandas as pd
 import numpy as np
+import sys
 
 from oaklib.datamodels.vocabulary import IS_A, PART_OF
 from oaklib.interfaces import MappingProviderInterface
 from oaklib.interfaces import OboGraphInterface
 from oaklib.interfaces.obograph_interface import GraphTraversalMethod
-
 from oaklib import get_adapter
 
+from cachetools import cached, LRUCache
+from cachetools.keys import hashkey
+from shelved_cache import PersistentCache
+
+pc_cache_file = "trial_diagnose_cache"
+pc = PersistentCache(LRUCache, pc_cache_file, maxsize=4096)        
+
 
 def mondo_adapter() -> OboGraphInterface:
     """
@@ -19,55 +26,67 @@ def mondo_adapter() -> OboGraphInterface:
     return get_adapter("sqlite:obo:mondo") 
 
 def mondo_mapping(term, adapter): 
-    print(term)
     mondos = []
     for m in adapter.sssom_mappings([term], source="OMIM"):
         if m.predicate_id == "skos:exactMatch":
             mondos.append(m.subject_id)
     return mondos
 
+@cached(pc, key=lambda omim_term, disease_categories, mondo: hashkey(omim_term))
 def find_category(omim_term, disease_categories, mondo):
     if not isinstance(mondo, MappingProviderInterface):
-        raise ValueError("Adapter is not an MappingProviderInterface")
-    # What is best algorithm to avoid traversing the mondo graph a billion times?    
+        raise ValueError("Adapter is not a MappingProviderInterface")
     # Find ancestors
     mondo_term = mondo_mapping(omim_term, mondo)
+    if not mondo_term:
+        print(omim_term)
+        return None
+
     ancestor_list = mondo.ancestors(mondo_term, predicates=[IS_A, PART_OF]) #, reflexive=True) # method=GraphTraversalMethod.ENTAILMENT
 
     for mondo_ancestor in ancestor_list:
         if mondo_ancestor in disease_categories:
             return mondo_ancestor # This should be smt like MONDO:0045024 (cancer or benign tumor)
+
+    print("Special issue following:  ")
+    print(omim_term)
 
-
+#=====================================================
+# script starts here
 # Find 42 diseases categories
+#=====================================================
+
 mondo = mondo_adapter()
 disease_categories = mondo.relationships(objects = ["MONDO:0700096"], predicates=[IS_A])
+
 # make df contingency table with header=diseases_category, correct, incorrect and initialize all to 0.
 header = ["label","correct", "incorrect"]
-#header = ["diseases_category", "correct", "incorrect"]
 dc_list = [i[0] for i in list(disease_categories)]
-#contingency_table = pd.DataFrame(0, index=np.arange(len(dc_list)), columns=header)
 contingency_table = pd.DataFrame(0, index=dc_list, columns=header)
-#dc_labels = []
 for j in dc_list:
     contingency_table.loc[j,"label"] = mondo.label(j)
 
-
-# example path of full results
-filename = "testout_multmodel_b4run/raw_results/multimodel/gpt-4/full_df_results.tsv"
-
+model=str(sys.argv[1])
+filename = f"out_openAI_models/multimodel/{model}/full_df_results.tsv"
 # label   term    score   rank    correct_term    is_correct      reciprocal_rank
 # PMID_35962790_Family_B_Individual_3__II_6__en-prompt.txt        MONDO:0008675   1.0     1.0     OMIM:620545     False        0.0
 
 df = pd.read_csv(
-        filename, sep="\t" #, header=None, names=["description", "term", "label"]
+        filename, sep="\t" 
     )
 
 ppkts = df.groupby("label")[["term", "correct_term", "is_correct"]] 
+count_fails=0
 
+omim_wo_match = {}
 for ppkt in ppkts:
     # find this phenopackets category <cat> from OMIM
     category_index = find_category(ppkt[1].iloc[0]["correct_term"], dc_list, mondo)
+    if not category_index:
+        count_fails += 1
+        #print(f"Category index for {ppkt[1].iloc[0]["correct_term"]} ")
+        omim_wo_match[ppkt[0]] = ppkt[1].iloc[0]["correct_term"]
+        continue
     #cat_ind = find_cat_index(category)
     # is there a true? ppkt is tuple ("filename", dataframe) --> ppkt[1] is a dataframe 
     if not any(ppkt[1]["is_correct"]):
@@ -77,5 +96,11 @@ def find_category(omim_term, disease_categories, mondo):
         # yes --> increase <cat> correct
         contingency_table.loc[category_index, "correct"] += 1
 
-print(contingency_table)
-
+print("\n\n", "==="*15,"\n")
+print(f"For whatever reason find_category() returned None in {count_fails} cases, wich follow:\n") # print to file!
+#print(contingency_table)
+print(omim_wo_match, "\n\nOf which the following are unique OMIMs:\n", set(list(omim_wo_match.values())))
+
+cont_table_file = f"disease_groups/{model}.tsv"
+# Will overwrite
+#contingency_table.to_csv(cont_table_file, sep='\t')
diff --git a/src/malco/post_process/df_save_util.py b/src/malco/post_process/df_save_util.py
@@ -0,0 +1,15 @@
+import shutil
+import os
+import pandas as pd
+
+def safe_save_tsv(path, filename, df):
+    full_path = path / filename
+    # If full_path already exists, prepend "old_"
+    # It's the user's responsibility to know only up to 2 versions can exist, then data is lost
+    if os.path.isfile(full_path):
+        old_full_path = path / ("old_" + filename)
+        if os.path.isfile(old_full_path):
+            os.remove(old_full_path)
+        shutil.copy(full_path, old_full_path)
+        os.remove(full_path)
+    df.to_csv(full_path, sep='\t', index=False)
diff --git a/src/malco/post_process/generate_plots.py b/src/malco/post_process/generate_plots.py
@@ -6,7 +6,10 @@
 
 # Make a nice plot, use it as function or as script
 
-def make_plots(mrr_file, plot_dir, languages, num_ppkt, models, topn_file, comparing):
+def make_plots(mrr_file, data_dir, languages, num_ppkt, models, topn_aggr_file, comparing):
+    plot_dir = data_dir.parents[0] / "plots"
+    plot_dir.mkdir(exist_ok=True)
+
     if comparing=="model":
         name_string = str(len(models))
     else:
@@ -30,19 +33,8 @@ def make_plots(mrr_file, plot_dir, languages, num_ppkt, models, topn_file, compa
     plt.close()
 
     # Plotting bar-plots with top<n> ranks
-    df = pd.read_csv(topn_file, delimiter='\t')
-    df["top1"] = df['n1']
-    df["top3"] = df["n1"] + df["n2"] + df["n3"]
-    df["top5"] = df["top3"] + df["n4"] + df["n5"]
-    df["top10"] = df["top5"] + df["n6"] + df["n7"] + df["n8"] + df["n9"] + df["n10"]
-    df["not_found"] = df["nf"]
+    df_aggr = pd.read_csv(topn_aggr_file, delimiter='\t')
 
-    df_aggr = pd.DataFrame()
-    df_aggr = pd.melt(df, id_vars=comparing, value_vars=["top1", "top3", "top5", "top10", "not_found"], var_name="Rank_in", value_name="counts")
-    df_aggr["percentage"] = df_aggr["counts"]/num_ppkt
-    bar_data_file = plot_dir / "topn_aggr.tsv"
-    df_aggr.to_csv(bar_data_file, sep='\t', index=False)
-
     sns.barplot(x="Rank_in", y="percentage", data = df_aggr, hue = comparing)
 
     plt.xlabel("Number of Ranks in")

diff --git a/src/malco/post_process/post_process.py b/src/malco/post_process/post_process.py
@@ -24,7 +24,7 @@ def post_process(raw_results_dir: Path, output_dir: Path, langs: tuple, models:
                                     output_dir=output_lang, output_file_name="results.tsv")
 
     '''
-
+    #TODO should this duplicated code a single code with a parameter?
     for model in models:
         raw_results_model = raw_results_dir / "multimodel" / model
         output_model = output_dir / "multimodel" / model

diff --git a/src/malco/post_process/post_process_results_format.py b/src/malco/post_process/post_process_results_format.py
@@ -2,12 +2,14 @@
 import os
 from pathlib import Path
 from typing import List
-
+import shutil
 import pandas as pd
 import yaml
 from pheval.post_processing.post_processing import PhEvalGeneResult, generate_pheval_result
 from pheval.utils.file_utils import all_files
 from pheval.utils.phenopacket_utils import GeneIdentifierUpdater, create_hgnc_dict
+from malco.post_process.df_save_util import safe_save_tsv
+
 
 
 def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
@@ -21,14 +23,15 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
         dict: Contents of the raw result file.
     """
     with open(raw_result_path, 'r') as raw_result:
-        return list(yaml.safe_load_all(raw_result))  # Load and convert to list
+        return list(yaml.safe_load_all(raw_result.read().replace(u'\x04','')))  # Load and convert to list
 
 
 def create_standardised_results(raw_results_dir: Path, output_dir: Path,
                                 output_file_name: str) -> pd.DataFrame:
     data = []
     for raw_result_path in raw_results_dir.iterdir():
         if raw_result_path.is_file():
+            # Cannot have further files in raw_result_path!
             all_results = read_raw_result_yaml(raw_result_path)
 
             for this_result in all_results:
@@ -37,6 +40,8 @@ def create_standardised_results(raw_results_dir: Path, output_dir: Path,
                     label = extracted_object.get('label')
                     terms = extracted_object.get('terms')
                     if terms:
+                    # Note, the if allows for rerunning ppkts that failed due to connection issues
+                    # We can have multiple identical ppkts/prompts in results.yaml as long as only one has a terms field
                         num_terms = len(terms)
                         score = [1 / (i + 1) for i in range(num_terms)]  # score is reciprocal rank
                         rank_list = [ i+1 for i in range(num_terms)]
@@ -47,8 +52,8 @@ def create_standardised_results(raw_results_dir: Path, output_dir: Path,
     df = pd.DataFrame(data)
 
     # Save DataFrame to TSV
-    output_path = output_dir / output_file_name
-    df.to_csv(output_path, sep='\t', index=False)
+    # output_path = output_dir / output_file_name
+    safe_save_tsv(output_dir, output_file_name, df)
 
     return df