Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modularize io #47

Merged
merged 7 commits into from
Sep 3, 2024
67 changes: 67 additions & 0 deletions src/malco/analysis/check_lens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import pandas as pd
from typing import List

import pandas as pd
import yaml
#from malco.post_process.post_process_results_format import read_raw_result_yaml
from pathlib import Path
import sys

def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
"""
Read the raw result file.

Args:
raw_result_path(Path): Path to the raw result file.

Returns:
dict: Contents of the raw result file.
"""
with open(raw_result_path, 'r') as raw_result:
return list(yaml.safe_load_all(raw_result.read().replace(u'\x04',''))) # Load and convert to list

unique_ppkts = {}
#model=str(sys.argv[1])
models = ["gpt-3.5-turbo", "gpt-4-turbo", "gpt-4", "gpt-4o"]
for model in models:
print("==="*10, "\nEvaluating now: ", model, "\n"+"==="*10)

yamlfile = f"out_openAI_models/raw_results/multimodel/{model}/results.yaml"
all_results=read_raw_result_yaml(yamlfile)

counter = 0
labelvec = []

# Cannot have further files in raw_result_path!
for this_result in all_results:
extracted_object = this_result.get("extracted_object")
if extracted_object:
label = extracted_object.get('label')
labelvec.append(label)
terms = extracted_object.get('terms')
if terms:
counter += 1

full_df_file = f"out_openAI_models/multimodel/{model}/results.tsv"
df = pd.read_csv(full_df_file, sep='\t')
num_ppkts = df['label'].nunique()
unique_ppkts[model] = df['label'].unique()
# The first should be equivalent to grepping "raw_" in some results.yaml
print("The number of prompts that have something in results.yaml are: ", len(labelvec))
print("The number of prompts that have a non-empty differential (i.e. term is not None) is:", counter)
print("The number of unique prompts/ppkts with a non-empty differential in results.tsv are:", num_ppkts, "\n")

# This we know a posteriori, gpt-4o and gpt-4-turbo both have 5213 phenopackets
# Thus, let's print out what is missing in the others
for i in unique_ppkts["gpt-4-turbo"]:
if i in unique_ppkts["gpt-4"]:
continue
else:
print(f"Missing ppkt in gpt-4 is:\t", i)
print("\n")

for i in unique_ppkts["gpt-4-turbo"]:
if i in unique_ppkts["gpt-3.5-turbo"]:
continue
else:
print(f"Missing ppkt in gpt-3.5-turbo is:\t", i)
20 changes: 20 additions & 0 deletions src/malco/analysis/check_shelved_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# check shelved cache. Can maxsize be changed at a later point in time?

from cachetools import LRUCache
from cachetools.keys import hashkey
from shelved_cache import PersistentCache

file_name = "test_increasing_cache"

pc = PersistentCache(LRUCache, file_name, maxsize=4096)

pc["a"] = 42

pc.close()
breakpoint()

pc2 = PersistentCache(LRUCache, file_name, maxsize=16384)

breakpoint()
pc2.close()

55 changes: 40 additions & 15 deletions src/malco/analysis/eval_diagnose_category.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
import pandas as pd
import numpy as np
import sys

from oaklib.datamodels.vocabulary import IS_A, PART_OF
from oaklib.interfaces import MappingProviderInterface
from oaklib.interfaces import OboGraphInterface
from oaklib.interfaces.obograph_interface import GraphTraversalMethod

from oaklib import get_adapter

from cachetools import cached, LRUCache
from cachetools.keys import hashkey
from shelved_cache import PersistentCache

pc_cache_file = "trial_diagnose_cache"
pc = PersistentCache(LRUCache, pc_cache_file, maxsize=4096)


def mondo_adapter() -> OboGraphInterface:
"""
Expand All @@ -19,55 +26,67 @@ def mondo_adapter() -> OboGraphInterface:
return get_adapter("sqlite:obo:mondo")

def mondo_mapping(term, adapter):
print(term)
mondos = []
for m in adapter.sssom_mappings([term], source="OMIM"):
if m.predicate_id == "skos:exactMatch":
mondos.append(m.subject_id)
return mondos

@cached(pc, key=lambda omim_term, disease_categories, mondo: hashkey(omim_term))
def find_category(omim_term, disease_categories, mondo):
if not isinstance(mondo, MappingProviderInterface):
raise ValueError("Adapter is not an MappingProviderInterface")
# What is best algorithm to avoid traversing the mondo graph a billion times?
raise ValueError("Adapter is not a MappingProviderInterface")
# Find ancestors
mondo_term = mondo_mapping(omim_term, mondo)
if not mondo_term:
print(omim_term)
return None

ancestor_list = mondo.ancestors(mondo_term, predicates=[IS_A, PART_OF]) #, reflexive=True) # method=GraphTraversalMethod.ENTAILMENT

for mondo_ancestor in ancestor_list:
if mondo_ancestor in disease_categories:
return mondo_ancestor # This should be smt like MONDO:0045024 (cancer or benign tumor)

print("Special issue following: ")
print(omim_term)


#=====================================================
# script starts here
# Find 42 diseases categories
#=====================================================

mondo = mondo_adapter()
disease_categories = mondo.relationships(objects = ["MONDO:0700096"], predicates=[IS_A])

# make df contingency table with header=diseases_category, correct, incorrect and initialize all to 0.
header = ["label","correct", "incorrect"]
#header = ["diseases_category", "correct", "incorrect"]
dc_list = [i[0] for i in list(disease_categories)]
#contingency_table = pd.DataFrame(0, index=np.arange(len(dc_list)), columns=header)
contingency_table = pd.DataFrame(0, index=dc_list, columns=header)
#dc_labels = []
for j in dc_list:
contingency_table.loc[j,"label"] = mondo.label(j)


# example path of full results
filename = "testout_multmodel_b4run/raw_results/multimodel/gpt-4/full_df_results.tsv"

model=str(sys.argv[1])
filename = f"out_openAI_models/multimodel/{model}/full_df_results.tsv"
# label term score rank correct_term is_correct reciprocal_rank
# PMID_35962790_Family_B_Individual_3__II_6__en-prompt.txt MONDO:0008675 1.0 1.0 OMIM:620545 False 0.0

df = pd.read_csv(
filename, sep="\t" #, header=None, names=["description", "term", "label"]
filename, sep="\t"
)

ppkts = df.groupby("label")[["term", "correct_term", "is_correct"]]
count_fails=0

omim_wo_match = {}
for ppkt in ppkts:
# find this phenopackets category <cat> from OMIM
category_index = find_category(ppkt[1].iloc[0]["correct_term"], dc_list, mondo)
if not category_index:
count_fails += 1
#print(f"Category index for {ppkt[1].iloc[0]["correct_term"]} ")
omim_wo_match[ppkt[0]] = ppkt[1].iloc[0]["correct_term"]
continue
#cat_ind = find_cat_index(category)
# is there a true? ppkt is tuple ("filename", dataframe) --> ppkt[1] is a dataframe
if not any(ppkt[1]["is_correct"]):
Expand All @@ -77,5 +96,11 @@ def find_category(omim_term, disease_categories, mondo):
# yes --> increase <cat> correct
contingency_table.loc[category_index, "correct"] += 1

print(contingency_table)

print("\n\n", "==="*15,"\n")
print(f"For whatever reason find_category() returned None in {count_fails} cases, wich follow:\n") # print to file!
#print(contingency_table)
print(omim_wo_match, "\n\nOf which the following are unique OMIMs:\n", set(list(omim_wo_match.values())))

cont_table_file = f"disease_groups/{model}.tsv"
# Will overwrite
#contingency_table.to_csv(cont_table_file, sep='\t')
15 changes: 15 additions & 0 deletions src/malco/post_process/df_save_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import shutil
import os
import pandas as pd

def safe_save_tsv(path, filename, df):
full_path = path / filename
# If full_path already exists, prepend "old_"
# It's the user's responsibility to know only up to 2 versions can exist, then data is lost
if os.path.isfile(full_path):
old_full_path = path / ("old_" + filename)
if os.path.isfile(old_full_path):
os.remove(old_full_path)
shutil.copy(full_path, old_full_path)
os.remove(full_path)
df.to_csv(full_path, sep='\t', index=False)
18 changes: 5 additions & 13 deletions src/malco/post_process/generate_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@

# Make a nice plot, use it as function or as script

def make_plots(mrr_file, plot_dir, languages, num_ppkt, models, topn_file, comparing):
def make_plots(mrr_file, data_dir, languages, num_ppkt, models, topn_aggr_file, comparing):
plot_dir = data_dir.parents[0] / "plots"
plot_dir.mkdir(exist_ok=True)

if comparing=="model":
name_string = str(len(models))
else:
Expand All @@ -30,19 +33,8 @@ def make_plots(mrr_file, plot_dir, languages, num_ppkt, models, topn_file, compa
plt.close()

# Plotting bar-plots with top<n> ranks
df = pd.read_csv(topn_file, delimiter='\t')
df["top1"] = df['n1']
df["top3"] = df["n1"] + df["n2"] + df["n3"]
df["top5"] = df["top3"] + df["n4"] + df["n5"]
df["top10"] = df["top5"] + df["n6"] + df["n7"] + df["n8"] + df["n9"] + df["n10"]
df["not_found"] = df["nf"]
df_aggr = pd.read_csv(topn_aggr_file, delimiter='\t')

df_aggr = pd.DataFrame()
df_aggr = pd.melt(df, id_vars=comparing, value_vars=["top1", "top3", "top5", "top10", "not_found"], var_name="Rank_in", value_name="counts")
df_aggr["percentage"] = df_aggr["counts"]/num_ppkt
bar_data_file = plot_dir / "topn_aggr.tsv"
df_aggr.to_csv(bar_data_file, sep='\t', index=False)

sns.barplot(x="Rank_in", y="percentage", data = df_aggr, hue = comparing)

plt.xlabel("Number of Ranks in")
Expand Down
2 changes: 1 addition & 1 deletion src/malco/post_process/post_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def post_process(raw_results_dir: Path, output_dir: Path, langs: tuple, models:
output_dir=output_lang, output_file_name="results.tsv")

'''

#TODO should this duplicated code a single code with a parameter?
for model in models:
raw_results_model = raw_results_dir / "multimodel" / model
output_model = output_dir / "multimodel" / model
Expand Down
13 changes: 9 additions & 4 deletions src/malco/post_process/post_process_results_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
import os
from pathlib import Path
from typing import List

import shutil
import pandas as pd
import yaml
from pheval.post_processing.post_processing import PhEvalGeneResult, generate_pheval_result
from pheval.utils.file_utils import all_files
from pheval.utils.phenopacket_utils import GeneIdentifierUpdater, create_hgnc_dict
from malco.post_process.df_save_util import safe_save_tsv



def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
Expand All @@ -21,14 +23,15 @@ def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
dict: Contents of the raw result file.
"""
with open(raw_result_path, 'r') as raw_result:
return list(yaml.safe_load_all(raw_result)) # Load and convert to list
return list(yaml.safe_load_all(raw_result.read().replace(u'\x04',''))) # Load and convert to list


def create_standardised_results(raw_results_dir: Path, output_dir: Path,
output_file_name: str) -> pd.DataFrame:
data = []
for raw_result_path in raw_results_dir.iterdir():
if raw_result_path.is_file():
# Cannot have further files in raw_result_path!
all_results = read_raw_result_yaml(raw_result_path)

for this_result in all_results:
Expand All @@ -37,6 +40,8 @@ def create_standardised_results(raw_results_dir: Path, output_dir: Path,
label = extracted_object.get('label')
terms = extracted_object.get('terms')
if terms:
# Note, the if allows for rerunning ppkts that failed due to connection issues
# We can have multiple identical ppkts/prompts in results.yaml as long as only one has a terms field
num_terms = len(terms)
score = [1 / (i + 1) for i in range(num_terms)] # score is reciprocal rank
rank_list = [ i+1 for i in range(num_terms)]
Expand All @@ -47,8 +52,8 @@ def create_standardised_results(raw_results_dir: Path, output_dir: Path,
df = pd.DataFrame(data)

# Save DataFrame to TSV
output_path = output_dir / output_file_name
df.to_csv(output_path, sep='\t', index=False)
# output_path = output_dir / output_file_name
safe_save_tsv(output_dir, output_file_name, df)

return df

Expand Down
Loading
Loading