Skip to content

Commit

Permalink
mostly added IC to analysis. almost done
Browse files Browse the repository at this point in the history
  • Loading branch information
leokim-l committed Sep 30, 2024
1 parent 7e90dd8 commit 2d2877b
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 29 deletions.
2 changes: 1 addition & 1 deletion src/malco/analysis/count_grounding_failures.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Quick check how often the grounding failed
# Need to be in short_letter branch
import pandas as pd
mfile = "outputdir_all_2024_07_04/en/results.tsv"
mfile = "../outputdir_all_2024_07_04/en/results.tsv"

df = pd.read_csv(
mfile, sep="\t" #, header=None, names=["description", "term", "label"]
Expand Down
35 changes: 35 additions & 0 deletions src/malco/analysis/count_translated_prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import os
import re
fp = "/Users/leonardo/IdeaProjects/phenopacket2prompt/prompts/"

langs = ["en",
"es",
"de",
"it",
"nl",
"tr",
"zh",
]

promptfiles = {}
for lang in langs:
promptfiles[lang] = []
for (dirpath, dirnames, filenames) in os.walk(fp+lang):
for fn in filenames:
fn = fn[0:-14]
promptfiles[lang].append(fn)
break

intersection = set()

enset = set(promptfiles['en'])
esset = set(promptfiles['es'])
deset = set(promptfiles['de'])
itset = set(promptfiles['it'])
nlset = set(promptfiles['nl'])
zhset = set(promptfiles['zh'])
trset = set(promptfiles['tr'])

intersection = enset & esset & deset & itset & nlset & zhset & trset

print("Common ppkts are: ", len(intersection))
128 changes: 101 additions & 27 deletions src/malco/analysis/disease_avail_knowledge.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,30 @@
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# This script looks for correlations between the ability of an LLM to
# diagnose the correct disease and certain parameters.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The main points are using time, namely dates of discovery, as a way to capture how much of a
# disease is present in the web. This is a proxy for how much an LLM knows about such a diseases.
# We use HPOA, we do not parse out disease genes discovered after 2008 though (first thing in HPOA)
#
# Then we could look at some IC(prompt) as a second proxy.
#
# Finally, if the two things correlate, can we use them to train a logit or SVM to predict whether
# the LLM will be successfull or not?
"""This script looks for correlations between the ability of an LLM to
diagnose the correct disease and certain parameters.
(1) The first idea is using time, namely dates of discovery, as a way to capture how much of a
disease is present in the web. This is a proxy for how much an LLM knows about such a diseases.
We use HPOA, we do not parse out disease genes discovered after 2008 though (first thing in HPOA)
(2) Then we could look at some IC(prompt) as a second proxy. To start, avg(IC) as computed with
`runoak -g hpoa_file -G hpoa -i hpo_file information-content -p i --use-associations .all`
Finally, if the two things correlate, can we use them to train a logit or SVM to predict whether
the LLM will be successfull or not?
"""
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

import sys
import os
import pandas as pd
import numpy as np
import datetime as dt
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import json
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) HPOA for dates
# HPOA import and setup
Expand Down Expand Up @@ -65,8 +70,8 @@
index_of_match = ppkt[1]["is_correct"].to_list().index(True)
try:
#inverse_rank = 1/ppkt[1].iloc[index_of_match]["rank"] # np.float64
inverse_rank = ppkt[1].iloc[index_of_match]["rank"] # np.float64
rank_date_dict[ppkt[0]] = [inverse_rank.item(),
rank = ppkt[1].iloc[index_of_match]["rank"] # np.float64
rank_date_dict[ppkt[0]] = [rank.item(),
hpoa_unique.loc[ppkt[1].iloc[0]["correct_term"]]]
except (ValueError, KeyError) as e:
print(f"Error {e} for {ppkt[0]}, disease {ppkt[1].iloc[0]['correct_term']}.")
Expand All @@ -85,45 +90,118 @@
# len(ppkts) --> 6687

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Do linear regression of box plot of ppkts' 1/r vs time
# Do linear regression of box plot of ppkts' rank vs time
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Plot TODO
#plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
#plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=365))
dates = []
invranks = []
ranks = []
for key, data in rank_date_dict.items():
#rank, date_str = zip(*data_list) # Unpack
# necessary to convert to date object?
#dates = convert_str_to_dates(dates_str) # Not handled in example
#plt.plot(date_str, rank, label=key)
dates.append(dt.datetime.strptime(data[1], '%Y-%m-%d').date())
invranks.append(data[0])
ranks.append(data[0])

#plt.legend()
#plt.plot(dates, invranks, 'xr')
#plt.plot(dates, ranks, 'xr')
#plt.gcf().autofmt_xdate()
#plt.show()

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Correlation coefficient TODO
# Correlation? Not evident from the following:
years_only = []
for i in range(len(dates)):
years_only.append(dates[i].year)

sns.boxplot(x=years_only,y=invranks)
sns.boxplot(x=years_only,y=ranks)
plt.xlabel("Year of HPOA annotation")
plt.ylabel("Rank")
plt.title("LLM performance uncorrelated with date of discovery")
plt.show()
#plt.show()

#years_range = np.array([i for i in range(2009,2025)]) # bins
#year_indices = np.digitize(years_only,years_range)
breakpoint()

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Statistical test, simplest idea: chi2 of contingency table with:
# y<=2009 and y>2009 clmns and found vs not-found counts, one count per ppkt
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Statistical test TODO


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# IC: For each phenpacket, list observed HPOs and compute average IC. Is it correlated with
# success? I.e., start with f/nf, 1/0 on y-axis vs avg(IC) on x-axis

# Import file as dict
ic_file = "data/ic_hpoa.txt"
with open(ic_file) as f:
ic_dict = dict(i.rstrip().split(None, 1) for i in f)


original_ppkt_dir = Path.home() / "data" / "phenopacket-store"
ppkt_ic = {}
missing_in_ic_dict = []
ppkts_with_zero_hpos = []

# Iterate over ppkts, which are json.
for subdir, dirs, files in os.walk(original_ppkt_dir):
# For each ppkt
for filename in files:
if filename.endswith('.json'):
file_path = os.path.join(subdir, filename)
with open(file_path, mode="r", encoding="utf-8") as read_file:
ppkt = json.load(read_file)
ic = 0
num_hpos = 0
# For each HPO
for i in ppkt['phenotypicFeatures']:
try:
if i["excluded"]: # skip excluded
continue
except KeyError:
pass
hpo = i["type"]["id"]
try:
ic += float(ic_dict[hpo])
num_hpos += 1
except KeyError as e:
missing_in_ic_dict.append(e.args[0])
#print(f"No entry for {e}.")

# For now we are fine with average IC
try:
ppkt_ic[ppkt["id"]] = ic/num_hpos
except ZeroDivisionError as e:
ppkts_with_zero_hpos.append(ppkt["id"])
#print(f"No HPOs for {ppkt["id"]}.")

missing_in_ic_dict_unique = set(missing_in_ic_dict)
print(f"\nNumber of HPOs without IC-value is {len(missing_in_ic_dict_unique)}.") # 191
print(f"Number of ppkts with zero observed HPOs is {len(ppkts_with_zero_hpos)}.\n") # 141
breakpoint()
ppkt_ic_df = pd.DataFrame.from_dict(ppkt_ic, orient='index', columns=['avg(IC)'])
ppkt_ic_df['Diagnosed'] = 0

still_missing = []

for ppkt in ppkts:
if any(ppkt[1]["is_correct"]):
ppkt_label = ppkt[0][0:-14]
try:
ppkt_ic_df.loc[ppkt_label,'Diagnosed'] = 1
# somehow this code generates new entries in df. From a code perspective it's bad and
# should be changed, but before, why? Is there some error? TODO
except :
if ppkt_label in ppkts_with_zero_hpos:
continue
else:
still_missing.append(ppkt_label)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Analysis of found vs not-found
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand All @@ -142,11 +220,8 @@
print(f"Number of found diseases by {model} is {len(found_set)}.")
print(f"Number of not found diseases by {model} is {len(notfound_set)}.")
print(f"Found diseases also present in not-found set, by {model} is {len(overlap)}.\n")
# Need some more statistic


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# One Idea
# Look at the 263-129 (gpt-4o) found diseases not present in not-found set ("always found")
# and the opposite namely "never found" diseases. Average date of two sets is?

Expand All @@ -164,7 +239,6 @@
hpoa_df.drop_duplicates(subset='database_id', inplace=True)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

for af in always_found:
try:
results_dict[af] = [True, hpoa_df.loc[hpoa_df['database_id'] == af, 'date'].item() ]
Expand Down
1 change: 0 additions & 1 deletion src/malco/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from malco.post_process.generate_plots import make_plots
import os

@dataclass # necessary if PhevalRunner is already one?
class MalcoRunner(PhEvalRunner):
input_dir: Path
testdata_dir: Path
Expand Down

0 comments on commit 2d2877b

Please sign in to comment.