Skip to content

Commit

Permalink
added plot of inverse rank vs date, still messy
Browse files Browse the repository at this point in the history
  • Loading branch information
leokim-l committed Sep 23, 2024
1 parent 5bb9ab5 commit ec56a24
Showing 1 changed file with 102 additions and 40 deletions.
142 changes: 102 additions & 40 deletions src/malco/analysis/disease_avail_knowledge.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,41 @@
# Let us try to parametrize how much is known about the diseases, there are two ideas beyond eval_diagnose_category, looking at the MONDO categories
# Idea (0), (number of HPOs present, number of HPOs excluded) correlated to diseases found?
# (1) HPOA and (2) Monarch KG
# (1) Parse out disease genes discovered after 2008/9 (First thing in HPOA)
# Look for a correlation between date annotated and disease correctly diagnosed.
# Hypothesis: the older the easier to diagnose
# PNR suggests: for each ppkt we have a date
# (2) To start, looking at the two broad categories found/not-found, count average number of all links
# After that, count average number of links of some kind
# Then, something more graphy, such as, centrality? Maybe need to project out something first to find signal in the noise...
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# This script looks for correlations between the ability of an LLM to
# diagnose the correct disease and certain parameters.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The main points are using time, namely dates of discovery, as a way to capture how much of a
# disease is present in the web. This is a proxy for how much an LLM knows about such a diseases.
# We use HPOA, we need to parse out disease genes discovered after 2008 or 9 (First thing in HPOA)
#
# Then we could look at some IC(prompt) as a second proxy.
#
# Finally, if the two things correlate, can we use them to train a logit or SVM to predict whether
# the LLM will be successfull or not?
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

import sys
import pandas as pd
import numpy as np
import datetime as dt

from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) HPOA for dates
# HPOA import and setup
hpoa_file_path = "/Users/leonardo/IdeaProjects/maxodiff/data/phenotype.hpoa"
hpoa_file_path = Path.home() / "data" / "phenotype.hpoa"
hpoa_df = pd.read_csv(
hpoa_file_path, sep="\t" , header=4
)

hpoa_cleaned = pd.DataFrame()
hpoa_cleaned["database_id"] = hpoa_df["database_id"]
hpoa_cleaned['date'] = hpoa_df["biocuration"].str.extract(r'\[(.*?)\]')
#string_dates = str(hpoa_df["biocuration"].str.extract(r'\[(.*?)\]'))
labels_to_drop = ["disease_name", "qualifier", "hpo_id", "reference", "evidence", "onset", "frequency", "sex", "modifier", "aspect"]
hpoa_df = hpoa_df.drop(columns=labels_to_drop)

hpoa_df['date'] = hpoa_df["biocuration"].str.extract(r'\[(.*?)\]')
hpoa_df = hpoa_df.drop(columns='biocuration')
hpoa_df = hpoa_df[hpoa_df['database_id'].str.startswith("OMIM")]

#hpoa_cleaned['date'] = [dt.datetime.strptime(day, '%Y-%m-%d').date() for day in string_dates]
hpoa_cleaned = hpoa_cleaned[hpoa_cleaned['database_id'].str.startswith("OMIM")]
hpoa_unique = hpoa_df.groupby("database_id").date.min()
# Now length 8251, and e.g. hpoa_unique.loc["OMIM:620662"] -> '2024-04-15'

# import df of results
model = str(sys.argv[1])
Expand All @@ -36,17 +44,72 @@
ranking_results_filename, sep="\t"
)

# go through results data and make set of found vs not found diseases
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# As some exploratory data analysis, let us divide the set into two and look for
# some properties. Meanwhile, create dictonary necessary for linear regression.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Go through results data and make set of found vs not found diseases.
found_diseases = []
not_found_diseases = []
ppkts = rank_results_df.groupby("label")[["term", "correct_term", "is_correct"]]
rank_date_dict = {} # Maybe dict is not a super good idea
ppkts = rank_results_df.groupby("label")[["term", "correct_term", "is_correct", "rank"]]
for ppkt in ppkts:
# is there a true? ppkt is tuple ("filename", dataframe) --> ppkt[1] is a dataframe
disease = ppkt[1].iloc[0]['correct_term']
if any(ppkt[1]["is_correct"]):
found_diseases.append(disease)
else:
not_found_diseases.append(disease)
disease = ppkt[1].iloc[0]['correct_term']
if any(ppkt[1]["is_correct"]):
found_diseases.append(disease)
index_of_match = ppkt[1]["is_correct"].to_list().index(True)
try:
inverse_rank = 1/ppkt[1].iloc[index_of_match]["rank"] # np.float64
rank_date_dict[ppkt[0]] = [inverse_rank.item(),
hpoa_unique.loc[ppkt[1].iloc[0]["correct_term"]]]
except (ValueError, KeyError) as e:
print(f"Error {e} for {ppkt[0]}, disease {ppkt[1].iloc[0]['correct_term']}.")

else:
not_found_diseases.append(disease)
try:
rank_date_dict[ppkt[0]] = [0.0,
hpoa_unique.loc[ppkt[1].iloc[0]["correct_term"]]]
except (ValueError, KeyError) as e:
print(f"Error {e} for {ppkt[0]}, disease {ppkt[1].iloc[0]['correct_term']}.")

# gpt-4o output, reasonable enough to throw out 62 cases ~1%. 3 OMIMs to check and 3 nan
# len(rank_date_dict) --> 6625
# len(ppkts) --> 6687

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Do linear regression of box plot of ppkts' 1/r vs time
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Plot TODO
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=365))
dates = []
invranks = []
for key, data in rank_date_dict.items():
#rank, date_str = zip(*data_list) # Unpack
# necessary to convert to date object?
#dates = convert_str_to_dates(dates_str) # Not handled in example
#plt.plot(date_str, rank, label=key)
dates.append(dt.datetime.strptime(data[1], '%Y-%m-%d').date())
invranks.append(data[0])

plt.plot(dates, invranks, 'xr')
#plt.legend()
plt.gcf().autofmt_xdate()
plt.show()
breakpoint()
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Correlation coefficient TODO

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Statistical test TODO


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Analysis of found vs not-found
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

found_set = set(found_diseases)
notfound_set = set(not_found_diseases)
Expand All @@ -63,13 +126,12 @@
print(f"Found diseases also present in not-found set, by {model} is {len(overlap)}.\n")
# Need some more statistic

# header = ["disease_id", "found", "date"]

# Problematic, with the following HPOA goes from 27 k unique values to 8.2k
# TODO for each set of associations, take the first available as ground truth date
hpoa_cleaned = hpoa_cleaned.drop_duplicates(subset='database_id')
# Idea here could be to look at the 263-129 (gpt-4o) found diseases not present in not found set and the opposite
# namely never found diseases and look for a correlation with date.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# One Idea
# Look at the 263-129 (gpt-4o) found diseases not present in not-found set ("always found")
# and the opposite namely "never found" diseases. Average date of two sets is?

always_found = found_set - notfound_set # 134
never_found = notfound_set - found_set # 213

Expand All @@ -78,20 +140,23 @@
found_dict = {}
notfound_dict = {}

# TODO
results_df = pd.DataFrame(columns=["disease", "found", "date"])
# TODO get rid of next line
hpoa_df.drop_duplicates(subset='database_id', inplace=True)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

for af in always_found:
try:
results_dict[af] = [True, hpoa_cleaned.loc[hpoa_cleaned['database_id'] == af, 'date'].item() ]
found_dict[af] = hpoa_cleaned.loc[hpoa_cleaned['database_id'] == af, 'date'].item()
results_dict[af] = [True, hpoa_df.loc[hpoa_df['database_id'] == af, 'date'].item() ]
found_dict[af] = hpoa_df.loc[hpoa_df['database_id'] == af, 'date'].item()
results_df
except ValueError:
print(f"No HPOA for {af}.")
for nf in never_found:
try:
results_dict[nf] = [False, hpoa_cleaned.loc[hpoa_cleaned['database_id'] == nf, 'date'].item() ]
notfound_dict[nf] = hpoa_cleaned.loc[hpoa_cleaned['database_id'] == af, 'date'].item()
results_dict[nf] = [False, hpoa_df.loc[hpoa_df['database_id'] == nf, 'date'].item() ]
notfound_dict[nf] = hpoa_df.loc[hpoa_df['database_id'] == af, 'date'].item()
except ValueError:
print(f"No HPOA for {nf}.")

Expand All @@ -102,7 +167,4 @@
print(final_avg)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (0) Number of phenotypes in each phenopacket
# Try to make a classifier with number of phenotypes asserted and excluded.
# Another dimension could be IC present in each ppkt, measured how, though?
# Or maybe, the 3 above + date as 4 feautres: 4 dimensional logistic regression or SVM with gausskernel
# 2 dimensional logistic regression or SVM with gausskernel?

0 comments on commit ec56a24

Please sign in to comment.