Skip to content

Commit

Permalink
benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
sbordt committed May 22, 2024
1 parent db8db1c commit 2837b07
Show file tree
Hide file tree
Showing 28 changed files with 35,842 additions and 5,984 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
data/
benchmarks/data
benchmarks/models
*.pkl

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
122 changes: 122 additions & 0 deletions benchmarks/benchmark/function-recognition-hard.json

Large diffs are not rendered by default.

202 changes: 202 additions & 0 deletions benchmarks/benchmark/function-recognition.json

Large diffs are not rendered by default.

402 changes: 402 additions & 0 deletions benchmarks/benchmark/jumps.json

Large diffs are not rendered by default.

402 changes: 402 additions & 0 deletions benchmarks/benchmark/monotonicity.json

Large diffs are not rendered by default.

402 changes: 402 additions & 0 deletions benchmarks/benchmark/read-value.json

Large diffs are not rendered by default.

402 changes: 402 additions & 0 deletions benchmarks/benchmark/wide-confidence.json

Large diffs are not rendered by default.

108 changes: 79 additions & 29 deletions benchmarks/utils.py → benchmarks/benchmark_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@

import pandas as pd
import os
import openai
import guidance
import pickle
import numpy as np

from sklearn.model_selection import train_test_split

Expand All @@ -16,44 +15,35 @@

RANDOM_STATE = 1498672

IRIS_SETOSA = (
"Iris-setosa" # binary classification of Iris-setosa vs. all other species
)
IRIS = "Iris"
TITANIC = "Titanic"
SPACESHIP_TITANIC = "Spaceship-Titanic"
CALIFORNIA_HOUSING = "California-Housing"
OPENML_DIABETES = "OpenML-Diabetes"
ADULT = "Adult-Income"
KAGGLE_FLOOD = "Kaggle-Flood"
KAGGLE_HEART_FAILURE = "Kaggle-Heart-Failure"
BANK_CHURN = "Kaggle-Bank-Churn"
CANCER = "Wisconsin-Cancer"

DATASETS = [
CALIFORNIA_HOUSING,
OPENML_DIABETES,
IRIS_SETOSA,
IRIS,
TITANIC,
SPACESHIP_TITANIC,
ADULT,
KAGGLE_FLOOD,
KAGGLE_HEART_FAILURE,
BANK_CHURN,
CANCER,
]


def openai_setup_gpt3_5():
openai.organization = os.environ["OPENAI_API_ORG"]
openai.api_key = os.environ["OPENAI_API_KEY"]
return guidance.llms.OpenAI("gpt-3.5-turbo-0125")


def openai_setup_gpt4():
openai.organization = os.environ["OPENAI_API_ORG"]
openai.api_key = os.environ["OPENAI_API_KEY"]
return guidance.llms.OpenAI("gpt-4o-2024-05-13")


def get_avaialble_datasets():
return DATASETS


def get_dataset_description(dataset_name):
"""Returns: dataset description (str), dataset y axis description (str)"""
pass


def get_dataset(dataset_name):
"""
Loads the dataset with the given name and return the train and test splits.
Expand All @@ -76,11 +66,11 @@ def get_dataset(dataset_name):
# drop rows with missing values
df = df.dropna()
y_data = df["Survived"].values
df = df.drop(columns=["PassengerId", "Name", "Survived"])
df = df.drop(columns=["PassengerId", "Name", "Survived", "Cabin", "Ticket"])
X_data = df.values
feature_names = df.columns.tolist()
elif dataset_name == IRIS_SETOSA:
df = pd.read_csv("../data/IRIS.csv")
elif dataset_name == IRIS:
df = pd.read_csv("../data/iris.csv")
df = df.dropna()
y_data = df["species"].values == "Iris-setosa" # binary classification
df = df.drop(columns=["species"])
Expand All @@ -101,6 +91,57 @@ def get_dataset(dataset_name):
df = df.drop(columns=["Outcome"])
X_data = df.values
feature_names = df.columns.tolist()
elif dataset_name == ADULT:
df = pd.read_csv("../data/adult-train.csv")
# drop fnlwgt
df = df.drop(columns=["fnlwgt"])
y_data = df["Income"].values == " >50K"
df = df.drop(columns=["Income"])
# convert categorical columns to numbers
for col in df.columns:
if df[col].dtype == "object":
df[col] = df[col].astype("category").cat.codes
# drop na and inf values
df = df.replace([np.inf, -np.inf], np.nan).dropna()
X_data = df.values
feature_names = df.columns.tolist()
elif dataset_name == KAGGLE_FLOOD:
df = pd.read_csv("../data/kaggle-flood-train.csv")
df = df.dropna()
# subset 10 000 observations
df = df.sample(n=10000, random_state=RANDOM_STATE)
y_data = df["FloodProbability"].values
df = df.drop(columns=["FloodProbability"])
X_data = df.values
feature_names = df.columns.tolist()
elif dataset_name == KAGGLE_HEART_FAILURE:
df = pd.read_csv("../data/kaggle_heart_failure_clinical_records.csv")
df = df.dropna()
y_data = df["DEATH_EVENT"].values
df = df.drop(columns=["DEATH_EVENT"])
X_data = df.values
feature_names = df.columns.tolist()
elif dataset_name == BANK_CHURN:
df = pd.read_csv("../data/kaggle-bank-churn.csv")
df = df.dropna()
# drop surname feature
df = df.drop(columns=["Surname"])
# subset 10 000 observations
df = df.sample(n=10000, random_state=RANDOM_STATE)
y_data = df["Exited"].values == 0 # binary classification
df = df.drop(columns=["Exited"])
# drop na and inf values
df = df.replace([np.inf, -np.inf], np.nan).dropna()
X_data = df.values
feature_names = df.columns.tolist()
elif dataset_name == CANCER:
df = pd.read_csv("../data/Wisconsin-cancer.csv")
# ddrop 'Unnamed: 32'
df = df.drop(columns=["Unnamed: 32"])
y_data = df["diagnosis"].values == "M"
df = df.drop(columns=["diagnosis"])
X_data = df.values
feature_names = df.columns.tolist()
else:
raise ValueError("Unknown dataset: ", dataset_name)

Expand All @@ -112,7 +153,7 @@ def get_dataset(dataset_name):

def get_ebm(dataset_name):
"""
Returns: the ebm
Returns: An EBM trained on the dataset.
"""
X_train, X_test, y_train, y_test, feature_names = get_dataset(dataset_name)
model_file = f"../models/{dataset_name}"
Expand All @@ -121,7 +162,16 @@ def get_ebm(dataset_name):
with open(model_file, "rb") as file:
ebm = pickle.load(file)
else: # otherwise train and save
if dataset_name in [SPACESHIP_TITANIC, IRIS_SETOSA, TITANIC, OPENML_DIABETES]:
if dataset_name in [
SPACESHIP_TITANIC,
IRIS,
TITANIC,
OPENML_DIABETES,
ADULT,
KAGGLE_HEART_FAILURE,
BANK_CHURN,
CANCER,
]:
# classification
ebm = ExplainableBoostingClassifier(
interactions=0,
Expand All @@ -132,7 +182,7 @@ def get_ebm(dataset_name):
# store for later use
with open(model_file, "wb") as file:
pickle.dump(ebm, file)
elif dataset_name in [CALIFORNIA_HOUSING]:
elif dataset_name in [CALIFORNIA_HOUSING, KAGGLE_FLOOD]:
# regression
ebm = ExplainableBoostingRegressor(
interactions=0,
Expand Down
Loading

0 comments on commit 2837b07

Please sign in to comment.