Skip to content

Commit

Permalink
test(benchmarks): add info/describe benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
cpcloud committed Jul 25, 2024
1 parent 7254f65 commit 1fe7c90
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 12 deletions.
12 changes: 0 additions & 12 deletions ibis/expr/types/relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3021,13 +3021,10 @@ def describe(
│ island │ 1 │ string │ 344 │ 0 │ 3 │ Biscoe │
└─────────┴───────┴────────┴───────┴───────┴────────┴────────┘
"""
import ibis.selectors as s
from ibis import literal as lit

quantile = sorted(quantile)
aggs = []
string_col = False
numeric_col = False
for pos, colname in enumerate(self.columns):
col = self[colname]
typ = col.type()
Expand All @@ -3044,7 +3041,6 @@ def describe(
}

if typ.is_numeric():
numeric_col = True
col_mean = col.mean()
col_std = col.std()
col_min = col.min().cast(float)
Expand All @@ -3054,10 +3050,8 @@ def describe(
for q in quantile
}
elif typ.is_string():
string_col = True
col_mode = col.mode()
elif typ.is_boolean():
numeric_col = True
col_mean = col.mean()
else:
# Will not calculate statistics for other types
Expand All @@ -3081,12 +3075,6 @@ def describe(

t = ibis.union(*aggs)

# TODO(jiting): Need a better way to remove columns with all NULL
if string_col and not numeric_col:
t = t.select(~s.of_type("float"))
elif numeric_col and not string_col:
t = t.drop("mode")

return t

def join(
Expand Down
37 changes: 37 additions & 0 deletions ibis/tests/benchmarks/test_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -976,3 +976,40 @@ def test_selectors(benchmark, cols):
n = cols - cols // 10
sel = s.across(s.c(*[f"col{i}" for i in range(n)]), lambda c: c.cast("str"))
benchmark(sel.expand, t)


@pytest.fixture(scope="module")
def info_t():
num_cols = 450
return ibis.table({f"col_{i}": "float64" for i in range(num_cols)})


@pytest.mark.parametrize("method", ["describe", "info"])
def test_summarize_construct(benchmark, info_t, method):
benchmark(getattr(info_t, method))


@pytest.mark.parametrize("method", ["describe", "info"])
def test_summarize_compile(benchmark, info_t, method):
expr = getattr(info_t, method)()
benchmark(ibis.to_sql, expr, dialect="duckdb")


@pytest.fixture(scope="module")
def info_t_with_data():
import pyarrow as pa

num_cols = 450
num_rows = 1_500
data = pa.Table.from_arrays(
np.random.randn(num_rows, num_cols).T,
names=list(map("col_{}".format, range(num_cols))),
)
return ibis.memtable(data)


@pytest.mark.parametrize("method", ["describe", "info"])
def test_summarize_execute(benchmark, info_t_with_data, method):
con = ibis.duckdb.connect()
expr = getattr(info_t_with_data, method)()
benchmark(con.execute, expr)

0 comments on commit 1fe7c90

Please sign in to comment.