From 1fe7c90b13d773718a7ae050667168be4dbb0a23 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 24 Jul 2024 07:36:53 -0400 Subject: [PATCH] test(benchmarks): add info/describe benchmark --- ibis/expr/types/relations.py | 12 -------- ibis/tests/benchmarks/test_benchmarks.py | 37 ++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/ibis/expr/types/relations.py b/ibis/expr/types/relations.py index 20b7469a1a177..35da5c2139c61 100644 --- a/ibis/expr/types/relations.py +++ b/ibis/expr/types/relations.py @@ -3021,13 +3021,10 @@ def describe( │ island │ 1 │ string │ 344 │ 0 │ 3 │ Biscoe │ └─────────┴───────┴────────┴───────┴───────┴────────┴────────┘ """ - import ibis.selectors as s from ibis import literal as lit quantile = sorted(quantile) aggs = [] - string_col = False - numeric_col = False for pos, colname in enumerate(self.columns): col = self[colname] typ = col.type() @@ -3044,7 +3041,6 @@ def describe( } if typ.is_numeric(): - numeric_col = True col_mean = col.mean() col_std = col.std() col_min = col.min().cast(float) @@ -3054,10 +3050,8 @@ def describe( for q in quantile } elif typ.is_string(): - string_col = True col_mode = col.mode() elif typ.is_boolean(): - numeric_col = True col_mean = col.mean() else: # Will not calculate statistics for other types @@ -3081,12 +3075,6 @@ def describe( t = ibis.union(*aggs) - # TODO(jiting): Need a better way to remove columns with all NULL - if string_col and not numeric_col: - t = t.select(~s.of_type("float")) - elif numeric_col and not string_col: - t = t.drop("mode") - return t def join( diff --git a/ibis/tests/benchmarks/test_benchmarks.py b/ibis/tests/benchmarks/test_benchmarks.py index 4faf306dd0e89..dcfc1adf8cde8 100644 --- a/ibis/tests/benchmarks/test_benchmarks.py +++ b/ibis/tests/benchmarks/test_benchmarks.py @@ -976,3 +976,40 @@ def test_selectors(benchmark, cols): n = cols - cols // 10 sel = s.across(s.c(*[f"col{i}" for i in range(n)]), lambda c: c.cast("str")) benchmark(sel.expand, t) + + +@pytest.fixture(scope="module") +def info_t(): + num_cols = 450 + return ibis.table({f"col_{i}": "float64" for i in range(num_cols)}) + + +@pytest.mark.parametrize("method", ["describe", "info"]) +def test_summarize_construct(benchmark, info_t, method): + benchmark(getattr(info_t, method)) + + +@pytest.mark.parametrize("method", ["describe", "info"]) +def test_summarize_compile(benchmark, info_t, method): + expr = getattr(info_t, method)() + benchmark(ibis.to_sql, expr, dialect="duckdb") + + +@pytest.fixture(scope="module") +def info_t_with_data(): + import pyarrow as pa + + num_cols = 450 + num_rows = 1_500 + data = pa.Table.from_arrays( + np.random.randn(num_rows, num_cols).T, + names=list(map("col_{}".format, range(num_cols))), + ) + return ibis.memtable(data) + + +@pytest.mark.parametrize("method", ["describe", "info"]) +def test_summarize_execute(benchmark, info_t_with_data, method): + con = ibis.duckdb.connect() + expr = getattr(info_t_with_data, method)() + benchmark(con.execute, expr)