test(benchmarks): add info/describe benchmark

ibis-project · Jul 25, 2024 · 1fe7c90 · 1fe7c90
1 parent 7254f65
commit 1fe7c90
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 12 deletions.
diff --git a/ibis/expr/types/relations.py b/ibis/expr/types/relations.py
@@ -3021,13 +3021,10 @@ def describe(
         │ island  │     1 │ string │   344 │     0 │      3 │ Biscoe │
         └─────────┴───────┴────────┴───────┴───────┴────────┴────────┘
         """
-        import ibis.selectors as s
         from ibis import literal as lit
 
         quantile = sorted(quantile)
         aggs = []
-        string_col = False
-        numeric_col = False
         for pos, colname in enumerate(self.columns):
             col = self[colname]
             typ = col.type()
@@ -3044,7 +3041,6 @@ def describe(
             }
 
             if typ.is_numeric():
-                numeric_col = True
                 col_mean = col.mean()
                 col_std = col.std()
                 col_min = col.min().cast(float)
@@ -3054,10 +3050,8 @@ def describe(
                     for q in quantile
                 }
             elif typ.is_string():
-                string_col = True
                 col_mode = col.mode()
             elif typ.is_boolean():
-                numeric_col = True
                 col_mean = col.mean()
             else:
                 # Will not calculate statistics for other types
@@ -3081,12 +3075,6 @@ def describe(
 
         t = ibis.union(*aggs)
 
-        # TODO(jiting): Need a better way to remove columns with all NULL
-        if string_col and not numeric_col:
-            t = t.select(~s.of_type("float"))
-        elif numeric_col and not string_col:
-            t = t.drop("mode")
-
         return t
 
     def join(

diff --git a/ibis/tests/benchmarks/test_benchmarks.py b/ibis/tests/benchmarks/test_benchmarks.py
@@ -976,3 +976,40 @@ def test_selectors(benchmark, cols):
     n = cols - cols // 10
     sel = s.across(s.c(*[f"col{i}" for i in range(n)]), lambda c: c.cast("str"))
     benchmark(sel.expand, t)
+
+
+@pytest.fixture(scope="module")
+def info_t():
+    num_cols = 450
+    return ibis.table({f"col_{i}": "float64" for i in range(num_cols)})
+
+
+@pytest.mark.parametrize("method", ["describe", "info"])
+def test_summarize_construct(benchmark, info_t, method):
+    benchmark(getattr(info_t, method))
+
+
+@pytest.mark.parametrize("method", ["describe", "info"])
+def test_summarize_compile(benchmark, info_t, method):
+    expr = getattr(info_t, method)()
+    benchmark(ibis.to_sql, expr, dialect="duckdb")
+
+
+@pytest.fixture(scope="module")
+def info_t_with_data():
+    import pyarrow as pa
+
+    num_cols = 450
+    num_rows = 1_500
+    data = pa.Table.from_arrays(
+        np.random.randn(num_rows, num_cols).T,
+        names=list(map("col_{}".format, range(num_cols))),
+    )
+    return ibis.memtable(data)
+
+
+@pytest.mark.parametrize("method", ["describe", "info"])
+def test_summarize_execute(benchmark, info_t_with_data, method):
+    con = ibis.duckdb.connect()
+    expr = getattr(info_t_with_data, method)()
+    benchmark(con.execute, expr)