perf(exprs): speed up .info() and .describe()

ibis-project · Jul 24, 2024 · 667e75d · 667e75d
1 parent 8e09e4f
commit 667e75d
Show file tree

Hide file tree

Showing 5 changed files with 407 additions and 77 deletions.
diff --git a/ibis/backends/sql/compilers/base.py b/ibis/backends/sql/compilers/base.py
@@ -6,6 +6,7 @@
 import math
 import operator
 import string
+from collections import deque
 from functools import partial, reduce
 from typing import TYPE_CHECKING, Any, ClassVar
 
@@ -22,7 +23,9 @@
     LastValue,
     add_one_to_nth_value_input,
     add_order_by_to_empty_ranking_window_functions,
+    describe_to_generic_describe,
     empty_in_values_right_side,
+    info_to_generic_info,
     lower_bucket,
     lower_capitalize,
     lower_sample,
@@ -238,6 +241,8 @@ class SQLGlotCompiler(abc.ABC):
         add_order_by_to_empty_ranking_window_functions,
         one_to_zero_index,
         add_one_to_nth_value_input,
+        info_to_generic_info,
+        describe_to_generic_describe,
     )
     """A sequence of rewrites to apply to the expression tree before compilation."""
 
@@ -1527,6 +1532,269 @@ def visit_DropColumns(self, op, *, parent, columns_to_drop):
         )
         return sg.select(*columns_to_keep).from_(parent)
 
+    def visit_GenericInfo(self, op, *, parent, **_):
+        quoted = self.quoted
+        schema = op.parent.schema
+
+        table = sg.to_identifier(parent.alias_or_name, quoted=quoted)
+
+        aggs = deque()
+        for colname, pos in schema._name_locs.items():
+            typ = schema[colname]
+
+            col = sge.column(colname, table=table, quoted=quoted).is_(None)
+            isna = self.cast(col, dt.int32)
+
+            aggs.append(
+                sg.select(
+                    sge.convert(colname).as_("name", quoted=quoted),
+                    sge.convert(str(typ)).as_("type", quoted=quoted),
+                    sge.convert(typ.nullable).as_("nullable", quoted=quoted),
+                    self.f.sum(isna).as_("nulls", quoted=quoted),
+                    self.f.sum(1 - isna).as_("non_nulls", quoted=quoted),
+                    self.f.avg(isna).as_("null_frac", quoted=quoted),
+                    sge.convert(pos).as_("pos", quoted=quoted),
+                ).from_(parent)
+            )
+
+        # rebalance aggs, this speeds up sqlglot compilation of huge unions
+        # significantly
+        while len(aggs) > 1:
+            left = aggs.popleft()
+            right = aggs.popleft()
+            aggs.append(sg.union(left, right, distinct=False))
+
+        unions = aggs.popleft()
+
+        assert not aggs, "not all unions processed"
+
+        return unions.order_by(sg.column("pos", quoted=quoted).asc())
+
+    def visit_FastInfo(self, op, *, parent, **_):
+        names = []
+        types = []
+        nullables = []
+        nullses = []
+        non_nullses = []
+        null_fracs = []
+        poses = []
+        quoted = self.quoted
+        schema = op.parent.schema
+
+        table = sg.to_identifier(parent.alias_or_name, quoted=quoted)
+
+        for colname, pos in schema._name_locs.items():
+            typ = schema[colname]
+
+            col = sge.column(colname, table=table, quoted=quoted).is_(None)
+            isna = self.cast(col, dt.int32)
+
+            names.append(sge.convert(colname))
+            types.append(sge.convert(str(typ)))
+            nullables.append(sge.convert(typ.nullable))
+            nullses.append(self.f.sum(isna))
+            non_nullses.append(self.f.sum(1 - isna))
+            null_fracs.append(self.f.avg(isna))
+            poses.append(sge.convert(pos))
+
+        return (
+            sg.select(
+                self.f.unnest(self.f.array(*names)).as_("name", quoted=quoted),
+                self.f.unnest(self.f.array(*types)).as_("type", quoted=quoted),
+                self.f.unnest(self.f.array(*nullables)).as_("nullable", quoted=quoted),
+                self.f.unnest(self.f.array(*nullses)).as_("nulls", quoted=quoted),
+                self.f.unnest(self.f.array(*non_nullses)).as_(
+                    "non_nulls", quoted=quoted
+                ),
+                self.f.unnest(self.f.array(*null_fracs)).as_(
+                    "null_frac", quoted=quoted
+                ),
+                self.f.unnest(self.f.array(*poses)).as_("pos", quoted=quoted),
+            )
+            .from_(parent)
+            .order_by(sg.column("pos", quoted=quoted).asc())
+        )
+
+    def visit_GenericDescribe(self, op, *, parent, quantile, **_):
+        quantile = sorted(quantile)
+        schema = op.parent.schema
+        quoted = self.quoted
+
+        quantile_keys = tuple(
+            f"p{100 * q:.6f}".rstrip("0").rstrip(".") for q in quantile
+        )
+        default_quantiles = dict.fromkeys(quantile_keys, NULL)
+        table = sg.to_identifier(parent.alias_or_name, quoted=quoted)
+        aggs = deque()
+
+        for colname, pos in schema._name_locs.items():
+            col = sge.column(colname, table=table, quoted=quoted)
+            typ = schema[colname]
+
+            # statistics default to NULL
+            col_mean = col_std = col_min = col_max = col_mode = NULL
+            quantile_values = default_quantiles.copy()
+
+            if typ.is_numeric():
+                col_mean = self.f.avg(col)
+                col_std = self.f.stddev(col)
+                col_min = self.f.min(col)
+                col_max = self.f.max(col)
+                for key, q in zip(quantile_keys, quantile):
+                    quantile_values[key] = sge.Quantile(
+                        this=col, quantile=sge.convert(q)
+                    )
+
+            elif typ.is_string():
+                col_mode = self.f.mode(col)
+            elif typ.is_boolean():
+                col_mean = self.f.avg(self.f.cast(col, dt.int32))
+            else:
+                # Will not calculate statistics for other types
+                continue
+
+            aggs.append(
+                sg.select(
+                    sge.convert(colname).as_("name", quoted=quoted),
+                    sge.convert(pos).as_("pos", quoted=quoted),
+                    sge.convert(str(typ)).as_("type", quoted=quoted),
+                    self.f.count(col).as_("count", quoted=quoted),
+                    self.f.sum(self.cast(col.is_(NULL), dt.int32)).as_(
+                        "nulls", quoted=quoted
+                    ),
+                    self.f.count(sge.Distinct(expressions=[col])).as_(
+                        "unique", quoted=quoted
+                    ),
+                    col_mode.as_("mode", quoted=quoted),
+                    col_mean.as_("mean", quoted=quoted),
+                    col_std.as_("std", quoted=quoted),
+                    col_min.as_("min", quoted=quoted),
+                    *(val.as_(q, quoted=quoted) for q, val in quantile_values.items()),
+                    col_max.as_("max", quoted=quoted),
+                ).from_(parent)
+            )
+
+        # rebalance aggs, this speeds up sqlglot compilation of huge unions
+        # significantly
+        while len(aggs) > 1:
+            left = aggs.popleft()
+            right = aggs.popleft()
+            aggs.append(sg.union(left, right, distinct=False))
+
+        unions = aggs.popleft()
+
+        assert not aggs, "not all unions processed"
+
+        return unions
+
+    def visit_FastDescribe(self, op, *, parent, quantile, **_):
+        quantile = sorted(quantile)
+        schema = op.parent.schema
+        quoted = self.quoted
+
+        names = list(map(sge.convert, schema._name_locs.keys()))
+        poses = list(map(sge.convert, schema._name_locs.values()))
+        types = list(map(sge.convert, map(str, schema.values())))
+        counts = []
+        nulls = []
+        uniques = []
+        modes = []
+        means = []
+        stds = []
+        mins = []
+        quantiles = {}
+        maxs = []
+
+        quantile_keys = tuple(
+            f"p{100 * q:.6f}".rstrip("0").rstrip(".") for q in quantile
+        )
+        default_quantiles = dict.fromkeys(quantile_keys, NULL)
+        quantiles = {key: [] for key in quantile_keys}
+        table = sg.to_identifier(parent.alias_or_name, quoted=quoted)
+
+        for colname in schema._name_locs.keys():
+            col = sge.column(colname, table=table, quoted=quoted)
+            typ = schema[colname]
+
+            # statistics default to NULL
+            col_mean = col_std = col_min = col_max = col_mode = NULL
+            quantile_values = default_quantiles.copy()
+
+            if typ.is_numeric():
+                col_mean = self.f.avg(col)
+                col_std = self.f.stddev(col)
+                col_min = self.f.min(col)
+                col_max = self.f.max(col)
+                for key, q in zip(quantile_keys, quantile):
+                    quantile_values[key] = sge.Quantile(
+                        this=col, quantile=sge.convert(q)
+                    )
+
+            elif typ.is_string():
+                col_mode = self.f.mode(col)
+            elif typ.is_boolean():
+                col_mean = self.f.avg(self.f.cast(col, dt.int32))
+            else:
+                # Will not calculate statistics for other types
+                continue
+
+            counts.append(self.f.count(col))
+            nulls.append(self.f.sum(self.cast(col.is_(NULL), dt.int32)))
+            uniques.append(self.f.count(sge.Distinct(expressions=[col])))
+            modes.append(col_mode)
+            means.append(col_mean)
+            stds.append(col_std)
+            mins.append(col_min)
+
+            for q, val in quantile_values.items():
+                quantiles[q].append(val)
+
+            maxs.append(col_max)
+
+        opschema = op.schema
+
+        return sg.select(
+            self.f.unnest(
+                self.cast(self.f.array(*names), dt.Array(opschema["name"]))
+            ).as_("name", quoted=quoted),
+            self.f.unnest(
+                self.cast(self.f.array(*poses), dt.Array(opschema["pos"]))
+            ).as_("pos", quoted=quoted),
+            self.f.unnest(
+                self.cast(self.f.array(*types), dt.Array(opschema["type"]))
+            ).as_("type", quoted=quoted),
+            self.f.unnest(
+                self.cast(self.f.array(*counts), dt.Array(opschema["count"]))
+            ).as_("count", quoted=quoted),
+            self.f.unnest(
+                self.cast(self.f.array(*nulls), dt.Array(opschema["nulls"]))
+            ).as_("nulls", quoted=quoted),
+            self.f.unnest(
+                self.cast(self.f.array(*uniques), dt.Array(opschema["unique"]))
+            ).as_("unique", quoted=quoted),
+            self.f.unnest(
+                self.cast(self.f.array(*modes), dt.Array(opschema["mode"]))
+            ).as_("mode", quoted=quoted),
+            self.f.unnest(
+                self.cast(self.f.array(*means), dt.Array(opschema["mean"]))
+            ).as_("mean", quoted=quoted),
+            self.f.unnest(
+                self.cast(self.f.array(*stds), dt.Array(opschema["std"]))
+            ).as_("std", quoted=quoted),
+            self.f.unnest(
+                self.cast(self.f.array(*mins), dt.Array(opschema["min"]))
+            ).as_("min", quoted=quoted),
+            *(
+                self.f.unnest(
+                    self.cast(self.f.array(*vals), dt.Array(opschema[q]))
+                ).as_(q, quoted=quoted)
+                for q, vals in quantiles.items()
+            ),
+            self.f.unnest(
+                self.cast(self.f.array(*maxs), dt.Array(opschema["max"]))
+            ).as_("max", quoted=quoted),
+        ).from_(parent)
+
 
 # `__init_subclass__` is uncalled for subclasses - we manually call it here to
 # autogenerate the base class implementations as well.

diff --git a/ibis/backends/sql/compilers/duckdb.py b/ibis/backends/sql/compilers/duckdb.py
@@ -12,7 +12,11 @@
 import ibis.expr.operations as ops
 from ibis.backends.sql.compilers.base import NULL, STAR, AggGen, SQLGlotCompiler
 from ibis.backends.sql.datatypes import DuckDBType
-from ibis.backends.sql.rewrites import exclude_nulls_from_array_collect
+from ibis.backends.sql.rewrites import (
+    describe_to_fast_describe,
+    exclude_nulls_from_array_collect,
+    info_to_fast_info,
+)
 from ibis.util import gen_name
 
 _INTERVAL_SUFFIXES = {
@@ -37,6 +41,8 @@ class DuckDBCompiler(SQLGlotCompiler):
 
     rewrites = (
         exclude_nulls_from_array_collect,
+        info_to_fast_info,
+        describe_to_fast_describe,
         *SQLGlotCompiler.rewrites,
     )
 

diff --git a/ibis/backends/sql/rewrites.py b/ibis/backends/sql/rewrites.py
@@ -87,6 +87,80 @@ def dtype(self):
         return self.arg.dtype
 
 
+@public
+class GenericInfo(ops.Relation):
+    """Generic info node."""
+
+    parent: ops.Relation
+    schema: Schema
+
+    @attribute
+    def values(self):
+        return {}
+
+
+@public
+class FastInfo(ops.Relation):
+    """Fast info node for backends that support arrays."""
+
+    parent: ops.Relation
+    schema: Schema
+
+    @attribute
+    def values(self):
+        return {}
+
+
+@replace(p.Info)
+def info_to_generic_info(_, **kwargs):
+    """Convert Info node to GenericInfo node."""
+    return GenericInfo(parent=_.parent, schema=_.schema)
+
+
+@replace(p.Info)
+def info_to_fast_info(_, **kwargs):
+    """Convert Info node to GenericInfo node."""
+    return FastInfo(parent=_.parent, schema=_.schema)
+
+
+@public
+class GenericDescribe(ops.Relation):
+    """Generic describe node."""
+
+    parent: ops.Relation
+    quantile: VarTuple[float]
+    schema: Schema
+
+    @attribute
+    def values(self):
+        return {}
+
+
+@public
+class FastDescribe(ops.Relation):
+    """Fast describe node for backends that support arrays."""
+
+    parent: ops.Relation
+    quantile: VarTuple[float]
+    schema: Schema
+
+    @attribute
+    def values(self):
+        return {}
+
+
+@replace(p.Describe)
+def describe_to_generic_describe(_, **kwargs):
+    """Convert Info node to GenericInfo node."""
+    return GenericDescribe(parent=_.parent, quantile=_.quantile, schema=_.schema)
+
+
+@replace(p.Describe)
+def describe_to_fast_describe(_, **kwargs):
+    """Convert Info node to GenericInfo node."""
+    return FastDescribe(parent=_.parent, quantile=_.quantile, schema=_.schema)
+
+
 # TODO(kszucs): there is a better strategy to rewrite the relational operations
 # to Select nodes by wrapping the leaf nodes in a Select node and then merging
 # Project, Filter, Sort, etc. incrementally into the Select node. This way we