diff --git a/ibis/backends/sql/compilers/base.py b/ibis/backends/sql/compilers/base.py index a040609e01189..c1e420fc3d9a7 100644 --- a/ibis/backends/sql/compilers/base.py +++ b/ibis/backends/sql/compilers/base.py @@ -6,6 +6,7 @@ import math import operator import string +from collections import deque from functools import partial, reduce from typing import TYPE_CHECKING, Any, ClassVar @@ -22,7 +23,9 @@ LastValue, add_one_to_nth_value_input, add_order_by_to_empty_ranking_window_functions, + describe_to_generic_describe, empty_in_values_right_side, + info_to_generic_info, lower_bucket, lower_capitalize, lower_sample, @@ -238,6 +241,8 @@ class SQLGlotCompiler(abc.ABC): add_order_by_to_empty_ranking_window_functions, one_to_zero_index, add_one_to_nth_value_input, + info_to_generic_info, + describe_to_generic_describe, ) """A sequence of rewrites to apply to the expression tree before compilation.""" @@ -1527,6 +1532,269 @@ def visit_DropColumns(self, op, *, parent, columns_to_drop): ) return sg.select(*columns_to_keep).from_(parent) + def visit_GenericInfo(self, op, *, parent, **_): + quoted = self.quoted + schema = op.parent.schema + + table = sg.to_identifier(parent.alias_or_name, quoted=quoted) + + aggs = deque() + for colname, pos in schema._name_locs.items(): + typ = schema[colname] + + col = sge.column(colname, table=table, quoted=quoted).is_(None) + isna = self.cast(col, dt.int32) + + aggs.append( + sg.select( + sge.convert(colname).as_("name", quoted=quoted), + sge.convert(str(typ)).as_("type", quoted=quoted), + sge.convert(typ.nullable).as_("nullable", quoted=quoted), + self.f.sum(isna).as_("nulls", quoted=quoted), + self.f.sum(1 - isna).as_("non_nulls", quoted=quoted), + self.f.avg(isna).as_("null_frac", quoted=quoted), + sge.convert(pos).as_("pos", quoted=quoted), + ).from_(parent) + ) + + # rebalance aggs, this speeds up sqlglot compilation of huge unions + # significantly + while len(aggs) > 1: + left = aggs.popleft() + right = aggs.popleft() + aggs.append(sg.union(left, right, distinct=False)) + + unions = aggs.popleft() + + assert not aggs, "not all unions processed" + + return unions.order_by(sg.column("pos", quoted=quoted).asc()) + + def visit_FastInfo(self, op, *, parent, **_): + names = [] + types = [] + nullables = [] + nullses = [] + non_nullses = [] + null_fracs = [] + poses = [] + quoted = self.quoted + schema = op.parent.schema + + table = sg.to_identifier(parent.alias_or_name, quoted=quoted) + + for colname, pos in schema._name_locs.items(): + typ = schema[colname] + + col = sge.column(colname, table=table, quoted=quoted).is_(None) + isna = self.cast(col, dt.int32) + + names.append(sge.convert(colname)) + types.append(sge.convert(str(typ))) + nullables.append(sge.convert(typ.nullable)) + nullses.append(self.f.sum(isna)) + non_nullses.append(self.f.sum(1 - isna)) + null_fracs.append(self.f.avg(isna)) + poses.append(sge.convert(pos)) + + return ( + sg.select( + self.f.unnest(self.f.array(*names)).as_("name", quoted=quoted), + self.f.unnest(self.f.array(*types)).as_("type", quoted=quoted), + self.f.unnest(self.f.array(*nullables)).as_("nullable", quoted=quoted), + self.f.unnest(self.f.array(*nullses)).as_("nulls", quoted=quoted), + self.f.unnest(self.f.array(*non_nullses)).as_( + "non_nulls", quoted=quoted + ), + self.f.unnest(self.f.array(*null_fracs)).as_( + "null_frac", quoted=quoted + ), + self.f.unnest(self.f.array(*poses)).as_("pos", quoted=quoted), + ) + .from_(parent) + .order_by(sg.column("pos", quoted=quoted).asc()) + ) + + def visit_GenericDescribe(self, op, *, parent, quantile, **_): + quantile = sorted(quantile) + schema = op.parent.schema + quoted = self.quoted + + quantile_keys = tuple( + f"p{100 * q:.6f}".rstrip("0").rstrip(".") for q in quantile + ) + default_quantiles = dict.fromkeys(quantile_keys, NULL) + table = sg.to_identifier(parent.alias_or_name, quoted=quoted) + aggs = deque() + + for colname, pos in schema._name_locs.items(): + col = sge.column(colname, table=table, quoted=quoted) + typ = schema[colname] + + # statistics default to NULL + col_mean = col_std = col_min = col_max = col_mode = NULL + quantile_values = default_quantiles.copy() + + if typ.is_numeric(): + col_mean = self.f.avg(col) + col_std = self.f.stddev(col) + col_min = self.f.min(col) + col_max = self.f.max(col) + for key, q in zip(quantile_keys, quantile): + quantile_values[key] = sge.Quantile( + this=col, quantile=sge.convert(q) + ) + + elif typ.is_string(): + col_mode = self.f.mode(col) + elif typ.is_boolean(): + col_mean = self.f.avg(self.f.cast(col, dt.int32)) + else: + # Will not calculate statistics for other types + continue + + aggs.append( + sg.select( + sge.convert(colname).as_("name", quoted=quoted), + sge.convert(pos).as_("pos", quoted=quoted), + sge.convert(str(typ)).as_("type", quoted=quoted), + self.f.count(col).as_("count", quoted=quoted), + self.f.sum(self.cast(col.is_(NULL), dt.int32)).as_( + "nulls", quoted=quoted + ), + self.f.count(sge.Distinct(expressions=[col])).as_( + "unique", quoted=quoted + ), + col_mode.as_("mode", quoted=quoted), + col_mean.as_("mean", quoted=quoted), + col_std.as_("std", quoted=quoted), + col_min.as_("min", quoted=quoted), + *(val.as_(q, quoted=quoted) for q, val in quantile_values.items()), + col_max.as_("max", quoted=quoted), + ).from_(parent) + ) + + # rebalance aggs, this speeds up sqlglot compilation of huge unions + # significantly + while len(aggs) > 1: + left = aggs.popleft() + right = aggs.popleft() + aggs.append(sg.union(left, right, distinct=False)) + + unions = aggs.popleft() + + assert not aggs, "not all unions processed" + + return unions + + def visit_FastDescribe(self, op, *, parent, quantile, **_): + quantile = sorted(quantile) + schema = op.parent.schema + quoted = self.quoted + + names = list(map(sge.convert, schema._name_locs.keys())) + poses = list(map(sge.convert, schema._name_locs.values())) + types = list(map(sge.convert, map(str, schema.values()))) + counts = [] + nulls = [] + uniques = [] + modes = [] + means = [] + stds = [] + mins = [] + quantiles = {} + maxs = [] + + quantile_keys = tuple( + f"p{100 * q:.6f}".rstrip("0").rstrip(".") for q in quantile + ) + default_quantiles = dict.fromkeys(quantile_keys, NULL) + quantiles = {key: [] for key in quantile_keys} + table = sg.to_identifier(parent.alias_or_name, quoted=quoted) + + for colname in schema._name_locs.keys(): + col = sge.column(colname, table=table, quoted=quoted) + typ = schema[colname] + + # statistics default to NULL + col_mean = col_std = col_min = col_max = col_mode = NULL + quantile_values = default_quantiles.copy() + + if typ.is_numeric(): + col_mean = self.f.avg(col) + col_std = self.f.stddev(col) + col_min = self.f.min(col) + col_max = self.f.max(col) + for key, q in zip(quantile_keys, quantile): + quantile_values[key] = sge.Quantile( + this=col, quantile=sge.convert(q) + ) + + elif typ.is_string(): + col_mode = self.f.mode(col) + elif typ.is_boolean(): + col_mean = self.f.avg(self.f.cast(col, dt.int32)) + else: + # Will not calculate statistics for other types + continue + + counts.append(self.f.count(col)) + nulls.append(self.f.sum(self.cast(col.is_(NULL), dt.int32))) + uniques.append(self.f.count(sge.Distinct(expressions=[col]))) + modes.append(col_mode) + means.append(col_mean) + stds.append(col_std) + mins.append(col_min) + + for q, val in quantile_values.items(): + quantiles[q].append(val) + + maxs.append(col_max) + + opschema = op.schema + + return sg.select( + self.f.unnest( + self.cast(self.f.array(*names), dt.Array(opschema["name"])) + ).as_("name", quoted=quoted), + self.f.unnest( + self.cast(self.f.array(*poses), dt.Array(opschema["pos"])) + ).as_("pos", quoted=quoted), + self.f.unnest( + self.cast(self.f.array(*types), dt.Array(opschema["type"])) + ).as_("type", quoted=quoted), + self.f.unnest( + self.cast(self.f.array(*counts), dt.Array(opschema["count"])) + ).as_("count", quoted=quoted), + self.f.unnest( + self.cast(self.f.array(*nulls), dt.Array(opschema["nulls"])) + ).as_("nulls", quoted=quoted), + self.f.unnest( + self.cast(self.f.array(*uniques), dt.Array(opschema["unique"])) + ).as_("unique", quoted=quoted), + self.f.unnest( + self.cast(self.f.array(*modes), dt.Array(opschema["mode"])) + ).as_("mode", quoted=quoted), + self.f.unnest( + self.cast(self.f.array(*means), dt.Array(opschema["mean"])) + ).as_("mean", quoted=quoted), + self.f.unnest( + self.cast(self.f.array(*stds), dt.Array(opschema["std"])) + ).as_("std", quoted=quoted), + self.f.unnest( + self.cast(self.f.array(*mins), dt.Array(opschema["min"])) + ).as_("min", quoted=quoted), + *( + self.f.unnest( + self.cast(self.f.array(*vals), dt.Array(opschema[q])) + ).as_(q, quoted=quoted) + for q, vals in quantiles.items() + ), + self.f.unnest( + self.cast(self.f.array(*maxs), dt.Array(opschema["max"])) + ).as_("max", quoted=quoted), + ).from_(parent) + # `__init_subclass__` is uncalled for subclasses - we manually call it here to # autogenerate the base class implementations as well. diff --git a/ibis/backends/sql/compilers/duckdb.py b/ibis/backends/sql/compilers/duckdb.py index fe8f53e2d3a16..7ab9a35ea7ac3 100644 --- a/ibis/backends/sql/compilers/duckdb.py +++ b/ibis/backends/sql/compilers/duckdb.py @@ -12,7 +12,11 @@ import ibis.expr.operations as ops from ibis.backends.sql.compilers.base import NULL, STAR, AggGen, SQLGlotCompiler from ibis.backends.sql.datatypes import DuckDBType -from ibis.backends.sql.rewrites import exclude_nulls_from_array_collect +from ibis.backends.sql.rewrites import ( + describe_to_fast_describe, + exclude_nulls_from_array_collect, + info_to_fast_info, +) from ibis.util import gen_name _INTERVAL_SUFFIXES = { @@ -37,6 +41,8 @@ class DuckDBCompiler(SQLGlotCompiler): rewrites = ( exclude_nulls_from_array_collect, + info_to_fast_info, + describe_to_fast_describe, *SQLGlotCompiler.rewrites, ) diff --git a/ibis/backends/sql/rewrites.py b/ibis/backends/sql/rewrites.py index 584cf748bbcd5..c99d4040e6ec7 100644 --- a/ibis/backends/sql/rewrites.py +++ b/ibis/backends/sql/rewrites.py @@ -87,6 +87,80 @@ def dtype(self): return self.arg.dtype +@public +class GenericInfo(ops.Relation): + """Generic info node.""" + + parent: ops.Relation + schema: Schema + + @attribute + def values(self): + return {} + + +@public +class FastInfo(ops.Relation): + """Fast info node for backends that support arrays.""" + + parent: ops.Relation + schema: Schema + + @attribute + def values(self): + return {} + + +@replace(p.Info) +def info_to_generic_info(_, **kwargs): + """Convert Info node to GenericInfo node.""" + return GenericInfo(parent=_.parent, schema=_.schema) + + +@replace(p.Info) +def info_to_fast_info(_, **kwargs): + """Convert Info node to GenericInfo node.""" + return FastInfo(parent=_.parent, schema=_.schema) + + +@public +class GenericDescribe(ops.Relation): + """Generic describe node.""" + + parent: ops.Relation + quantile: VarTuple[float] + schema: Schema + + @attribute + def values(self): + return {} + + +@public +class FastDescribe(ops.Relation): + """Fast describe node for backends that support arrays.""" + + parent: ops.Relation + quantile: VarTuple[float] + schema: Schema + + @attribute + def values(self): + return {} + + +@replace(p.Describe) +def describe_to_generic_describe(_, **kwargs): + """Convert Info node to GenericInfo node.""" + return GenericDescribe(parent=_.parent, quantile=_.quantile, schema=_.schema) + + +@replace(p.Describe) +def describe_to_fast_describe(_, **kwargs): + """Convert Info node to GenericInfo node.""" + return FastDescribe(parent=_.parent, quantile=_.quantile, schema=_.schema) + + # TODO(kszucs): there is a better strategy to rewrite the relational operations # to Select nodes by wrapping the leaf nodes in a Select node and then merging # Project, Filter, Sort, etc. incrementally into the Select node. This way we diff --git a/ibis/expr/operations/relations.py b/ibis/expr/operations/relations.py index 104b5d752c34f..d19ad4120a7cf 100644 --- a/ibis/expr/operations/relations.py +++ b/ibis/expr/operations/relations.py @@ -301,7 +301,7 @@ class Aggregate(Relation): parent: Relation groups: FrozenOrderedDict[str, Unaliased[Value]] - metrics: FrozenOrderedDict[str, Unaliased[Scalar]] + metrics: FrozenOrderedDict[str, Unaliased[Value]] def __init__(self, parent, groups, metrics): _check_integrity(groups.values(), {parent}) @@ -517,4 +517,59 @@ def schema(self): return Schema(base) +@public +class Info(Relation): + parent: Relation + + @attribute + def schema(self) -> Schema: + return Schema( + { + "name": dt.string, + "type": dt.string, + "nullable": dt.boolean, + "nulls": dt.int64, + "non_nulls": dt.int64, + "null_frac": dt.float64, + "pos": dt.int16, + } + ) + + @attribute + def values(self): + return {} + + +@public +class Describe(Relation): + parent: Relation + quantile: VarTuple[float] + + @attribute + def schema(self) -> Schema: + return Schema( + { + "name": dt.string, + "pos": dt.int16, + "type": dt.string, + "count": dt.int64, + "nulls": dt.int64, + "unique": dt.int64, + "mode": dt.string, + "mean": dt.float64, + "std": dt.float64, + "min": dt.float64, + **{ + f"p{100 * q:.6f}".rstrip("0").rstrip("."): dt.float64 + for q in sorted(self.quantile) + }, + "max": dt.float64, + } + ) + + @attribute + def values(self): + return {} + + # TODO(kszucs): support t.select(*t) syntax by implementing Table.__iter__() diff --git a/ibis/expr/types/relations.py b/ibis/expr/types/relations.py index 35da5c2139c61..c328c49c7f5fa 100644 --- a/ibis/expr/types/relations.py +++ b/ibis/expr/types/relations.py @@ -2934,26 +2934,7 @@ def info(self) -> Table: │ year │ int64 │ True │ 0 │ 344 │ 0.000000 │ … │ └───────────────────┴─────────┴──────────┴───────┴───────────┴───────────┴───┘ """ - from ibis import literal as lit - - aggs = [] - - for pos, colname in enumerate(self.columns): - col = self[colname] - typ = col.type() - agg = self.select( - isna=ibis.case().when(col.isnull(), 1).else_(0).end() - ).agg( - name=lit(colname), - type=lit(str(typ)), - nullable=lit(typ.nullable), - nulls=lambda t: t.isna.sum(), - non_nulls=lambda t: (1 - t.isna).sum(), - null_frac=lambda t: t.isna.mean(), - pos=lit(pos, type=dt.int16), - ) - aggs.append(agg) - return ibis.union(*aggs).order_by(ibis.asc("pos")) + return ops.Info(self).to_expr() def describe( self, quantile: Sequence[ir.NumericValue | float] = (0.25, 0.5, 0.75) @@ -3021,61 +3002,7 @@ def describe( │ island │ 1 │ string │ 344 │ 0 │ 3 │ Biscoe │ └─────────┴───────┴────────┴───────┴───────┴────────┴────────┘ """ - from ibis import literal as lit - - quantile = sorted(quantile) - aggs = [] - for pos, colname in enumerate(self.columns): - col = self[colname] - typ = col.type() - - # default statistics to None - col_mean = lit(None).cast(float) - col_std = lit(None).cast(float) - col_min = lit(None).cast(float) - col_max = lit(None).cast(float) - col_mode = lit(None).cast(str) - quantile_values = { - f"p{100*q:.6f}".rstrip("0").rstrip("."): lit(None).cast(float) - for q in quantile - } - - if typ.is_numeric(): - col_mean = col.mean() - col_std = col.std() - col_min = col.min().cast(float) - col_max = col.max().cast(float) - quantile_values = { - f"p{100*q:.6f}".rstrip("0").rstrip("."): col.quantile(q).cast(float) - for q in quantile - } - elif typ.is_string(): - col_mode = col.mode() - elif typ.is_boolean(): - col_mean = col.mean() - else: - # Will not calculate statistics for other types - continue - - agg = self.agg( - name=lit(colname), - pos=lit(pos, type=dt.int16), - type=lit(str(typ)), - count=col.isnull().count(), - nulls=col.isnull().sum(), - unique=col.nunique(), - mode=col_mode, - mean=col_mean, - std=col_std, - min=col_min, - **quantile_values, - max=col_max, - ) - aggs.append(agg) - - t = ibis.union(*aggs) - - return t + return ops.Describe(self, quantile=quantile).to_expr() def join( left: Table,