-
Notifications
You must be signed in to change notification settings - Fork 598
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
test: add test for impure function correlation behavior #9014
base: main
Are you sure you want to change the base?
Changes from all commits
ab2ae3b
eab4206
c577d6f
0f30227
674b92b
751f2a1
e15cb02
8339160
73e8e35
e187200
f4c2989
78506b1
117d15b
4180104
de49350
552c2cf
8b51625
e877e0d
055da77
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,221 @@ | ||
from __future__ import annotations | ||
|
||
import sys | ||
|
||
import pytest | ||
|
||
import ibis | ||
import ibis.common.exceptions as com | ||
from ibis import _ | ||
from ibis.backends.tests.errors import Py4JJavaError | ||
|
||
tm = pytest.importorskip("pandas.testing") | ||
|
||
pytestmark = pytest.mark.xdist_group("impure") | ||
|
||
no_randoms = [ | ||
pytest.mark.notimpl( | ||
["polars", "druid", "risingwave"], raises=com.OperationNotDefinedError | ||
), | ||
] | ||
|
||
no_udfs = [ | ||
pytest.mark.notyet("datafusion", raises=NotImplementedError), | ||
pytest.mark.notimpl( | ||
[ | ||
"bigquery", | ||
"clickhouse", | ||
"druid", | ||
"exasol", | ||
"impala", | ||
"mssql", | ||
"mysql", | ||
"oracle", | ||
"trino", | ||
"risingwave", | ||
] | ||
), | ||
pytest.mark.notyet( | ||
"flink", | ||
condition=sys.version_info >= (3, 11), | ||
raises=Py4JJavaError, | ||
reason="Docker image has Python 3.10, results in `cloudpickle` version mismatch", | ||
), | ||
] | ||
|
||
no_uuids = [ | ||
pytest.mark.notimpl( | ||
["druid", "exasol", "oracle", "polars", "pyspark", "risingwave"], | ||
raises=com.OperationNotDefinedError, | ||
), | ||
pytest.mark.notyet("mssql", reason="Unrelated bug: Incorrect syntax near '('"), | ||
] | ||
|
||
|
||
@ibis.udf.scalar.python(side_effects=True) | ||
def my_random(x: float) -> float: | ||
# need to make the whole UDF self-contained for postgres to work | ||
import random | ||
|
||
return random.random() # noqa: S311 | ||
|
||
|
||
mark_impures = pytest.mark.parametrize( | ||
"impure", | ||
[ | ||
pytest.param(lambda _: ibis.random(), marks=no_randoms, id="random"), | ||
pytest.param( | ||
lambda _: ibis.uuid().cast(str).contains("a").ifelse(1, 0), | ||
marks=[ | ||
*no_uuids, | ||
pytest.mark.notyet("impala", reason="instances are uncorrelated"), | ||
], | ||
id="uuid", | ||
), | ||
pytest.param( | ||
lambda table: my_random(table.float_col), | ||
marks=[ | ||
*no_udfs, | ||
pytest.mark.notyet(["flink"], reason="instances are uncorrelated"), | ||
], | ||
id="udf", | ||
), | ||
], | ||
) | ||
|
||
|
||
# You can work around this by .cache()ing the table. | ||
@pytest.mark.notyet("sqlite", reason="instances are uncorrelated") | ||
@mark_impures | ||
def test_impure_correlated(alltypes, impure): | ||
# An "impure" expression is random(), uuid(), or some other non-deterministic UDF. | ||
# If we evaluate it for two different rows in the same relation, | ||
# we might get different results. This is expected. | ||
# But, as soon as we .select() it into a new relation, then that "locks in" the | ||
# value, and any further references to it will be the same. | ||
# eg if you look at the following SQL: | ||
# WITH | ||
# t AS (SELECT random() AS common) | ||
# SELECT common as x, common as y FROM t | ||
# Then both x and y should have the same value. | ||
cpcloud marked this conversation as resolved.
Show resolved
Hide resolved
|
||
expr = alltypes.select(common=impure(alltypes)).select(x=_.common, y=_.common) | ||
df = expr.execute() | ||
tm.assert_series_equal(df.x, df.y, check_names=False) | ||
|
||
|
||
# You can work around this by .cache()ing the table. | ||
@pytest.mark.notyet("sqlite", reason="instances are uncorrelated") | ||
@mark_impures | ||
def test_chained_selections(alltypes, impure): | ||
# https://github.com/ibis-project/ibis/issues/8921#issue-2234327722 | ||
# This is a slightly more complex version of test_impure_correlated. | ||
# consider this SQL: | ||
# WITH | ||
# t AS (SELECT random() AS num) | ||
# SELECT num, num > 0.5 AS isbig FROM t | ||
# We would expect that the value of num and isbig are consistent, | ||
# since we "lock in" the value of num by selecting it into t. | ||
t = alltypes.select(num=impure(alltypes)) | ||
t = t.mutate(isbig=(t.num > 0.5)) | ||
df = t.execute() | ||
df["expected"] = df.num > 0.5 | ||
tm.assert_series_equal(df.isbig, df.expected, check_names=False) | ||
|
||
|
||
impure_params_uncorrelated = pytest.mark.parametrize( | ||
"impure", | ||
[ | ||
pytest.param( | ||
lambda _: ibis.random(), | ||
marks=[ | ||
*no_randoms, | ||
pytest.mark.notyet(["impala"], reason="instances are correlated"), | ||
], | ||
id="random", | ||
), | ||
pytest.param( | ||
# make this a float so we can compare to .5 | ||
lambda _: ibis.uuid().cast(str).contains("a").ifelse(1, 0), | ||
marks=[ | ||
*no_uuids, | ||
pytest.mark.notyet(["mysql"], reason="instances are correlated"), | ||
], | ||
id="uuid", | ||
), | ||
pytest.param( | ||
lambda table: my_random(table.float_col), | ||
marks=[ | ||
*no_udfs, | ||
# no "impure" argument for pyspark yet | ||
pytest.mark.notimpl("pyspark"), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. notyet instead of notimpl, since it's a problem on the backend side? And move the comment into the reason kwarg? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not, we just don't pass it through, unless pyspark doesn't have the ability to set this property. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah, I see, thanks. Looks like it is implemented in spark, we just need to expose it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's do this in a follow-up. Lucky for us, each engine appears to use a slightly different way of specifying whether a UDF is deterministic 🙄 |
||
], | ||
id="udf", | ||
), | ||
], | ||
) | ||
|
||
|
||
# You can work around this by doing .select().cache().select() | ||
@pytest.mark.notyet(["clickhouse"], reason="instances are correlated") | ||
@impure_params_uncorrelated | ||
def test_impure_uncorrelated_different_id(alltypes, impure): | ||
# This is the opposite of test_impure_correlated. | ||
# If we evaluate an impure expression for two different rows in the same relation, | ||
# the should be uncorrelated. | ||
# eg if you look at the following SQL: | ||
# select random() as x, random() as y | ||
# Then x and y should be uncorrelated. | ||
expr = alltypes.select(x=impure(alltypes), y=impure(alltypes)) | ||
df = expr.execute() | ||
assert (df.x != df.y).any() | ||
|
||
|
||
# You can work around this by doing .select().cache().select() | ||
@pytest.mark.notyet(["clickhouse"], reason="instances are correlated") | ||
@impure_params_uncorrelated | ||
def test_impure_uncorrelated_same_id(alltypes, impure): | ||
# Similar to test_impure_uncorrelated_different_id, but the two expressions | ||
# have the same ID. Still, they should be uncorrelated. | ||
common = impure(alltypes) | ||
expr = alltypes.select(x=common, y=common) | ||
df = expr.execute() | ||
assert (df.x != df.y).any() | ||
|
||
|
||
@pytest.mark.notyet( | ||
[ | ||
"duckdb", | ||
"clickhouse", | ||
"datafusion", | ||
"mysql", | ||
"impala", | ||
"mssql", | ||
"trino", | ||
"flink", | ||
"bigquery", | ||
], | ||
raises=AssertionError, | ||
reason="instances are not correlated but ideally they would be", | ||
) | ||
@pytest.mark.notyet( | ||
["sqlite"], | ||
raises=AssertionError, | ||
reason="instances are *sometimes* correlated but ideally they would always be", | ||
strict=False, | ||
) | ||
@pytest.mark.notimpl( | ||
["polars", "risingwave", "druid", "exasol", "oracle", "pyspark"], | ||
raises=com.OperationNotDefinedError, | ||
) | ||
def test_self_join_with_generated_keys(con): | ||
# Even with CTEs in the generated SQL, the backends still | ||
# materialize a new value every time it is referenced. | ||
# This isn't ideal behavior, but there is nothing we can do about it | ||
# on the ibis side. The best you can do is to .cache() the table | ||
# right after you assign the uuid(). | ||
# https://github.com/ibis-project/ibis/pull/9014#issuecomment-2399449665 | ||
left = ibis.memtable({"idx": list(range(5))}).mutate(key=ibis.uuid()) | ||
right = left.filter(left.idx < 3) | ||
expr = left.join(right, "key") | ||
result = con.execute(expr.count()) | ||
assert result == 3 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,7 +38,6 @@ | |
ops.StringContains: "contains", | ||
ops.StringSQLILike: "ilike", | ||
ops.StringSQLLike: "like", | ||
ops.TimestampNow: "now", | ||
} | ||
|
||
|
||
|
@@ -84,6 +83,11 @@ def translate(op, *args, **kwargs): | |
raise NotImplementedError(op) | ||
|
||
|
||
@translate.register(ops.TimestampNow) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this needed for this PR or just an unrelated fixup? (looks like the right change regardless) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's needed for this PR, but I'm not entirely sure why. I'll poke around a bit. |
||
def now(_): | ||
return "ibis.now()" | ||
|
||
|
||
@translate.register(ops.Value) | ||
def value(op, *args, **kwargs): | ||
method = _get_method_name(op) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why is this needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Concurrent execution of
CREATE OR REPLACE FUNCTION
in postgres doesn't seem to work. This ensures that all tests in this module run in the same process as long as--dist=loadgroup
is passed, which it is.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense, that seems like something we shouldn't worry about further. Maybe add a comment here? Also fine to not, if someone removes it they will find out from failing tests.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll add a comment, because I already forgot when revisiting this in review 😅 !