Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make discretization backend agnostic #896

Merged
merged 21 commits into from
Sep 20, 2023
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
0194ead
make discretization backend agnostic
divyashreepathihalli Sep 15, 2023
0108d10
Merge branch 'keras-team:main' into tf_discretization
divyashreepathihalli Sep 15, 2023
130dfb3
Merge branch 'keras-team:main' into tf_discretization
divyashreepathihalli Sep 18, 2023
b2652eb
make encode_categorical_inputs backend agnostic
divyashreepathihalli Sep 18, 2023
74a88b7
reformatted
divyashreepathihalli Sep 18, 2023
46795d7
fix import error and ops.identity error
divyashreepathihalli Sep 18, 2023
8d1e276
reformatted
divyashreepathihalli Sep 18, 2023
26e76fb
fix tf_data test failure
divyashreepathihalli Sep 19, 2023
3400330
code reformat
divyashreepathihalli Sep 19, 2023
92dcd7a
undo updating numpy op
divyashreepathihalli Sep 19, 2023
2ff6286
updated encode_categorical_inputs
divyashreepathihalli Sep 19, 2023
1ca4309
code reformat
divyashreepathihalli Sep 19, 2023
84d9a5c
revert changes to tf_utils
divyashreepathihalli Sep 19, 2023
0e1187d
update backend_utils.py
divyashreepathihalli Sep 19, 2023
09ab996
code reformat
divyashreepathihalli Sep 19, 2023
972761c
Merge branch 'keras-team:main' into tf_discretization
divyashreepathihalli Sep 19, 2023
740c4ce
update discretization.py
divyashreepathihalli Sep 19, 2023
85013b5
Merge branch 'keras-team:main' into tf_discretization
divyashreepathihalli Sep 19, 2023
315b10b
update descretization
divyashreepathihalli Sep 19, 2023
d66f135
add back output type test
divyashreepathihalli Sep 20, 2023
fa12ff5
update tests
divyashreepathihalli Sep 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions keras_core/backend/common/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,3 +255,48 @@
else:
output_shape = [input_shape[0], filters] + output_shape
return output_shape


def encode_categorical_inputs(
inputs,
output_mode,
depth,
dtype="float32",
sparse=False,
count_weights=None,
):
from keras_core import ops

"""Encodes categoical inputs according to output_mode."""
if output_mode == "int":
return ops.cast(inputs, dtype=dtype)

original_shape = inputs.shape
# In all cases, we should uprank scalar input to a single sample.
if len(ops.shape(inputs)) == 0:
inputs = ops.expand_dims(inputs, -1)

Check warning on line 277 in keras_core/backend/common/backend_utils.py

View check run for this annotation

Codecov / codecov/patch

keras_core/backend/common/backend_utils.py#L277

Added line #L277 was not covered by tests
# One hot will unprank only if the final output dimension is not already 1.
if output_mode == "one_hot":
if ops.shape(inputs)[-1] != 1:
inputs = ops.expand_dims(inputs, -1)

if len(ops.shape(inputs)) > 2:
raise ValueError(

Check warning on line 284 in keras_core/backend/common/backend_utils.py

View check run for this annotation

Codecov / codecov/patch

keras_core/backend/common/backend_utils.py#L284

Added line #L284 was not covered by tests
"When output_mode is not `'int'`, maximum supported output rank "
f"is 2. Received output_mode {output_mode} and input shape "
f"{original_shape}, "
f"which would result in output rank {inputs.shape.rank}."
)

binary_output = output_mode in ("multi_hot", "one_hot")
bincounts = ops.bincount(
inputs,
weights=count_weights,
minlength=depth,
)
if binary_output:
one_hot_input = ops.one_hot(inputs, depth)
bincounts = ops.where(ops.any(one_hot_input, axis=-2), 1, 0)
bincounts = ops.cast(bincounts, dtype)

return bincounts
46 changes: 9 additions & 37 deletions keras_core/layers/preprocessing/discretization.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,23 @@
import numpy as np

from keras_core import backend
from keras_core import ops
from keras_core.api_export import keras_core_export
from keras_core.layers.layer import Layer
from keras_core.backend.common.backend_utils import encode_categorical_inputs
from keras_core.layers.preprocessing.tf_data_layer import TFDataLayer
from keras_core.utils import argument_validation
from keras_core.utils import backend_utils
from keras_core.utils import tf_utils
from keras_core.utils.module_utils import tensorflow as tf


@keras_core_export("keras_core.layers.Discretization")
class Discretization(Layer):
class Discretization(TFDataLayer):
"""A preprocessing layer which buckets continuous features by ranges.

This layer will place each element of its input data into one of several
contiguous ranges and output an integer index indicating which range each
element was placed in.

**Note:** This layer uses TensorFlow internally. It cannot
be used as part of the compiled computation graph of a model with
any backend other than TensorFlow.
It can however be used with any backend when running eagerly.
It can also always be used as part of an input preprocessing pipeline
with any backend (outside the model itself), which is how we recommend
to use this layer.

**Note:** This layer is safe to use inside a `tf.data` pipeline
divyashreepathihalli marked this conversation as resolved.
Show resolved Hide resolved
(independently of which backend you're using).

Expand Down Expand Up @@ -78,14 +71,14 @@ class Discretization(Layer):

Examples:

Bucketize float values based on provided buckets.
Discretize float values based on provided buckets.
>>> input = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
>>> layer = Discretization(bin_boundaries=[0., 1., 2.])
>>> layer(input)
array([[0, 2, 3, 1],
[1, 3, 2, 1]])

Bucketize float values based on a number of buckets to compute.
Discretize float values based on a number of buckets to compute.
>>> input = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
>>> layer = Discretization(num_bins=4, epsilon=0.01)
>>> layer.adapt(input)
Expand Down Expand Up @@ -238,29 +231,12 @@ def load_own_variables(self, store):
return

def call(self, inputs):
if not isinstance(
inputs,
(
tf.Tensor,
tf.SparseTensor,
tf.RaggedTensor,
np.ndarray,
backend.KerasTensor,
),
):
inputs = tf.convert_to_tensor(
backend.convert_to_numpy(inputs), dtype=self.input_dtype
)

from keras_core.backend.tensorflow.numpy import digitize

indices = digitize(inputs, self.bin_boundaries)

outputs = tf_utils.encode_categorical_inputs(
self._convert_input_args = True
indices = ops.digitize(inputs, self.bin_boundaries)
outputs = encode_categorical_inputs(
indices,
output_mode=self.output_mode,
depth=len(self.bin_boundaries) + 1,
sparse=self.sparse,
dtype=self.compute_dtype,
)
if (
Expand Down Expand Up @@ -370,7 +346,3 @@ def compress_summary(summary, epsilon):
)
summary = np.stack((new_bins, new_weights))
return summary.astype("float32")


def bucketize(inputs, boundaries):
return tf.raw_ops.Bucketize(input=inputs, boundaries=boundaries)
9 changes: 3 additions & 6 deletions keras_core/layers/preprocessing/discretization_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import numpy as np
from tensorflow import data as tf_data

from keras_core import backend
from keras_core import layers
from keras_core import models
from keras_core import testing
Expand Down Expand Up @@ -41,36 +40,34 @@ def test_correctness(self):
bin_boundaries=[0.0, 0.5, 1.0], output_mode="int"
)
output = layer(np.array([[-1.0, 0.0, 0.1, 0.8, 1.2]]))
self.assertTrue(backend.is_tensor(output))
divyashreepathihalli marked this conversation as resolved.
Show resolved Hide resolved
self.assertAllClose(output, np.array([[0, 1, 1, 2, 3]]))

# one_hot mode
layer = layers.Discretization(
bin_boundaries=[0.0, 0.5, 1.0], output_mode="one_hot"
)
output = layer(np.array([0.1, 0.8]))
self.assertTrue(backend.is_tensor(output))
self.assertAllClose(output, np.array([[0, 1, 0, 0], [0, 0, 1, 0]]))

# multi_hot mode
layer = layers.Discretization(
bin_boundaries=[0.0, 0.5, 1.0], output_mode="multi_hot"
)
output = layer(np.array([[0.1, 0.8]]))
self.assertTrue(backend.is_tensor(output))
self.assertAllClose(output, np.array([[0, 1, 1, 0]]))

# count mode
layer = layers.Discretization(
bin_boundaries=[0.0, 0.5, 1.0], output_mode="count"
)
output = layer(np.array([[0.1, 0.8, 0.9]]))
self.assertTrue(backend.is_tensor(output))
self.assertAllClose(output, np.array([[0, 1, 2, 0]]))

def test_tf_data_compatibility(self):
# With fixed bins
layer = layers.Discretization(bin_boundaries=[0.0, 0.35, 0.5, 1.0])
layer = layers.Discretization(
bin_boundaries=[0.0, 0.35, 0.5, 1.0], dtype="float32"
)
x = np.array([[-1.0, 0.0, 0.1, 0.2, 0.4, 0.5, 1.0, 1.2, 0.98]])
self.assertAllClose(layer(x), np.array([[0, 1, 1, 1, 2, 3, 4, 4, 3]]))
ds = tf_data.Dataset.from_tensor_slices(x).batch(1).map(layer)
Expand Down
Loading